Merge pull request #3659 from lonvia/custom-datrie-structure

Replace datrie library with a simple custom Python implementation
2025-02-24 16:49:42 +01:00
parent f567ea89cc 18d4996bec
commit 157414a053
6 changed files with 134 additions and 22 deletions
--- a/docs/admin/Installation.md
+++ b/docs/admin/Installation.md
@@ -37,7 +37,6 @@ Furthermore the following Python libraries are required:
  * [Jinja2](https://palletsprojects.com/p/jinja/)
  * [PyICU](https://pypi.org/project/PyICU/)
  * [PyYaml](https://pyyaml.org/) (5.1+)
  * [datrie](https://github.com/pytries/datrie)
 These will be installed automatically when using pip installation.
--- a/docs/develop/Development-Environment.md
+++ b/docs/develop/Development-Environment.md
@@ -70,7 +70,7 @@ To set up the virtual environment with all necessary packages run:
 virtualenv ~/nominatim-dev-venv
 ~/nominatim-dev-venv/bin/pip install\
    psutil psycopg[binary] PyICU SQLAlchemy \
-    python-dotenv jinja2 pyYAML datrie behave \
+    python-dotenv jinja2 pyYAML behave \
    mkdocs mkdocstrings mkdocs-gen-files pytest pytest-asyncio flake8 \
    types-jinja2 types-markupsafe types-psutil types-psycopg2 \
    types-pygments types-pyyaml types-requests types-ujson \
--- a/packaging/nominatim-db/pyproject.toml
+++ b/packaging/nominatim-db/pyproject.toml
@@ -19,7 +19,6 @@ dependencies = [
    "python-dotenv",
    "jinja2",
    "pyYAML>=5.1",
    "datrie",
    "psutil",
    "PyICU"
 ]
--- a/src/nominatim_db/tokenizer/token_analysis/generic.py
+++ b/src/nominatim_db/tokenizer/token_analysis/generic.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Generic processor for names that creates abbreviation variants.
@@ -10,12 +10,11 @@ Generic processor for names that creates abbreviation variants.
 from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
 import itertools
 import datrie
 from ...errors import UsageError
 from ...data.place_name import PlaceName
 from .config_variants import get_variant_config
 from .generic_mutation import MutationVariantGenerator
 from .simple_trie import SimpleTrie
 # Configuration section
@@ -25,8 +24,7 @@ def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, An
    """
    config: Dict[str, Any] = {}
-    config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
+    config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer)
                                                                 normalizer)
    config['variant_only'] = rules.get('mode', '') == 'variant-only'
    # parse mutation rules
@@ -68,12 +66,8 @@ class GenericTokenAnalysis:
        self.variant_only = config['variant_only']
        # Set up datrie
-        if config['replacements']:
+        self.replacements: Optional[SimpleTrie[List[str]]] = \
-            self.replacements = datrie.Trie(config['chars'])
+            SimpleTrie(config['replacements']) if config['replacements'] else None
            for src, repllist in config['replacements']:
                self.replacements[src] = repllist
        else:
            self.replacements = None
        # set up mutation rules
        self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
@@ -116,10 +110,10 @@ class GenericTokenAnalysis:
            pos = 0
            force_space = False
            while pos < baselen:
-                full, repl = self.replacements.longest_prefix_item(baseform[pos:],
+                frm = pos
-                                                                   (None, None))
+                repl, pos = self.replacements.longest_prefix(baseform, pos)
-                if full is not None:
+                if repl is not None:
-                    done = baseform[startpos:pos]
+                    done = baseform[startpos:frm]
                    partials = [v + done + r
                                for v, r in itertools.product(partials, repl)
                                if not force_space or r.startswith(' ')]
@@ -128,11 +122,10 @@ class GenericTokenAnalysis:
                        # to be helpful. Only use the original term.
                        startpos = 0
                        break
-                    startpos = pos + len(full)
+                    if baseform[pos - 1] == ' ':
-                    if full[-1] == ' ':
+                        pos -= 1
                        startpos -= 1
                        force_space = True
-                    pos = startpos
+                    startpos = pos
                else:
                    pos += 1
                    force_space = False
--- a/src/nominatim_db/tokenizer/token_analysis/simple_trie.py
+++ b/src/nominatim_db/tokenizer/token_analysis/simple_trie.py
@@ -0,0 +1,84 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Simple dict-based implementation of a trie structure.
 """
 from typing import TypeVar, Generic, Tuple, Optional, List, Dict
 from collections import defaultdict
 T = TypeVar('T')
 class SimpleTrie(Generic[T]):
    """ A simple read-only trie structure.
        This structure supports examply one lookup operation,
        which is longest-prefix lookup.
    """
    def __init__(self, data: Optional[List[Tuple[str, T]]] = None) -> None:
        self._tree: Dict[str, 'SimpleTrie[T]'] = defaultdict(SimpleTrie[T])
        self._value: Optional[T] = None
        self._prefix = ''
        if data:
            for key, value in data:
                self._add(key, 0, value)
            self._make_compact()
    def _add(self, word: str, pos: int, value: T) -> None:
        """ (Internal) Add a sub-word to the trie.
            The word is added from index 'pos'. If the sub-word to add
            is empty, then the trie saves the given value.
        """
        if pos < len(word):
            self._tree[word[pos]]._add(word, pos + 1, value)
        else:
            self._value = value
    def _make_compact(self) -> None:
        """ (Internal) Compress tree where there is exactly one subtree
            and no value.
            Compression works recursively starting at the leaf.
        """
        for t in self._tree.values():
            t._make_compact()
        if len(self._tree) == 1 and self._value is None:
            assert not self._prefix
            for k, v in self._tree.items():
                self._prefix = k + v._prefix
                self._tree = v._tree
                self._value = v._value
    def longest_prefix(self, word: str, start: int = 0) -> Tuple[Optional[T], int]:
        """ Return the longest prefix match for the given word starting at
            the position 'start'.
            The function returns a tuple with the value for the longest match and
            the position of the word after the match. If no match was found at
            all, the function returns (None, start).
        """
        cur = self
        pos = start
        result: Tuple[Optional[T], int] = None, start
        while True:
            if cur._prefix:
                if not word.startswith(cur._prefix, pos):
                    return result
                pos += len(cur._prefix)
            if cur._value:
                result = cur._value, pos
            if pos >= len(word) or word[pos] not in cur._tree:
                return result
            cur = cur._tree[word[pos]]
            pos += 1
--- a/test/python/tokenizer/token_analysis/test_simple_trie.py
+++ b/test/python/tokenizer/token_analysis/test_simple_trie.py
@@ -0,0 +1,37 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Tests for simplified trie structure.
 """
 from nominatim_db.tokenizer.token_analysis.simple_trie import SimpleTrie
 def test_single_item_trie():
    t = SimpleTrie([('foob', 42)])
    assert t.longest_prefix('afoobar') == (None, 0)
    assert t.longest_prefix('afoobar', start=1) == (42, 5)
    assert t.longest_prefix('foob') == (42, 4)
    assert t.longest_prefix('123foofoo', 3) == (None, 3)
 def test_complex_item_tree():
    t = SimpleTrie([('a', 1),
                    ('b', 2),
                    ('auto', 3),
                    ('buto', 4),
                    ('automat', 5),
                    ('bu', 6),
                    ('bx', 7)])
    assert t.longest_prefix('a') == (1, 1)
    assert t.longest_prefix('au') == (1, 1)
    assert t.longest_prefix('aut') == (1, 1)
    assert t.longest_prefix('auto') == (3, 4)
    assert t.longest_prefix('automat') == (5, 7)
    assert t.longest_prefix('automatx') == (5, 7)
    assert t.longest_prefix('butomat') == (4, 4)
    assert t.longest_prefix('butomat', 1) == (None, 1)