replace datrie library with a more simple pure-Python class

This commit is contained in:
Sarah Hoffmann
2025-02-18 21:09:12 +01:00
parent f567ea89cc
commit 13db4c9731
3 changed files with 133 additions and 19 deletions

View File

@@ -2,7 +2,7 @@
# #
# This file is part of Nominatim. (https://nominatim.org) # This file is part of Nominatim. (https://nominatim.org)
# #
# Copyright (C) 2024 by the Nominatim developer community. # Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log. # For a full list of authors see the git log.
""" """
Generic processor for names that creates abbreviation variants. Generic processor for names that creates abbreviation variants.
@@ -10,12 +10,11 @@ Generic processor for names that creates abbreviation variants.
from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
import itertools import itertools
import datrie
from ...errors import UsageError from ...errors import UsageError
from ...data.place_name import PlaceName from ...data.place_name import PlaceName
from .config_variants import get_variant_config from .config_variants import get_variant_config
from .generic_mutation import MutationVariantGenerator from .generic_mutation import MutationVariantGenerator
from .simple_trie import SimpleTrie
# Configuration section # Configuration section
@@ -25,8 +24,7 @@ def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, An
""" """
config: Dict[str, Any] = {} config: Dict[str, Any] = {}
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer)
normalizer)
config['variant_only'] = rules.get('mode', '') == 'variant-only' config['variant_only'] = rules.get('mode', '') == 'variant-only'
# parse mutation rules # parse mutation rules
@@ -68,12 +66,8 @@ class GenericTokenAnalysis:
self.variant_only = config['variant_only'] self.variant_only = config['variant_only']
# Set up datrie # Set up datrie
if config['replacements']: self.replacements: Optional[SimpleTrie[List[str]]] = \
self.replacements = datrie.Trie(config['chars']) SimpleTrie(config['replacements']) if config['replacements'] else None
for src, repllist in config['replacements']:
self.replacements[src] = repllist
else:
self.replacements = None
# set up mutation rules # set up mutation rules
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']] self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
@@ -116,10 +110,10 @@ class GenericTokenAnalysis:
pos = 0 pos = 0
force_space = False force_space = False
while pos < baselen: while pos < baselen:
full, repl = self.replacements.longest_prefix_item(baseform[pos:], frm = pos
(None, None)) repl, pos = self.replacements.longest_prefix(baseform, pos)
if full is not None: if repl is not None:
done = baseform[startpos:pos] done = baseform[startpos:frm]
partials = [v + done + r partials = [v + done + r
for v, r in itertools.product(partials, repl) for v, r in itertools.product(partials, repl)
if not force_space or r.startswith(' ')] if not force_space or r.startswith(' ')]
@@ -128,11 +122,10 @@ class GenericTokenAnalysis:
# to be helpful. Only use the original term. # to be helpful. Only use the original term.
startpos = 0 startpos = 0
break break
startpos = pos + len(full) if baseform[pos - 1] == ' ':
if full[-1] == ' ': pos -= 1
startpos -= 1
force_space = True force_space = True
pos = startpos startpos = pos
else: else:
pos += 1 pos += 1
force_space = False force_space = False

View File

@@ -0,0 +1,84 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Simple dict-based implementation of a trie structure.
"""
from typing import TypeVar, Generic, Tuple, Optional, List, Dict
from collections import defaultdict
T = TypeVar('T')
class SimpleTrie(Generic[T]):
""" A simple read-only trie structure.
This structure supports examply one lookup operation,
which is longest-prefix lookup.
"""
def __init__(self, data: Optional[List[Tuple[str, T]]] = None) -> None:
self._tree: Dict[str, 'SimpleTrie[T]'] = defaultdict(SimpleTrie[T])
self._value: Optional[T] = None
self._prefix = ''
if data:
for key, value in data:
self._add(key, 0, value)
self._make_compact()
def _add(self, word: str, pos: int, value: T) -> None:
""" (Internal) Add a sub-word to the trie.
The word is added from index 'pos'. If the sub-word to add
is empty, then the trie saves the given value.
"""
if pos < len(word):
self._tree[word[pos]]._add(word, pos + 1, value)
else:
self._value = value
def _make_compact(self) -> None:
""" (Internal) Compress tree where there is exactly one subtree
and no value.
Compression works recursively starting at the leaf.
"""
for t in self._tree.values():
t._make_compact()
if len(self._tree) == 1 and self._value is None:
assert not self._prefix
for k, v in self._tree.items():
self._prefix = k + v._prefix
self._tree = v._tree
self._value = v._value
def longest_prefix(self, word: str, start: int = 0) -> Tuple[Optional[T], int]:
""" Return the longest prefix match for the given word starting at
the position 'start'.
The function returns a tuple with the value for the longest match and
the position of the word after the match. If no match was found at
all, the function returns (None, start).
"""
cur = self
pos = start
result: Tuple[Optional[T], int] = None, start
while True:
if cur._prefix:
if not word.startswith(cur._prefix, pos):
return result
pos += len(cur._prefix)
if cur._value:
result = cur._value, pos
if pos >= len(word) or word[pos] not in cur._tree:
return result
cur = cur._tree[word[pos]]
pos += 1

View File

@@ -0,0 +1,37 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tests for simplified trie structure.
"""
from nominatim_db.tokenizer.token_analysis.simple_trie import SimpleTrie
def test_single_item_trie():
t = SimpleTrie([('foob', 42)])
assert t.longest_prefix('afoobar') == (None, 0)
assert t.longest_prefix('afoobar', start=1) == (42, 5)
assert t.longest_prefix('foob') == (42, 4)
assert t.longest_prefix('123foofoo', 3) == (None, 3)
def test_complex_item_tree():
t = SimpleTrie([('a', 1),
('b', 2),
('auto', 3),
('buto', 4),
('automat', 5),
('bu', 6),
('bx', 7)])
assert t.longest_prefix('a') == (1, 1)
assert t.longest_prefix('au') == (1, 1)
assert t.longest_prefix('aut') == (1, 1)
assert t.longest_prefix('auto') == (3, 4)
assert t.longest_prefix('automat') == (5, 7)
assert t.longest_prefix('automatx') == (5, 7)
assert t.longest_prefix('butomat') == (4, 4)
assert t.longest_prefix('butomat', 1) == (None, 1)