mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
replace datrie library with a more simple pure-Python class
This commit is contained in:
@@ -2,7 +2,7 @@
|
|||||||
#
|
#
|
||||||
# This file is part of Nominatim. (https://nominatim.org)
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
#
|
#
|
||||||
# Copyright (C) 2024 by the Nominatim developer community.
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
# For a full list of authors see the git log.
|
# For a full list of authors see the git log.
|
||||||
"""
|
"""
|
||||||
Generic processor for names that creates abbreviation variants.
|
Generic processor for names that creates abbreviation variants.
|
||||||
@@ -10,12 +10,11 @@ Generic processor for names that creates abbreviation variants.
|
|||||||
from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
|
from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
import datrie
|
|
||||||
|
|
||||||
from ...errors import UsageError
|
from ...errors import UsageError
|
||||||
from ...data.place_name import PlaceName
|
from ...data.place_name import PlaceName
|
||||||
from .config_variants import get_variant_config
|
from .config_variants import get_variant_config
|
||||||
from .generic_mutation import MutationVariantGenerator
|
from .generic_mutation import MutationVariantGenerator
|
||||||
|
from .simple_trie import SimpleTrie
|
||||||
|
|
||||||
# Configuration section
|
# Configuration section
|
||||||
|
|
||||||
@@ -25,8 +24,7 @@ def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, An
|
|||||||
"""
|
"""
|
||||||
config: Dict[str, Any] = {}
|
config: Dict[str, Any] = {}
|
||||||
|
|
||||||
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
|
config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer)
|
||||||
normalizer)
|
|
||||||
config['variant_only'] = rules.get('mode', '') == 'variant-only'
|
config['variant_only'] = rules.get('mode', '') == 'variant-only'
|
||||||
|
|
||||||
# parse mutation rules
|
# parse mutation rules
|
||||||
@@ -68,12 +66,8 @@ class GenericTokenAnalysis:
|
|||||||
self.variant_only = config['variant_only']
|
self.variant_only = config['variant_only']
|
||||||
|
|
||||||
# Set up datrie
|
# Set up datrie
|
||||||
if config['replacements']:
|
self.replacements: Optional[SimpleTrie[List[str]]] = \
|
||||||
self.replacements = datrie.Trie(config['chars'])
|
SimpleTrie(config['replacements']) if config['replacements'] else None
|
||||||
for src, repllist in config['replacements']:
|
|
||||||
self.replacements[src] = repllist
|
|
||||||
else:
|
|
||||||
self.replacements = None
|
|
||||||
|
|
||||||
# set up mutation rules
|
# set up mutation rules
|
||||||
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
|
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
|
||||||
@@ -116,10 +110,10 @@ class GenericTokenAnalysis:
|
|||||||
pos = 0
|
pos = 0
|
||||||
force_space = False
|
force_space = False
|
||||||
while pos < baselen:
|
while pos < baselen:
|
||||||
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
|
frm = pos
|
||||||
(None, None))
|
repl, pos = self.replacements.longest_prefix(baseform, pos)
|
||||||
if full is not None:
|
if repl is not None:
|
||||||
done = baseform[startpos:pos]
|
done = baseform[startpos:frm]
|
||||||
partials = [v + done + r
|
partials = [v + done + r
|
||||||
for v, r in itertools.product(partials, repl)
|
for v, r in itertools.product(partials, repl)
|
||||||
if not force_space or r.startswith(' ')]
|
if not force_space or r.startswith(' ')]
|
||||||
@@ -128,11 +122,10 @@ class GenericTokenAnalysis:
|
|||||||
# to be helpful. Only use the original term.
|
# to be helpful. Only use the original term.
|
||||||
startpos = 0
|
startpos = 0
|
||||||
break
|
break
|
||||||
startpos = pos + len(full)
|
if baseform[pos - 1] == ' ':
|
||||||
if full[-1] == ' ':
|
pos -= 1
|
||||||
startpos -= 1
|
|
||||||
force_space = True
|
force_space = True
|
||||||
pos = startpos
|
startpos = pos
|
||||||
else:
|
else:
|
||||||
pos += 1
|
pos += 1
|
||||||
force_space = False
|
force_space = False
|
||||||
|
|||||||
84
src/nominatim_db/tokenizer/token_analysis/simple_trie.py
Normal file
84
src/nominatim_db/tokenizer/token_analysis/simple_trie.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Simple dict-based implementation of a trie structure.
|
||||||
|
"""
|
||||||
|
from typing import TypeVar, Generic, Tuple, Optional, List, Dict
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
T = TypeVar('T')
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleTrie(Generic[T]):
|
||||||
|
""" A simple read-only trie structure.
|
||||||
|
This structure supports examply one lookup operation,
|
||||||
|
which is longest-prefix lookup.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, data: Optional[List[Tuple[str, T]]] = None) -> None:
|
||||||
|
self._tree: Dict[str, 'SimpleTrie[T]'] = defaultdict(SimpleTrie[T])
|
||||||
|
self._value: Optional[T] = None
|
||||||
|
self._prefix = ''
|
||||||
|
|
||||||
|
if data:
|
||||||
|
for key, value in data:
|
||||||
|
self._add(key, 0, value)
|
||||||
|
|
||||||
|
self._make_compact()
|
||||||
|
|
||||||
|
def _add(self, word: str, pos: int, value: T) -> None:
|
||||||
|
""" (Internal) Add a sub-word to the trie.
|
||||||
|
The word is added from index 'pos'. If the sub-word to add
|
||||||
|
is empty, then the trie saves the given value.
|
||||||
|
"""
|
||||||
|
if pos < len(word):
|
||||||
|
self._tree[word[pos]]._add(word, pos + 1, value)
|
||||||
|
else:
|
||||||
|
self._value = value
|
||||||
|
|
||||||
|
def _make_compact(self) -> None:
|
||||||
|
""" (Internal) Compress tree where there is exactly one subtree
|
||||||
|
and no value.
|
||||||
|
|
||||||
|
Compression works recursively starting at the leaf.
|
||||||
|
"""
|
||||||
|
for t in self._tree.values():
|
||||||
|
t._make_compact()
|
||||||
|
|
||||||
|
if len(self._tree) == 1 and self._value is None:
|
||||||
|
assert not self._prefix
|
||||||
|
for k, v in self._tree.items():
|
||||||
|
self._prefix = k + v._prefix
|
||||||
|
self._tree = v._tree
|
||||||
|
self._value = v._value
|
||||||
|
|
||||||
|
def longest_prefix(self, word: str, start: int = 0) -> Tuple[Optional[T], int]:
|
||||||
|
""" Return the longest prefix match for the given word starting at
|
||||||
|
the position 'start'.
|
||||||
|
|
||||||
|
The function returns a tuple with the value for the longest match and
|
||||||
|
the position of the word after the match. If no match was found at
|
||||||
|
all, the function returns (None, start).
|
||||||
|
"""
|
||||||
|
cur = self
|
||||||
|
pos = start
|
||||||
|
result: Tuple[Optional[T], int] = None, start
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if cur._prefix:
|
||||||
|
if not word.startswith(cur._prefix, pos):
|
||||||
|
return result
|
||||||
|
pos += len(cur._prefix)
|
||||||
|
|
||||||
|
if cur._value:
|
||||||
|
result = cur._value, pos
|
||||||
|
|
||||||
|
if pos >= len(word) or word[pos] not in cur._tree:
|
||||||
|
return result
|
||||||
|
|
||||||
|
cur = cur._tree[word[pos]]
|
||||||
|
pos += 1
|
||||||
37
test/python/tokenizer/token_analysis/test_simple_trie.py
Normal file
37
test/python/tokenizer/token_analysis/test_simple_trie.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Tests for simplified trie structure.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from nominatim_db.tokenizer.token_analysis.simple_trie import SimpleTrie
|
||||||
|
|
||||||
|
def test_single_item_trie():
|
||||||
|
t = SimpleTrie([('foob', 42)])
|
||||||
|
|
||||||
|
assert t.longest_prefix('afoobar') == (None, 0)
|
||||||
|
assert t.longest_prefix('afoobar', start=1) == (42, 5)
|
||||||
|
assert t.longest_prefix('foob') == (42, 4)
|
||||||
|
assert t.longest_prefix('123foofoo', 3) == (None, 3)
|
||||||
|
|
||||||
|
def test_complex_item_tree():
|
||||||
|
t = SimpleTrie([('a', 1),
|
||||||
|
('b', 2),
|
||||||
|
('auto', 3),
|
||||||
|
('buto', 4),
|
||||||
|
('automat', 5),
|
||||||
|
('bu', 6),
|
||||||
|
('bx', 7)])
|
||||||
|
|
||||||
|
assert t.longest_prefix('a') == (1, 1)
|
||||||
|
assert t.longest_prefix('au') == (1, 1)
|
||||||
|
assert t.longest_prefix('aut') == (1, 1)
|
||||||
|
assert t.longest_prefix('auto') == (3, 4)
|
||||||
|
assert t.longest_prefix('automat') == (5, 7)
|
||||||
|
assert t.longest_prefix('automatx') == (5, 7)
|
||||||
|
assert t.longest_prefix('butomat') == (4, 4)
|
||||||
|
assert t.longest_prefix('butomat', 1) == (None, 1)
|
||||||
Reference in New Issue
Block a user