move abbreviation computation into import phase

This adds precomputation of abbreviated terms for names and removes abbreviation of terms in the query. Basic import works but still needs some thorough testing as well as speed improvements during import. New dependency for python library datrie.
2026-02-16 05:18:00 +00:00 · 2021-05-28 22:06:13 +02:00
parent 6ba00e6aee
commit 8413075249
10 changed files with 665 additions and 206 deletions
--- a/test/python/test_tokenizer_icu_rule_loader.py
+++ b/test/python/test_tokenizer_icu_rule_loader.py
@@ -0,0 +1,75 @@
+"""
+Tests for converting a config file to ICU rules.
+"""
+import pytest
+from textwrap import dedent
+
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.errors import UsageError
+
+from icu import Transliterator
+
+@pytest.fixture
+def cfgfile(tmp_path, suffix='.yaml'):
+    def _create_config(suffixes, abbr):
+        content = dedent("""\
+        normalization:
+            - ":: NFD ()"
+            - "[[:Nonspacing Mark:] [:Cf:]] >"
+            - ":: lower ()"
+            - "[[:Punctuation:][:Space:]]+ > ' '"
+            - ":: NFC ()"
+        transliteration:
+            - "::  Latin ()"
+        """)
+        content += "compound_suffixes:\n"
+        content += '\n'.join(("    - " + s for s in suffixes)) + '\n'
+        content += "abbreviations:\n"
+        content += '\n'.join(("    - " + s for s in abbr)) + '\n'
+        fpath = tmp_path / ('test_config' + suffix)
+        fpath.write_text(dedent(content))
+        return fpath
+
+    return _create_config
+
+def test_missing_normalization(tmp_path):
+    fpath = tmp_path / ('test_config.yaml')
+    fpath.write_text(dedent("""\
+        normalizatio:
+            - ":: NFD ()"
+        """))
+
+    with pytest.raises(UsageError):
+        ICURuleLoader(fpath)
+
+
+def test_get_search_rules(cfgfile):
+    fpath = cfgfile(['strasse', 'straße', 'weg'],
+                    ['strasse,straße => str',
+                     'prospekt => pr'])
+
+    loader = ICURuleLoader(fpath)
+
+    rules = loader.get_search_rules()
+    trans = Transliterator.createFromRules("test", rules)
+
+    assert trans.transliterate(" Baumstraße ") == " baum straße "
+    assert trans.transliterate(" Baumstrasse ") == " baum strasse "
+    assert trans.transliterate(" Baumstr ") == " baum str "
+    assert trans.transliterate(" Baumwegstr ") == " baumweg str "
+    assert trans.transliterate(" Αθήνα ") == " athēna "
+    assert trans.transliterate(" проспект ") == " prospekt "
+
+
+def test_get_synonym_pairs(cfgfile):
+    fpath = cfgfile(['Weg', 'Strasse'],
+                    ['Strasse => str,st'])
+
+    loader = ICURuleLoader(fpath)
+
+    repl = loader.get_replacement_pairs()
+
+    assert repl == [(' strasse ', {' strasse ', ' str ', ' st '}),
+                    ('strasse ', {' strasse ', ' str ', ' st '}),
+                    ('weg ', {' weg '})]
+