introduce sanitizer step before token analysis

Sanatizer functions allow to transform name and address tags before they are handed to the tokenizer. Theses transformations are visible only for the tokenizer and thus only have an influence on the search terms and address match terms for a place. Currently two sanitizers are implemented which are responsible for splitting names with multiple values and removing bracket additions. Both was previously hard-coded in the tokenizer.
2021-09-30 21:30:13 +02:00
parent 16daa57e47
commit 8171fe4571
8 changed files with 259 additions and 58 deletions
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -67,10 +67,12 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
    monkeypatch.undo()

    def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
-                     variants=('~gasse -> gasse', 'street => st', )):
+                     variants=('~gasse -> gasse', 'street => st', ),
+                     sanitizers=[]):
        cfgstr = {'normalization' : list(norm),
-                   'transliteration' : list(trans),
-                   'variants' : [ {'words': list(variants)}]}
+                  'sanitizers' : sanitizers,
+                  'transliteration' : list(trans),
+                  'variants' : [ {'words': list(variants)}]}
        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
        tok.loader = ICURuleLoader(test_config)

@@ -309,14 +311,15 @@ class TestPlaceNames:

    @pytest.fixture(autouse=True)
    def setup(self, analyzer, sql_functions):
-        with analyzer() as anl:
+        sanitizers = [{'step': 'split-name-list'},
+                      {'step': 'strip-brace-terms'}]
+        with analyzer(sanitizers=sanitizers) as anl:
            self.analyzer = anl
            yield anl


    def expect_name_terms(self, info, *expected_terms):
        tokens = self.analyzer.get_word_token_info(expected_terms)
-        print (tokens)
        for token in tokens:
            assert token[2] is not None, "No token for {0}".format(token)

@@ -324,9 +327,7 @@ class TestPlaceNames:


    def process_named_place(self, names):
-        place = {'name': names}
-
-        return self.analyzer.process_place(PlaceInfo(place))
+        return self.analyzer.process_place(PlaceInfo({'name': names}))


    def test_simple_names(self):