introduce sanitizer step before token analysis

Sanatizer functions allow to transform name and address tags before they are handed to the tokenizer. Theses transformations are visible only for the tokenizer and thus only have an influence on the search terms and address match terms for a place. Currently two sanitizers are implemented which are responsible for splitting names with multiple values and removing bracket additions. Both was previously hard-coded in the tokenizer.
2026-02-26 11:08:13 +00:00 · 2021-09-30 21:30:13 +02:00
parent 16daa57e47
commit 8171fe4571
8 changed files with 259 additions and 58 deletions
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -12,6 +12,7 @@ from icu import Transliterator
 from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 import nominatim.tokenizer.icu_variants as variants
 LOG = logging.getLogger()
@@ -65,6 +66,9 @@ class ICURuleLoader:
        self.analysis_rules = self._get_section(rules, 'variants')
        self._parse_variant_list()
        # Load optional sanitizer rule set.
        self.sanitizer_rules = rules.get('sanitizers', [])
    def load_config_from_db(self, conn):
        """ Get previously saved parts of the configuration from the
@@ -85,6 +89,12 @@ class ICURuleLoader:
        set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
    def make_sanitizer(self):
        """ Create a place sanitizer from the configured rules.
        """
        return PlaceSanitizer(self.sanitizer_rules)
    def make_token_analysis(self):
        """ Create a token analyser from the reviouly loaded rules.
        """
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -13,6 +13,7 @@ from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.indexer.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
@@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
-        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
                                     self.loader.make_token_analysis())
    def _install_php(self, phpdir):
@@ -187,10 +189,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        normalization.
    """
-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
        self.token_analysis = token_analysis
        self._cache = _TokenCache()
@@ -203,6 +206,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            self.conn = None
    def _search_normalized(self, name):
        """ Return the search token transliteration of the given name.
        """
        return self.token_analysis.get_search_normalized(name)
    def _normalized(self, name):
        """ Return the normalized version of the given name with all
            non-relevant information removed.
        """
        return self.token_analysis.get_normalized(name)
    def get_word_token_info(self, words):
        """ Return token information for the given list of words.
            If a word starts with # it is assumed to be a full name
@@ -218,9 +234,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        partial_tokens = {}
        for word in words:
            if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
            else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)
        with self.conn.cursor() as cur:
            cur.execute("""SELECT word_token, word_id
@@ -251,7 +267,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            This function takes minor shortcuts on transliteration.
        """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)
    def update_postcodes_from_db(self):
        """ Update postcode tokens in the word table from the location_postcode
@@ -274,7 +290,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                    if postcode is None:
                        to_delete.append(word)
                    else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                    'P', postcode)
                if to_delete:
@@ -292,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            completely replaced. Otherwise the phrases are added to the
            already existing ones.
        """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                            for p in phrases))
        with self.conn.cursor() as cur:
@@ -322,7 +338,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        added = 0
        with CopyBuffer() as copystr:
            for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                if term:
                    copystr.add(term, 'S', word,
                                json.dumps({'class': cls, 'type': typ,
@@ -356,9 +372,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
    def add_country_names(self, country_code, names):
        """ Add names for the given country to the search index.
        """
        # Make sure any name preprocessing for country names applies.
        info = PlaceInfo({'name': names, 'country_code': country_code,
                          'rank_address': 4, 'class': 'boundary',
                          'type': 'administrative'})
        self._add_country_full_names(country_code,
                                     self.sanitizer.process_names(info)[0])
    def _add_country_full_names(self, country_code, names):
        """ Add names for the given country from an already sanitized
            name list.
        """
        word_tokens = set()
-        for name in self._compute_full_names(names):
+        for name in names:
-            norm_name = self.name_processor.get_search_normalized(name)
+            norm_name = self._search_normalized(name.name)
            if norm_name:
                word_tokens.add(norm_name)
@@ -384,12 +412,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
    def process_place(self, place):
        """ Determine tokenizer information about the given place.
-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
            the database via the token_info field.
        """
        token_info = _TokenInfo(self._cache)
-        names = place.name
+        names, address = self.sanitizer.process_names(place)
        if names:
            fulls, partials = self._compute_name_tokens(names)
@@ -397,9 +425,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            token_info.add_names(fulls, partials)
            if place.is_country():
-                self.add_country_names(place.country_code, names)
+                self._add_country_full_names(place.country_code, names)
        address = place.address
        if address:
            self._process_place_address(token_info, address)
@@ -409,18 +436,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
    def _process_place_address(self, token_info, address):
        hnrs = []
        addr_terms = []
-        for key, value in address.items():
+        for item in address:
-            if key == 'postcode':
+            if item.kind == 'postcode':
-                self._add_postcode(value)
+                self._add_postcode(item.name)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
+                hnrs.append(item.name)
-            elif key == 'street':
+            elif item.kind == 'street':
-                token_info.add_street(self._compute_partial_tokens(value))
+                token_info.add_street(self._compute_partial_tokens(item.name))
-            elif key == 'place':
+            elif item.kind == 'place':
-                token_info.add_place(self._compute_partial_tokens(value))
+                token_info.add_place(self._compute_partial_tokens(item.name))
-            elif not key.startswith('_') and \
+            elif not item.kind.startswith('_') and \
-                 key not in ('country', 'full'):
+                 item.kind not in ('country', 'full'):
-                addr_terms.append((key, self._compute_partial_tokens(value)))
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
        if hnrs:
            hnrs = self._split_housenumbers(hnrs)
@@ -433,7 +460,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        """ Normalize the given term, split it into partial words and return
            then token list for them.
        """
-        norm_name = self.name_processor.get_search_normalized(name)
+        norm_name = self._search_normalized(name)
        tokens = []
        need_lookup = []
@@ -456,19 +483,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        return tokens
    def _compute_name_tokens(self, names):
        """ Computes the full name and partial name tokens for the given
            dictionary of names.
        """
        full_names = self._compute_full_names(names)
        full_tokens = set()
        partial_tokens = set()
-        for name in full_names:
+        for name in names:
-            norm_name = self.name_processor.get_normalized(name)
+            norm_name = self._normalized(name.name)
            full, part = self._cache.names.get(norm_name, (None, None))
            if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.get_variants_ascii(norm_name)
                if not variants:
                    continue
@@ -485,23 +512,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        return full_tokens, partial_tokens
    @staticmethod
    def _compute_full_names(names):
        """ Return the set of all full name word ids to be used with the
            given dictionary of names.
        """
        full_names = set()
        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
            if name:
                full_names.add(name)
                brace_idx = name.find('(')
                if brace_idx >= 0:
                    full_names.add(name[:brace_idx].strip())
        return full_names
    def _add_postcode(self, postcode):
        """ Make sure the normalized postcode is present in the word table.
        """
@@ -509,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            postcode = self.normalize_postcode(postcode)
            if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                if not term:
                    return
--- a/nominatim/tokenizer/place_sanitizer.py
+++ b/nominatim/tokenizer/place_sanitizer.py
@@ -0,0 +1,127 @@
 """
 Handler for cleaning name and address tags in place information before it
 is handed to the token analysis.
 """
 import importlib
 from nominatim.errors import UsageError
 class PlaceName:
    """ A searchable name for a place together with properties.
        Every name object saves the name proper and two basic properties:
        * 'kind' describes the name of the OSM key used without any suffixes
          (i.e. the part after the colon removed)
        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
          is the part of the key after the first colon.
        In addition to that, the name may have arbitrary additional attributes.
        Which attributes are used, depends on the token analyser.
    """
    def __init__(self, name, kind, suffix):
        self.name = name
        self.kind = kind
        self.suffix = suffix
        self.attr = {}
    def __repr__(self):
        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
    def clone(self, name=None, kind=None, suffix=None, attr=None):
        """ Create a deep copy of the place name, optionally with the
            given parameters replaced. In the attribute list only the given
            keys are updated. The list is not replaced completely.
            In particular, the function cannot to be used to remove an
            attribute from a place name.
        """
        newobj = PlaceName(name or self.name,
                           kind or self.kind,
                           suffix or self.suffix)
        newobj.attr.update(self.attr)
        if attr:
            newobj.attr.update(attr)
        return newobj
    def set_attr(self, key, value):
        """ Add the given property to the name. If the property was already
            set, then the value is overwritten.
        """
        self.attr[key] = value
    def get_attr(self, key, default=None):
        """ Return the given property or the value of 'default' if it
            is not set.
        """
        return self.attr.get(key, default)
    def has_attr(self, key):
        """ Check if the given attribute is set.
        """
        return key in self.attr
 class _ProcessInfo:
    """ Container class for information handed into to handler functions.
        The 'names' and 'address' members are mutable. A handler must change
        them by either modifying the lists place or replacing the old content
        with a new list.
    """
    def __init__(self, place):
        self.place = place
        self.names = self._convert_name_dict(place.name)
        self.address = self._convert_name_dict(place.address)
    @staticmethod
    def _convert_name_dict(names):
        """ Convert a dictionary of names into a list of PlaceNames.
            The dictionary key is split into the primary part of the key
            and the suffix (the part after an optional colon).
        """
        out = []
        if names:
            for key, value in names.items():
                parts = key.split(':', 1)
                out.append(PlaceName(value.strip(),
                                     parts[0].strip(),
                                     parts[1].strip() if len(parts) > 1 else None))
        return out
 class PlaceSanitizer:
    """ Controller class which applies sanitizer functions on the place
        names and address before they are used by the token analysers.
    """
    def __init__(self, rules):
        self.handlers = []
        if rules:
            for func in rules:
                if 'step' not in func:
                    raise UsageError("Sanitizer rule is missing the 'step' attribute.")
                module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
                handler_module = importlib.import_module(module_name)
                self.handlers.append(handler_module.create(func))
    def process_names(self, place):
        """ Extract a sanitized list of names and address parts from the
            given place. The function returns a tuple
            (list of names, list of address names)
        """
        obj = _ProcessInfo(place)
        for func in self.handlers:
            func(obj)
        return obj.names, obj.address
--- a/nominatim/tokenizer/sanitizers/init.py
+++ b/nominatim/tokenizer/sanitizers/init.py
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -0,0 +1,28 @@
 """
 Name processor that splits name values with multiple values into their components.
 """
 import re
 def create(func):
    """ Create a name processing function that splits name values with
        multiple values into their components. The optional parameter
        'delimiters' can be used to define the characters that should be used
        for splitting. The default is ',;'.
    """
    regexp = re.compile('[{}]'.format(func.get('delimiters', ',;')))
    def _process(obj):
        if not obj.names:
            return
        new_names = []
        for name in obj.names:
            split_names = regexp.split(name.name)
            if len(split_names) == 1:
                new_names.append(name)
            else:
                new_names.extend(name.clone(name=n) for n in split_names)
        obj.names = new_names
    return _process
--- a/nominatim/tokenizer/sanitizers/strip_brace_terms.py
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@@ -0,0 +1,22 @@
 """
 Sanitizer handling names with addendums in braces.
 """
 def create(_):
    """ Create a name processing function that creates additional name variants
        when a name has an addendum in brackets (e.g. "Halle (Saale)"). The
        additional variant only contains the main name without the bracket part.
    """
    def _process(obj):
        """ Add variants for names that have a bracket extension.
        """
        new_names = []
        if obj.names:
            for name in (n for n in obj.names if '(' in n.name):
                new_name = name.name.split('(')[0].strip()
                if new_name:
                    new_names.append(name.clone(name=new_name))
        obj.names.extend(new_names)
    return _process
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -24,6 +24,9 @@ transliteration:
    - "[^[:Ascii:]] >"
    - ":: lower ()"
    - ":: NFC ()"
 sanitizers:
    - step: split-name-list
    - step: strip-brace-terms
 variants:
    - !include icu-rules/variants-bg.yaml
    - !include icu-rules/variants-ca.yaml
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -67,10 +67,12 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
    monkeypatch.undo()
    def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
-                     variants=('~gasse -> gasse', 'street => st', )):
+                     variants=('~gasse -> gasse', 'street => st', ),
                     sanitizers=[]):
        cfgstr = {'normalization' : list(norm),
-                   'transliteration' : list(trans),
+                  'sanitizers' : sanitizers,
-                   'variants' : [ {'words': list(variants)}]}
+                  'transliteration' : list(trans),
                  'variants' : [ {'words': list(variants)}]}
        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
        tok.loader = ICURuleLoader(test_config)
@@ -309,14 +311,15 @@ class TestPlaceNames:
    @pytest.fixture(autouse=True)
    def setup(self, analyzer, sql_functions):
-        with analyzer() as anl:
+        sanitizers = [{'step': 'split-name-list'},
                      {'step': 'strip-brace-terms'}]
        with analyzer(sanitizers=sanitizers) as anl:
            self.analyzer = anl
            yield anl
    def expect_name_terms(self, info, *expected_terms):
        tokens = self.analyzer.get_word_token_info(expected_terms)
        print (tokens)
        for token in tokens:
            assert token[2] is not None, "No token for {0}".format(token)
@@ -324,9 +327,7 @@ class TestPlaceNames:
    def process_named_place(self, names):
-        place = {'name': names}
+        return self.analyzer.process_place(PlaceInfo({'name': names}))
        return self.analyzer.process_place(PlaceInfo(place))
    def test_simple_names(self):