introduce sanitizer step before token analysis

Sanatizer functions allow to transform name and address tags before they are handed to the tokenizer. Theses transformations are visible only for the tokenizer and thus only have an influence on the search terms and address match terms for a place. Currently two sanitizers are implemented which are responsible for splitting names with multiple values and removing bracket additions. Both was previously hard-coded in the tokenizer.
2026-03-11 13:24:07 +00:00 · 2021-09-30 21:30:13 +02:00
parent 16daa57e47
commit 8171fe4571
8 changed files with 259 additions and 58 deletions
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -13,6 +13,7 @@ from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer

@@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer):

            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
-        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                                     self.loader.make_token_analysis())


    def _install_php(self, phpdir):
@@ -187,10 +189,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        normalization.
    """

-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis

        self._cache = _TokenCache()

@@ -203,6 +206,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            self.conn = None


+    def _search_normalized(self, name):
+        """ Return the search token transliteration of the given name.
+        """
+        return self.token_analysis.get_search_normalized(name)
+
+
+    def _normalized(self, name):
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return self.token_analysis.get_normalized(name)
+
+
    def get_word_token_info(self, words):
        """ Return token information for the given list of words.
            If a word starts with # it is assumed to be a full name
@@ -218,9 +234,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        partial_tokens = {}
        for word in words:
            if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
            else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)

        with self.conn.cursor() as cur:
            cur.execute("""SELECT word_token, word_id
@@ -251,7 +267,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):

            This function takes minor shortcuts on transliteration.
        """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)

    def update_postcodes_from_db(self):
        """ Update postcode tokens in the word table from the location_postcode
@@ -274,7 +290,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                    if postcode is None:
                        to_delete.append(word)
                    else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                    'P', postcode)

                if to_delete:
@@ -292,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            completely replaced. Otherwise the phrases are added to the
            already existing ones.
        """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                            for p in phrases))

        with self.conn.cursor() as cur:
@@ -322,7 +338,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        added = 0
        with CopyBuffer() as copystr:
            for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                if term:
                    copystr.add(term, 'S', word,
                                json.dumps({'class': cls, 'type': typ,
@@ -356,9 +372,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
    def add_country_names(self, country_code, names):
        """ Add names for the given country to the search index.
        """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0])
+
+
+    def _add_country_full_names(self, country_code, names):
+        """ Add names for the given country from an already sanitized
+            name list.
+        """
        word_tokens = set()
-        for name in self._compute_full_names(names):
-            norm_name = self.name_processor.get_search_normalized(name)
+        for name in names:
+            norm_name = self._search_normalized(name.name)
            if norm_name:
                word_tokens.add(norm_name)

@@ -384,12 +412,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
    def process_place(self, place):
        """ Determine tokenizer information about the given place.

-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
            the database via the token_info field.
        """
        token_info = _TokenInfo(self._cache)

-        names = place.name
+        names, address = self.sanitizer.process_names(place)

        if names:
            fulls, partials = self._compute_name_tokens(names)
@@ -397,9 +425,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            token_info.add_names(fulls, partials)

            if place.is_country():
-                self.add_country_names(place.country_code, names)
+                self._add_country_full_names(place.country_code, names)

-        address = place.address
        if address:
            self._process_place_address(token_info, address)

@@ -409,18 +436,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
    def _process_place_address(self, token_info, address):
        hnrs = []
        addr_terms = []
-        for key, value in address.items():
-            if key == 'postcode':
-                self._add_postcode(value)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
-            elif key == 'street':
-                token_info.add_street(self._compute_partial_tokens(value))
-            elif key == 'place':
-                token_info.add_place(self._compute_partial_tokens(value))
-            elif not key.startswith('_') and \
-                 key not in ('country', 'full'):
-                addr_terms.append((key, self._compute_partial_tokens(value)))
+        for item in address:
+            if item.kind == 'postcode':
+                self._add_postcode(item.name)
+            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(item.name)
+            elif item.kind == 'street':
+                token_info.add_street(self._compute_partial_tokens(item.name))
+            elif item.kind == 'place':
+                token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and \
+                 item.kind not in ('country', 'full'):
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))

        if hnrs:
            hnrs = self._split_housenumbers(hnrs)
@@ -433,7 +460,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        """ Normalize the given term, split it into partial words and return
            then token list for them.
        """
-        norm_name = self.name_processor.get_search_normalized(name)
+        norm_name = self._search_normalized(name)

        tokens = []
        need_lookup = []
@@ -456,19 +483,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):

        return tokens

+
    def _compute_name_tokens(self, names):
        """ Computes the full name and partial name tokens for the given
            dictionary of names.
        """
-        full_names = self._compute_full_names(names)
        full_tokens = set()
        partial_tokens = set()

-        for name in full_names:
-            norm_name = self.name_processor.get_normalized(name)
+        for name in names:
+            norm_name = self._normalized(name.name)
            full, part = self._cache.names.get(norm_name, (None, None))
            if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.get_variants_ascii(norm_name)
                if not variants:
                    continue

@@ -485,23 +512,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        return full_tokens, partial_tokens


-    @staticmethod
-    def _compute_full_names(names):
-        """ Return the set of all full name word ids to be used with the
-            given dictionary of names.
-        """
-        full_names = set()
-        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
-            if name:
-                full_names.add(name)
-
-                brace_idx = name.find('(')
-                if brace_idx >= 0:
-                    full_names.add(name[:brace_idx].strip())
-
-        return full_names
-
-
    def _add_postcode(self, postcode):
        """ Make sure the normalized postcode is present in the word table.
        """
@@ -509,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            postcode = self.normalize_postcode(postcode)

            if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                if not term:
                    return