introduce sanitizer step before token analysis

Sanatizer functions allow to transform name and address tags before they are handed to the tokenizer. Theses transformations are visible only for the tokenizer and thus only have an influence on the search terms and address match terms for a place. Currently two sanitizers are implemented which are responsible for splitting names with multiple values and removing bracket additions. Both was previously hard-coded in the tokenizer.
2021-09-30 21:30:13 +02:00
parent 16daa57e47
commit 8171fe4571
8 changed files with 259 additions and 58 deletions
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -12,6 +12,7 @@ from icu import Transliterator
 from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 import nominatim.tokenizer.icu_variants as variants

 LOG = logging.getLogger()
@@ -65,6 +66,9 @@ class ICURuleLoader:
        self.analysis_rules = self._get_section(rules, 'variants')
        self._parse_variant_list()

+        # Load optional sanitizer rule set.
+        self.sanitizer_rules = rules.get('sanitizers', [])
+

    def load_config_from_db(self, conn):
        """ Get previously saved parts of the configuration from the
@@ -85,6 +89,12 @@ class ICURuleLoader:
        set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))


+    def make_sanitizer(self):
+        """ Create a place sanitizer from the configured rules.
+        """
+        return PlaceSanitizer(self.sanitizer_rules)
+
+
    def make_token_analysis(self):
        """ Create a token analyser from the reviouly loaded rules.
        """