introduce sanitizer step before token analysis

Sanatizer functions allow to transform name and address tags before
they are handed to the tokenizer. Theses transformations are visible
only for the tokenizer and thus only have an influence on the
search terms and address match terms for a place.

Currently two sanitizers are implemented which are responsible for
splitting names with multiple values and removing bracket additions.
Both was previously hard-coded in the tokenizer.
This commit is contained in:
Sarah Hoffmann
2021-09-30 21:30:13 +02:00
parent 16daa57e47
commit 8171fe4571
8 changed files with 259 additions and 58 deletions

View File

@@ -12,6 +12,7 @@ from icu import Transliterator
from nominatim.db.properties import set_property, get_property
from nominatim.errors import UsageError
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
import nominatim.tokenizer.icu_variants as variants
LOG = logging.getLogger()
@@ -65,6 +66,9 @@ class ICURuleLoader:
self.analysis_rules = self._get_section(rules, 'variants')
self._parse_variant_list()
# Load optional sanitizer rule set.
self.sanitizer_rules = rules.get('sanitizers', [])
def load_config_from_db(self, conn):
""" Get previously saved parts of the configuration from the
@@ -85,6 +89,12 @@ class ICURuleLoader:
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
def make_sanitizer(self):
""" Create a place sanitizer from the configured rules.
"""
return PlaceSanitizer(self.sanitizer_rules)
def make_token_analysis(self):
""" Create a token analyser from the reviouly loaded rules.
"""