diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index bd0739f2..330179bb 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -12,6 +12,7 @@ from icu import Transliterator from nominatim.db.properties import set_property, get_property from nominatim.errors import UsageError from nominatim.tokenizer.icu_name_processor import ICUNameProcessor +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer import nominatim.tokenizer.icu_variants as variants LOG = logging.getLogger() @@ -65,6 +66,9 @@ class ICURuleLoader: self.analysis_rules = self._get_section(rules, 'variants') self._parse_variant_list() + # Load optional sanitizer rule set. + self.sanitizer_rules = rules.get('sanitizers', []) + def load_config_from_db(self, conn): """ Get previously saved parts of the configuration from the @@ -85,6 +89,12 @@ class ICURuleLoader: set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules)) + def make_sanitizer(self): + """ Create a place sanitizer from the configured rules. + """ + return PlaceSanitizer(self.sanitizer_rules) + + def make_token_analysis(self): """ Create a token analyser from the reviouly loaded rules. """ diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 87906d71..2ece10f2 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -13,6 +13,7 @@ from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.indexer.place_info import PlaceInfo from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer @@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer): Analyzers are not thread-safe. You need to instantiate one per thread. """ - return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis()) + return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(), + self.loader.make_token_analysis()) def _install_php(self, phpdir): @@ -187,10 +189,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): normalization. """ - def __init__(self, dsn, name_proc): + def __init__(self, dsn, sanitizer, token_analysis): self.conn = connect(dsn).connection self.conn.autocommit = True - self.name_processor = name_proc + self.sanitizer = sanitizer + self.token_analysis = token_analysis self._cache = _TokenCache() @@ -203,6 +206,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): self.conn = None + def _search_normalized(self, name): + """ Return the search token transliteration of the given name. + """ + return self.token_analysis.get_search_normalized(name) + + + def _normalized(self, name): + """ Return the normalized version of the given name with all + non-relevant information removed. + """ + return self.token_analysis.get_normalized(name) + + def get_word_token_info(self, words): """ Return token information for the given list of words. If a word starts with # it is assumed to be a full name @@ -218,9 +234,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): partial_tokens = {} for word in words: if word.startswith('#'): - full_tokens[word] = self.name_processor.get_search_normalized(word[1:]) + full_tokens[word] = self._search_normalized(word[1:]) else: - partial_tokens[word] = self.name_processor.get_search_normalized(word) + partial_tokens[word] = self._search_normalized(word) with self.conn.cursor() as cur: cur.execute("""SELECT word_token, word_id @@ -251,7 +267,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): This function takes minor shortcuts on transliteration. """ - return self.name_processor.get_search_normalized(hnr) + return self._search_normalized(hnr) def update_postcodes_from_db(self): """ Update postcode tokens in the word table from the location_postcode @@ -274,7 +290,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if postcode is None: to_delete.append(word) else: - copystr.add(self.name_processor.get_search_normalized(postcode), + copystr.add(self._search_normalized(postcode), 'P', postcode) if to_delete: @@ -292,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): completely replaced. Otherwise the phrases are added to the already existing ones. """ - norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3]) + norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3]) for p in phrases)) with self.conn.cursor() as cur: @@ -322,7 +338,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): added = 0 with CopyBuffer() as copystr: for word, cls, typ, oper in to_add: - term = self.name_processor.get_search_normalized(word) + term = self._search_normalized(word) if term: copystr.add(term, 'S', word, json.dumps({'class': cls, 'type': typ, @@ -356,9 +372,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def add_country_names(self, country_code, names): """ Add names for the given country to the search index. """ + # Make sure any name preprocessing for country names applies. + info = PlaceInfo({'name': names, 'country_code': country_code, + 'rank_address': 4, 'class': 'boundary', + 'type': 'administrative'}) + self._add_country_full_names(country_code, + self.sanitizer.process_names(info)[0]) + + + def _add_country_full_names(self, country_code, names): + """ Add names for the given country from an already sanitized + name list. + """ word_tokens = set() - for name in self._compute_full_names(names): - norm_name = self.name_processor.get_search_normalized(name) + for name in names: + norm_name = self._search_normalized(name.name) if norm_name: word_tokens.add(norm_name) @@ -384,12 +412,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def process_place(self, place): """ Determine tokenizer information about the given place. - Returns a JSON-serialisable structure that will be handed into + Returns a JSON-serializable structure that will be handed into the database via the token_info field. """ token_info = _TokenInfo(self._cache) - names = place.name + names, address = self.sanitizer.process_names(place) if names: fulls, partials = self._compute_name_tokens(names) @@ -397,9 +425,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): token_info.add_names(fulls, partials) if place.is_country(): - self.add_country_names(place.country_code, names) + self._add_country_full_names(place.country_code, names) - address = place.address if address: self._process_place_address(token_info, address) @@ -409,18 +436,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): hnrs = [] addr_terms = [] - for key, value in address.items(): - if key == 'postcode': - self._add_postcode(value) - elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(value) - elif key == 'street': - token_info.add_street(self._compute_partial_tokens(value)) - elif key == 'place': - token_info.add_place(self._compute_partial_tokens(value)) - elif not key.startswith('_') and \ - key not in ('country', 'full'): - addr_terms.append((key, self._compute_partial_tokens(value))) + for item in address: + if item.kind == 'postcode': + self._add_postcode(item.name) + elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'): + hnrs.append(item.name) + elif item.kind == 'street': + token_info.add_street(self._compute_partial_tokens(item.name)) + elif item.kind == 'place': + token_info.add_place(self._compute_partial_tokens(item.name)) + elif not item.kind.startswith('_') and \ + item.kind not in ('country', 'full'): + addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) if hnrs: hnrs = self._split_housenumbers(hnrs) @@ -433,7 +460,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): """ Normalize the given term, split it into partial words and return then token list for them. """ - norm_name = self.name_processor.get_search_normalized(name) + norm_name = self._search_normalized(name) tokens = [] need_lookup = [] @@ -456,19 +483,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return tokens + def _compute_name_tokens(self, names): """ Computes the full name and partial name tokens for the given dictionary of names. """ - full_names = self._compute_full_names(names) full_tokens = set() partial_tokens = set() - for name in full_names: - norm_name = self.name_processor.get_normalized(name) + for name in names: + norm_name = self._normalized(name.name) full, part = self._cache.names.get(norm_name, (None, None)) if full is None: - variants = self.name_processor.get_variants_ascii(norm_name) + variants = self.token_analysis.get_variants_ascii(norm_name) if not variants: continue @@ -485,23 +512,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return full_tokens, partial_tokens - @staticmethod - def _compute_full_names(names): - """ Return the set of all full name word ids to be used with the - given dictionary of names. - """ - full_names = set() - for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)): - if name: - full_names.add(name) - - brace_idx = name.find('(') - if brace_idx >= 0: - full_names.add(name[:brace_idx].strip()) - - return full_names - - def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ @@ -509,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): postcode = self.normalize_postcode(postcode) if postcode not in self._cache.postcodes: - term = self.name_processor.get_search_normalized(postcode) + term = self._search_normalized(postcode) if not term: return diff --git a/nominatim/tokenizer/place_sanitizer.py b/nominatim/tokenizer/place_sanitizer.py new file mode 100644 index 00000000..5961dcf0 --- /dev/null +++ b/nominatim/tokenizer/place_sanitizer.py @@ -0,0 +1,127 @@ +""" +Handler for cleaning name and address tags in place information before it +is handed to the token analysis. +""" +import importlib + +from nominatim.errors import UsageError + +class PlaceName: + """ A searchable name for a place together with properties. + Every name object saves the name proper and two basic properties: + * 'kind' describes the name of the OSM key used without any suffixes + (i.e. the part after the colon removed) + * 'suffix' contains the suffix of the OSM tag, if any. The suffix + is the part of the key after the first colon. + In addition to that, the name may have arbitrary additional attributes. + Which attributes are used, depends on the token analyser. + """ + + def __init__(self, name, kind, suffix): + self.name = name + self.kind = kind + self.suffix = suffix + self.attr = {} + + + def __repr__(self): + return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')" + + + def clone(self, name=None, kind=None, suffix=None, attr=None): + """ Create a deep copy of the place name, optionally with the + given parameters replaced. In the attribute list only the given + keys are updated. The list is not replaced completely. + In particular, the function cannot to be used to remove an + attribute from a place name. + """ + newobj = PlaceName(name or self.name, + kind or self.kind, + suffix or self.suffix) + + newobj.attr.update(self.attr) + if attr: + newobj.attr.update(attr) + + return newobj + + + def set_attr(self, key, value): + """ Add the given property to the name. If the property was already + set, then the value is overwritten. + """ + self.attr[key] = value + + + def get_attr(self, key, default=None): + """ Return the given property or the value of 'default' if it + is not set. + """ + return self.attr.get(key, default) + + + def has_attr(self, key): + """ Check if the given attribute is set. + """ + return key in self.attr + + +class _ProcessInfo: + """ Container class for information handed into to handler functions. + The 'names' and 'address' members are mutable. A handler must change + them by either modifying the lists place or replacing the old content + with a new list. + """ + + def __init__(self, place): + self.place = place + self.names = self._convert_name_dict(place.name) + self.address = self._convert_name_dict(place.address) + + + @staticmethod + def _convert_name_dict(names): + """ Convert a dictionary of names into a list of PlaceNames. + The dictionary key is split into the primary part of the key + and the suffix (the part after an optional colon). + """ + out = [] + + if names: + for key, value in names.items(): + parts = key.split(':', 1) + out.append(PlaceName(value.strip(), + parts[0].strip(), + parts[1].strip() if len(parts) > 1 else None)) + + return out + + +class PlaceSanitizer: + """ Controller class which applies sanitizer functions on the place + names and address before they are used by the token analysers. + """ + + def __init__(self, rules): + self.handlers = [] + + if rules: + for func in rules: + if 'step' not in func: + raise UsageError("Sanitizer rule is missing the 'step' attribute.") + module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_') + handler_module = importlib.import_module(module_name) + self.handlers.append(handler_module.create(func)) + + + def process_names(self, place): + """ Extract a sanitized list of names and address parts from the + given place. The function returns a tuple + (list of names, list of address names) + """ + obj = _ProcessInfo(place) + + for func in self.handlers: + func(obj) + + return obj.names, obj.address diff --git a/nominatim/tokenizer/sanitizers/__init__.py b/nominatim/tokenizer/sanitizers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py new file mode 100644 index 00000000..93651f3e --- /dev/null +++ b/nominatim/tokenizer/sanitizers/split_name_list.py @@ -0,0 +1,28 @@ +""" +Name processor that splits name values with multiple values into their components. +""" +import re + +def create(func): + """ Create a name processing function that splits name values with + multiple values into their components. The optional parameter + 'delimiters' can be used to define the characters that should be used + for splitting. The default is ',;'. + """ + regexp = re.compile('[{}]'.format(func.get('delimiters', ',;'))) + + def _process(obj): + if not obj.names: + return + + new_names = [] + for name in obj.names: + split_names = regexp.split(name.name) + if len(split_names) == 1: + new_names.append(name) + else: + new_names.extend(name.clone(name=n) for n in split_names) + + obj.names = new_names + + return _process diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py new file mode 100644 index 00000000..4423d305 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py @@ -0,0 +1,22 @@ +""" +Sanitizer handling names with addendums in braces. +""" + +def create(_): + """ Create a name processing function that creates additional name variants + when a name has an addendum in brackets (e.g. "Halle (Saale)"). The + additional variant only contains the main name without the bracket part. + """ + def _process(obj): + """ Add variants for names that have a bracket extension. + """ + new_names = [] + if obj.names: + for name in (n for n in obj.names if '(' in n.name): + new_name = name.name.split('(')[0].strip() + if new_name: + new_names.append(name.clone(name=new_name)) + + obj.names.extend(new_names) + + return _process diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index c0c8c043..08b7a7ff 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -24,6 +24,9 @@ transliteration: - "[^[:Ascii:]] >" - ":: lower ()" - ":: NFC ()" +sanitizers: + - step: split-name-list + - step: strip-brace-terms variants: - !include icu-rules/variants-bg.yaml - !include icu-rules/variants-ca.yaml diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py index 4b7c56d5..9a6f5a94 100644 --- a/test/python/test_tokenizer_icu.py +++ b/test/python/test_tokenizer_icu.py @@ -67,10 +67,12 @@ def analyzer(tokenizer_factory, test_config, monkeypatch, monkeypatch.undo() def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',), - variants=('~gasse -> gasse', 'street => st', )): + variants=('~gasse -> gasse', 'street => st', ), + sanitizers=[]): cfgstr = {'normalization' : list(norm), - 'transliteration' : list(trans), - 'variants' : [ {'words': list(variants)}]} + 'sanitizers' : sanitizers, + 'transliteration' : list(trans), + 'variants' : [ {'words': list(variants)}]} (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr)) tok.loader = ICURuleLoader(test_config) @@ -309,14 +311,15 @@ class TestPlaceNames: @pytest.fixture(autouse=True) def setup(self, analyzer, sql_functions): - with analyzer() as anl: + sanitizers = [{'step': 'split-name-list'}, + {'step': 'strip-brace-terms'}] + with analyzer(sanitizers=sanitizers) as anl: self.analyzer = anl yield anl def expect_name_terms(self, info, *expected_terms): tokens = self.analyzer.get_word_token_info(expected_terms) - print (tokens) for token in tokens: assert token[2] is not None, "No token for {0}".format(token) @@ -324,9 +327,7 @@ class TestPlaceNames: def process_named_place(self, names): - place = {'name': names} - - return self.analyzer.process_place(PlaceInfo(place)) + return self.analyzer.process_place(PlaceInfo({'name': names})) def test_simple_names(self):