Merge pull request #2602 from lonvia/filter-bad-housenumbers

Handle mistagged housenumbers like names
2026-02-26 11:08:13 +00:00 · 2022-02-07 16:27:04 +01:00
parent 39ede26b5c 7d19209fa1
commit 02894ca4a4
12 changed files with 199 additions and 82 deletions
--- a/.pylintrc
+++ b/.pylintrc
@@ -10,6 +10,7 @@ ignored-modules=icu,datrie
 # closing added here because it sometimes triggers a false positive with
 # 'with' statements.
 ignored-classes=NominatimArgs,closing
-disable=too-few-public-methods,duplicate-code
+# 'too-many-ancestors' is triggered already by deriving from UserDict
 disable=too-few-public-methods,duplicate-code,too-many-ancestors
 good-names=i,x,y,fd,db
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -45,7 +45,7 @@ class ICURuleLoader:
        rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                              config='TOKENIZER_CONFIG')
-        # Make sure country information is available to analyzers and sanatizers.
+        # Make sure country information is available to analyzers and sanitizers.
        nominatim.tools.country_info.setup_country_config(config)
        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
--- a/nominatim/tokenizer/place_sanitizer.py
+++ b/nominatim/tokenizer/place_sanitizer.py
@@ -11,6 +11,7 @@ is handed to the token analysis.
 import importlib
 from nominatim.errors import UsageError
 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 class PlaceName:
    """ A searchable name for a place together with properties.
@@ -117,7 +118,7 @@ class PlaceSanitizer:
                    raise UsageError("Sanitizer rule is missing the 'step' attribute.")
                module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
                handler_module = importlib.import_module(module_name)
-                self.handlers.append(handler_module.create(func))
+                self.handlers.append(handler_module.create(SanitizerConfig(func)))
    def process_names(self, place):
--- a/nominatim/tokenizer/sanitizers/clean_housenumbers.py
+++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
@@ -19,15 +19,22 @@ Arguments:
                 where each string is a regular expression. An address item
                 is considered a house number if the 'kind' fully matches any
                 of the given regular expressions. (default: 'housenumber')
-
+    convert-to-name: Define house numbers that should be treated as a name
                     instead of a house number. Either takes a single string
                     or a list of strings, where each string is a regular
                     expression that must match the full house number value.
 """
-from nominatim.tokenizer.sanitizers.helpers import create_split_regex, create_kind_filter
+import re
 class _HousenumberSanitizer:
    def __init__(self, config):
-        self.filter_kind = create_kind_filter(config, 'housenumber')
+        self.filter_kind = config.get_filter_kind('housenumber')
-        self.split_regexp = create_split_regex(config)
+        self.split_regexp = config.get_delimiter()
        nameregexps = config.get_string_list('convert-to-name', [])
        self.is_name_regexp = [re.compile(r) for r in nameregexps]
    def __call__(self, obj):
@@ -37,8 +44,11 @@ class _HousenumberSanitizer:
        new_address = []
        for item in obj.address:
            if self.filter_kind(item):
-                new_address.extend(item.clone(kind='housenumber', name=n)
+                if self._treat_as_name(item.name):
-                                   for n in self.sanitize(item.name))
+                    obj.names.append(item.clone(kind='housenumber'))
                else:
                    new_address.extend(item.clone(kind='housenumber', name=n)
                                       for n in self.sanitize(item.name))
            else:
                # Don't touch other address items.
                new_address.append(item)
@@ -62,6 +72,10 @@ class _HousenumberSanitizer:
        yield hnr
    def _treat_as_name(self, housenumber):
        return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
 def create(config):
    """ Create a housenumber processing function.
    """
--- a/nominatim/tokenizer/sanitizers/config.py
+++ b/nominatim/tokenizer/sanitizers/config.py
@@ -0,0 +1,82 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Configuration for Sanitizers.
 """
 from collections import UserDict
 import re
 from nominatim.errors import UsageError
 class SanitizerConfig(UserDict):
    """ Dictionary with configuration options for a sanitizer.
        In addition to the usualy dictionary function, the class provides
        accessors to standard sanatizer options that are used by many of the
        sanitizers.
    """
    def get_string_list(self, param, default=tuple()):
        """ Extract a configuration parameter as a string list.
            If the parameter value is a simple string, it is returned as a
            one-item list. If the parameter value does not exist, the given
            default is returned. If the parameter value is a list, it is checked
            to contain only strings before being returned.
        """
        values = self.data.get(param, None)
        if values is None:
            return None if default is None else list(default)
        if isinstance(values, str):
            return [values] if values else []
        if not isinstance(values, (list, tuple)):
            raise UsageError(f"Parameter '{param}' must be string or list of strings.")
        if any(not isinstance(value, str) for value in values):
            raise UsageError(f"Parameter '{param}' must be string or list of strings.")
        return values
    def get_delimiter(self, default=',;'):
        """ Return the 'delimiter' parameter in the configuration as a
            compiled regular expression that can be used to split the names on the
            delimiters. The regular expression makes sure that the resulting names
            are stripped and that repeated delimiters
            are ignored but it will still create empty fields on occasion. The
            code needs to filter those.
            The 'default' parameter defines the delimiter set to be used when
            not explicitly configured.
        """
        delimiter_set = set(self.data.get('delimiters', default))
        if not delimiter_set:
            raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
        return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
    def get_filter_kind(self, *default):
        """ Return a filter function for the name kind from the 'filter-kind'
            config parameter. The filter functions takes a name item and returns
            True when the item passes the filter.
            If the parameter is empty, the filter lets all items pass. If the
            paramter is a string, it is interpreted as a single regular expression
            that must match the full kind string. If the parameter is a list then
            any of the regular expressions in the list must match to pass.
        """
        filters = self.get_string_list('filter-kind', default)
        if not filters:
            return lambda _: True
        regexes = [re.compile(regex) for regex in filters]
        return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
--- a/nominatim/tokenizer/sanitizers/helpers.py
+++ b/nominatim/tokenizer/sanitizers/helpers.py
@@ -1,52 +0,0 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Helper functions for sanitizers.
 """
 import re
 from nominatim.errors import UsageError
 def create_split_regex(config, default=',;'):
    """ Converts the 'delimiter' parameter in the configuration into a
        compiled regular expression that can be used to split the names on the
        delimiters. The regular expression makes sure that the resulting names
        are stripped and that repeated delimiters
        are ignored but it will still create empty fields on occasion. The
        code needs to filter those.
        The 'default' parameter defines the delimiter set to be used when
        not explicitly configured.
    """
    delimiter_set = set(config.get('delimiters', default))
    if not delimiter_set:
        raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
    return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
 def create_kind_filter(config, default=None):
    """ Create a filter function for the name kind from the 'filter-kind'
        config parameter. The filter functions takes a name item and returns
        True when the item passes the filter.
        If the parameter is empty, the filter lets all items pass. If the
        paramter is a string, it is interpreted as a single regular expression
        that must match the full kind string. If the parameter is a list then
        any of the regular expressions in the list must match to pass.
    """
    filters = config.get('filter-kind', default)
    if not filters:
        return lambda _: True
    if isinstance(filters, str):
        regex = re.compile(filters)
        return lambda name: regex.fullmatch(name.kind)
    regexes = [re.compile(regex) for regex in filters]
    return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -11,13 +11,11 @@ Arguments:
    delimiters: Define the set of characters to be used for
                splitting the list. (default: ',;')
 """
-from nominatim.tokenizer.sanitizers.helpers import create_split_regex
+def create(config):
 def create(func):
    """ Create a name processing function that splits name values with
        multiple values into their components.
    """
-    regexp = create_split_regex(func)
+    regexp = config.get_delimiter()
    def _process(obj):
        if not obj.names:
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -31,21 +31,20 @@ Arguments:
 """
 from nominatim.tools import country_info
 from nominatim.tokenizer.sanitizers.helpers import create_kind_filter
 class _AnalyzerByLanguage:
    """ Processor for tagging the language of names in a place.
    """
    def __init__(self, config):
-        self.filter_kind = create_kind_filter(config)
+        self.filter_kind = config.get_filter_kind()
        self.replace = config.get('mode', 'replace') != 'append'
        self.whitelist = config.get('whitelist')
-        self.__compute_default_languages(config.get('use-defaults', 'no'))
+        self._compute_default_languages(config.get('use-defaults', 'no'))
-    def __compute_default_languages(self, use_defaults):
+    def _compute_default_languages(self, use_defaults):
        self.deflangs = {}
        if use_defaults in ('mono', 'all'):
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -25,13 +25,15 @@ transliteration:
    - "[^a-z0-9[:Space:]] >"
    - ":: NFC ()"
 sanitizers:
    - step: split-name-list
    - step: strip-brace-terms
    - step: clean-housenumbers
      filter-kind:
        - housenumber
        - conscriptionnumber
        - streetnumber
      convert-to-name:
        - (\A|.*,)[^\d,]{3,}(,.*|\Z)
    - step: split-name-list
    - step: strip-brace-terms
    - step: tag-analyzer-by-language
      filter-kind: [".*name.*"]
      whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
--- a/test/bdd/db/query/housenumbers.feature
+++ b/test/bdd/db/query/housenumbers.feature
@@ -53,3 +53,17 @@ Feature: Searching of house numbers
        | 2;4;12 |
        | 2,4,12 |
        | 2, 4, 12 |
    Scenario: A name mapped as a housenumber is found
        Given the places
         | osm | class    | type | housenr | geometry |
         | N1  | building | yes  | Warring | 9        |
        And the places
         | osm | class   | type | name       | geometry |
         | W10 | highway | path | Chester St | 1,2,3    |
        When importing
        When sending search query "Chester St Warring"
        Then results contain
         | osm |
         | N1  |
--- a/test/python/tokenizer/sanitizers/test_clean_housenumbers.py
+++ b/test/python/tokenizer/sanitizers/test_clean_housenumbers.py
@@ -42,3 +42,27 @@ def test_housenumber_lists(sanitize, number):
 def test_filter_kind(sanitize):
    assert sanitize(housenumber='34', number='4', badnumber='65') == \
            [('badnumber', '65'), ('housenumber', '34'), ('housenumber', '4')]
@pytest.mark.parametrize('number', ('6523', 'n/a', '4'))
 def test_convert_to_name_converted(number):
    sanitizer_args = {'step': 'clean-housenumbers',
                      'convert-to-name': (r'\d+', 'n/a')}
    place = PlaceInfo({'address': {'housenumber': number}})
    names, address = PlaceSanitizer([sanitizer_args]).process_names(place)
    assert ('housenumber', number) in set((p.kind, p.name) for p in names)
    assert 'housenumber' not in set(p.kind for p in address)
@pytest.mark.parametrize('number', ('a54', 'n.a', 'bow'))
 def test_convert_to_name_unconverted(number):
    sanitizer_args = {'step': 'clean-housenumbers',
                      'convert-to-name': (r'\d+', 'n/a')}
    place = PlaceInfo({'address': {'housenumber': number}})
    names, address = PlaceSanitizer([sanitizer_args]).process_names(place)
    assert 'housenumber' not in set(p.kind for p in names)
    assert ('housenumber', number) in set((p.kind, p.name) for p in address)
--- a/test/python/tokenizer/sanitizers/test_sanitizer_config.py
+++ b/test/python/tokenizer/sanitizers/test_sanitizer_config.py
@@ -5,17 +5,51 @@
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
-Tests for sanitizer helper functions.
+Tests for sanitizer configuration helper functions.
 """
 import pytest
 from nominatim.errors import UsageError
 from nominatim.tokenizer.place_sanitizer import PlaceName
-import nominatim.tokenizer.sanitizers.helpers as helpers
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 def test_string_list_default_empty():
    assert SanitizerConfig().get_string_list('op') == []
 def test_string_list_default_none():
    assert SanitizerConfig().get_string_list('op', default=None) is None
 def test_string_list_default_something():
    assert SanitizerConfig().get_string_list('op', default=['a', 'b']) == ['a', 'b']
 def test_string_list_value_string():
    assert SanitizerConfig({'op': 't'}).get_string_list('op', default=['a', 'b']) == ['t']
 def test_string_list_value_list():
    assert SanitizerConfig({'op': ['1', '2']}).get_string_list('op') == ['1', '2']
 def test_string_list_value_empty():
    assert SanitizerConfig({'op': ''}).get_string_list('op', default=['a', 'b']) == []
 def test_string_list_value_dict():
    with pytest.raises(UsageError):
        SanitizerConfig({'op': {'1': 'a'}}).get_string_list('op')
 def test_string_list_value_int_list():
    with pytest.raises(UsageError):
        SanitizerConfig({'op': [1, 2]}).get_string_list('op')
@pytest.mark.parametrize('inp', ('fg34', 'f\\f', 'morning [glory]', '56.78'))
 def test_create_split_regex_no_params_unsplit(inp):
-    regex = helpers.create_split_regex({})
+    regex = SanitizerConfig().get_delimiter()
    assert list(regex.split(inp)) == [inp]
@@ -26,14 +60,14 @@ def test_create_split_regex_no_params_unsplit(inp):
                                      ('1,  3  ,5', ['1', '3', '5'])
                                     ])
 def test_create_split_regex_no_params_split(inp, outp):
-    regex = helpers.create_split_regex({})
+    regex = SanitizerConfig().get_delimiter()
    assert list(regex.split(inp)) == outp
@pytest.mark.parametrize('delimiter', ['.', '\\', '[]', '   ', '/.*+'])
 def test_create_split_regex_custom(delimiter):
-    regex = helpers.create_split_regex({'delimiters': delimiter})
+    regex = SanitizerConfig({'delimiters': delimiter}).get_delimiter()
    assert list(regex.split(f'out{delimiter}house')) == ['out', 'house']
    assert list(regex.split('out,house')) == ['out,house']
@@ -41,39 +75,39 @@ def test_create_split_regex_custom(delimiter):
 def test_create_split_regex_empty_delimiter():
    with pytest.raises(UsageError):
-        regex = helpers.create_split_regex({'delimiters': ''})
+        regex = SanitizerConfig({'delimiters': ''}).get_delimiter()
@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*'))
 def test_create_kind_filter_no_params(inp):
-    filt = helpers.create_kind_filter({})
+    filt = SanitizerConfig().get_filter_kind()
    assert filt(PlaceName('something', inp, ''))
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
 def test_create_kind_filter_custom_regex_positive(kind):
-    filt = helpers.create_kind_filter({'filter-kind': '.*de'})
+    filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
    assert filt(PlaceName('something', kind, ''))
@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
 def test_create_kind_filter_custom_regex_negative(kind):
-    filt = helpers.create_kind_filter({'filter-kind': '.*de'})
+    filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
    assert not filt(PlaceName('something', kind, ''))
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
 def test_create_kind_filter_many_positive(kind):
-    filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
+    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
    assert filt(PlaceName('something', kind, ''))
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
 def test_create_kind_filter_many_negative(kind):
-    filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
+    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
    assert not filt(PlaceName('something', kind, ''))