clean_housenumbers: make kinds and delimiters configurable

Also adds unit tests for various options.
2026-02-26 11:08:13 +00:00 · 2022-01-20 12:07:12 +01:00
parent 206ee87188
commit 4774e45218
7 changed files with 133 additions and 20 deletions
--- a/nominatim/tokenizer/sanitizers/clean_housenumbers.py
+++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
@@ -6,13 +6,19 @@
 # For a full list of authors see the git log.
 """
 Sanitizer that cleans and normalizes housenumbers.
 Arguments:
    delimiters: Define the set of characters to be used for
                splitting a list of housenumbers into parts. (default: ',;')
 """
-import re
+from nominatim.tokenizer.sanitizers.helpers import create_split_regex
 class _HousenumberSanitizer:
    def __init__(self, config):
-        pass
+        self.kinds = config.get('filter-kind', ('housenumber', ))
        self.split_regexp = create_split_regex(config)
    def __call__(self, obj):
@@ -21,7 +27,7 @@ class _HousenumberSanitizer:
        new_address = []
        for item in obj.address:
-            if item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+            if item.kind in self.kinds:
                new_address.extend(item.clone(kind='housenumber', name=n) for n in self.sanitize(item.name))
            else:
                # Don't touch other address items.
@@ -36,13 +42,9 @@ class _HousenumberSanitizer:
            The function works as a generator that yields all valid housenumbers
            that can be created from the value.
        """
-        for hnr in self._split_number(value):
+        for hnr in self.split_regexp.split(value):
-            yield from self._regularize(hnr)
+            if hnr:
-
+                yield from self._regularize(hnr)
    def _split_number(self, hnr):
        for part in re.split(r'[;,]', hnr):
            yield part.strip()
    def _regularize(self, hnr):
--- a/nominatim/tokenizer/sanitizers/helpers.py
+++ b/nominatim/tokenizer/sanitizers/helpers.py
@@ -0,0 +1,29 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Helper functions for sanitizers.
 """
 import re
 from nominatim.errors import UsageError
 def create_split_regex(config, default=',;'):
    """ Converts the 'delimiter' parameter in the configuration into a
        compiled regular expression that can be used to split the names on the
        delimiters. The regular expression makes sure that the resulting names
        are stripped and that repeated delimiters
        are ignored but it will still create empty fields on occasion. The
        code needs to filter those.
        The 'default' parameter defines the delimiter set to be used when
        not explicitly configured.
    """
    delimiter_set = set(config.get('delimiters', default))
    if not delimiter_set:
        raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
    return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -9,21 +9,16 @@ Sanitizer that splits lists of names into their components.
 Arguments:
    delimiters: Define the set of characters to be used for
-                splitting the list. (default: `,;`)
+                splitting the list. (default: ',;')
 """
 import re
 from nominatim.errors import UsageError
 from nominatim.tokenizer.sanitizers.helpers import create_split_regex
 def create(func):
    """ Create a name processing function that splits name values with
        multiple values into their components.
    """
-    delimiter_set = set(func.get('delimiters', ',;'))
+    regexp = create_split_regex(func)
    if not delimiter_set:
        raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
    regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
    def _process(obj):
        if not obj.names:
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -13,7 +13,7 @@ Arguments:
    filter-kind: Restrict the names the sanitizer should be applied to
                 to the given tags. The parameter expects a list of
-                 regular expressions which are matched against `kind`.
+                 regular expressions which are matched against 'kind'.
                 Note that a match against the full string is expected.
    whitelist: Restrict the set of languages that should be tagged.
               Expects a list of acceptable suffixes. When unset,
--- a/test/python/tokenizer/sanitizers/test_clean_housenumbers.py
+++ b/test/python/tokenizer/sanitizers/test_clean_housenumbers.py
@@ -0,0 +1,44 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Tests for the sanitizer that normalizes housenumbers.
 """
 import pytest
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.indexer.place_info import PlaceInfo
@pytest.fixture
 def sanitize(request):
    sanitizer_args = {'step': 'clean-housenumbers'}
    for mark in request.node.iter_markers(name="sanitizer_params"):
        sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
    def _run(**kwargs):
        place = PlaceInfo({'address': kwargs})
        _, address = PlaceSanitizer([sanitizer_args]).process_names(place)
        return sorted([(p.kind, p.name) for p in address])
    return _run
 def test_simple_number(sanitize):
    assert sanitize(housenumber='34') == [('housenumber', '34')]
@pytest.mark.parametrize('number', ['1;2;3', '1,2,3', '1; 3 ,2',
                                    '2,,3,1', '1;2;3;;', ';3;2;1'])
 def test_housenumber_lists(sanitize, number):
    assert sanitize(housenumber=number) == \
           [('housenumber', '1'), ('housenumber', '2'), ('housenumber', '3')]
@pytest.mark.sanitizer_params(filter_kind=('number', 'streetnumber'))
 def test_filter_kind(sanitize):
    assert sanitize(housenumber='34', number='4', badnumber='65') == \
            [('badnumber', '65'), ('housenumber', '34'), ('housenumber', '4')]
--- a/test/python/tokenizer/sanitizers/test_helpers.py
+++ b/test/python/tokenizer/sanitizers/test_helpers.py
@@ -0,0 +1,43 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Tests for sanitizer helper functions.
 """
 import pytest
 from nominatim.errors import UsageError
 import nominatim.tokenizer.sanitizers.helpers as helpers
@pytest.mark.parametrize('inp', ('fg34', 'f\\f', 'morning [glory]', '56.78'))
 def test_create_split_regex_no_params_unsplit(inp):
    regex = helpers.create_split_regex({})
    assert list(regex.split(inp)) == [inp]
@pytest.mark.parametrize('inp,outp', [('here,there', ['here', 'there']),
                                      ('ying;;yang', ['ying', 'yang']),
                                      (';a; ;c;d,', ['', 'a', '', 'c', 'd', '']),
                                      ('1,  3  ,5', ['1', '3', '5'])
                                     ])
 def test_create_split_regex_no_params_split(inp, outp):
    regex = helpers.create_split_regex({})
    assert list(regex.split(inp)) == outp
@pytest.mark.parametrize('delimiter', ['.', '\\', '[]', '   ', '/.*+'])
 def test_create_split_regex_custom(delimiter):
    regex = helpers.create_split_regex({'delimiters': delimiter})
    assert list(regex.split(f'out{delimiter}house')) == ['out', 'house']
    assert list(regex.split('out,house')) == ['out,house']
 def test_create_split_regex_empty_delimiter():
    with pytest.raises(UsageError):
        regex = helpers.create_split_regex({'delimiters': ''})
--- a/test/python/tokenizer/sanitizers/test_split_name_list.py
+++ b/test/python/tokenizer/sanitizers/test_split_name_list.py
@@ -5,7 +5,7 @@
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
-Tests for the sanitizer that splitts multivalue lists.
+Tests for the sanitizer that splits multivalue lists.
 """
 import pytest