clean_housenumbers: make kinds and delimiters configurable

Also adds unit tests for various options.
This commit is contained in:
Sarah Hoffmann
2022-01-20 12:07:12 +01:00
parent 206ee87188
commit 4774e45218
7 changed files with 133 additions and 20 deletions

View File

@@ -6,13 +6,19 @@
# For a full list of authors see the git log.
"""
Sanitizer that cleans and normalizes housenumbers.
Arguments:
delimiters: Define the set of characters to be used for
splitting a list of housenumbers into parts. (default: ',;')
"""
import re
from nominatim.tokenizer.sanitizers.helpers import create_split_regex
class _HousenumberSanitizer:
def __init__(self, config):
pass
self.kinds = config.get('filter-kind', ('housenumber', ))
self.split_regexp = create_split_regex(config)
def __call__(self, obj):
@@ -21,7 +27,7 @@ class _HousenumberSanitizer:
new_address = []
for item in obj.address:
if item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
if item.kind in self.kinds:
new_address.extend(item.clone(kind='housenumber', name=n) for n in self.sanitize(item.name))
else:
# Don't touch other address items.
@@ -36,13 +42,9 @@ class _HousenumberSanitizer:
The function works as a generator that yields all valid housenumbers
that can be created from the value.
"""
for hnr in self._split_number(value):
yield from self._regularize(hnr)
def _split_number(self, hnr):
for part in re.split(r'[;,]', hnr):
yield part.strip()
for hnr in self.split_regexp.split(value):
if hnr:
yield from self._regularize(hnr)
def _regularize(self, hnr):

View File

@@ -0,0 +1,29 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for sanitizers.
"""
import re
from nominatim.errors import UsageError
def create_split_regex(config, default=',;'):
""" Converts the 'delimiter' parameter in the configuration into a
compiled regular expression that can be used to split the names on the
delimiters. The regular expression makes sure that the resulting names
are stripped and that repeated delimiters
are ignored but it will still create empty fields on occasion. The
code needs to filter those.
The 'default' parameter defines the delimiter set to be used when
not explicitly configured.
"""
delimiter_set = set(config.get('delimiters', default))
if not delimiter_set:
raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))

View File

@@ -9,21 +9,16 @@ Sanitizer that splits lists of names into their components.
Arguments:
delimiters: Define the set of characters to be used for
splitting the list. (default: `,;`)
splitting the list. (default: ',;')
"""
import re
from nominatim.errors import UsageError
from nominatim.tokenizer.sanitizers.helpers import create_split_regex
def create(func):
""" Create a name processing function that splits name values with
multiple values into their components.
"""
delimiter_set = set(func.get('delimiters', ',;'))
if not delimiter_set:
raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
regexp = create_split_regex(func)
def _process(obj):
if not obj.names:

View File

@@ -13,7 +13,7 @@ Arguments:
filter-kind: Restrict the names the sanitizer should be applied to
to the given tags. The parameter expects a list of
regular expressions which are matched against `kind`.
regular expressions which are matched against 'kind'.
Note that a match against the full string is expected.
whitelist: Restrict the set of languages that should be tagged.
Expects a list of acceptable suffixes. When unset,