add support for external sanitizer modules

This commit is contained in:
Sarah Hoffmann
2022-07-25 16:10:19 +02:00
parent 7b7203c149
commit 6d41046b15
9 changed files with 142 additions and 94 deletions

View File

@@ -45,6 +45,7 @@ class ICURuleLoader:
""" """
def __init__(self, config: Configuration) -> None: def __init__(self, config: Configuration) -> None:
self.config = config
rules = config.load_sub_configuration('icu_tokenizer.yaml', rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG') config='TOKENIZER_CONFIG')
@@ -92,7 +93,7 @@ class ICURuleLoader:
def make_sanitizer(self) -> PlaceSanitizer: def make_sanitizer(self) -> PlaceSanitizer:
""" Create a place sanitizer from the configured rules. """ Create a place sanitizer from the configured rules.
""" """
return PlaceSanitizer(self.sanitizer_rules) return PlaceSanitizer(self.sanitizer_rules, self.config)
def make_token_analysis(self) -> ICUTokenAnalysis: def make_token_analysis(self) -> ICUTokenAnalysis:

View File

@@ -9,9 +9,9 @@ Handler for cleaning name and address tags in place information before it
is handed to the token analysis. is handed to the token analysis.
""" """
from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
import importlib
from nominatim.errors import UsageError from nominatim.errors import UsageError
from nominatim.config import Configuration
from nominatim.tokenizer.sanitizers.config import SanitizerConfig from nominatim.tokenizer.sanitizers.config import SanitizerConfig
from nominatim.tokenizer.sanitizers.base import SanitizerHandler, ProcessInfo, PlaceName from nominatim.tokenizer.sanitizers.base import SanitizerHandler, ProcessInfo, PlaceName
from nominatim.data.place_info import PlaceInfo from nominatim.data.place_info import PlaceInfo
@@ -22,16 +22,21 @@ class PlaceSanitizer:
names and address before they are used by the token analysers. names and address before they are used by the token analysers.
""" """
def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]]) -> None: def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]],
config: Configuration) -> None:
self.handlers: List[Callable[[ProcessInfo], None]] = [] self.handlers: List[Callable[[ProcessInfo], None]] = []
if rules: if rules:
for func in rules: for func in rules:
if 'step' not in func: if 'step' not in func:
raise UsageError("Sanitizer rule is missing the 'step' attribute.") raise UsageError("Sanitizer rule is missing the 'step' attribute.")
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_') if not isinstance(func['step'], str):
handler_module: SanitizerHandler = importlib.import_module(module_name) raise UsageError("'step' attribute must be a simple string.")
self.handlers.append(handler_module.create(SanitizerConfig(func)))
module: SanitizerHandler = \
config.load_plugin_module(func['step'], 'nominatim.tokenizer.sanitizers')
self.handlers.append(module.create(SanitizerConfig(func)))
def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]: def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:

View File

@@ -31,6 +31,11 @@ def test_load_default_module(test_config):
assert isinstance(module.NOMINATIM_VERSION, tuple) assert isinstance(module.NOMINATIM_VERSION, tuple)
def test_load_default_module_with_hyphen(test_config):
module = test_config.load_plugin_module('place-info', 'nominatim.data')
assert isinstance(module.PlaceInfo, object)
def test_load_plugin_module(test_config, tmp_path): def test_load_plugin_module(test_config, tmp_path):
(tmp_path / 'project' / 'testpath').mkdir() (tmp_path / 'project' / 'testpath').mkdir()

View File

@@ -13,14 +13,14 @@ from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.data.place_info import PlaceInfo from nominatim.data.place_info import PlaceInfo
@pytest.fixture @pytest.fixture
def sanitize(request): def sanitize(request, def_config):
sanitizer_args = {'step': 'clean-housenumbers'} sanitizer_args = {'step': 'clean-housenumbers'}
for mark in request.node.iter_markers(name="sanitizer_params"): for mark in request.node.iter_markers(name="sanitizer_params"):
sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()}) sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
def _run(**kwargs): def _run(**kwargs):
place = PlaceInfo({'address': kwargs}) place = PlaceInfo({'address': kwargs})
_, address = PlaceSanitizer([sanitizer_args]).process_names(place) _, address = PlaceSanitizer([sanitizer_args], def_config).process_names(place)
return sorted([(p.kind, p.name) for p in address]) return sorted([(p.kind, p.name) for p in address])
@@ -45,24 +45,24 @@ def test_filter_kind(sanitize):
@pytest.mark.parametrize('number', ('6523', 'n/a', '4')) @pytest.mark.parametrize('number', ('6523', 'n/a', '4'))
def test_convert_to_name_converted(number): def test_convert_to_name_converted(def_config, number):
sanitizer_args = {'step': 'clean-housenumbers', sanitizer_args = {'step': 'clean-housenumbers',
'convert-to-name': (r'\d+', 'n/a')} 'convert-to-name': (r'\d+', 'n/a')}
place = PlaceInfo({'address': {'housenumber': number}}) place = PlaceInfo({'address': {'housenumber': number}})
names, address = PlaceSanitizer([sanitizer_args]).process_names(place) names, address = PlaceSanitizer([sanitizer_args], def_config).process_names(place)
assert ('housenumber', number) in set((p.kind, p.name) for p in names) assert ('housenumber', number) in set((p.kind, p.name) for p in names)
assert 'housenumber' not in set(p.kind for p in address) assert 'housenumber' not in set(p.kind for p in address)
@pytest.mark.parametrize('number', ('a54', 'n.a', 'bow')) @pytest.mark.parametrize('number', ('a54', 'n.a', 'bow'))
def test_convert_to_name_unconverted(number): def test_convert_to_name_unconverted(def_config, number):
sanitizer_args = {'step': 'clean-housenumbers', sanitizer_args = {'step': 'clean-housenumbers',
'convert-to-name': (r'\d+', 'n/a')} 'convert-to-name': (r'\d+', 'n/a')}
place = PlaceInfo({'address': {'housenumber': number}}) place = PlaceInfo({'address': {'housenumber': number}})
names, address = PlaceSanitizer([sanitizer_args]).process_names(place) names, address = PlaceSanitizer([sanitizer_args], def_config).process_names(place)
assert 'housenumber' not in set(p.kind for p in names) assert 'housenumber' not in set(p.kind for p in names)
assert ('housenumber', number) in set((p.kind, p.name) for p in address) assert ('housenumber', number) in set((p.kind, p.name) for p in address)

View File

@@ -25,7 +25,7 @@ def sanitize(def_config, request):
if country is not None: if country is not None:
pi['country_code'] = country pi['country_code'] = country
_, address = PlaceSanitizer([sanitizer_args]).process_names(PlaceInfo(pi)) _, address = PlaceSanitizer([sanitizer_args], def_config).process_names(PlaceInfo(pi))
return sorted([(p.kind, p.name) for p in address]) return sorted([(p.kind, p.name) for p in address])

View File

@@ -14,58 +14,66 @@ from nominatim.data.place_info import PlaceInfo
from nominatim.errors import UsageError from nominatim.errors import UsageError
def run_sanitizer_on(**kwargs): class TestSplitName:
place = PlaceInfo({'name': kwargs})
name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
return sorted([(p.name, p.kind, p.suffix) for p in name]) @pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def sanitize_with_delimiter(delimiter, name): def run_sanitizer_on(self, **kwargs):
place = PlaceInfo({'name': {'name': name}}) place = PlaceInfo({'name': kwargs})
san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}]) name, _ = PlaceSanitizer([{'step': 'split-name-list'}], self.config).process_names(place)
name, _ = san.process_names(place)
return sorted([p.name for p in name]) return sorted([(p.name, p.kind, p.suffix) for p in name])
def test_simple(): def sanitize_with_delimiter(self, delimiter, name):
assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)] place = PlaceInfo({'name': {'name': name}})
assert run_sanitizer_on(name='') == [('', 'name', None)] san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}],
self.config)
name, _ = san.process_names(place)
return sorted([p.name for p in name])
def test_splits(): def test_simple(self):
assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None), assert self.run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
('B', 'name', None), assert self.run_sanitizer_on(name='') == [('', 'name', None)]
('C', 'name', None)]
assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
('boat', 'short_name', None)]
def test_empty_fields(): def test_splits(self):
assert run_sanitizer_on(name='A;;B') == [('A', 'name', None), assert self.run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
('B', 'name', None)] ('B', 'name', None),
assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None), ('C', 'name', None)]
('B', 'name', None)] assert self.run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)] ('boat', 'short_name', None)]
assert run_sanitizer_on(name='B,') == [('B', 'name', None)]
def test_custom_delimiters(): def test_empty_fields(self):
assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3'] assert self.run_sanitizer_on(name='A;;B') == [('A', 'name', None),
assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@'] ('B', 'name', None)]
assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to'] assert self.run_sanitizer_on(name='A; ,B') == [('A', 'name', None),
assert sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun'] ('B', 'name', None)]
assert self.run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
assert self.run_sanitizer_on(name='B,') == [('B', 'name', None)]
def test_empty_delimiter_set(): def test_custom_delimiters(self):
with pytest.raises(UsageError): assert self.sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
sanitize_with_delimiter('', 'abc') assert self.sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
assert self.sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
assert self.sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun']
def test_no_name_list(): def test_empty_delimiter_set(self):
with pytest.raises(UsageError):
self.sanitize_with_delimiter('', 'abc')
def test_no_name_list(def_config):
place = PlaceInfo({'address': {'housenumber': '3'}}) place = PlaceInfo({'address': {'housenumber': '3'}})
name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place) name, address = PlaceSanitizer([{'step': 'split-name-list'}], def_config).process_names(place)
assert not name assert not name
assert len(address) == 1 assert len(address) == 1

View File

@@ -12,39 +12,45 @@ import pytest
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.data.place_info import PlaceInfo from nominatim.data.place_info import PlaceInfo
def run_sanitizer_on(**kwargs): class TestStripBrace:
place = PlaceInfo({'name': kwargs})
name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
return sorted([(p.name, p.kind, p.suffix) for p in name]) @pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, **kwargs):
place = PlaceInfo({'name': kwargs})
name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}], self.config).process_names(place)
return sorted([(p.name, p.kind, p.suffix) for p in name])
def test_no_braces(): def test_no_braces(self):
assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None), assert self.run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None),
('foo', 'name', None)] ('foo', 'name', None)]
def test_simple_braces(): def test_simple_braces(self):
assert run_sanitizer_on(name='Halle (Saale)', ref='3')\ assert self.run_sanitizer_on(name='Halle (Saale)', ref='3')\
== [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)] == [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
assert run_sanitizer_on(name='ack ( bar')\ assert self.run_sanitizer_on(name='ack ( bar')\
== [('ack', 'name', None), ('ack ( bar', 'name', None)] == [('ack', 'name', None), ('ack ( bar', 'name', None)]
def test_only_braces(): def test_only_braces(self):
assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)] assert self.run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)]
def test_double_braces(): def test_double_braces(self):
assert run_sanitizer_on(name='a((b))') == [('a', 'name', None), assert self.run_sanitizer_on(name='a((b))') == [('a', 'name', None),
('a((b))', 'name', None)] ('a((b))', 'name', None)]
assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None), assert self.run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None),
('a (b) (c)', 'name', None)] ('a (b) (c)', 'name', None)]
def test_no_names(): def test_no_names(def_config):
place = PlaceInfo({'address': {'housenumber': '3'}}) place = PlaceInfo({'address': {'housenumber': '3'}})
name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place) name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}], def_config).process_names(place)
assert not name assert not name
assert len(address) == 1 assert len(address) == 1

View File

@@ -15,11 +15,16 @@ from nominatim.data.country_info import setup_country_config
class TestWithDefaults: class TestWithDefaults:
@staticmethod @pytest.fixture(autouse=True)
def run_sanitizer_on(country, **kwargs): def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, country, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': country}) 'country_code': country})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language'}]).process_names(place) name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language'}],
self.config).process_names(place)
return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name]) return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name])
@@ -44,12 +49,17 @@ class TestWithDefaults:
class TestFilterKind: class TestFilterKind:
@staticmethod @pytest.fixture(autouse=True)
def run_sanitizer_on(filt, **kwargs): def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, filt, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': 'de'}) 'country_code': 'de'})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'filter-kind': filt}]).process_names(place) 'filter-kind': filt}],
self.config).process_names(place)
return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name]) return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name])
@@ -94,14 +104,16 @@ class TestDefaultCountry:
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def setup_country(self, def_config): def setup_country(self, def_config):
setup_country_config(def_config) setup_country_config(def_config)
self.config = def_config
@staticmethod
def run_sanitizer_append(mode, country, **kwargs): def run_sanitizer_append(self, mode, country, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': country}) 'country_code': country})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'use-defaults': mode, 'use-defaults': mode,
'mode': 'append'}]).process_names(place) 'mode': 'append'}],
self.config).process_names(place)
assert all(isinstance(p.attr, dict) for p in name) assert all(isinstance(p.attr, dict) for p in name)
assert all(len(p.attr) <= 1 for p in name) assert all(len(p.attr) <= 1 for p in name)
@@ -111,13 +123,13 @@ class TestDefaultCountry:
return sorted([(p.name, p.attr.get('analyzer', '')) for p in name]) return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
@staticmethod def run_sanitizer_replace(self, mode, country, **kwargs):
def run_sanitizer_replace(mode, country, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': country}) 'country_code': country})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'use-defaults': mode, 'use-defaults': mode,
'mode': 'replace'}]).process_names(place) 'mode': 'replace'}],
self.config).process_names(place)
assert all(isinstance(p.attr, dict) for p in name) assert all(isinstance(p.attr, dict) for p in name)
assert all(len(p.attr) <= 1 for p in name) assert all(len(p.attr) <= 1 for p in name)
@@ -131,7 +143,8 @@ class TestDefaultCountry:
place = PlaceInfo({'name': {'name': 'something'}}) place = PlaceInfo({'name': {'name': 'something'}})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'use-defaults': 'all', 'use-defaults': 'all',
'mode': 'replace'}]).process_names(place) 'mode': 'replace'}],
self.config).process_names(place)
assert len(name) == 1 assert len(name) == 1
assert name[0].name == 'something' assert name[0].name == 'something'
@@ -199,14 +212,19 @@ class TestDefaultCountry:
class TestCountryWithWhitelist: class TestCountryWithWhitelist:
@staticmethod @pytest.fixture(autouse=True)
def run_sanitizer_on(mode, country, **kwargs): def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, mode, country, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': country}) 'country_code': country})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'use-defaults': mode, 'use-defaults': mode,
'mode': 'replace', 'mode': 'replace',
'whitelist': ['de', 'fr', 'ru']}]).process_names(place) 'whitelist': ['de', 'fr', 'ru']}],
self.config).process_names(place)
assert all(isinstance(p.attr, dict) for p in name) assert all(isinstance(p.attr, dict) for p in name)
assert all(len(p.attr) <= 1 for p in name) assert all(len(p.attr) <= 1 for p in name)
@@ -238,12 +256,17 @@ class TestCountryWithWhitelist:
class TestWhiteList: class TestWhiteList:
@staticmethod @pytest.fixture(autouse=True)
def run_sanitizer_on(whitelist, **kwargs): def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, whitelist, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}}) place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'mode': 'replace', 'mode': 'replace',
'whitelist': whitelist}]).process_names(place) 'whitelist': whitelist}],
self.config).process_names(place)
assert all(isinstance(p.attr, dict) for p in name) assert all(isinstance(p.attr, dict) for p in name)
assert all(len(p.attr) <= 1 for p in name) assert all(len(p.attr) <= 1 for p in name)

View File

@@ -47,8 +47,8 @@ def test_placeinfo_has_attr():
assert not place.has_attr('whatever') assert not place.has_attr('whatever')
def test_sanitizer_default(): def test_sanitizer_default(def_config):
san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}]) san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}], def_config)
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}, name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'},
'address': {'street': 'Bald'}})) 'address': {'street': 'Bald'}}))
@@ -63,8 +63,8 @@ def test_sanitizer_default():
@pytest.mark.parametrize('rules', [None, []]) @pytest.mark.parametrize('rules', [None, []])
def test_sanitizer_empty_list(rules): def test_sanitizer_empty_list(def_config, rules):
san = sanitizer.PlaceSanitizer(rules) san = sanitizer.PlaceSanitizer(rules, def_config)
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}})) name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}}))
@@ -72,6 +72,6 @@ def test_sanitizer_empty_list(rules):
assert all(isinstance(n, sanitizer.PlaceName) for n in name) assert all(isinstance(n, sanitizer.PlaceName) for n in name)
def test_sanitizer_missing_step_definition(): def test_sanitizer_missing_step_definition(def_config):
with pytest.raises(UsageError): with pytest.raises(UsageError):
san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}]) san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}], def_config)