add support for external sanitizer modules

This commit is contained in:
Sarah Hoffmann
2022-07-25 16:10:19 +02:00
parent 7b7203c149
commit 6d41046b15
9 changed files with 142 additions and 94 deletions

View File

@@ -13,14 +13,14 @@ from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.data.place_info import PlaceInfo
@pytest.fixture
def sanitize(request):
def sanitize(request, def_config):
sanitizer_args = {'step': 'clean-housenumbers'}
for mark in request.node.iter_markers(name="sanitizer_params"):
sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
def _run(**kwargs):
place = PlaceInfo({'address': kwargs})
_, address = PlaceSanitizer([sanitizer_args]).process_names(place)
_, address = PlaceSanitizer([sanitizer_args], def_config).process_names(place)
return sorted([(p.kind, p.name) for p in address])
@@ -45,24 +45,24 @@ def test_filter_kind(sanitize):
@pytest.mark.parametrize('number', ('6523', 'n/a', '4'))
def test_convert_to_name_converted(number):
def test_convert_to_name_converted(def_config, number):
sanitizer_args = {'step': 'clean-housenumbers',
'convert-to-name': (r'\d+', 'n/a')}
place = PlaceInfo({'address': {'housenumber': number}})
names, address = PlaceSanitizer([sanitizer_args]).process_names(place)
names, address = PlaceSanitizer([sanitizer_args], def_config).process_names(place)
assert ('housenumber', number) in set((p.kind, p.name) for p in names)
assert 'housenumber' not in set(p.kind for p in address)
@pytest.mark.parametrize('number', ('a54', 'n.a', 'bow'))
def test_convert_to_name_unconverted(number):
def test_convert_to_name_unconverted(def_config, number):
sanitizer_args = {'step': 'clean-housenumbers',
'convert-to-name': (r'\d+', 'n/a')}
place = PlaceInfo({'address': {'housenumber': number}})
names, address = PlaceSanitizer([sanitizer_args]).process_names(place)
names, address = PlaceSanitizer([sanitizer_args], def_config).process_names(place)
assert 'housenumber' not in set(p.kind for p in names)
assert ('housenumber', number) in set((p.kind, p.name) for p in address)

View File

@@ -25,7 +25,7 @@ def sanitize(def_config, request):
if country is not None:
pi['country_code'] = country
_, address = PlaceSanitizer([sanitizer_args]).process_names(PlaceInfo(pi))
_, address = PlaceSanitizer([sanitizer_args], def_config).process_names(PlaceInfo(pi))
return sorted([(p.kind, p.name) for p in address])

View File

@@ -14,58 +14,66 @@ from nominatim.data.place_info import PlaceInfo
from nominatim.errors import UsageError
def run_sanitizer_on(**kwargs):
place = PlaceInfo({'name': kwargs})
name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
class TestSplitName:
return sorted([(p.name, p.kind, p.suffix) for p in name])
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def sanitize_with_delimiter(delimiter, name):
place = PlaceInfo({'name': {'name': name}})
san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}])
name, _ = san.process_names(place)
def run_sanitizer_on(self, **kwargs):
place = PlaceInfo({'name': kwargs})
name, _ = PlaceSanitizer([{'step': 'split-name-list'}], self.config).process_names(place)
return sorted([p.name for p in name])
return sorted([(p.name, p.kind, p.suffix) for p in name])
def test_simple():
assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
assert run_sanitizer_on(name='') == [('', 'name', None)]
def sanitize_with_delimiter(self, delimiter, name):
place = PlaceInfo({'name': {'name': name}})
san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}],
self.config)
name, _ = san.process_names(place)
return sorted([p.name for p in name])
def test_splits():
assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
('B', 'name', None),
('C', 'name', None)]
assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
('boat', 'short_name', None)]
def test_simple(self):
assert self.run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
assert self.run_sanitizer_on(name='') == [('', 'name', None)]
def test_empty_fields():
assert run_sanitizer_on(name='A;;B') == [('A', 'name', None),
('B', 'name', None)]
assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None),
('B', 'name', None)]
assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
assert run_sanitizer_on(name='B,') == [('B', 'name', None)]
def test_splits(self):
assert self.run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
('B', 'name', None),
('C', 'name', None)]
assert self.run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
('boat', 'short_name', None)]
def test_custom_delimiters():
assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
assert sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun']
def test_empty_fields(self):
assert self.run_sanitizer_on(name='A;;B') == [('A', 'name', None),
('B', 'name', None)]
assert self.run_sanitizer_on(name='A; ,B') == [('A', 'name', None),
('B', 'name', None)]
assert self.run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
assert self.run_sanitizer_on(name='B,') == [('B', 'name', None)]
def test_empty_delimiter_set():
with pytest.raises(UsageError):
sanitize_with_delimiter('', 'abc')
def test_custom_delimiters(self):
assert self.sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
assert self.sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
assert self.sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
assert self.sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun']
def test_no_name_list():
def test_empty_delimiter_set(self):
with pytest.raises(UsageError):
self.sanitize_with_delimiter('', 'abc')
def test_no_name_list(def_config):
place = PlaceInfo({'address': {'housenumber': '3'}})
name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
name, address = PlaceSanitizer([{'step': 'split-name-list'}], def_config).process_names(place)
assert not name
assert len(address) == 1

View File

@@ -12,39 +12,45 @@ import pytest
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.data.place_info import PlaceInfo
def run_sanitizer_on(**kwargs):
place = PlaceInfo({'name': kwargs})
name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
class TestStripBrace:
return sorted([(p.name, p.kind, p.suffix) for p in name])
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, **kwargs):
place = PlaceInfo({'name': kwargs})
name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}], self.config).process_names(place)
return sorted([(p.name, p.kind, p.suffix) for p in name])
def test_no_braces():
assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None),
('foo', 'name', None)]
def test_no_braces(self):
assert self.run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None),
('foo', 'name', None)]
def test_simple_braces():
assert run_sanitizer_on(name='Halle (Saale)', ref='3')\
== [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
assert run_sanitizer_on(name='ack ( bar')\
== [('ack', 'name', None), ('ack ( bar', 'name', None)]
def test_simple_braces(self):
assert self.run_sanitizer_on(name='Halle (Saale)', ref='3')\
== [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
assert self.run_sanitizer_on(name='ack ( bar')\
== [('ack', 'name', None), ('ack ( bar', 'name', None)]
def test_only_braces():
assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)]
def test_only_braces(self):
assert self.run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)]
def test_double_braces():
assert run_sanitizer_on(name='a((b))') == [('a', 'name', None),
('a((b))', 'name', None)]
assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None),
('a (b) (c)', 'name', None)]
def test_double_braces(self):
assert self.run_sanitizer_on(name='a((b))') == [('a', 'name', None),
('a((b))', 'name', None)]
assert self.run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None),
('a (b) (c)', 'name', None)]
def test_no_names():
def test_no_names(def_config):
place = PlaceInfo({'address': {'housenumber': '3'}})
name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}], def_config).process_names(place)
assert not name
assert len(address) == 1

View File

@@ -15,11 +15,16 @@ from nominatim.data.country_info import setup_country_config
class TestWithDefaults:
@staticmethod
def run_sanitizer_on(country, **kwargs):
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, country, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': country})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language'}]).process_names(place)
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language'}],
self.config).process_names(place)
return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name])
@@ -44,12 +49,17 @@ class TestWithDefaults:
class TestFilterKind:
@staticmethod
def run_sanitizer_on(filt, **kwargs):
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, filt, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': 'de'})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'filter-kind': filt}]).process_names(place)
'filter-kind': filt}],
self.config).process_names(place)
return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name])
@@ -94,14 +104,16 @@ class TestDefaultCountry:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
setup_country_config(def_config)
self.config = def_config
@staticmethod
def run_sanitizer_append(mode, country, **kwargs):
def run_sanitizer_append(self, mode, country, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': country})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'use-defaults': mode,
'mode': 'append'}]).process_names(place)
'mode': 'append'}],
self.config).process_names(place)
assert all(isinstance(p.attr, dict) for p in name)
assert all(len(p.attr) <= 1 for p in name)
@@ -111,13 +123,13 @@ class TestDefaultCountry:
return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
@staticmethod
def run_sanitizer_replace(mode, country, **kwargs):
def run_sanitizer_replace(self, mode, country, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': country})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'use-defaults': mode,
'mode': 'replace'}]).process_names(place)
'mode': 'replace'}],
self.config).process_names(place)
assert all(isinstance(p.attr, dict) for p in name)
assert all(len(p.attr) <= 1 for p in name)
@@ -131,7 +143,8 @@ class TestDefaultCountry:
place = PlaceInfo({'name': {'name': 'something'}})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'use-defaults': 'all',
'mode': 'replace'}]).process_names(place)
'mode': 'replace'}],
self.config).process_names(place)
assert len(name) == 1
assert name[0].name == 'something'
@@ -199,14 +212,19 @@ class TestDefaultCountry:
class TestCountryWithWhitelist:
@staticmethod
def run_sanitizer_on(mode, country, **kwargs):
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, mode, country, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': country})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'use-defaults': mode,
'mode': 'replace',
'whitelist': ['de', 'fr', 'ru']}]).process_names(place)
'whitelist': ['de', 'fr', 'ru']}],
self.config).process_names(place)
assert all(isinstance(p.attr, dict) for p in name)
assert all(len(p.attr) <= 1 for p in name)
@@ -238,12 +256,17 @@ class TestCountryWithWhitelist:
class TestWhiteList:
@staticmethod
def run_sanitizer_on(whitelist, **kwargs):
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, whitelist, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}})
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
'mode': 'replace',
'whitelist': whitelist}]).process_names(place)
'whitelist': whitelist}],
self.config).process_names(place)
assert all(isinstance(p.attr, dict) for p in name)
assert all(len(p.attr) <= 1 for p in name)

View File

@@ -47,8 +47,8 @@ def test_placeinfo_has_attr():
assert not place.has_attr('whatever')
def test_sanitizer_default():
san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}])
def test_sanitizer_default(def_config):
san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}], def_config)
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'},
'address': {'street': 'Bald'}}))
@@ -63,8 +63,8 @@ def test_sanitizer_default():
@pytest.mark.parametrize('rules', [None, []])
def test_sanitizer_empty_list(rules):
san = sanitizer.PlaceSanitizer(rules)
def test_sanitizer_empty_list(def_config, rules):
san = sanitizer.PlaceSanitizer(rules, def_config)
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}}))
@@ -72,6 +72,6 @@ def test_sanitizer_empty_list(rules):
assert all(isinstance(n, sanitizer.PlaceName) for n in name)
def test_sanitizer_missing_step_definition():
def test_sanitizer_missing_step_definition(def_config):
with pytest.raises(UsageError):
san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}])
san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}], def_config)