mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
add tests for sanitizer tagging language
This commit is contained in:
@@ -124,7 +124,7 @@ class ICURuleLoader:
|
|||||||
else:
|
else:
|
||||||
LOG.fatal("ICU tokenizer configuration has two token "
|
LOG.fatal("ICU tokenizer configuration has two token "
|
||||||
"analyzers with id '%s'.", name)
|
"analyzers with id '%s'.", name)
|
||||||
UsageError("Syntax error in ICU tokenizer config.")
|
raise UsageError("Syntax error in ICU tokenizer config.")
|
||||||
self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
|
self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -15,30 +15,30 @@ class _AnalyzerByLanguage:
|
|||||||
else:
|
else:
|
||||||
self.regexes = None
|
self.regexes = None
|
||||||
|
|
||||||
self.use_defaults = config.get('use-defaults', 'no')
|
|
||||||
if self.use_defaults not in ('mono', 'all'):
|
|
||||||
self.use_defaults = False
|
|
||||||
|
|
||||||
self.replace = config.get('mode', 'replace') != 'append'
|
self.replace = config.get('mode', 'replace') != 'append'
|
||||||
self.whitelist = config.get('whitelist')
|
self.whitelist = config.get('whitelist')
|
||||||
|
|
||||||
# Compute the languages to use when no suffix is given.
|
self.__compute_default_languages(config.get('use-defaults', 'no'))
|
||||||
self.deflangs = {}
|
|
||||||
for ccode, prop in country_info.iterate():
|
|
||||||
clangs = prop['languages']
|
|
||||||
if len(clangs) == 1 or self.use_defaults == 'all':
|
|
||||||
if self.whitelist:
|
|
||||||
self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
|
|
||||||
else:
|
|
||||||
self.deflangs[ccode] = clangs
|
|
||||||
|
|
||||||
|
|
||||||
|
def __compute_default_languages(self, use_defaults):
|
||||||
|
self.deflangs = {}
|
||||||
|
|
||||||
|
if use_defaults in ('mono', 'all'):
|
||||||
|
for ccode, prop in country_info.iterate():
|
||||||
|
clangs = prop['languages']
|
||||||
|
if len(clangs) == 1 or use_defaults == 'all':
|
||||||
|
if self.whitelist:
|
||||||
|
self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
|
||||||
|
else:
|
||||||
|
self.deflangs[ccode] = clangs
|
||||||
|
|
||||||
|
|
||||||
def _kind_matches(self, kind):
|
def _kind_matches(self, kind):
|
||||||
if self.regexes is None:
|
if self.regexes is None:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return any(regex.search(kind) for regex in self.regexes)
|
return any(regex.fullmatch(kind) for regex in self.regexes)
|
||||||
|
|
||||||
|
|
||||||
def _suffix_matches(self, suffix):
|
def _suffix_matches(self, suffix):
|
||||||
@@ -59,10 +59,8 @@ class _AnalyzerByLanguage:
|
|||||||
if name.suffix:
|
if name.suffix:
|
||||||
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
|
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
|
||||||
else:
|
else:
|
||||||
if self.use_defaults:
|
langs = self.deflangs.get(obj.place.country_code)
|
||||||
langs = self.deflangs.get(obj.place.country_code)
|
|
||||||
if self.use_defaults == 'mono' and len(langs) > 1:
|
|
||||||
langs = None
|
|
||||||
|
|
||||||
if langs:
|
if langs:
|
||||||
if self.replace:
|
if self.replace:
|
||||||
|
|||||||
@@ -0,0 +1,259 @@
|
|||||||
|
"""
|
||||||
|
Tests for the sanitizer that enables language-dependent analyzers.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||||
|
from nominatim.tools.country_info import setup_country_config
|
||||||
|
|
||||||
|
class TestWithDefaults:
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def run_sanitizer_on(country, **kwargs):
|
||||||
|
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||||
|
'country_code': country})
|
||||||
|
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language'}]).process_names(place)
|
||||||
|
|
||||||
|
return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name])
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_names(self):
|
||||||
|
assert self.run_sanitizer_on('de') == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple(self):
|
||||||
|
res = self.run_sanitizer_on('fr', name='Foo',name_de='Zoo', ref_abc='M')
|
||||||
|
|
||||||
|
assert res == [('Foo', 'name', None, {}),
|
||||||
|
('M', 'ref', 'abc', {'analyzer': 'abc'}),
|
||||||
|
('Zoo', 'name', 'de', {'analyzer': 'de'})]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('suffix', ['DE', 'asbc'])
|
||||||
|
def test_illegal_suffix(self, suffix):
|
||||||
|
assert self.run_sanitizer_on('fr', **{'name_' + suffix: 'Foo'}) \
|
||||||
|
== [('Foo', 'name', suffix, {})]
|
||||||
|
|
||||||
|
|
||||||
|
class TestFilterKind:
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def run_sanitizer_on(filt, **kwargs):
|
||||||
|
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||||
|
'country_code': 'de'})
|
||||||
|
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
|
||||||
|
'filter-kind': filt}]).process_names(place)
|
||||||
|
|
||||||
|
return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name])
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_exact_name(self):
|
||||||
|
res = self.run_sanitizer_on(['name'], name_fr='A', ref_fr='12',
|
||||||
|
shortname_fr='C', name='D')
|
||||||
|
|
||||||
|
assert res == [('12', 'ref', 'fr', {}),
|
||||||
|
('A', 'name', 'fr', {'analyzer': 'fr'}),
|
||||||
|
('C', 'shortname', 'fr', {}),
|
||||||
|
('D', 'name', None, {})]
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_pattern(self):
|
||||||
|
res = self.run_sanitizer_on(['.*name'],
|
||||||
|
name_fr='A', ref_fr='12', namexx_fr='B',
|
||||||
|
shortname_fr='C', name='D')
|
||||||
|
|
||||||
|
assert res == [('12', 'ref', 'fr', {}),
|
||||||
|
('A', 'name', 'fr', {'analyzer': 'fr'}),
|
||||||
|
('B', 'namexx', 'fr', {}),
|
||||||
|
('C', 'shortname', 'fr', {'analyzer': 'fr'}),
|
||||||
|
('D', 'name', None, {})]
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_patterns(self):
|
||||||
|
res = self.run_sanitizer_on(['.*name', 'ref'],
|
||||||
|
name_fr='A', ref_fr='12', oldref_fr='X',
|
||||||
|
namexx_fr='B', shortname_fr='C', name='D')
|
||||||
|
|
||||||
|
assert res == [('12', 'ref', 'fr', {'analyzer': 'fr'}),
|
||||||
|
('A', 'name', 'fr', {'analyzer': 'fr'}),
|
||||||
|
('B', 'namexx', 'fr', {}),
|
||||||
|
('C', 'shortname', 'fr', {'analyzer': 'fr'}),
|
||||||
|
('D', 'name', None, {}),
|
||||||
|
('X', 'oldref', 'fr', {})]
|
||||||
|
|
||||||
|
|
||||||
|
class TestDefaultCountry:
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def setup_country(self, def_config):
|
||||||
|
setup_country_config(def_config)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def run_sanitizer_append(mode, country, **kwargs):
|
||||||
|
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||||
|
'country_code': country})
|
||||||
|
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
|
||||||
|
'use-defaults': mode,
|
||||||
|
'mode': 'append'}]).process_names(place)
|
||||||
|
|
||||||
|
assert all(isinstance(p.attr, dict) for p in name)
|
||||||
|
assert all(len(p.attr) <= 1 for p in name)
|
||||||
|
assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer'])
|
||||||
|
for p in name)
|
||||||
|
|
||||||
|
return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def run_sanitizer_replace(mode, country, **kwargs):
|
||||||
|
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||||
|
'country_code': country})
|
||||||
|
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
|
||||||
|
'use-defaults': mode,
|
||||||
|
'mode': 'replace'}]).process_names(place)
|
||||||
|
|
||||||
|
assert all(isinstance(p.attr, dict) for p in name)
|
||||||
|
assert all(len(p.attr) <= 1 for p in name)
|
||||||
|
assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer'])
|
||||||
|
for p in name)
|
||||||
|
|
||||||
|
return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
|
||||||
|
|
||||||
|
|
||||||
|
def test_missing_country(self):
|
||||||
|
place = PlaceInfo({'name': {'name': 'something'}})
|
||||||
|
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
|
||||||
|
'use-defaults': 'all',
|
||||||
|
'mode': 'replace'}]).process_names(place)
|
||||||
|
|
||||||
|
assert len(name) == 1
|
||||||
|
assert name[0].name == 'something'
|
||||||
|
assert name[0].suffix is None
|
||||||
|
assert 'analyzer' not in name[0].attr
|
||||||
|
|
||||||
|
|
||||||
|
def test_mono_unknown_country(self):
|
||||||
|
expect = [('XX', '')]
|
||||||
|
|
||||||
|
assert self.run_sanitizer_replace('mono', 'xx', name='XX') == expect
|
||||||
|
assert self.run_sanitizer_append('mono', 'xx', name='XX') == expect
|
||||||
|
|
||||||
|
|
||||||
|
def test_mono_monoling_replace(self):
|
||||||
|
res = self.run_sanitizer_replace('mono', 'de', name='Foo')
|
||||||
|
|
||||||
|
assert res == [('Foo', 'de')]
|
||||||
|
|
||||||
|
|
||||||
|
def test_mono_monoling_append(self):
|
||||||
|
res = self.run_sanitizer_append('mono', 'de', name='Foo')
|
||||||
|
|
||||||
|
assert res == [('Foo', ''), ('Foo', 'de')]
|
||||||
|
|
||||||
|
|
||||||
|
def test_mono_multiling(self):
|
||||||
|
expect = [('XX', '')]
|
||||||
|
|
||||||
|
assert self.run_sanitizer_replace('mono', 'ch', name='XX') == expect
|
||||||
|
assert self.run_sanitizer_append('mono', 'ch', name='XX') == expect
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_unknown_country(self):
|
||||||
|
expect = [('XX', '')]
|
||||||
|
|
||||||
|
assert self.run_sanitizer_replace('all', 'xx', name='XX') == expect
|
||||||
|
assert self.run_sanitizer_append('all', 'xx', name='XX') == expect
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_monoling_replace(self):
|
||||||
|
res = self.run_sanitizer_replace('all', 'de', name='Foo')
|
||||||
|
|
||||||
|
assert res == [('Foo', 'de')]
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_monoling_append(self):
|
||||||
|
res = self.run_sanitizer_append('all', 'de', name='Foo')
|
||||||
|
|
||||||
|
assert res == [('Foo', ''), ('Foo', 'de')]
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_multiling_append(self):
|
||||||
|
res = self.run_sanitizer_append('all', 'ch', name='XX')
|
||||||
|
|
||||||
|
assert res == [('XX', ''),
|
||||||
|
('XX', 'de'), ('XX', 'fr'), ('XX', 'it'), ('XX', 'rm')]
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_multiling_replace(self):
|
||||||
|
res = self.run_sanitizer_replace('all', 'ch', name='XX')
|
||||||
|
|
||||||
|
assert res == [('XX', 'de'), ('XX', 'fr'), ('XX', 'it'), ('XX', 'rm')]
|
||||||
|
|
||||||
|
|
||||||
|
class TestCountryWithWhitelist:
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def run_sanitizer_on(mode, country, **kwargs):
|
||||||
|
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||||
|
'country_code': country})
|
||||||
|
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
|
||||||
|
'use-defaults': mode,
|
||||||
|
'mode': 'replace',
|
||||||
|
'whitelist': ['de', 'fr', 'ru']}]).process_names(place)
|
||||||
|
|
||||||
|
assert all(isinstance(p.attr, dict) for p in name)
|
||||||
|
assert all(len(p.attr) <= 1 for p in name)
|
||||||
|
assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer'])
|
||||||
|
for p in name)
|
||||||
|
|
||||||
|
return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
|
||||||
|
|
||||||
|
|
||||||
|
def test_mono_monoling(self):
|
||||||
|
assert self.run_sanitizer_on('mono', 'de', name='Foo') == [('Foo', 'de')]
|
||||||
|
assert self.run_sanitizer_on('mono', 'pt', name='Foo') == [('Foo', '')]
|
||||||
|
|
||||||
|
|
||||||
|
def test_mono_multiling(self):
|
||||||
|
assert self.run_sanitizer_on('mono', 'ca', name='Foo') == [('Foo', '')]
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_monoling(self):
|
||||||
|
assert self.run_sanitizer_on('all', 'de', name='Foo') == [('Foo', 'de')]
|
||||||
|
assert self.run_sanitizer_on('all', 'pt', name='Foo') == [('Foo', '')]
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_multiling(self):
|
||||||
|
assert self.run_sanitizer_on('all', 'ca', name='Foo') == [('Foo', 'fr')]
|
||||||
|
assert self.run_sanitizer_on('all', 'ch', name='Foo') \
|
||||||
|
== [('Foo', 'de'), ('Foo', 'fr')]
|
||||||
|
|
||||||
|
|
||||||
|
class TestWhiteList:
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def run_sanitizer_on(whitelist, **kwargs):
|
||||||
|
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}})
|
||||||
|
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
|
||||||
|
'mode': 'replace',
|
||||||
|
'whitelist': whitelist}]).process_names(place)
|
||||||
|
|
||||||
|
assert all(isinstance(p.attr, dict) for p in name)
|
||||||
|
assert all(len(p.attr) <= 1 for p in name)
|
||||||
|
assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer'])
|
||||||
|
for p in name)
|
||||||
|
|
||||||
|
return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
|
||||||
|
|
||||||
|
|
||||||
|
def test_in_whitelist(self):
|
||||||
|
assert self.run_sanitizer_on(['de', 'xx'], ref_xx='123') == [('123', 'xx')]
|
||||||
|
|
||||||
|
|
||||||
|
def test_not_in_whitelist(self):
|
||||||
|
assert self.run_sanitizer_on(['de', 'xx'], ref_yy='123') == [('123', '')]
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_whitelist(self):
|
||||||
|
assert self.run_sanitizer_on([], ref_yy='123') == [('123', '')]
|
||||||
Reference in New Issue
Block a user