forked from hans/Nominatim
enable flake for Python tests
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for the sanitizer that normalizes housenumbers.
|
||||
@@ -12,11 +12,12 @@ import pytest
|
||||
from nominatim_db.tokenizer.place_sanitizer import PlaceSanitizer
|
||||
from nominatim_db.data.place_info import PlaceInfo
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sanitize(request, def_config):
|
||||
sanitizer_args = {'step': 'clean-housenumbers'}
|
||||
for mark in request.node.iter_markers(name="sanitizer_params"):
|
||||
sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
|
||||
sanitizer_args.update({k.replace('_', '-'): v for k, v in mark.kwargs.items()})
|
||||
|
||||
def _run(**kwargs):
|
||||
place = PlaceInfo({'address': kwargs})
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for the sanitizer that normalizes postcodes.
|
||||
@@ -13,12 +13,13 @@ from nominatim_db.tokenizer.place_sanitizer import PlaceSanitizer
|
||||
from nominatim_db.data.place_info import PlaceInfo
|
||||
from nominatim_db.data import country_info
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sanitize(def_config, request):
|
||||
country_info.setup_country_config(def_config)
|
||||
sanitizer_args = {'step': 'clean-postcodes'}
|
||||
for mark in request.node.iter_markers(name="sanitizer_params"):
|
||||
sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
|
||||
sanitizer_args.update({k.replace('_', '-'): v for k, v in mark.kwargs.items()})
|
||||
|
||||
def _run(country=None, **kwargs):
|
||||
pi = {'address': kwargs}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for sanitizer that clean up TIGER tags.
|
||||
@@ -12,16 +12,17 @@ import pytest
|
||||
from nominatim_db.tokenizer.place_sanitizer import PlaceSanitizer
|
||||
from nominatim_db.data.place_info import PlaceInfo
|
||||
|
||||
|
||||
class TestCleanTigerTags:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
|
||||
def run_sanitizer_on(self, addr):
|
||||
place = PlaceInfo({'address': addr})
|
||||
_, outaddr = PlaceSanitizer([{'step': 'clean-tiger-tags'}], self.config).process_names(place)
|
||||
_, outaddr = PlaceSanitizer([{'step': 'clean-tiger-tags'}],
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix) for p in outaddr])
|
||||
|
||||
@@ -31,13 +32,11 @@ class TestCleanTigerTags:
|
||||
assert self.run_sanitizer_on({'tiger:county': inname})\
|
||||
== [(outname, 'county', 'tiger')]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name', ('Hamilton', 'Big, Road', ''))
|
||||
def test_badly_formatted(self, name):
|
||||
assert self.run_sanitizer_on({'tiger:county': name})\
|
||||
== [(name, 'county', 'tiger')]
|
||||
|
||||
|
||||
def test_unmatched(self):
|
||||
assert self.run_sanitizer_on({'tiger:country': 'US'})\
|
||||
== [('US', 'tiger', 'country')]
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for the sanitizer that normalizes housenumbers.
|
||||
@@ -22,18 +22,15 @@ class TestWithDefault:
|
||||
def run_sanitizer_on(self, type, **kwargs):
|
||||
|
||||
place = PlaceInfo({type: {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {'step': 'delete-tags'}
|
||||
|
||||
name, address = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
|
||||
return {
|
||||
'name': sorted([(p.name, p.kind, p.suffix or '') for p in name]),
|
||||
'address': sorted([(p.name, p.kind, p.suffix or '') for p in address])
|
||||
}
|
||||
self.config).process_names(place)
|
||||
|
||||
return {'name': sorted([(p.name, p.kind, p.suffix or '') for p in name]),
|
||||
'address': sorted([(p.name, p.kind, p.suffix or '') for p in address])}
|
||||
|
||||
def test_on_name(self):
|
||||
res = self.run_sanitizer_on('name', name='foo', ref='bar', ref_abc='baz')
|
||||
@@ -44,7 +41,7 @@ class TestWithDefault:
|
||||
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
|
||||
|
||||
assert res.get('address') == [('bar', 'ref', ''), ('baz', 'ref', 'abc'),
|
||||
('foo', 'name', '')]
|
||||
('foo', 'name', '')]
|
||||
|
||||
|
||||
class TestTypeField:
|
||||
@@ -56,15 +53,13 @@ class TestTypeField:
|
||||
def run_sanitizer_on(self, type, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
'type': type,
|
||||
}
|
||||
sanitizer_args = {'step': 'delete-tags',
|
||||
'type': type}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
|
||||
|
||||
@@ -77,7 +72,8 @@ class TestTypeField:
|
||||
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
|
||||
|
||||
assert res == [('bar', 'ref', ''), ('baz', 'ref', 'abc'),
|
||||
('foo', 'name', '')]
|
||||
('foo', 'name', '')]
|
||||
|
||||
|
||||
class TestFilterKind:
|
||||
|
||||
@@ -88,15 +84,13 @@ class TestFilterKind:
|
||||
def run_sanitizer_on(self, filt, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
'filter-kind': filt,
|
||||
}
|
||||
sanitizer_args = {'step': 'delete-tags',
|
||||
'filter-kind': filt}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
|
||||
|
||||
@@ -106,7 +100,6 @@ class TestFilterKind:
|
||||
|
||||
assert res == [('bar', 'ref', 'abc'), ('foo', 'ref', '')]
|
||||
|
||||
|
||||
def test_single_pattern(self):
|
||||
res = self.run_sanitizer_on(['.*name'],
|
||||
name_fr='foo', ref_fr='foo', namexx_fr='bar',
|
||||
@@ -114,7 +107,6 @@ class TestFilterKind:
|
||||
|
||||
assert res == [('bar', 'namexx', 'fr'), ('foo', 'ref', 'fr')]
|
||||
|
||||
|
||||
def test_multiple_patterns(self):
|
||||
res = self.run_sanitizer_on(['.*name', 'ref'],
|
||||
name_fr='foo', ref_fr='foo', oldref_fr='foo',
|
||||
@@ -132,19 +124,16 @@ class TestRankAddress:
|
||||
def run_sanitizer_on(self, rank_addr, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
'rank_address': rank_addr
|
||||
}
|
||||
sanitizer_args = {'step': 'delete-tags',
|
||||
'rank_address': rank_addr}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
|
||||
|
||||
|
||||
def test_single_rank(self):
|
||||
res = self.run_sanitizer_on('30', name='foo', ref='bar')
|
||||
|
||||
@@ -185,33 +174,29 @@ class TestSuffix:
|
||||
def run_sanitizer_on(self, suffix, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
'suffix': suffix,
|
||||
}
|
||||
sanitizer_args = {'step': 'delete-tags',
|
||||
'suffix': suffix}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
|
||||
|
||||
|
||||
def test_single_suffix(self):
|
||||
res = self.run_sanitizer_on('abc', name='foo', name_abc='foo',
|
||||
name_pqr='bar', ref='bar', ref_abc='baz')
|
||||
name_pqr='bar', ref='bar', ref_abc='baz')
|
||||
|
||||
assert res == [('bar', 'name', 'pqr'), ('bar', 'ref', ''), ('foo', 'name', '')]
|
||||
|
||||
def test_multiple_suffix(self):
|
||||
res = self.run_sanitizer_on(['abc.*', 'pqr'], name='foo', name_abcxx='foo',
|
||||
ref_pqr='bar', name_pqrxx='baz')
|
||||
ref_pqr='bar', name_pqrxx='baz')
|
||||
|
||||
assert res == [('baz', 'name', 'pqrxx'), ('foo', 'name', '')]
|
||||
|
||||
|
||||
|
||||
class TestCountryCodes:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@@ -221,19 +206,16 @@ class TestCountryCodes:
|
||||
def run_sanitizer_on(self, country_code, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
'country_code': country_code,
|
||||
}
|
||||
sanitizer_args = {'step': 'delete-tags',
|
||||
'country_code': country_code}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind) for p in name])
|
||||
|
||||
|
||||
def test_single_country_code_pass(self):
|
||||
res = self.run_sanitizer_on('de', name='foo', ref='bar')
|
||||
|
||||
@@ -259,6 +241,7 @@ class TestCountryCodes:
|
||||
|
||||
assert res == [('bar', 'ref'), ('foo', 'name')]
|
||||
|
||||
|
||||
class TestAllParameters:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@@ -268,7 +251,7 @@ class TestAllParameters:
|
||||
def run_sanitizer_on(self, country_code, rank_addr, suffix, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
@@ -281,11 +264,10 @@ class TestAllParameters:
|
||||
}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
|
||||
|
||||
|
||||
def test_string_arguments_pass(self):
|
||||
res = self.run_sanitizer_on('de', '25-30', r'[\s\S]*',
|
||||
name='foo', ref='foo', name_abc='bar', ref_abc='baz')
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for sanitizer configuration helper functions.
|
||||
@@ -12,6 +12,7 @@ import pytest
|
||||
from nominatim_db.errors import UsageError
|
||||
from nominatim_db.tokenizer.sanitizers.config import SanitizerConfig
|
||||
|
||||
|
||||
def test_string_list_default_empty():
|
||||
assert SanitizerConfig().get_string_list('op') == []
|
||||
|
||||
@@ -53,7 +54,7 @@ def test_create_split_regex_no_params_unsplit(inp):
|
||||
('ying;;yang', ['ying', 'yang']),
|
||||
(';a; ;c;d,', ['', 'a', '', 'c', 'd', '']),
|
||||
('1, 3 ,5', ['1', '3', '5'])
|
||||
])
|
||||
])
|
||||
def test_create_split_regex_no_params_split(inp, outp):
|
||||
regex = SanitizerConfig().get_delimiter()
|
||||
|
||||
@@ -70,7 +71,7 @@ def test_create_split_regex_custom(delimiter):
|
||||
|
||||
def test_create_split_regex_empty_delimiter():
|
||||
with pytest.raises(UsageError):
|
||||
regex = SanitizerConfig({'delimiters': ''}).get_delimiter()
|
||||
SanitizerConfig({'delimiters': ''}).get_delimiter()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*', ''))
|
||||
@@ -96,12 +97,12 @@ def test_create_name_filter_no_param_default_fail_all(inp):
|
||||
|
||||
def test_create_name_filter_no_param_default_invalid_string():
|
||||
with pytest.raises(ValueError):
|
||||
filt = SanitizerConfig().get_filter('name', 'abc')
|
||||
SanitizerConfig().get_filter('name', 'abc')
|
||||
|
||||
|
||||
def test_create_name_filter_no_param_default_empty_list():
|
||||
with pytest.raises(ValueError):
|
||||
filt = SanitizerConfig().get_filter('name', [])
|
||||
SanitizerConfig().get_filter('name', [])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
|
||||
@@ -121,7 +122,7 @@ def test_create_kind_filter_default_negetive(kind):
|
||||
@pytest.mark.parametrize('kind', ('lang', 'lang:de', 'langxx'))
|
||||
def test_create_kind_filter_custom_regex_positive(kind):
|
||||
filt = SanitizerConfig({'filter-kind': 'lang.*'}
|
||||
).get_filter('filter-kind', ['.*fr'])
|
||||
).get_filter('filter-kind', ['.*fr'])
|
||||
|
||||
assert filt(kind)
|
||||
|
||||
@@ -136,7 +137,7 @@ def test_create_kind_filter_custom_regex_negative(kind):
|
||||
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
|
||||
def test_create_kind_filter_many_positive(kind):
|
||||
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}
|
||||
).get_filter('filter-kind')
|
||||
).get_filter('filter-kind')
|
||||
|
||||
assert filt(kind)
|
||||
|
||||
@@ -144,6 +145,6 @@ def test_create_kind_filter_many_positive(kind):
|
||||
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
|
||||
def test_create_kind_filter_many_negative(kind):
|
||||
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}
|
||||
).get_filter('filter-kind')
|
||||
).get_filter('filter-kind')
|
||||
|
||||
assert not filt(kind)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for the sanitizer that splits multivalue lists.
|
||||
@@ -14,20 +14,19 @@ from nominatim_db.data.place_info import PlaceInfo
|
||||
|
||||
from nominatim_db.errors import UsageError
|
||||
|
||||
|
||||
class TestSplitName:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
|
||||
def run_sanitizer_on(self, **kwargs):
|
||||
place = PlaceInfo({'name': kwargs})
|
||||
name, _ = PlaceSanitizer([{'step': 'split-name-list'}], self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix) for p in name])
|
||||
|
||||
|
||||
def sanitize_with_delimiter(self, delimiter, name):
|
||||
place = PlaceInfo({'name': {'name': name}})
|
||||
san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}],
|
||||
@@ -36,12 +35,10 @@ class TestSplitName:
|
||||
|
||||
return sorted([p.name for p in name])
|
||||
|
||||
|
||||
def test_simple(self):
|
||||
assert self.run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
|
||||
assert self.run_sanitizer_on(name='') == [('', 'name', None)]
|
||||
|
||||
|
||||
def test_splits(self):
|
||||
assert self.run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
|
||||
('B', 'name', None),
|
||||
@@ -49,7 +46,6 @@ class TestSplitName:
|
||||
assert self.run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
|
||||
('boat', 'short_name', None)]
|
||||
|
||||
|
||||
def test_empty_fields(self):
|
||||
assert self.run_sanitizer_on(name='A;;B') == [('A', 'name', None),
|
||||
('B', 'name', None)]
|
||||
@@ -58,14 +54,12 @@ class TestSplitName:
|
||||
assert self.run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
|
||||
assert self.run_sanitizer_on(name='B,') == [('B', 'name', None)]
|
||||
|
||||
|
||||
def test_custom_delimiters(self):
|
||||
assert self.sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
|
||||
assert self.sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
|
||||
assert self.sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
|
||||
assert self.sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun']
|
||||
|
||||
|
||||
def test_empty_delimiter_set(self):
|
||||
with pytest.raises(UsageError):
|
||||
self.sanitize_with_delimiter('', 'abc')
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for the sanitizer that handles braced suffixes.
|
||||
@@ -12,6 +12,7 @@ import pytest
|
||||
from nominatim_db.tokenizer.place_sanitizer import PlaceSanitizer
|
||||
from nominatim_db.data.place_info import PlaceInfo
|
||||
|
||||
|
||||
class TestStripBrace:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@@ -24,23 +25,19 @@ class TestStripBrace:
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix) for p in name])
|
||||
|
||||
|
||||
def test_no_braces(self):
|
||||
assert self.run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None),
|
||||
('foo', 'name', None)]
|
||||
|
||||
|
||||
def test_simple_braces(self):
|
||||
assert self.run_sanitizer_on(name='Halle (Saale)', ref='3')\
|
||||
== [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
|
||||
assert self.run_sanitizer_on(name='ack ( bar')\
|
||||
== [('ack', 'name', None), ('ack ( bar', 'name', None)]
|
||||
|
||||
assert self.run_sanitizer_on(name='Halle (Saale)', ref='3') \
|
||||
== [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
|
||||
assert self.run_sanitizer_on(name='ack ( bar') \
|
||||
== [('ack', 'name', None), ('ack ( bar', 'name', None)]
|
||||
|
||||
def test_only_braces(self):
|
||||
assert self.run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)]
|
||||
|
||||
|
||||
def test_double_braces(self):
|
||||
assert self.run_sanitizer_on(name='a((b))') == [('a', 'name', None),
|
||||
('a((b))', 'name', None)]
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for the sanitizer that enables language-dependent analyzers.
|
||||
@@ -13,13 +13,13 @@ from nominatim_db.data.place_info import PlaceInfo
|
||||
from nominatim_db.tokenizer.place_sanitizer import PlaceSanitizer
|
||||
from nominatim_db.data.country_info import setup_country_config
|
||||
|
||||
|
||||
class TestWithDefaults:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
|
||||
def run_sanitizer_on(self, country, **kwargs):
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': country})
|
||||
@@ -28,19 +28,16 @@ class TestWithDefaults:
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name])
|
||||
|
||||
|
||||
def test_no_names(self):
|
||||
assert self.run_sanitizer_on('de') == []
|
||||
|
||||
|
||||
def test_simple(self):
|
||||
res = self.run_sanitizer_on('fr', name='Foo',name_de='Zoo', ref_abc='M')
|
||||
res = self.run_sanitizer_on('fr', name='Foo', name_de='Zoo', ref_abc='M')
|
||||
|
||||
assert res == [('Foo', 'name', None, {}),
|
||||
('M', 'ref', 'abc', {'analyzer': 'abc'}),
|
||||
('Zoo', 'name', 'de', {'analyzer': 'de'})]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('suffix', ['DE', 'asbc'])
|
||||
def test_illegal_suffix(self, suffix):
|
||||
assert self.run_sanitizer_on('fr', **{'name_' + suffix: 'Foo'}) \
|
||||
@@ -53,7 +50,6 @@ class TestFilterKind:
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
|
||||
def run_sanitizer_on(self, filt, **kwargs):
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de'})
|
||||
@@ -63,17 +59,15 @@ class TestFilterKind:
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name])
|
||||
|
||||
|
||||
def test_single_exact_name(self):
|
||||
res = self.run_sanitizer_on(['name'], name_fr='A', ref_fr='12',
|
||||
shortname_fr='C', name='D')
|
||||
shortname_fr='C', name='D')
|
||||
|
||||
assert res == [('12', 'ref', 'fr', {}),
|
||||
('A', 'name', 'fr', {'analyzer': 'fr'}),
|
||||
('C', 'shortname', 'fr', {}),
|
||||
('D', 'name', None, {})]
|
||||
|
||||
|
||||
def test_single_pattern(self):
|
||||
res = self.run_sanitizer_on(['.*name'],
|
||||
name_fr='A', ref_fr='12', namexx_fr='B',
|
||||
@@ -85,7 +79,6 @@ class TestFilterKind:
|
||||
('C', 'shortname', 'fr', {'analyzer': 'fr'}),
|
||||
('D', 'name', None, {})]
|
||||
|
||||
|
||||
def test_multiple_patterns(self):
|
||||
res = self.run_sanitizer_on(['.*name', 'ref'],
|
||||
name_fr='A', ref_fr='12', oldref_fr='X',
|
||||
@@ -106,7 +99,6 @@ class TestDefaultCountry:
|
||||
setup_country_config(def_config)
|
||||
self.config = def_config
|
||||
|
||||
|
||||
def run_sanitizer_append(self, mode, country, **kwargs):
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': country})
|
||||
@@ -122,7 +114,6 @@ class TestDefaultCountry:
|
||||
|
||||
return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
|
||||
|
||||
|
||||
def run_sanitizer_replace(self, mode, country, **kwargs):
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': country})
|
||||
@@ -138,7 +129,6 @@ class TestDefaultCountry:
|
||||
|
||||
return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
|
||||
|
||||
|
||||
def test_missing_country(self):
|
||||
place = PlaceInfo({'name': {'name': 'something'}})
|
||||
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
|
||||
@@ -151,59 +141,50 @@ class TestDefaultCountry:
|
||||
assert name[0].suffix is None
|
||||
assert 'analyzer' not in name[0].attr
|
||||
|
||||
|
||||
def test_mono_unknown_country(self):
|
||||
expect = [('XX', '')]
|
||||
|
||||
assert self.run_sanitizer_replace('mono', 'xx', name='XX') == expect
|
||||
assert self.run_sanitizer_append('mono', 'xx', name='XX') == expect
|
||||
|
||||
|
||||
def test_mono_monoling_replace(self):
|
||||
res = self.run_sanitizer_replace('mono', 'de', name='Foo')
|
||||
|
||||
assert res == [('Foo', 'de')]
|
||||
|
||||
|
||||
def test_mono_monoling_append(self):
|
||||
res = self.run_sanitizer_append('mono', 'de', name='Foo')
|
||||
|
||||
assert res == [('Foo', ''), ('Foo', 'de')]
|
||||
|
||||
|
||||
def test_mono_multiling(self):
|
||||
expect = [('XX', '')]
|
||||
|
||||
assert self.run_sanitizer_replace('mono', 'ch', name='XX') == expect
|
||||
assert self.run_sanitizer_append('mono', 'ch', name='XX') == expect
|
||||
|
||||
|
||||
def test_all_unknown_country(self):
|
||||
expect = [('XX', '')]
|
||||
|
||||
assert self.run_sanitizer_replace('all', 'xx', name='XX') == expect
|
||||
assert self.run_sanitizer_append('all', 'xx', name='XX') == expect
|
||||
|
||||
|
||||
def test_all_monoling_replace(self):
|
||||
res = self.run_sanitizer_replace('all', 'de', name='Foo')
|
||||
|
||||
assert res == [('Foo', 'de')]
|
||||
|
||||
|
||||
def test_all_monoling_append(self):
|
||||
res = self.run_sanitizer_append('all', 'de', name='Foo')
|
||||
|
||||
assert res == [('Foo', ''), ('Foo', 'de')]
|
||||
|
||||
|
||||
def test_all_multiling_append(self):
|
||||
res = self.run_sanitizer_append('all', 'ch', name='XX')
|
||||
|
||||
assert res == [('XX', ''),
|
||||
('XX', 'de'), ('XX', 'fr'), ('XX', 'it'), ('XX', 'rm')]
|
||||
|
||||
|
||||
def test_all_multiling_replace(self):
|
||||
res = self.run_sanitizer_replace('all', 'ch', name='XX')
|
||||
|
||||
@@ -216,7 +197,6 @@ class TestCountryWithWhitelist:
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
|
||||
def run_sanitizer_on(self, mode, country, **kwargs):
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': country})
|
||||
@@ -233,21 +213,17 @@ class TestCountryWithWhitelist:
|
||||
|
||||
return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
|
||||
|
||||
|
||||
def test_mono_monoling(self):
|
||||
assert self.run_sanitizer_on('mono', 'de', name='Foo') == [('Foo', 'de')]
|
||||
assert self.run_sanitizer_on('mono', 'pt', name='Foo') == [('Foo', '')]
|
||||
|
||||
|
||||
def test_mono_multiling(self):
|
||||
assert self.run_sanitizer_on('mono', 'ca', name='Foo') == [('Foo', '')]
|
||||
|
||||
|
||||
def test_all_monoling(self):
|
||||
assert self.run_sanitizer_on('all', 'de', name='Foo') == [('Foo', 'de')]
|
||||
assert self.run_sanitizer_on('all', 'pt', name='Foo') == [('Foo', '')]
|
||||
|
||||
|
||||
def test_all_multiling(self):
|
||||
assert self.run_sanitizer_on('all', 'ca', name='Foo') == [('Foo', 'fr')]
|
||||
assert self.run_sanitizer_on('all', 'ch', name='Foo') \
|
||||
@@ -260,7 +236,6 @@ class TestWhiteList:
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
|
||||
def run_sanitizer_on(self, whitelist, **kwargs):
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}})
|
||||
name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
|
||||
@@ -275,14 +250,11 @@ class TestWhiteList:
|
||||
|
||||
return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
|
||||
|
||||
|
||||
def test_in_whitelist(self):
|
||||
assert self.run_sanitizer_on(['de', 'xx'], ref_xx='123') == [('123', 'xx')]
|
||||
|
||||
|
||||
def test_not_in_whitelist(self):
|
||||
assert self.run_sanitizer_on(['de', 'xx'], ref_yy='123') == [('123', '')]
|
||||
|
||||
|
||||
def test_empty_whitelist(self):
|
||||
assert self.run_sanitizer_on([], ref_yy='123') == [('123', '')]
|
||||
|
||||
@@ -2,86 +2,86 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
from typing import Mapping, Optional, List
|
||||
import pytest
|
||||
|
||||
from nominatim_db.data.place_info import PlaceInfo
|
||||
from nominatim_db.data.place_name import PlaceName
|
||||
from nominatim_db.tokenizer.place_sanitizer import PlaceSanitizer
|
||||
|
||||
|
||||
class TestTagJapanese:
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
def run_sanitizer_on(self,type, **kwargs):
|
||||
def run_sanitizer_on(self, type, **kwargs):
|
||||
place = PlaceInfo({
|
||||
'address': kwargs,
|
||||
'country_code': 'jp'
|
||||
})
|
||||
sanitizer_args = {'step': 'tag-japanese'}
|
||||
_, address = PlaceSanitizer([sanitizer_args], self.config).process_names(place)
|
||||
tmp_list = [(p.name,p.kind) for p in address]
|
||||
tmp_list = [(p.name, p.kind) for p in address]
|
||||
return sorted(tmp_list)
|
||||
|
||||
def test_on_address(self):
|
||||
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
|
||||
assert res == [('bar','ref'),('baz','ref_abc'),('foo','name')]
|
||||
assert res == [('bar', 'ref'), ('baz', 'ref_abc'), ('foo', 'name')]
|
||||
|
||||
def test_housenumber(self):
|
||||
res = self.run_sanitizer_on('address', housenumber='2')
|
||||
assert res == [('2','housenumber')]
|
||||
assert res == [('2', 'housenumber')]
|
||||
|
||||
def test_blocknumber(self):
|
||||
res = self.run_sanitizer_on('address', block_number='6')
|
||||
assert res == [('6','housenumber')]
|
||||
assert res == [('6', 'housenumber')]
|
||||
|
||||
def test_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address', neighbourhood='8')
|
||||
assert res == [('8','place')]
|
||||
assert res == [('8', 'place')]
|
||||
|
||||
def test_quarter(self):
|
||||
res = self.run_sanitizer_on('address', quarter='kase')
|
||||
assert res==[('kase','place')]
|
||||
assert res == [('kase', 'place')]
|
||||
|
||||
def test_housenumber_blocknumber(self):
|
||||
res = self.run_sanitizer_on('address', housenumber='2', block_number='6')
|
||||
assert res == [('6-2','housenumber')]
|
||||
assert res == [('6-2', 'housenumber')]
|
||||
|
||||
def test_quarter_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address', quarter='kase', neighbourhood='8')
|
||||
assert res == [('kase8','place')]
|
||||
assert res == [('kase8', 'place')]
|
||||
|
||||
def test_blocknumber_housenumber_quarter(self):
|
||||
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', quarter='kase')
|
||||
assert res == [('6-2','housenumber'),('kase','place')]
|
||||
assert res == [('6-2', 'housenumber'), ('kase', 'place')]
|
||||
|
||||
def test_blocknumber_housenumber_quarter_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', neighbourhood='8')
|
||||
assert res == [('6-2','housenumber'),('8','place')]
|
||||
assert res == [('6-2', 'housenumber'), ('8', 'place')]
|
||||
|
||||
def test_blocknumber_quarter_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address',block_number='6', quarter='kase', neighbourhood='8')
|
||||
assert res == [('6','housenumber'),('kase8','place')]
|
||||
res = self.run_sanitizer_on('address', block_number='6', quarter='kase', neighbourhood='8')
|
||||
assert res == [('6', 'housenumber'), ('kase8', 'place')]
|
||||
|
||||
def test_blocknumber_quarter(self):
|
||||
res = self.run_sanitizer_on('address',block_number='6', quarter='kase')
|
||||
assert res == [('6','housenumber'),('kase','place')]
|
||||
res = self.run_sanitizer_on('address', block_number='6', quarter='kase')
|
||||
assert res == [('6', 'housenumber'), ('kase', 'place')]
|
||||
|
||||
def test_blocknumber_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address',block_number='6', neighbourhood='8')
|
||||
assert res == [('6','housenumber'),('8','place')]
|
||||
res = self.run_sanitizer_on('address', block_number='6', neighbourhood='8')
|
||||
assert res == [('6', 'housenumber'), ('8', 'place')]
|
||||
|
||||
def test_housenumber_quarter_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address',housenumber='2', quarter='kase', neighbourhood='8')
|
||||
assert res == [('2','housenumber'),('kase8','place')]
|
||||
res = self.run_sanitizer_on('address', housenumber='2', quarter='kase', neighbourhood='8')
|
||||
assert res == [('2', 'housenumber'), ('kase8', 'place')]
|
||||
|
||||
def test_housenumber_quarter(self):
|
||||
res = self.run_sanitizer_on('address',housenumber='2', quarter='kase')
|
||||
assert res == [('2','housenumber'),('kase','place')]
|
||||
res = self.run_sanitizer_on('address', housenumber='2', quarter='kase')
|
||||
assert res == [('2', 'housenumber'), ('kase', 'place')]
|
||||
|
||||
def test_housenumber_blocknumber_neighbourhood_quarter(self):
|
||||
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', quarter='kase', neighbourhood='8')
|
||||
assert res == [('6-2','housenumber'),('kase8','place')]
|
||||
res = self.run_sanitizer_on('address', block_number='6', housenumber='2',
|
||||
quarter='kase', neighbourhood='8')
|
||||
assert res == [('6-2', 'housenumber'), ('kase8', 'place')]
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for creating new tokenizers.
|
||||
@@ -27,7 +27,6 @@ class TestFactory:
|
||||
def init_env(self, project_env, property_table, tokenizer_mock):
|
||||
self.config = project_env
|
||||
|
||||
|
||||
def test_setup_dummy_tokenizer(self, temp_db_conn):
|
||||
tokenizer = factory.create_tokenizer(self.config)
|
||||
|
||||
@@ -37,7 +36,6 @@ class TestFactory:
|
||||
|
||||
assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
|
||||
|
||||
|
||||
def test_setup_tokenizer_dir_exists(self):
|
||||
(self.config.project_dir / 'tokenizer').mkdir()
|
||||
|
||||
@@ -46,14 +44,12 @@ class TestFactory:
|
||||
assert isinstance(tokenizer, DummyTokenizer)
|
||||
assert tokenizer.init_state == "new"
|
||||
|
||||
|
||||
def test_setup_tokenizer_dir_failure(self):
|
||||
(self.config.project_dir / 'tokenizer').write_text("foo")
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
factory.create_tokenizer(self.config)
|
||||
|
||||
|
||||
def test_load_tokenizer(self):
|
||||
factory.create_tokenizer(self.config)
|
||||
|
||||
@@ -62,7 +58,6 @@ class TestFactory:
|
||||
assert isinstance(tokenizer, DummyTokenizer)
|
||||
assert tokenizer.init_state == "loaded"
|
||||
|
||||
|
||||
def test_load_repopulate_tokenizer_dir(self):
|
||||
factory.create_tokenizer(self.config)
|
||||
|
||||
@@ -71,7 +66,6 @@ class TestFactory:
|
||||
factory.get_tokenizer_for_db(self.config)
|
||||
assert (self.config.project_dir / 'tokenizer').exists()
|
||||
|
||||
|
||||
def test_load_missing_property(self, temp_db_cursor):
|
||||
factory.create_tokenizer(self.config)
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for ICU tokenizer.
|
||||
@@ -20,6 +20,7 @@ from nominatim_db.data.place_info import PlaceInfo
|
||||
|
||||
from mock_icu_word_table import MockIcuWordTable
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def word_table(temp_db_conn):
|
||||
return MockIcuWordTable(temp_db_conn)
|
||||
@@ -89,6 +90,7 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
||||
|
||||
return _mk_analyser
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sql_functions(temp_db_conn, def_config, src_dir):
|
||||
orig_sql = def_config.lib_dir.sql
|
||||
@@ -152,19 +154,19 @@ LANGUAGE plpgsql;
|
||||
""")
|
||||
|
||||
|
||||
|
||||
def test_init_new(tokenizer_factory, test_config, db_prop):
|
||||
tok = tokenizer_factory()
|
||||
tok.init_new_db(test_config)
|
||||
|
||||
assert db_prop(nominatim_db.tokenizer.icu_rule_loader.DBCFG_IMPORT_NORM_RULES) \
|
||||
.startswith(':: lower ();')
|
||||
prop = db_prop(nominatim_db.tokenizer.icu_rule_loader.DBCFG_IMPORT_NORM_RULES)
|
||||
|
||||
assert prop.startswith(':: lower ();')
|
||||
|
||||
|
||||
def test_init_word_table(tokenizer_factory, test_config, place_row, temp_db_cursor):
|
||||
place_row(names={'name' : 'Test Area', 'ref' : '52'})
|
||||
place_row(names={'name' : 'No Area'})
|
||||
place_row(names={'name' : 'Holzstrasse'})
|
||||
place_row(names={'name': 'Test Area', 'ref': '52'})
|
||||
place_row(names={'name': 'No Area'})
|
||||
place_row(names={'name': 'Holzstrasse'})
|
||||
|
||||
tok = tokenizer_factory()
|
||||
tok.init_new_db(test_config)
|
||||
@@ -259,12 +261,10 @@ class TestPostcodes:
|
||||
self.analyzer = anl
|
||||
yield anl
|
||||
|
||||
|
||||
def process_postcode(self, cc, postcode):
|
||||
return self.analyzer.process_place(PlaceInfo({'country_code': cc,
|
||||
'address': {'postcode': postcode}}))
|
||||
|
||||
|
||||
def test_update_postcodes_deleted(self, word_table):
|
||||
word_table.add_postcode(' 1234', '1234')
|
||||
word_table.add_postcode(' 5678', '5678')
|
||||
@@ -273,20 +273,17 @@ class TestPostcodes:
|
||||
|
||||
assert word_table.count() == 0
|
||||
|
||||
|
||||
def test_process_place_postcode_simple(self, word_table):
|
||||
info = self.process_postcode('de', '12345')
|
||||
|
||||
assert info['postcode'] == '12345'
|
||||
|
||||
|
||||
def test_process_place_postcode_with_space(self, word_table):
|
||||
info = self.process_postcode('in', '123 567')
|
||||
|
||||
assert info['postcode'] == '123567'
|
||||
|
||||
|
||||
|
||||
def test_update_special_phrase_empty_table(analyzer, word_table):
|
||||
with analyzer() as anl:
|
||||
anl.update_special_phrases([
|
||||
@@ -296,9 +293,9 @@ def test_update_special_phrase_empty_table(analyzer, word_table):
|
||||
], True)
|
||||
|
||||
assert word_table.get_special() \
|
||||
== {('KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'),
|
||||
('KÖNIGE', 'Könige', 'amenity', 'royal', None),
|
||||
('STREET', 'street', 'highway', 'primary', 'in')}
|
||||
== {('KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'),
|
||||
('KÖNIGE', 'Könige', 'amenity', 'royal', None),
|
||||
('STREET', 'street', 'highway', 'primary', 'in')}
|
||||
|
||||
|
||||
def test_update_special_phrase_delete_all(analyzer, word_table):
|
||||
@@ -339,9 +336,9 @@ def test_update_special_phrase_modify(analyzer, word_table):
|
||||
], True)
|
||||
|
||||
assert word_table.get_special() \
|
||||
== {('PRISON', 'prison', 'amenity', 'prison', 'in'),
|
||||
('BAR', 'bar', 'highway', 'road', None),
|
||||
('GARDEN', 'garden', 'leisure', 'garden', 'near')}
|
||||
== {('PRISON', 'prison', 'amenity', 'prison', 'in'),
|
||||
('BAR', 'bar', 'highway', 'road', None),
|
||||
('GARDEN', 'garden', 'leisure', 'garden', 'near')}
|
||||
|
||||
|
||||
def test_add_country_names_new(analyzer, word_table):
|
||||
@@ -370,7 +367,6 @@ class TestPlaceNames:
|
||||
self.analyzer = anl
|
||||
yield anl
|
||||
|
||||
|
||||
def expect_name_terms(self, info, *expected_terms):
|
||||
tokens = self.analyzer.get_word_token_info(expected_terms)
|
||||
for token in tokens:
|
||||
@@ -378,34 +374,29 @@ class TestPlaceNames:
|
||||
|
||||
assert eval(info['names']) == set((t[2] for t in tokens))
|
||||
|
||||
|
||||
def process_named_place(self, names):
|
||||
return self.analyzer.process_place(PlaceInfo({'name': names}))
|
||||
|
||||
|
||||
def test_simple_names(self):
|
||||
info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'})
|
||||
|
||||
self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('sep', [',' , ';'])
|
||||
@pytest.mark.parametrize('sep', [',', ';'])
|
||||
def test_names_with_separator(self, sep):
|
||||
info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))})
|
||||
|
||||
self.expect_name_terms(info, '#New York', '#Big Apple',
|
||||
'new', 'york', 'big', 'apple')
|
||||
|
||||
|
||||
def test_full_names_with_bracket(self):
|
||||
info = self.process_named_place({'name': 'Houseboat (left)'})
|
||||
|
||||
self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
|
||||
'houseboat', 'left')
|
||||
|
||||
|
||||
def test_country_name(self, word_table):
|
||||
place = PlaceInfo({'name' : {'name': 'Norge'},
|
||||
place = PlaceInfo({'name': {'name': 'Norge'},
|
||||
'country_code': 'no',
|
||||
'rank_address': 4,
|
||||
'class': 'boundary',
|
||||
@@ -427,18 +418,15 @@ class TestPlaceAddress:
|
||||
self.analyzer = anl
|
||||
yield anl
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def getorcreate_hnr_id(self, temp_db_cursor):
|
||||
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
|
||||
RETURNS INTEGER AS $$
|
||||
SELECT -nextval('seq_word')::INTEGER; $$ LANGUAGE SQL""")
|
||||
|
||||
|
||||
def process_address(self, **kwargs):
|
||||
return self.analyzer.process_place(PlaceInfo({'address': kwargs}))
|
||||
|
||||
|
||||
def name_token_set(self, *expected_terms):
|
||||
tokens = self.analyzer.get_word_token_info(expected_terms)
|
||||
for token in tokens:
|
||||
@@ -446,14 +434,12 @@ class TestPlaceAddress:
|
||||
|
||||
return set((t[2] for t in tokens))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
|
||||
def test_process_place_postcode(self, word_table, pcode):
|
||||
info = self.process_address(postcode=pcode)
|
||||
|
||||
assert info['postcode'] == pcode
|
||||
|
||||
|
||||
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
|
||||
def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
|
||||
info = self.process_address(housenumber=hnr)
|
||||
@@ -461,7 +447,6 @@ class TestPlaceAddress:
|
||||
assert info['hnr'] == hnr.upper()
|
||||
assert info['hnr_tokens'] == "{-1}"
|
||||
|
||||
|
||||
def test_process_place_housenumbers_duplicates(self, getorcreate_hnr_id):
|
||||
info = self.process_address(housenumber='134',
|
||||
conscriptionnumber='134',
|
||||
@@ -470,7 +455,6 @@ class TestPlaceAddress:
|
||||
assert set(info['hnr'].split(';')) == set(('134', '99A'))
|
||||
assert info['hnr_tokens'] == "{-1,-2}"
|
||||
|
||||
|
||||
def test_process_place_housenumbers_cached(self, getorcreate_hnr_id):
|
||||
info = self.process_address(housenumber="45")
|
||||
assert info['hnr_tokens'] == "{-1}"
|
||||
@@ -484,37 +468,32 @@ class TestPlaceAddress:
|
||||
info = self.process_address(housenumber="41")
|
||||
assert eval(info['hnr_tokens']) == {-3}
|
||||
|
||||
|
||||
def test_process_place_street(self):
|
||||
self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road'}}))
|
||||
self.analyzer.process_place(PlaceInfo({'name': {'name': 'Grand Road'}}))
|
||||
info = self.process_address(street='Grand Road')
|
||||
|
||||
assert eval(info['street']) == self.name_token_set('#Grand Road')
|
||||
|
||||
|
||||
def test_process_place_nonexisting_street(self):
|
||||
info = self.process_address(street='Grand Road')
|
||||
|
||||
assert info['street'] == '{}'
|
||||
|
||||
|
||||
def test_process_place_multiple_street_tags(self):
|
||||
self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road',
|
||||
self.analyzer.process_place(PlaceInfo({'name': {'name': 'Grand Road',
|
||||
'ref': '05989'}}))
|
||||
info = self.process_address(**{'street': 'Grand Road',
|
||||
'street:sym_ul': '05989'})
|
||||
'street:sym_ul': '05989'})
|
||||
|
||||
assert eval(info['street']) == self.name_token_set('#Grand Road', '#05989')
|
||||
|
||||
|
||||
def test_process_place_street_empty(self):
|
||||
info = self.process_address(street='🜵')
|
||||
|
||||
assert info['street'] == '{}'
|
||||
|
||||
|
||||
def test_process_place_street_from_cache(self):
|
||||
self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road'}}))
|
||||
self.analyzer.process_place(PlaceInfo({'name': {'name': 'Grand Road'}}))
|
||||
self.process_address(street='Grand Road')
|
||||
|
||||
# request address again
|
||||
@@ -522,25 +501,21 @@ class TestPlaceAddress:
|
||||
|
||||
assert eval(info['street']) == self.name_token_set('#Grand Road')
|
||||
|
||||
|
||||
def test_process_place_place(self):
|
||||
info = self.process_address(place='Honu Lulu')
|
||||
|
||||
assert eval(info['place']) == self.name_token_set('HONU', 'LULU', '#HONU LULU')
|
||||
|
||||
|
||||
def test_process_place_place_extra(self):
|
||||
info = self.process_address(**{'place:en': 'Honu Lulu'})
|
||||
|
||||
assert 'place' not in info
|
||||
|
||||
|
||||
def test_process_place_place_empty(self):
|
||||
info = self.process_address(place='🜵')
|
||||
|
||||
assert 'place' not in info
|
||||
|
||||
|
||||
def test_process_place_address_terms(self):
|
||||
info = self.process_address(country='de', city='Zwickau', state='Sachsen',
|
||||
suburb='Zwickau', street='Hauptstr',
|
||||
@@ -549,19 +524,17 @@ class TestPlaceAddress:
|
||||
city = self.name_token_set('ZWICKAU', '#ZWICKAU')
|
||||
state = self.name_token_set('SACHSEN', '#SACHSEN')
|
||||
|
||||
result = {k: eval(v) for k,v in info['addr'].items()}
|
||||
result = {k: eval(v) for k, v in info['addr'].items()}
|
||||
|
||||
assert result == {'city': city, 'suburb': city, 'state': state}
|
||||
|
||||
|
||||
def test_process_place_multiple_address_terms(self):
|
||||
info = self.process_address(**{'city': 'Bruxelles', 'city:de': 'Brüssel'})
|
||||
|
||||
result = {k: eval(v) for k,v in info['addr'].items()}
|
||||
result = {k: eval(v) for k, v in info['addr'].items()}
|
||||
|
||||
assert result == {'city': self.name_token_set('Bruxelles', '#Bruxelles')}
|
||||
|
||||
|
||||
def test_process_place_address_terms_empty(self):
|
||||
info = self.process_address(country='de', city=' ', street='Hauptstr',
|
||||
full='right behind the church')
|
||||
@@ -575,22 +548,21 @@ class TestPlaceHousenumberWithAnalyser:
|
||||
def setup(self, analyzer, sql_functions):
|
||||
hnr = {'step': 'clean-housenumbers',
|
||||
'filter-kind': ['housenumber', 'conscriptionnumber', 'streetnumber']}
|
||||
with analyzer(trans=(":: upper()", "'🜵' > ' '"), sanitizers=[hnr], with_housenumber=True) as anl:
|
||||
with analyzer(trans=(":: upper()", "'🜵' > ' '"), sanitizers=[hnr],
|
||||
with_housenumber=True) as anl:
|
||||
self.analyzer = anl
|
||||
yield anl
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def getorcreate_hnr_id(self, temp_db_cursor):
|
||||
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_analyzed_hnr_id(norm_term TEXT, lookup_terms TEXT[])
|
||||
RETURNS INTEGER AS $$
|
||||
SELECT -nextval('seq_word')::INTEGER; $$ LANGUAGE SQL""")
|
||||
|
||||
temp_db_cursor.execute("""
|
||||
CREATE OR REPLACE FUNCTION create_analyzed_hnr_id(norm_term TEXT, lookup_terms TEXT[])
|
||||
RETURNS INTEGER AS $$
|
||||
SELECT -nextval('seq_word')::INTEGER; $$ LANGUAGE SQL""")
|
||||
|
||||
def process_address(self, **kwargs):
|
||||
return self.analyzer.process_place(PlaceInfo({'address': kwargs}))
|
||||
|
||||
|
||||
def name_token_set(self, *expected_terms):
|
||||
tokens = self.analyzer.get_word_token_info(expected_terms)
|
||||
for token in tokens:
|
||||
@@ -598,7 +570,6 @@ class TestPlaceHousenumberWithAnalyser:
|
||||
|
||||
return set((t[2] for t in tokens))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('hnr', ['123 a', '1', '101'])
|
||||
def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
|
||||
info = self.process_address(housenumber=hnr)
|
||||
@@ -606,7 +577,6 @@ class TestPlaceHousenumberWithAnalyser:
|
||||
assert info['hnr'] == hnr.upper()
|
||||
assert info['hnr_tokens'] == "{-1}"
|
||||
|
||||
|
||||
def test_process_place_housenumbers_duplicates(self, getorcreate_hnr_id):
|
||||
info = self.process_address(housenumber='134',
|
||||
conscriptionnumber='134',
|
||||
@@ -615,7 +585,6 @@ class TestPlaceHousenumberWithAnalyser:
|
||||
assert set(info['hnr'].split(';')) == set(('134', '99 A'))
|
||||
assert info['hnr_tokens'] == "{-1,-2}"
|
||||
|
||||
|
||||
def test_process_place_housenumbers_cached(self, getorcreate_hnr_id):
|
||||
info = self.process_address(housenumber="45")
|
||||
assert info['hnr_tokens'] == "{-1}"
|
||||
@@ -637,7 +606,6 @@ class TestUpdateWordTokens:
|
||||
table_factory('search_name', 'place_id BIGINT, name_vector INT[]')
|
||||
self.tok = tokenizer_factory()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def search_entry(self, temp_db_cursor):
|
||||
place_id = itertools.count(1000)
|
||||
@@ -648,7 +616,6 @@ class TestUpdateWordTokens:
|
||||
|
||||
return _insert
|
||||
|
||||
|
||||
@pytest.fixture(params=['simple', 'analyzed'])
|
||||
def add_housenumber(self, request, word_table):
|
||||
if request.param == 'simple':
|
||||
@@ -660,7 +627,6 @@ class TestUpdateWordTokens:
|
||||
|
||||
return _make
|
||||
|
||||
|
||||
@pytest.mark.parametrize('hnr', ('1a', '1234567', '34 5'))
|
||||
def test_remove_unused_housenumbers(self, add_housenumber, word_table, hnr):
|
||||
word_table.add_housenumber(1000, hnr)
|
||||
@@ -669,7 +635,6 @@ class TestUpdateWordTokens:
|
||||
self.tok.update_word_tokens()
|
||||
assert word_table.count_housenumbers() == 0
|
||||
|
||||
|
||||
def test_keep_unused_numeral_housenumbers(self, add_housenumber, word_table):
|
||||
add_housenumber(1000, '5432')
|
||||
|
||||
@@ -677,8 +642,8 @@ class TestUpdateWordTokens:
|
||||
self.tok.update_word_tokens()
|
||||
assert word_table.count_housenumbers() == 1
|
||||
|
||||
|
||||
def test_keep_housenumbers_from_search_name_table(self, add_housenumber, word_table, search_entry):
|
||||
def test_keep_housenumbers_from_search_name_table(self, add_housenumber,
|
||||
word_table, search_entry):
|
||||
add_housenumber(9999, '5432a')
|
||||
add_housenumber(9991, '9 a')
|
||||
search_entry(123, 9999, 34)
|
||||
@@ -687,8 +652,8 @@ class TestUpdateWordTokens:
|
||||
self.tok.update_word_tokens()
|
||||
assert word_table.count_housenumbers() == 1
|
||||
|
||||
|
||||
def test_keep_housenumbers_from_placex_table(self, add_housenumber, word_table, placex_table):
|
||||
def test_keep_housenumbers_from_placex_table(self, add_housenumber, word_table,
|
||||
placex_table):
|
||||
add_housenumber(9999, '5432a')
|
||||
add_housenumber(9990, '34z')
|
||||
placex_table.add(housenumber='34z')
|
||||
@@ -698,8 +663,8 @@ class TestUpdateWordTokens:
|
||||
self.tok.update_word_tokens()
|
||||
assert word_table.count_housenumbers() == 1
|
||||
|
||||
|
||||
def test_keep_housenumbers_from_placex_table_hnr_list(self, add_housenumber, word_table, placex_table):
|
||||
def test_keep_housenumbers_from_placex_table_hnr_list(self, add_housenumber,
|
||||
word_table, placex_table):
|
||||
add_housenumber(9991, '9 b')
|
||||
add_housenumber(9990, '34z')
|
||||
placex_table.add(housenumber='9 a;9 b;9 c')
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for converting a config file to ICU rules.
|
||||
@@ -19,17 +19,16 @@ from icu import Transliterator
|
||||
|
||||
CONFIG_SECTIONS = ('normalization', 'transliteration', 'token-analysis')
|
||||
|
||||
|
||||
class TestIcuRuleLoader:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def init_env(self, project_env):
|
||||
self.project_env = project_env
|
||||
|
||||
|
||||
def write_config(self, content):
|
||||
(self.project_env.project_dir / 'icu_tokenizer.yaml').write_text(dedent(content))
|
||||
|
||||
|
||||
def config_rules(self, *variants):
|
||||
content = dedent("""\
|
||||
normalization:
|
||||
@@ -49,14 +48,12 @@ class TestIcuRuleLoader:
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
self.write_config(content)
|
||||
|
||||
|
||||
def get_replacements(self, *variants):
|
||||
self.config_rules(*variants)
|
||||
loader = ICURuleLoader(self.project_env)
|
||||
rules = loader.analysis[None].config['replacements']
|
||||
|
||||
return sorted((k, sorted(v)) for k,v in rules)
|
||||
|
||||
return sorted((k, sorted(v)) for k, v in rules)
|
||||
|
||||
def test_empty_rule_set(self):
|
||||
self.write_config("""\
|
||||
@@ -72,16 +69,14 @@ class TestIcuRuleLoader:
|
||||
assert rules.get_normalization_rules() == ''
|
||||
assert rules.get_transliteration_rules() == ''
|
||||
|
||||
|
||||
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
|
||||
def test_missing_section(self, section):
|
||||
rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section}
|
||||
rule_cfg = {s: [] for s in CONFIG_SECTIONS if s != section}
|
||||
self.write_config(yaml.dump(rule_cfg))
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
ICURuleLoader(self.project_env)
|
||||
|
||||
|
||||
def test_get_search_rules(self):
|
||||
self.config_rules()
|
||||
loader = ICURuleLoader(self.project_env)
|
||||
@@ -97,7 +92,6 @@ class TestIcuRuleLoader:
|
||||
assert trans.transliterate(" Αθήνα ") == " athēna "
|
||||
assert trans.transliterate(" проспект ") == " prospekt "
|
||||
|
||||
|
||||
def test_get_normalization_rules(self):
|
||||
self.config_rules()
|
||||
loader = ICURuleLoader(self.project_env)
|
||||
@@ -106,7 +100,6 @@ class TestIcuRuleLoader:
|
||||
|
||||
assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
|
||||
|
||||
|
||||
def test_get_transliteration_rules(self):
|
||||
self.config_rules()
|
||||
loader = ICURuleLoader(self.project_env)
|
||||
@@ -115,7 +108,6 @@ class TestIcuRuleLoader:
|
||||
|
||||
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
|
||||
|
||||
|
||||
def test_transliteration_rules_from_file(self):
|
||||
self.write_config("""\
|
||||
normalization:
|
||||
@@ -135,7 +127,6 @@ class TestIcuRuleLoader:
|
||||
|
||||
assert trans.transliterate(" axxt ") == " byt "
|
||||
|
||||
|
||||
def test_search_rules(self):
|
||||
self.config_rules('~street => s,st', 'master => mstr')
|
||||
proc = ICURuleLoader(self.project_env).make_token_analysis()
|
||||
@@ -144,7 +135,6 @@ class TestIcuRuleLoader:
|
||||
assert proc.search.transliterate('Earnes St').strip() == 'earnes st'
|
||||
assert proc.search.transliterate('Nostreet').strip() == 'nostreet'
|
||||
|
||||
|
||||
@pytest.mark.parametrize("variant", ['foo > bar', 'foo -> bar -> bar',
|
||||
'~foo~ -> bar', 'fo~ o -> bar'])
|
||||
def test_invalid_variant_description(self, variant):
|
||||
@@ -157,25 +147,21 @@ class TestIcuRuleLoader:
|
||||
|
||||
assert repl == [(' foo ', [' bar', ' foo'])]
|
||||
|
||||
|
||||
def test_replace_full(self):
|
||||
repl = self.get_replacements("foo => bar")
|
||||
|
||||
assert repl == [(' foo ', [' bar'])]
|
||||
|
||||
|
||||
def test_add_suffix_no_decompose(self):
|
||||
repl = self.get_replacements("~berg |-> bg")
|
||||
|
||||
assert repl == [(' berg ', [' berg', ' bg']),
|
||||
('berg ', ['berg', 'bg'])]
|
||||
|
||||
|
||||
def test_replace_suffix_no_decompose(self):
|
||||
repl = self.get_replacements("~berg |=> bg")
|
||||
|
||||
assert repl == [(' berg ', [' bg']),('berg ', ['bg'])]
|
||||
|
||||
assert repl == [(' berg ', [' bg']), ('berg ', ['bg'])]
|
||||
|
||||
def test_add_suffix_decompose(self):
|
||||
repl = self.get_replacements("~berg -> bg")
|
||||
@@ -183,26 +169,22 @@ class TestIcuRuleLoader:
|
||||
assert repl == [(' berg ', [' berg', ' bg', 'berg', 'bg']),
|
||||
('berg ', [' berg', ' bg', 'berg', 'bg'])]
|
||||
|
||||
|
||||
def test_replace_suffix_decompose(self):
|
||||
repl = self.get_replacements("~berg => bg")
|
||||
|
||||
assert repl == [(' berg ', [' bg', 'bg']),
|
||||
('berg ', [' bg', 'bg'])]
|
||||
|
||||
|
||||
def test_add_prefix_no_compose(self):
|
||||
repl = self.get_replacements("hinter~ |-> hnt")
|
||||
|
||||
assert repl == [(' hinter', [' hinter', ' hnt']),
|
||||
(' hinter ', [' hinter', ' hnt'])]
|
||||
|
||||
|
||||
def test_replace_prefix_no_compose(self):
|
||||
repl = self.get_replacements("hinter~ |=> hnt")
|
||||
|
||||
assert repl == [(' hinter', [' hnt']), (' hinter ', [' hnt'])]
|
||||
|
||||
assert repl == [(' hinter', [' hnt']), (' hinter ', [' hnt'])]
|
||||
|
||||
def test_add_prefix_compose(self):
|
||||
repl = self.get_replacements("hinter~-> h")
|
||||
@@ -210,45 +192,38 @@ class TestIcuRuleLoader:
|
||||
assert repl == [(' hinter', [' h', ' h ', ' hinter', ' hinter ']),
|
||||
(' hinter ', [' h', ' h', ' hinter', ' hinter'])]
|
||||
|
||||
|
||||
def test_replace_prefix_compose(self):
|
||||
repl = self.get_replacements("hinter~=> h")
|
||||
|
||||
assert repl == [(' hinter', [' h', ' h ']),
|
||||
(' hinter ', [' h', ' h'])]
|
||||
|
||||
|
||||
def test_add_beginning_only(self):
|
||||
repl = self.get_replacements("^Premier -> Pr")
|
||||
|
||||
assert repl == [('^ premier ', ['^ pr', '^ premier'])]
|
||||
|
||||
|
||||
def test_replace_beginning_only(self):
|
||||
repl = self.get_replacements("^Premier => Pr")
|
||||
|
||||
assert repl == [('^ premier ', ['^ pr'])]
|
||||
|
||||
|
||||
def test_add_final_only(self):
|
||||
repl = self.get_replacements("road$ -> rd")
|
||||
|
||||
assert repl == [(' road ^', [' rd ^', ' road ^'])]
|
||||
|
||||
|
||||
def test_replace_final_only(self):
|
||||
repl = self.get_replacements("road$ => rd")
|
||||
|
||||
assert repl == [(' road ^', [' rd ^'])]
|
||||
|
||||
|
||||
def test_decompose_only(self):
|
||||
repl = self.get_replacements("~foo -> foo")
|
||||
|
||||
assert repl == [(' foo ', [' foo', 'foo']),
|
||||
('foo ', [' foo', 'foo'])]
|
||||
|
||||
|
||||
def test_add_suffix_decompose_end_only(self):
|
||||
repl = self.get_replacements("~berg |-> bg", "~berg$ -> bg")
|
||||
|
||||
@@ -257,7 +232,6 @@ class TestIcuRuleLoader:
|
||||
('berg ', ['berg', 'bg']),
|
||||
('berg ^', [' berg ^', ' bg ^', 'berg ^', 'bg ^'])]
|
||||
|
||||
|
||||
def test_replace_suffix_decompose_end_only(self):
|
||||
repl = self.get_replacements("~berg |=> bg", "~berg$ => bg")
|
||||
|
||||
@@ -266,7 +240,6 @@ class TestIcuRuleLoader:
|
||||
('berg ', ['bg']),
|
||||
('berg ^', [' bg ^', 'bg ^'])]
|
||||
|
||||
|
||||
def test_add_multiple_suffix(self):
|
||||
repl = self.get_replacements("~berg,~burg -> bg")
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for execution of the sanitztion step.
|
||||
@@ -50,13 +50,13 @@ def test_placeinfo_has_attr():
|
||||
def test_sanitizer_default(def_config):
|
||||
san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}], def_config)
|
||||
|
||||
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'},
|
||||
'address': {'street': 'Bald'}}))
|
||||
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'},
|
||||
'address': {'street': 'Bald'}}))
|
||||
|
||||
assert len(name) == 3
|
||||
assert all(isinstance(n, sanitizer.PlaceName) for n in name)
|
||||
assert all(n.kind == 'name' for n in name)
|
||||
assert all(n.suffix == 'de:de' for n in name)
|
||||
assert all(n.kind == 'name' for n in name)
|
||||
assert all(n.suffix == 'de:de' for n in name)
|
||||
|
||||
assert len(address) == 1
|
||||
assert all(isinstance(n, sanitizer.PlaceName) for n in address)
|
||||
@@ -66,7 +66,7 @@ def test_sanitizer_default(def_config):
|
||||
def test_sanitizer_empty_list(def_config, rules):
|
||||
san = sanitizer.PlaceSanitizer(rules, def_config)
|
||||
|
||||
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}}))
|
||||
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}}))
|
||||
|
||||
assert len(name) == 1
|
||||
assert all(isinstance(n, sanitizer.PlaceName) for n in name)
|
||||
@@ -74,4 +74,4 @@ def test_sanitizer_empty_list(def_config, rules):
|
||||
|
||||
def test_sanitizer_missing_step_definition(def_config):
|
||||
with pytest.raises(UsageError):
|
||||
san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}], def_config)
|
||||
sanitizer.PlaceSanitizer([{'id': 'split-name-list'}], def_config)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for special postcode analysis and variant generation.
|
||||
@@ -13,7 +13,6 @@ from icu import Transliterator
|
||||
|
||||
import nominatim_db.tokenizer.token_analysis.postcodes as module
|
||||
from nominatim_db.data.place_name import PlaceName
|
||||
from nominatim_db.errors import UsageError
|
||||
|
||||
DEFAULT_NORMALIZATION = """ :: NFD ();
|
||||
'🜳' > ' ';
|
||||
@@ -27,9 +26,10 @@ DEFAULT_TRANSLITERATION = """ :: Latin ();
|
||||
'🜵' > ' ';
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def analyser():
|
||||
rules = { 'analyzer': 'postcodes'}
|
||||
rules = {'analyzer': 'postcodes'}
|
||||
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
||||
|
||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for import name normalisation and variant generation.
|
||||
@@ -26,8 +26,9 @@ DEFAULT_TRANSLITERATION = """ :: Latin ();
|
||||
'🜵' > ' ';
|
||||
"""
|
||||
|
||||
|
||||
def make_analyser(*variants, variant_only=False):
|
||||
rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
|
||||
rules = {'analyzer': 'generic', 'variants': [{'words': variants}]}
|
||||
if variant_only:
|
||||
rules['mode'] = 'variant-only'
|
||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||
@@ -43,7 +44,7 @@ def get_normalized_variants(proc, name):
|
||||
|
||||
|
||||
def test_no_variants():
|
||||
rules = { 'analyzer': 'generic' }
|
||||
rules = {'analyzer': 'generic'}
|
||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
config = module.configure(rules, norm, trans)
|
||||
@@ -62,35 +63,36 @@ def test_variants_empty():
|
||||
|
||||
|
||||
VARIANT_TESTS = [
|
||||
(('~strasse,~straße -> str', '~weg => weg'), "hallo", {'hallo'}),
|
||||
(('weg => wg',), "holzweg", {'holzweg'}),
|
||||
(('weg -> wg',), "holzweg", {'holzweg'}),
|
||||
(('~weg => weg',), "holzweg", {'holz weg', 'holzweg'}),
|
||||
(('~weg -> weg',), "holzweg", {'holz weg', 'holzweg'}),
|
||||
(('~weg => w',), "holzweg", {'holz w', 'holzw'}),
|
||||
(('~weg -> w',), "holzweg", {'holz weg', 'holzweg', 'holz w', 'holzw'}),
|
||||
(('~weg => weg',), "Meier Weg", {'meier weg', 'meierweg'}),
|
||||
(('~weg -> weg',), "Meier Weg", {'meier weg', 'meierweg'}),
|
||||
(('~weg => w',), "Meier Weg", {'meier w', 'meierw'}),
|
||||
(('~weg -> w',), "Meier Weg", {'meier weg', 'meierweg', 'meier w', 'meierw'}),
|
||||
(('weg => wg',), "Meier Weg", {'meier wg'}),
|
||||
(('weg -> wg',), "Meier Weg", {'meier weg', 'meier wg'}),
|
||||
(('~strasse,~straße -> str', '~weg => weg'), "Bauwegstraße",
|
||||
(('~strasse,~straße -> str', '~weg => weg'), "hallo", {'hallo'}),
|
||||
(('weg => wg',), "holzweg", {'holzweg'}),
|
||||
(('weg -> wg',), "holzweg", {'holzweg'}),
|
||||
(('~weg => weg',), "holzweg", {'holz weg', 'holzweg'}),
|
||||
(('~weg -> weg',), "holzweg", {'holz weg', 'holzweg'}),
|
||||
(('~weg => w',), "holzweg", {'holz w', 'holzw'}),
|
||||
(('~weg -> w',), "holzweg", {'holz weg', 'holzweg', 'holz w', 'holzw'}),
|
||||
(('~weg => weg',), "Meier Weg", {'meier weg', 'meierweg'}),
|
||||
(('~weg -> weg',), "Meier Weg", {'meier weg', 'meierweg'}),
|
||||
(('~weg => w',), "Meier Weg", {'meier w', 'meierw'}),
|
||||
(('~weg -> w',), "Meier Weg", {'meier weg', 'meierweg', 'meier w', 'meierw'}),
|
||||
(('weg => wg',), "Meier Weg", {'meier wg'}),
|
||||
(('weg -> wg',), "Meier Weg", {'meier weg', 'meier wg'}),
|
||||
(('~strasse,~straße -> str', '~weg => weg'), "Bauwegstraße",
|
||||
{'bauweg straße', 'bauweg str', 'bauwegstraße', 'bauwegstr'}),
|
||||
(('am => a', 'bach => b'), "am bach", {'a b'}),
|
||||
(('am => a', '~bach => b'), "am bach", {'a b'}),
|
||||
(('am -> a', '~bach -> b'), "am bach", {'am bach', 'a bach', 'am b', 'a b'}),
|
||||
(('am -> a', '~bach -> b'), "ambach", {'ambach', 'am bach', 'amb', 'am b'}),
|
||||
(('saint -> s,st', 'street -> st'), "Saint Johns Street",
|
||||
(('am => a', 'bach => b'), "am bach", {'a b'}),
|
||||
(('am => a', '~bach => b'), "am bach", {'a b'}),
|
||||
(('am -> a', '~bach -> b'), "am bach", {'am bach', 'a bach', 'am b', 'a b'}),
|
||||
(('am -> a', '~bach -> b'), "ambach", {'ambach', 'am bach', 'amb', 'am b'}),
|
||||
(('saint -> s,st', 'street -> st'), "Saint Johns Street",
|
||||
{'saint johns street', 's johns street', 'st johns street',
|
||||
'saint johns st', 's johns st', 'st johns st'}),
|
||||
(('river$ -> r',), "River Bend Road", {'river bend road'}),
|
||||
(('river$ -> r',), "Bent River", {'bent river', 'bent r'}),
|
||||
(('^north => n',), "North 2nd Street", {'n 2nd street'}),
|
||||
(('^north => n',), "Airport North", {'airport north'}),
|
||||
(('am -> a',), "am am am am am am am am", {'am am am am am am am am'}),
|
||||
(('am => a',), "am am am am am am am am", {'a a a a a a a a'})
|
||||
]
|
||||
(('river$ -> r',), "River Bend Road", {'river bend road'}),
|
||||
(('river$ -> r',), "Bent River", {'bent river', 'bent r'}),
|
||||
(('^north => n',), "North 2nd Street", {'n 2nd street'}),
|
||||
(('^north => n',), "Airport North", {'airport north'}),
|
||||
(('am -> a',), "am am am am am am am am", {'am am am am am am am am'}),
|
||||
(('am => a',), "am am am am am am am am", {'a a a a a a a a'})
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
|
||||
def test_variants(rules, name, variants):
|
||||
@@ -103,10 +105,11 @@ def test_variants(rules, name, variants):
|
||||
|
||||
|
||||
VARIANT_ONLY_TESTS = [
|
||||
(('weg => wg',), "hallo", set()),
|
||||
(('weg => wg',), "Meier Weg", {'meier wg'}),
|
||||
(('weg -> wg',), "Meier Weg", {'meier wg'}),
|
||||
]
|
||||
(('weg => wg',), "hallo", set()),
|
||||
(('weg => wg',), "Meier Weg", {'meier wg'}),
|
||||
(('weg -> wg',), "Meier Weg", {'meier wg'}),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("rules,name,variants", VARIANT_ONLY_TESTS)
|
||||
def test_variants_only(rules, name, variants):
|
||||
@@ -122,17 +125,15 @@ class TestGetReplacements:
|
||||
|
||||
@staticmethod
|
||||
def configure_rules(*variants):
|
||||
rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
|
||||
rules = {'analyzer': 'generic', 'variants': [{'words': variants}]}
|
||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
return module.configure(rules, norm, trans)
|
||||
|
||||
|
||||
def get_replacements(self, *variants):
|
||||
config = self.configure_rules(*variants)
|
||||
|
||||
return sorted((k, sorted(v)) for k,v in config['replacements'])
|
||||
|
||||
return sorted((k, sorted(v)) for k, v in config['replacements'])
|
||||
|
||||
@pytest.mark.parametrize("variant", ['foo > bar', 'foo -> bar -> bar',
|
||||
'~foo~ -> bar', 'fo~ o -> bar'])
|
||||
@@ -140,38 +141,32 @@ class TestGetReplacements:
|
||||
with pytest.raises(UsageError):
|
||||
self.configure_rules(variant)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("rule", ["!!! -> bar", "bar => !!!"])
|
||||
def test_ignore_unnormalizable_terms(self, rule):
|
||||
repl = self.get_replacements(rule)
|
||||
|
||||
assert repl == []
|
||||
|
||||
|
||||
def test_add_full(self):
|
||||
repl = self.get_replacements("foo -> bar")
|
||||
|
||||
assert repl == [(' foo ', [' bar', ' foo'])]
|
||||
|
||||
|
||||
def test_replace_full(self):
|
||||
repl = self.get_replacements("foo => bar")
|
||||
|
||||
assert repl == [(' foo ', [' bar'])]
|
||||
|
||||
|
||||
def test_add_suffix_no_decompose(self):
|
||||
repl = self.get_replacements("~berg |-> bg")
|
||||
|
||||
assert repl == [(' berg ', [' berg', ' bg']),
|
||||
('berg ', ['berg', 'bg'])]
|
||||
|
||||
|
||||
def test_replace_suffix_no_decompose(self):
|
||||
repl = self.get_replacements("~berg |=> bg")
|
||||
|
||||
assert repl == [(' berg ', [' bg']),('berg ', ['bg'])]
|
||||
|
||||
assert repl == [(' berg ', [' bg']), ('berg ', ['bg'])]
|
||||
|
||||
def test_add_suffix_decompose(self):
|
||||
repl = self.get_replacements("~berg -> bg")
|
||||
@@ -179,26 +174,22 @@ class TestGetReplacements:
|
||||
assert repl == [(' berg ', [' berg', ' bg', 'berg', 'bg']),
|
||||
('berg ', [' berg', ' bg', 'berg', 'bg'])]
|
||||
|
||||
|
||||
def test_replace_suffix_decompose(self):
|
||||
repl = self.get_replacements("~berg => bg")
|
||||
|
||||
assert repl == [(' berg ', [' bg', 'bg']),
|
||||
('berg ', [' bg', 'bg'])]
|
||||
|
||||
|
||||
def test_add_prefix_no_compose(self):
|
||||
repl = self.get_replacements("hinter~ |-> hnt")
|
||||
|
||||
assert repl == [(' hinter', [' hinter', ' hnt']),
|
||||
(' hinter ', [' hinter', ' hnt'])]
|
||||
|
||||
|
||||
def test_replace_prefix_no_compose(self):
|
||||
repl = self.get_replacements("hinter~ |=> hnt")
|
||||
|
||||
assert repl == [(' hinter', [' hnt']), (' hinter ', [' hnt'])]
|
||||
|
||||
assert repl == [(' hinter', [' hnt']), (' hinter ', [' hnt'])]
|
||||
|
||||
def test_add_prefix_compose(self):
|
||||
repl = self.get_replacements("hinter~-> h")
|
||||
@@ -206,45 +197,38 @@ class TestGetReplacements:
|
||||
assert repl == [(' hinter', [' h', ' h ', ' hinter', ' hinter ']),
|
||||
(' hinter ', [' h', ' h', ' hinter', ' hinter'])]
|
||||
|
||||
|
||||
def test_replace_prefix_compose(self):
|
||||
repl = self.get_replacements("hinter~=> h")
|
||||
|
||||
assert repl == [(' hinter', [' h', ' h ']),
|
||||
(' hinter ', [' h', ' h'])]
|
||||
|
||||
|
||||
def test_add_beginning_only(self):
|
||||
repl = self.get_replacements("^Premier -> Pr")
|
||||
|
||||
assert repl == [('^ premier ', ['^ pr', '^ premier'])]
|
||||
|
||||
|
||||
def test_replace_beginning_only(self):
|
||||
repl = self.get_replacements("^Premier => Pr")
|
||||
|
||||
assert repl == [('^ premier ', ['^ pr'])]
|
||||
|
||||
|
||||
def test_add_final_only(self):
|
||||
repl = self.get_replacements("road$ -> rd")
|
||||
|
||||
assert repl == [(' road ^', [' rd ^', ' road ^'])]
|
||||
|
||||
|
||||
def test_replace_final_only(self):
|
||||
repl = self.get_replacements("road$ => rd")
|
||||
|
||||
assert repl == [(' road ^', [' rd ^'])]
|
||||
|
||||
|
||||
def test_decompose_only(self):
|
||||
repl = self.get_replacements("~foo -> foo")
|
||||
|
||||
assert repl == [(' foo ', [' foo', 'foo']),
|
||||
('foo ', [' foo', 'foo'])]
|
||||
|
||||
|
||||
def test_add_suffix_decompose_end_only(self):
|
||||
repl = self.get_replacements("~berg |-> bg", "~berg$ -> bg")
|
||||
|
||||
@@ -253,7 +237,6 @@ class TestGetReplacements:
|
||||
('berg ', ['berg', 'bg']),
|
||||
('berg ^', [' berg ^', ' bg ^', 'berg ^', 'bg ^'])]
|
||||
|
||||
|
||||
def test_replace_suffix_decompose_end_only(self):
|
||||
repl = self.get_replacements("~berg |=> bg", "~berg$ => bg")
|
||||
|
||||
@@ -262,7 +245,6 @@ class TestGetReplacements:
|
||||
('berg ', ['bg']),
|
||||
('berg ^', [' bg ^', 'bg ^'])]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('rule', ["~berg,~burg -> bg",
|
||||
"~berg, ~burg -> bg",
|
||||
"~berg,,~burg -> bg"])
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for generic token analysis, mutation part.
|
||||
@@ -24,37 +24,34 @@ DEFAULT_TRANSLITERATION = """ :: Latin ();
|
||||
'🜵' > ' ';
|
||||
"""
|
||||
|
||||
|
||||
class TestMutationNoVariants:
|
||||
|
||||
def make_analyser(self, *mutations):
|
||||
rules = { 'analyzer': 'generic',
|
||||
'mutations': [ {'pattern': m[0], 'replacements': m[1]}
|
||||
for m in mutations]
|
||||
}
|
||||
rules = {'analyzer': 'generic',
|
||||
'mutations': [{'pattern': m[0], 'replacements': m[1]}
|
||||
for m in mutations]
|
||||
}
|
||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
config = module.configure(rules, norm, trans)
|
||||
|
||||
self.analysis = module.create(norm, trans, config)
|
||||
|
||||
|
||||
def variants(self, name):
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
return set(self.analysis.compute_variants(norm.transliterate(name).strip()))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pattern', ('(capture)', ['a list']))
|
||||
def test_bad_pattern(self, pattern):
|
||||
with pytest.raises(UsageError):
|
||||
self.make_analyser((pattern, ['b']))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('replacements', (None, 'a string'))
|
||||
def test_bad_replacement(self, replacements):
|
||||
with pytest.raises(UsageError):
|
||||
self.make_analyser(('a', replacements))
|
||||
|
||||
|
||||
def test_simple_replacement(self):
|
||||
self.make_analyser(('a', ['b']))
|
||||
|
||||
@@ -62,27 +59,23 @@ class TestMutationNoVariants:
|
||||
assert self.variants('abba') == {'bbbb'}
|
||||
assert self.variants('2 aar') == {'2 bbr'}
|
||||
|
||||
|
||||
def test_multichar_replacement(self):
|
||||
self.make_analyser(('1 1', ['1 1 1']))
|
||||
|
||||
assert self.variants('1 1456') == {'1 1 1456'}
|
||||
assert self.variants('1 1 1') == {'1 1 1 1'}
|
||||
|
||||
|
||||
def test_removement_replacement(self):
|
||||
self.make_analyser((' ', [' ', '']))
|
||||
|
||||
assert self.variants('A 345') == {'a 345', 'a345'}
|
||||
assert self.variants('a g b') == {'a g b', 'ag b', 'a gb', 'agb'}
|
||||
|
||||
|
||||
def test_regex_pattern(self):
|
||||
self.make_analyser(('[^a-z]+', ['XXX', ' ']))
|
||||
|
||||
assert self.variants('a-34n12') == {'aXXXnXXX', 'aXXXn', 'a nXXX', 'a n'}
|
||||
|
||||
|
||||
def test_multiple_mutations(self):
|
||||
self.make_analyser(('ä', ['ä', 'ae']), ('ö', ['ö', 'oe']))
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ Tests for simplified trie structure.
|
||||
|
||||
from nominatim_db.tokenizer.token_analysis.simple_trie import SimpleTrie
|
||||
|
||||
|
||||
def test_single_item_trie():
|
||||
t = SimpleTrie([('foob', 42)])
|
||||
|
||||
@@ -18,6 +19,7 @@ def test_single_item_trie():
|
||||
assert t.longest_prefix('foob') == (42, 4)
|
||||
assert t.longest_prefix('123foofoo', 3) == (None, 3)
|
||||
|
||||
|
||||
def test_complex_item_tree():
|
||||
t = SimpleTrie([('a', 1),
|
||||
('b', 2),
|
||||
|
||||
Reference in New Issue
Block a user