mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-15 02:47:59 +00:00
overhaul the token analysis interface
The functional split betweenthe two functions is now that the first one creates the ID that is used in the word table and the second one creates the variants. There no longer is a requirement that the ID is the normalized version. We might later reintroduce the requirement that a normalized version be available but it doesn't necessarily need to be through the ID. The function that creates the ID now gets the full PlaceName. That way it might take into account attributes that were set by the sanitizers. Finally rename both functions to something more sane.
This commit is contained in:
@@ -12,6 +12,7 @@ import pytest
|
||||
from icu import Transliterator
|
||||
|
||||
import nominatim.tokenizer.token_analysis.postcodes as module
|
||||
from nominatim.data.place_name import PlaceName
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
DEFAULT_NORMALIZATION = """ :: NFD ();
|
||||
@@ -39,22 +40,22 @@ def analyser():
|
||||
|
||||
def get_normalized_variants(proc, name):
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
return proc.get_variants_ascii(norm.transliterate(name).strip())
|
||||
return proc.compute_variants(norm.transliterate(name).strip())
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name,norm', [('12', '12'),
|
||||
('A 34 ', 'A 34'),
|
||||
('34-av', '34-AV')])
|
||||
def test_normalize(analyser, name, norm):
|
||||
assert analyser.normalize(name) == norm
|
||||
def test_get_canonical_id(analyser, name, norm):
|
||||
assert analyser.get_canonical_id(PlaceName(name=name, kind='', suffix='')) == norm
|
||||
|
||||
|
||||
@pytest.mark.parametrize('postcode,variants', [('12345', {'12345'}),
|
||||
('AB-998', {'ab 998', 'ab998'}),
|
||||
('23 FGH D3', {'23 fgh d3', '23fgh d3',
|
||||
'23 fghd3', '23fghd3'})])
|
||||
def test_get_variants_ascii(analyser, postcode, variants):
|
||||
out = analyser.get_variants_ascii(postcode)
|
||||
def test_compute_variants(analyser, postcode, variants):
|
||||
out = analyser.compute_variants(postcode)
|
||||
|
||||
assert len(out) == len(set(out))
|
||||
assert set(out) == variants
|
||||
|
||||
@@ -39,7 +39,7 @@ def make_analyser(*variants, variant_only=False):
|
||||
|
||||
def get_normalized_variants(proc, name):
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
return proc.get_variants_ascii(norm.transliterate(name).strip())
|
||||
return proc.compute_variants(norm.transliterate(name).strip())
|
||||
|
||||
|
||||
def test_no_variants():
|
||||
|
||||
@@ -40,7 +40,7 @@ class TestMutationNoVariants:
|
||||
|
||||
def variants(self, name):
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
return set(self.analysis.get_variants_ascii(norm.transliterate(name).strip()))
|
||||
return set(self.analysis.compute_variants(norm.transliterate(name).strip()))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pattern', ('(capture)', ['a list']))
|
||||
|
||||
Reference in New Issue
Block a user