forked from hans/Nominatim
handle postcodes properly on word table updates
update_postcodes_from_db() needs to do the full postcode treatment in order to derive the correct word table entries.
This commit is contained in:
@@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
||||
|
||||
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
|
||||
variants=('~gasse -> gasse', 'street => st', ),
|
||||
sanitizers=[], with_housenumber=False):
|
||||
sanitizers=[], with_housenumber=False,
|
||||
with_postcode=False):
|
||||
cfgstr = {'normalization': list(norm),
|
||||
'sanitizers': sanitizers,
|
||||
'transliteration': list(trans),
|
||||
@@ -81,6 +82,9 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
||||
if with_housenumber:
|
||||
cfgstr['token-analysis'].append({'id': '@housenumber',
|
||||
'analyzer': 'housenumbers'})
|
||||
if with_postcode:
|
||||
cfgstr['token-analysis'].append({'id': '@postcode',
|
||||
'analyzer': 'postcodes'})
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
||||
tok.loader = nominatim.tokenizer.icu_rule_loader.ICURuleLoader(test_config)
|
||||
|
||||
@@ -246,28 +250,69 @@ def test_normalize_postcode(analyzer):
|
||||
anl.normalize_postcode('38 Б') == '38 Б'
|
||||
|
||||
|
||||
def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table):
|
||||
table_factory('location_postcode', 'postcode TEXT',
|
||||
content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
|
||||
class TestPostcodes:
|
||||
|
||||
with analyzer() as anl:
|
||||
anl.update_postcodes_from_db()
|
||||
|
||||
assert word_table.count() == 3
|
||||
assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'}
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self, analyzer, sql_functions):
|
||||
sanitizers = [{'step': 'clean-postcodes'}]
|
||||
with analyzer(sanitizers=sanitizers, with_postcode=True) as anl:
|
||||
self.analyzer = anl
|
||||
yield anl
|
||||
|
||||
|
||||
def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table):
|
||||
table_factory('location_postcode', 'postcode TEXT',
|
||||
content=(('1234',), ('45BC', ), ('XX45', )))
|
||||
word_table.add_postcode(' 1234', '1234')
|
||||
word_table.add_postcode(' 5678', '5678')
|
||||
def process_postcode(self, cc, postcode):
|
||||
return self.analyzer.process_place(PlaceInfo({'country_code': cc,
|
||||
'address': {'postcode': postcode}}))
|
||||
|
||||
with analyzer() as anl:
|
||||
anl.update_postcodes_from_db()
|
||||
|
||||
assert word_table.count() == 3
|
||||
assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'}
|
||||
def test_update_postcodes_from_db_empty(self, table_factory, word_table):
|
||||
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
|
||||
content=(('de', '12345'), ('se', '132 34'),
|
||||
('bm', 'AB23'), ('fr', '12345')))
|
||||
|
||||
self.analyzer.update_postcodes_from_db()
|
||||
|
||||
assert word_table.count() == 5
|
||||
assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'}
|
||||
|
||||
|
||||
def test_update_postcodes_from_db_ambigious(self, table_factory, word_table):
|
||||
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
|
||||
content=(('in', '123456'), ('sg', '123456')))
|
||||
|
||||
self.analyzer.update_postcodes_from_db()
|
||||
|
||||
assert word_table.count() == 3
|
||||
assert word_table.get_postcodes() == {'123456', '123456@123 456'}
|
||||
|
||||
|
||||
def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table):
|
||||
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
|
||||
content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45')))
|
||||
word_table.add_postcode(' 1234', '1234')
|
||||
word_table.add_postcode(' 5678', '5678')
|
||||
|
||||
self.analyzer.update_postcodes_from_db()
|
||||
|
||||
assert word_table.count() == 5
|
||||
assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'}
|
||||
|
||||
|
||||
def test_process_place_postcode_simple(self, word_table):
|
||||
info = self.process_postcode('de', '12345')
|
||||
|
||||
assert info['postcode'] == '12345'
|
||||
|
||||
assert word_table.get_postcodes() == {'12345', }
|
||||
|
||||
|
||||
def test_process_place_postcode_with_space(self, word_table):
|
||||
info = self.process_postcode('in', '123 567')
|
||||
|
||||
assert info['postcode'] == '123567'
|
||||
|
||||
assert word_table.get_postcodes() == {'123567@123 567', }
|
||||
|
||||
|
||||
|
||||
def test_update_special_phrase_empty_table(analyzer, word_table):
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for special postcode analysis and variant generation.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
import nominatim.tokenizer.token_analysis.postcodes as module
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
DEFAULT_NORMALIZATION = """ :: NFD ();
|
||||
'🜳' > ' ';
|
||||
[[:Nonspacing Mark:] [:Cf:]] >;
|
||||
:: lower ();
|
||||
[[:Punctuation:][:Space:]]+ > ' ';
|
||||
:: NFC ();
|
||||
"""
|
||||
|
||||
DEFAULT_TRANSLITERATION = """ :: Latin ();
|
||||
'🜵' > ' ';
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def analyser():
|
||||
rules = { 'analyzer': 'postcodes'}
|
||||
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
||||
|
||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
|
||||
return module.create(norm, trans, config)
|
||||
|
||||
|
||||
def get_normalized_variants(proc, name):
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
return proc.get_variants_ascii(norm.transliterate(name).strip())
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name,norm', [('12', '12'),
|
||||
('A 34 ', 'A 34'),
|
||||
('34-av', '34-AV')])
|
||||
def test_normalize(analyser, name, norm):
|
||||
assert analyser.normalize(name) == norm
|
||||
|
||||
|
||||
@pytest.mark.parametrize('postcode,variants', [('12345', {'12345'}),
|
||||
('AB-998', {'ab 998', 'ab998'}),
|
||||
('23 FGH D3', {'23 fgh d3', '23fgh d3',
|
||||
'23 fghd3', '23fghd3'})])
|
||||
def test_get_variants_ascii(analyser, postcode, variants):
|
||||
out = analyser.get_variants_ascii(postcode)
|
||||
|
||||
assert len(out) == len(set(out))
|
||||
assert set(out) == variants
|
||||
Reference in New Issue
Block a user