mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
handle postcodes properly on word table updates
update_postcodes_from_db() needs to do the full postcode treatment in order to derive the correct word table entries.
This commit is contained in:
@@ -290,33 +290,72 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
""" Update postcode tokens in the word table from the location_postcode
|
""" Update postcode tokens in the word table from the location_postcode
|
||||||
table.
|
table.
|
||||||
"""
|
"""
|
||||||
to_delete = []
|
analyzer = self.token_analysis.analysis.get('@postcode')
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
# This finds us the rows in location_postcode and word that are
|
# First get all postcode names currently in the word table.
|
||||||
# missing in the other table.
|
cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
|
||||||
cur.execute("""SELECT * FROM
|
word_entries = set((entry[0] for entry in cur))
|
||||||
(SELECT pc, word FROM
|
|
||||||
(SELECT distinct(postcode) as pc FROM location_postcode) p
|
|
||||||
FULL JOIN
|
|
||||||
(SELECT word FROM word WHERE type = 'P') w
|
|
||||||
ON pc = word) x
|
|
||||||
WHERE pc is null or word is null""")
|
|
||||||
|
|
||||||
with CopyBuffer() as copystr:
|
# Then compute the required postcode names from the postcode table.
|
||||||
for postcode, word in cur:
|
needed_entries = set()
|
||||||
if postcode is None:
|
cur.execute("SELECT country_code, postcode FROM location_postcode")
|
||||||
to_delete.append(word)
|
for cc, postcode in cur:
|
||||||
else:
|
info = PlaceInfo({'country_code': cc,
|
||||||
copystr.add(self._search_normalized(postcode),
|
'class': 'place', 'type': 'postcode',
|
||||||
'P', postcode)
|
'address': {'postcode': postcode}})
|
||||||
|
address = self.sanitizer.process_names(info)[1]
|
||||||
|
for place in address:
|
||||||
|
if place.kind == 'postcode':
|
||||||
|
if analyzer is None:
|
||||||
|
postcode_name = place.name.strip().upper()
|
||||||
|
variant_base = None
|
||||||
|
else:
|
||||||
|
postcode_name = analyzer.normalize(place.name)
|
||||||
|
variant_base = place.get_attr("variant")
|
||||||
|
|
||||||
|
if variant_base:
|
||||||
|
needed_entries.add(f'{postcode_name}@{variant_base}')
|
||||||
|
else:
|
||||||
|
needed_entries.add(postcode_name)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Now update the word table.
|
||||||
|
self._delete_unused_postcode_words(word_entries - needed_entries)
|
||||||
|
self._add_missing_postcode_words(needed_entries - word_entries)
|
||||||
|
|
||||||
|
def _delete_unused_postcode_words(self, tokens):
|
||||||
|
if tokens:
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
|
||||||
|
(list(tokens), ))
|
||||||
|
|
||||||
|
def _add_missing_postcode_words(self, tokens):
|
||||||
|
if not tokens:
|
||||||
|
return
|
||||||
|
|
||||||
|
analyzer = self.token_analysis.analysis.get('@postcode')
|
||||||
|
terms = []
|
||||||
|
|
||||||
|
for postcode_name in tokens:
|
||||||
|
if '@' in postcode_name:
|
||||||
|
term, variant = postcode_name.split('@', 2)
|
||||||
|
term = self._search_normalized(term)
|
||||||
|
variants = {term}
|
||||||
|
if analyzer is not None:
|
||||||
|
variants.update(analyzer.get_variants_ascii(variant))
|
||||||
|
variants = list(variants)
|
||||||
|
else:
|
||||||
|
variants = [self._search_normalized(postcode_name)]
|
||||||
|
terms.append((postcode_name, variants))
|
||||||
|
|
||||||
|
if terms:
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute_values("""SELECT create_postcode_word(pc, var)
|
||||||
|
FROM (VALUES %s) AS v(pc, var)""",
|
||||||
|
terms)
|
||||||
|
|
||||||
if to_delete:
|
|
||||||
cur.execute("""DELETE FROM WORD
|
|
||||||
WHERE type ='P' and word = any(%s)
|
|
||||||
""", (to_delete, ))
|
|
||||||
|
|
||||||
copystr.copy_out(cur, 'word',
|
|
||||||
columns=['word_token', 'type', 'word'])
|
|
||||||
|
|
||||||
|
|
||||||
def update_special_phrases(self, phrases, should_replace):
|
def update_special_phrases(self, phrases, should_replace):
|
||||||
@@ -616,7 +655,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
postcode_name = analyzer.normalize(item.name)
|
postcode_name = analyzer.normalize(item.name)
|
||||||
variant_base = item.get_attr("variant")
|
variant_base = item.get_attr("variant")
|
||||||
|
|
||||||
if variant_base is not None:
|
if variant_base:
|
||||||
postcode = f'{postcode_name}@{variant_base}'
|
postcode = f'{postcode_name}@{variant_base}'
|
||||||
else:
|
else:
|
||||||
postcode = postcode_name
|
postcode = postcode_name
|
||||||
@@ -627,7 +666,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
variants = {term}
|
variants = {term}
|
||||||
if analyzer is not None and variant_base is not None:
|
if analyzer is not None and variant_base:
|
||||||
variants.update(analyzer.get_variants_ascii(variant_base))
|
variants.update(analyzer.get_variants_ascii(variant_base))
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
|
|||||||
@@ -25,8 +25,18 @@ def create(normalizer, transliterator, config): # pylint: disable=W0613
|
|||||||
"""
|
"""
|
||||||
return PostcodeTokenAnalysis(normalizer, transliterator)
|
return PostcodeTokenAnalysis(normalizer, transliterator)
|
||||||
|
|
||||||
|
|
||||||
class PostcodeTokenAnalysis:
|
class PostcodeTokenAnalysis:
|
||||||
""" Detects common housenumber patterns and normalizes them.
|
""" Special normalization and variant generation for postcodes.
|
||||||
|
|
||||||
|
This analyser must not be used with anything but postcodes as
|
||||||
|
it follows some special rules: `normalize` doesn't necessarily
|
||||||
|
need to return a standard form as per normalization rules. It
|
||||||
|
needs to return the canonical form of the postcode that is also
|
||||||
|
used for output. `get_variants_ascii` then needs to ensure that
|
||||||
|
the generated variants once more follow the standard normalization
|
||||||
|
and transliteration, so that postcodes are correctly recognised by
|
||||||
|
the search algorithm.
|
||||||
"""
|
"""
|
||||||
def __init__(self, norm, trans):
|
def __init__(self, norm, trans):
|
||||||
self.norm = norm
|
self.norm = norm
|
||||||
@@ -44,11 +54,12 @@ class PostcodeTokenAnalysis:
|
|||||||
def get_variants_ascii(self, norm_name):
|
def get_variants_ascii(self, norm_name):
|
||||||
""" Compute the spelling variants for the given normalized postcode.
|
""" Compute the spelling variants for the given normalized postcode.
|
||||||
|
|
||||||
The official form creates one variant. If a 'lookup version' is
|
Takes the canonical form of the postcode, normalizes it using the
|
||||||
given, then it will create variants with optional spaces.
|
standard rules and then creates variants of the result where
|
||||||
|
all spaces are optional.
|
||||||
"""
|
"""
|
||||||
# Postcodes follow their own transliteration rules.
|
# Postcodes follow their own transliteration rules.
|
||||||
# Make sure at this point, that the terms are normalized in a way
|
# Make sure at this point, that the terms are normalized in a way
|
||||||
# that they are searchable with the standard transliteration rules.
|
# that they are searchable with the standard transliteration rules.
|
||||||
return [self.trans.transliterate(term) for term in
|
return [self.trans.transliterate(term) for term in
|
||||||
self.mutator.generate([self.norm.transliterate(norm_name)])]
|
self.mutator.generate([self.norm.transliterate(norm_name)]) if term]
|
||||||
|
|||||||
@@ -18,13 +18,18 @@ from nominatim.tokenizer import factory as tokenizer_factory
|
|||||||
def check_database_integrity(context):
|
def check_database_integrity(context):
|
||||||
""" Check some generic constraints on the tables.
|
""" Check some generic constraints on the tables.
|
||||||
"""
|
"""
|
||||||
# place_addressline should not have duplicate (place_id, address_place_id)
|
with context.db.cursor() as cur:
|
||||||
cur = context.db.cursor()
|
# place_addressline should not have duplicate (place_id, address_place_id)
|
||||||
cur.execute("""SELECT count(*) FROM
|
cur.execute("""SELECT count(*) FROM
|
||||||
(SELECT place_id, address_place_id, count(*) as c
|
(SELECT place_id, address_place_id, count(*) as c
|
||||||
FROM place_addressline GROUP BY place_id, address_place_id) x
|
FROM place_addressline GROUP BY place_id, address_place_id) x
|
||||||
WHERE c > 1""")
|
WHERE c > 1""")
|
||||||
assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
|
assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
|
||||||
|
|
||||||
|
# word table must not have empty word_tokens
|
||||||
|
cur.execute("SELECT count(*) FROM word WHERE word_token = ''")
|
||||||
|
assert cur.fetchone()[0] == 0, "Empty word tokens found in word table"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
################################ GIVEN ##################################
|
################################ GIVEN ##################################
|
||||||
|
|||||||
@@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
|||||||
|
|
||||||
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
|
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
|
||||||
variants=('~gasse -> gasse', 'street => st', ),
|
variants=('~gasse -> gasse', 'street => st', ),
|
||||||
sanitizers=[], with_housenumber=False):
|
sanitizers=[], with_housenumber=False,
|
||||||
|
with_postcode=False):
|
||||||
cfgstr = {'normalization': list(norm),
|
cfgstr = {'normalization': list(norm),
|
||||||
'sanitizers': sanitizers,
|
'sanitizers': sanitizers,
|
||||||
'transliteration': list(trans),
|
'transliteration': list(trans),
|
||||||
@@ -81,6 +82,9 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
|||||||
if with_housenumber:
|
if with_housenumber:
|
||||||
cfgstr['token-analysis'].append({'id': '@housenumber',
|
cfgstr['token-analysis'].append({'id': '@housenumber',
|
||||||
'analyzer': 'housenumbers'})
|
'analyzer': 'housenumbers'})
|
||||||
|
if with_postcode:
|
||||||
|
cfgstr['token-analysis'].append({'id': '@postcode',
|
||||||
|
'analyzer': 'postcodes'})
|
||||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
||||||
tok.loader = nominatim.tokenizer.icu_rule_loader.ICURuleLoader(test_config)
|
tok.loader = nominatim.tokenizer.icu_rule_loader.ICURuleLoader(test_config)
|
||||||
|
|
||||||
@@ -246,28 +250,69 @@ def test_normalize_postcode(analyzer):
|
|||||||
anl.normalize_postcode('38 Б') == '38 Б'
|
anl.normalize_postcode('38 Б') == '38 Б'
|
||||||
|
|
||||||
|
|
||||||
def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table):
|
class TestPostcodes:
|
||||||
table_factory('location_postcode', 'postcode TEXT',
|
|
||||||
content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
|
|
||||||
|
|
||||||
with analyzer() as anl:
|
@pytest.fixture(autouse=True)
|
||||||
anl.update_postcodes_from_db()
|
def setup(self, analyzer, sql_functions):
|
||||||
|
sanitizers = [{'step': 'clean-postcodes'}]
|
||||||
assert word_table.count() == 3
|
with analyzer(sanitizers=sanitizers, with_postcode=True) as anl:
|
||||||
assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'}
|
self.analyzer = anl
|
||||||
|
yield anl
|
||||||
|
|
||||||
|
|
||||||
def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table):
|
def process_postcode(self, cc, postcode):
|
||||||
table_factory('location_postcode', 'postcode TEXT',
|
return self.analyzer.process_place(PlaceInfo({'country_code': cc,
|
||||||
content=(('1234',), ('45BC', ), ('XX45', )))
|
'address': {'postcode': postcode}}))
|
||||||
word_table.add_postcode(' 1234', '1234')
|
|
||||||
word_table.add_postcode(' 5678', '5678')
|
|
||||||
|
|
||||||
with analyzer() as anl:
|
|
||||||
anl.update_postcodes_from_db()
|
|
||||||
|
|
||||||
assert word_table.count() == 3
|
def test_update_postcodes_from_db_empty(self, table_factory, word_table):
|
||||||
assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'}
|
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
|
||||||
|
content=(('de', '12345'), ('se', '132 34'),
|
||||||
|
('bm', 'AB23'), ('fr', '12345')))
|
||||||
|
|
||||||
|
self.analyzer.update_postcodes_from_db()
|
||||||
|
|
||||||
|
assert word_table.count() == 5
|
||||||
|
assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'}
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_postcodes_from_db_ambigious(self, table_factory, word_table):
|
||||||
|
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
|
||||||
|
content=(('in', '123456'), ('sg', '123456')))
|
||||||
|
|
||||||
|
self.analyzer.update_postcodes_from_db()
|
||||||
|
|
||||||
|
assert word_table.count() == 3
|
||||||
|
assert word_table.get_postcodes() == {'123456', '123456@123 456'}
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table):
|
||||||
|
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
|
||||||
|
content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45')))
|
||||||
|
word_table.add_postcode(' 1234', '1234')
|
||||||
|
word_table.add_postcode(' 5678', '5678')
|
||||||
|
|
||||||
|
self.analyzer.update_postcodes_from_db()
|
||||||
|
|
||||||
|
assert word_table.count() == 5
|
||||||
|
assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'}
|
||||||
|
|
||||||
|
|
||||||
|
def test_process_place_postcode_simple(self, word_table):
|
||||||
|
info = self.process_postcode('de', '12345')
|
||||||
|
|
||||||
|
assert info['postcode'] == '12345'
|
||||||
|
|
||||||
|
assert word_table.get_postcodes() == {'12345', }
|
||||||
|
|
||||||
|
|
||||||
|
def test_process_place_postcode_with_space(self, word_table):
|
||||||
|
info = self.process_postcode('in', '123 567')
|
||||||
|
|
||||||
|
assert info['postcode'] == '123567'
|
||||||
|
|
||||||
|
assert word_table.get_postcodes() == {'123567@123 567', }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_update_special_phrase_empty_table(analyzer, word_table):
|
def test_update_special_phrase_empty_table(analyzer, word_table):
|
||||||
|
|||||||
@@ -0,0 +1,60 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Tests for special postcode analysis and variant generation.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from icu import Transliterator
|
||||||
|
|
||||||
|
import nominatim.tokenizer.token_analysis.postcodes as module
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
|
DEFAULT_NORMALIZATION = """ :: NFD ();
|
||||||
|
'🜳' > ' ';
|
||||||
|
[[:Nonspacing Mark:] [:Cf:]] >;
|
||||||
|
:: lower ();
|
||||||
|
[[:Punctuation:][:Space:]]+ > ' ';
|
||||||
|
:: NFC ();
|
||||||
|
"""
|
||||||
|
|
||||||
|
DEFAULT_TRANSLITERATION = """ :: Latin ();
|
||||||
|
'🜵' > ' ';
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def analyser():
|
||||||
|
rules = { 'analyzer': 'postcodes'}
|
||||||
|
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
||||||
|
|
||||||
|
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||||
|
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||||
|
|
||||||
|
return module.create(norm, trans, config)
|
||||||
|
|
||||||
|
|
||||||
|
def get_normalized_variants(proc, name):
|
||||||
|
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||||
|
return proc.get_variants_ascii(norm.transliterate(name).strip())
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('name,norm', [('12', '12'),
|
||||||
|
('A 34 ', 'A 34'),
|
||||||
|
('34-av', '34-AV')])
|
||||||
|
def test_normalize(analyser, name, norm):
|
||||||
|
assert analyser.normalize(name) == norm
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('postcode,variants', [('12345', {'12345'}),
|
||||||
|
('AB-998', {'ab 998', 'ab998'}),
|
||||||
|
('23 FGH D3', {'23 fgh d3', '23fgh d3',
|
||||||
|
'23 fghd3', '23fghd3'})])
|
||||||
|
def test_get_variants_ascii(analyser, postcode, variants):
|
||||||
|
out = analyser.get_variants_ascii(postcode)
|
||||||
|
|
||||||
|
assert len(out) == len(set(out))
|
||||||
|
assert set(out) == variants
|
||||||
Reference in New Issue
Block a user