mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
remove postcode computation for word table during import
This commit is contained in:
@@ -381,76 +381,15 @@ class ICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
return postcode.strip().upper()
|
return postcode.strip().upper()
|
||||||
|
|
||||||
def update_postcodes_from_db(self) -> None:
|
def update_postcodes_from_db(self) -> None:
|
||||||
""" Update postcode tokens in the word table from the location_postcode
|
""" Postcode update.
|
||||||
table.
|
|
||||||
|
Removes all postcodes from the word table because they are not
|
||||||
|
needed. Postcodes are recognised by pattern.
|
||||||
"""
|
"""
|
||||||
assert self.conn is not None
|
assert self.conn is not None
|
||||||
analyzer = self.token_analysis.analysis.get('@postcode')
|
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
# First get all postcode names currently in the word table.
|
cur.execute("DELETE FROM word WHERE type = 'P'")
|
||||||
cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
|
|
||||||
word_entries = set((entry[0] for entry in cur))
|
|
||||||
|
|
||||||
# Then compute the required postcode names from the postcode table.
|
|
||||||
needed_entries = set()
|
|
||||||
cur.execute("SELECT country_code, postcode FROM location_postcode")
|
|
||||||
for cc, postcode in cur:
|
|
||||||
info = PlaceInfo({'country_code': cc,
|
|
||||||
'class': 'place', 'type': 'postcode',
|
|
||||||
'address': {'postcode': postcode}})
|
|
||||||
address = self.sanitizer.process_names(info)[1]
|
|
||||||
for place in address:
|
|
||||||
if place.kind == 'postcode':
|
|
||||||
if analyzer is None:
|
|
||||||
postcode_name = place.name.strip().upper()
|
|
||||||
variant_base = None
|
|
||||||
else:
|
|
||||||
postcode_name = analyzer.get_canonical_id(place)
|
|
||||||
variant_base = place.get_attr("variant")
|
|
||||||
|
|
||||||
if variant_base:
|
|
||||||
needed_entries.add(f'{postcode_name}@{variant_base}')
|
|
||||||
else:
|
|
||||||
needed_entries.add(postcode_name)
|
|
||||||
break
|
|
||||||
|
|
||||||
# Now update the word table.
|
|
||||||
self._delete_unused_postcode_words(word_entries - needed_entries)
|
|
||||||
self._add_missing_postcode_words(needed_entries - word_entries)
|
|
||||||
|
|
||||||
def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
|
|
||||||
assert self.conn is not None
|
|
||||||
if tokens:
|
|
||||||
with self.conn.cursor() as cur:
|
|
||||||
cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
|
|
||||||
(list(tokens), ))
|
|
||||||
|
|
||||||
def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
|
|
||||||
assert self.conn is not None
|
|
||||||
if not tokens:
|
|
||||||
return
|
|
||||||
|
|
||||||
analyzer = self.token_analysis.analysis.get('@postcode')
|
|
||||||
terms = []
|
|
||||||
|
|
||||||
for postcode_name in tokens:
|
|
||||||
if '@' in postcode_name:
|
|
||||||
term, variant = postcode_name.split('@', 2)
|
|
||||||
term = self._search_normalized(term)
|
|
||||||
if analyzer is None:
|
|
||||||
variants = [term]
|
|
||||||
else:
|
|
||||||
variants = analyzer.compute_variants(variant)
|
|
||||||
if term not in variants:
|
|
||||||
variants.append(term)
|
|
||||||
else:
|
|
||||||
variants = [self._search_normalized(postcode_name)]
|
|
||||||
terms.append((postcode_name, variants))
|
|
||||||
|
|
||||||
if terms:
|
|
||||||
with self.conn.cursor() as cur:
|
|
||||||
cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
|
|
||||||
|
|
||||||
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
|
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
|
||||||
should_replace: bool) -> None:
|
should_replace: bool) -> None:
|
||||||
@@ -718,32 +657,9 @@ class ICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
analyzer = self.token_analysis.analysis.get('@postcode')
|
analyzer = self.token_analysis.analysis.get('@postcode')
|
||||||
|
|
||||||
if analyzer is None:
|
if analyzer is None:
|
||||||
postcode_name = item.name.strip().upper()
|
return item.name.strip().upper()
|
||||||
variant_base = None
|
|
||||||
else:
|
else:
|
||||||
postcode_name = analyzer.get_canonical_id(item)
|
return analyzer.get_canonical_id(item)
|
||||||
variant_base = item.get_attr("variant")
|
|
||||||
|
|
||||||
if variant_base:
|
|
||||||
postcode = f'{postcode_name}@{variant_base}'
|
|
||||||
else:
|
|
||||||
postcode = postcode_name
|
|
||||||
|
|
||||||
if postcode not in self._cache.postcodes:
|
|
||||||
term = self._search_normalized(postcode_name)
|
|
||||||
if not term:
|
|
||||||
return None
|
|
||||||
|
|
||||||
variants = {term}
|
|
||||||
if analyzer is not None and variant_base:
|
|
||||||
variants.update(analyzer.compute_variants(variant_base))
|
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
|
||||||
cur.execute("SELECT create_postcode_word(%s, %s)",
|
|
||||||
(postcode, list(variants)))
|
|
||||||
self._cache.postcodes.add(postcode)
|
|
||||||
|
|
||||||
return postcode_name
|
|
||||||
|
|
||||||
|
|
||||||
class _TokenInfo:
|
class _TokenInfo:
|
||||||
@@ -836,5 +752,4 @@ class _TokenCache:
|
|||||||
self.names: Dict[str, Tuple[int, List[int]]] = {}
|
self.names: Dict[str, Tuple[int, List[int]]] = {}
|
||||||
self.partials: Dict[str, int] = {}
|
self.partials: Dict[str, int] = {}
|
||||||
self.fulls: Dict[str, List[int]] = {}
|
self.fulls: Dict[str, List[int]] = {}
|
||||||
self.postcodes: Set[str] = set()
|
|
||||||
self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}
|
self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}
|
||||||
|
|||||||
@@ -102,12 +102,11 @@ async def test_splitting_in_transliteration(conn):
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']),
|
@pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']),
|
||||||
('3', ['H', 'P', 'W', 'w'])
|
('3', ['H', 'W', 'w'])
|
||||||
])
|
])
|
||||||
async def test_penalty_postcodes_and_housenumbers(conn, term, order):
|
async def test_penalty_postcodes_and_housenumbers(conn, term, order):
|
||||||
ana = await tok.create_query_analyzer(conn)
|
ana = await tok.create_query_analyzer(conn)
|
||||||
|
|
||||||
await add_word(conn, 1, term, 'P', None)
|
|
||||||
await add_word(conn, 2, term, 'H', term)
|
await add_word(conn, 2, term, 'H', term)
|
||||||
await add_word(conn, 3, term, 'w', term)
|
await add_word(conn, 3, term, 'w', term)
|
||||||
await add_word(conn, 4, term, 'W', term)
|
await add_word(conn, 4, term, 'W', term)
|
||||||
@@ -179,8 +178,10 @@ async def test_add_unknown_housenumbers(conn):
|
|||||||
assert query.nodes[1].starting[0].ttype == qmod.TOKEN_HOUSENUMBER
|
assert query.nodes[1].starting[0].ttype == qmod.TOKEN_HOUSENUMBER
|
||||||
assert len(query.nodes[1].starting[0].tokens) == 1
|
assert len(query.nodes[1].starting[0].tokens) == 1
|
||||||
assert query.nodes[1].starting[0].tokens[0].token == 1
|
assert query.nodes[1].starting[0].tokens[0].token == 1
|
||||||
assert not query.nodes[2].starting
|
assert query.nodes[2].has_tokens(3, qmod.TOKEN_POSTCODE)
|
||||||
assert not query.nodes[3].starting
|
assert not query.nodes[2].has_tokens(3, qmod.TOKEN_HOUSENUMBER)
|
||||||
|
assert not query.nodes[2].has_tokens(4, qmod.TOKEN_HOUSENUMBER)
|
||||||
|
assert not query.nodes[3].has_tokens(4, qmod.TOKEN_HOUSENUMBER)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|||||||
@@ -265,37 +265,13 @@ class TestPostcodes:
|
|||||||
'address': {'postcode': postcode}}))
|
'address': {'postcode': postcode}}))
|
||||||
|
|
||||||
|
|
||||||
def test_update_postcodes_from_db_empty(self, table_factory, word_table):
|
def test_update_postcodes_deleted(self, word_table):
|
||||||
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
|
|
||||||
content=(('de', '12345'), ('se', '132 34'),
|
|
||||||
('bm', 'AB23'), ('fr', '12345')))
|
|
||||||
|
|
||||||
self.analyzer.update_postcodes_from_db()
|
|
||||||
|
|
||||||
assert word_table.count() == 5
|
|
||||||
assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'}
|
|
||||||
|
|
||||||
|
|
||||||
def test_update_postcodes_from_db_ambigious(self, table_factory, word_table):
|
|
||||||
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
|
|
||||||
content=(('in', '123456'), ('sg', '123456')))
|
|
||||||
|
|
||||||
self.analyzer.update_postcodes_from_db()
|
|
||||||
|
|
||||||
assert word_table.count() == 3
|
|
||||||
assert word_table.get_postcodes() == {'123456', '123456@123 456'}
|
|
||||||
|
|
||||||
|
|
||||||
def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table):
|
|
||||||
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
|
|
||||||
content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45')))
|
|
||||||
word_table.add_postcode(' 1234', '1234')
|
word_table.add_postcode(' 1234', '1234')
|
||||||
word_table.add_postcode(' 5678', '5678')
|
word_table.add_postcode(' 5678', '5678')
|
||||||
|
|
||||||
self.analyzer.update_postcodes_from_db()
|
self.analyzer.update_postcodes_from_db()
|
||||||
|
|
||||||
assert word_table.count() == 5
|
assert word_table.count() == 0
|
||||||
assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'}
|
|
||||||
|
|
||||||
|
|
||||||
def test_process_place_postcode_simple(self, word_table):
|
def test_process_place_postcode_simple(self, word_table):
|
||||||
@@ -303,16 +279,12 @@ class TestPostcodes:
|
|||||||
|
|
||||||
assert info['postcode'] == '12345'
|
assert info['postcode'] == '12345'
|
||||||
|
|
||||||
assert word_table.get_postcodes() == {'12345', }
|
|
||||||
|
|
||||||
|
|
||||||
def test_process_place_postcode_with_space(self, word_table):
|
def test_process_place_postcode_with_space(self, word_table):
|
||||||
info = self.process_postcode('in', '123 567')
|
info = self.process_postcode('in', '123 567')
|
||||||
|
|
||||||
assert info['postcode'] == '123567'
|
assert info['postcode'] == '123567'
|
||||||
|
|
||||||
assert word_table.get_postcodes() == {'123567@123 567', }
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_update_special_phrase_empty_table(analyzer, word_table):
|
def test_update_special_phrase_empty_table(analyzer, word_table):
|
||||||
@@ -477,9 +449,9 @@ class TestPlaceAddress:
|
|||||||
|
|
||||||
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
|
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
|
||||||
def test_process_place_postcode(self, word_table, pcode):
|
def test_process_place_postcode(self, word_table, pcode):
|
||||||
self.process_address(postcode=pcode)
|
info = self.process_address(postcode=pcode)
|
||||||
|
|
||||||
assert word_table.get_postcodes() == {pcode, }
|
assert info['postcode'] == pcode
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
|
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
|
||||||
|
|||||||
Reference in New Issue
Block a user