remove postcode computation for word table during import

This commit is contained in:
Sarah Hoffmann
2025-03-01 10:20:33 +01:00
parent b2af358f66
commit a574b98e4a
3 changed files with 16 additions and 128 deletions

View File

@@ -381,76 +381,15 @@ class ICUNameAnalyzer(AbstractAnalyzer):
return postcode.strip().upper() return postcode.strip().upper()
def update_postcodes_from_db(self) -> None: def update_postcodes_from_db(self) -> None:
""" Update postcode tokens in the word table from the location_postcode """ Postcode update.
table.
Removes all postcodes from the word table because they are not
needed. Postcodes are recognised by pattern.
""" """
assert self.conn is not None assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@postcode')
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
# First get all postcode names currently in the word table. cur.execute("DELETE FROM word WHERE type = 'P'")
cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
word_entries = set((entry[0] for entry in cur))
# Then compute the required postcode names from the postcode table.
needed_entries = set()
cur.execute("SELECT country_code, postcode FROM location_postcode")
for cc, postcode in cur:
info = PlaceInfo({'country_code': cc,
'class': 'place', 'type': 'postcode',
'address': {'postcode': postcode}})
address = self.sanitizer.process_names(info)[1]
for place in address:
if place.kind == 'postcode':
if analyzer is None:
postcode_name = place.name.strip().upper()
variant_base = None
else:
postcode_name = analyzer.get_canonical_id(place)
variant_base = place.get_attr("variant")
if variant_base:
needed_entries.add(f'{postcode_name}@{variant_base}')
else:
needed_entries.add(postcode_name)
break
# Now update the word table.
self._delete_unused_postcode_words(word_entries - needed_entries)
self._add_missing_postcode_words(needed_entries - word_entries)
def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if tokens:
with self.conn.cursor() as cur:
cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
(list(tokens), ))
def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if not tokens:
return
analyzer = self.token_analysis.analysis.get('@postcode')
terms = []
for postcode_name in tokens:
if '@' in postcode_name:
term, variant = postcode_name.split('@', 2)
term = self._search_normalized(term)
if analyzer is None:
variants = [term]
else:
variants = analyzer.compute_variants(variant)
if term not in variants:
variants.append(term)
else:
variants = [self._search_normalized(postcode_name)]
terms.append((postcode_name, variants))
if terms:
with self.conn.cursor() as cur:
cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]], def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None: should_replace: bool) -> None:
@@ -718,32 +657,9 @@ class ICUNameAnalyzer(AbstractAnalyzer):
analyzer = self.token_analysis.analysis.get('@postcode') analyzer = self.token_analysis.analysis.get('@postcode')
if analyzer is None: if analyzer is None:
postcode_name = item.name.strip().upper() return item.name.strip().upper()
variant_base = None
else: else:
postcode_name = analyzer.get_canonical_id(item) return analyzer.get_canonical_id(item)
variant_base = item.get_attr("variant")
if variant_base:
postcode = f'{postcode_name}@{variant_base}'
else:
postcode = postcode_name
if postcode not in self._cache.postcodes:
term = self._search_normalized(postcode_name)
if not term:
return None
variants = {term}
if analyzer is not None and variant_base:
variants.update(analyzer.compute_variants(variant_base))
with self.conn.cursor() as cur:
cur.execute("SELECT create_postcode_word(%s, %s)",
(postcode, list(variants)))
self._cache.postcodes.add(postcode)
return postcode_name
class _TokenInfo: class _TokenInfo:
@@ -836,5 +752,4 @@ class _TokenCache:
self.names: Dict[str, Tuple[int, List[int]]] = {} self.names: Dict[str, Tuple[int, List[int]]] = {}
self.partials: Dict[str, int] = {} self.partials: Dict[str, int] = {}
self.fulls: Dict[str, List[int]] = {} self.fulls: Dict[str, List[int]] = {}
self.postcodes: Set[str] = set()
self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {} self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}

View File

@@ -102,12 +102,11 @@ async def test_splitting_in_transliteration(conn):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']), @pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']),
('3', ['H', 'P', 'W', 'w']) ('3', ['H', 'W', 'w'])
]) ])
async def test_penalty_postcodes_and_housenumbers(conn, term, order): async def test_penalty_postcodes_and_housenumbers(conn, term, order):
ana = await tok.create_query_analyzer(conn) ana = await tok.create_query_analyzer(conn)
await add_word(conn, 1, term, 'P', None)
await add_word(conn, 2, term, 'H', term) await add_word(conn, 2, term, 'H', term)
await add_word(conn, 3, term, 'w', term) await add_word(conn, 3, term, 'w', term)
await add_word(conn, 4, term, 'W', term) await add_word(conn, 4, term, 'W', term)
@@ -179,8 +178,10 @@ async def test_add_unknown_housenumbers(conn):
assert query.nodes[1].starting[0].ttype == qmod.TOKEN_HOUSENUMBER assert query.nodes[1].starting[0].ttype == qmod.TOKEN_HOUSENUMBER
assert len(query.nodes[1].starting[0].tokens) == 1 assert len(query.nodes[1].starting[0].tokens) == 1
assert query.nodes[1].starting[0].tokens[0].token == 1 assert query.nodes[1].starting[0].tokens[0].token == 1
assert not query.nodes[2].starting assert query.nodes[2].has_tokens(3, qmod.TOKEN_POSTCODE)
assert not query.nodes[3].starting assert not query.nodes[2].has_tokens(3, qmod.TOKEN_HOUSENUMBER)
assert not query.nodes[2].has_tokens(4, qmod.TOKEN_HOUSENUMBER)
assert not query.nodes[3].has_tokens(4, qmod.TOKEN_HOUSENUMBER)
@pytest.mark.asyncio @pytest.mark.asyncio

View File

@@ -265,37 +265,13 @@ class TestPostcodes:
'address': {'postcode': postcode}})) 'address': {'postcode': postcode}}))
def test_update_postcodes_from_db_empty(self, table_factory, word_table): def test_update_postcodes_deleted(self, word_table):
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
content=(('de', '12345'), ('se', '132 34'),
('bm', 'AB23'), ('fr', '12345')))
self.analyzer.update_postcodes_from_db()
assert word_table.count() == 5
assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'}
def test_update_postcodes_from_db_ambigious(self, table_factory, word_table):
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
content=(('in', '123456'), ('sg', '123456')))
self.analyzer.update_postcodes_from_db()
assert word_table.count() == 3
assert word_table.get_postcodes() == {'123456', '123456@123 456'}
def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table):
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45')))
word_table.add_postcode(' 1234', '1234') word_table.add_postcode(' 1234', '1234')
word_table.add_postcode(' 5678', '5678') word_table.add_postcode(' 5678', '5678')
self.analyzer.update_postcodes_from_db() self.analyzer.update_postcodes_from_db()
assert word_table.count() == 5 assert word_table.count() == 0
assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'}
def test_process_place_postcode_simple(self, word_table): def test_process_place_postcode_simple(self, word_table):
@@ -303,16 +279,12 @@ class TestPostcodes:
assert info['postcode'] == '12345' assert info['postcode'] == '12345'
assert word_table.get_postcodes() == {'12345', }
def test_process_place_postcode_with_space(self, word_table): def test_process_place_postcode_with_space(self, word_table):
info = self.process_postcode('in', '123 567') info = self.process_postcode('in', '123 567')
assert info['postcode'] == '123567' assert info['postcode'] == '123567'
assert word_table.get_postcodes() == {'123567@123 567', }
def test_update_special_phrase_empty_table(analyzer, word_table): def test_update_special_phrase_empty_table(analyzer, word_table):
@@ -477,9 +449,9 @@ class TestPlaceAddress:
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345']) @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
def test_process_place_postcode(self, word_table, pcode): def test_process_place_postcode(self, word_table, pcode):
self.process_address(postcode=pcode) info = self.process_address(postcode=pcode)
assert word_table.get_postcodes() == {pcode, } assert info['postcode'] == pcode
@pytest.mark.parametrize('hnr', ['123a', '1', '101']) @pytest.mark.parametrize('hnr', ['123a', '1', '101'])