icu: move housenumber token computation out of TokenInfo

This was the last function to use the cache. There is a more clean
separation of responsibility now.
This commit is contained in:
Sarah Hoffmann
2022-02-15 21:20:47 +01:00
parent 0bb59b2e22
commit 243725aae1

View File

@@ -282,13 +282,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
return postcode.strip().upper() return postcode.strip().upper()
def _make_standard_hnr(self, hnr):
""" Create a normalised version of a housenumber.
This function takes minor shortcuts on transliteration.
"""
return self._search_normalized(hnr)
def update_postcodes_from_db(self): def update_postcodes_from_db(self):
""" Update postcode tokens in the word table from the location_postcode """ Update postcode tokens in the word table from the location_postcode
table. table.
@@ -456,7 +449,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
Returns a JSON-serializable structure that will be handed into Returns a JSON-serializable structure that will be handed into
the database via the token_info field. the database via the token_info field.
""" """
token_info = _TokenInfo(self._cache) token_info = _TokenInfo()
names, address = self.sanitizer.process_names(place) names, address = self.sanitizer.process_names(place)
@@ -475,6 +468,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
def _process_place_address(self, token_info, address): def _process_place_address(self, token_info, address):
hnr_tokens = set()
hnrs = set() hnrs = set()
addr_terms = [] addr_terms = []
streets = [] streets = []
@@ -482,9 +476,10 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
if item.kind == 'postcode': if item.kind == 'postcode':
self._add_postcode(item.name) self._add_postcode(item.name)
elif item.kind == 'housenumber': elif item.kind == 'housenumber':
norm_name = self._make_standard_hnr(item.name) token, hnr = self._compute_housenumber_token(item)
if norm_name: if token is not None:
hnrs.add(norm_name) hnr_tokens.add(token)
hnrs.add(hnr)
elif item.kind == 'street': elif item.kind == 'street':
streets.extend(self._retrieve_full_tokens(item.name)) streets.extend(self._retrieve_full_tokens(item.name))
elif item.kind == 'place': elif item.kind == 'place':
@@ -495,7 +490,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
if hnrs: if hnrs:
token_info.add_housenumbers(self.conn, hnrs) token_info.add_housenumbers(hnr_tokens, hnrs)
if addr_terms: if addr_terms:
token_info.add_address_terms(addr_terms) token_info.add_address_terms(addr_terms)
@@ -504,6 +499,24 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
token_info.add_street(streets) token_info.add_street(streets)
def _compute_housenumber_token(self, hnr):
""" Normalize the housenumber and return the word token and the
canonical form.
"""
norm_name = self._search_normalized(hnr.name)
if not norm_name:
return None, None
token = self._cache.housenumbers.get(norm_name)
if token is None:
with self.conn.cursor() as cur:
cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
token = cur.fetchone()[0]
self._cache.housenumbers[norm_name] = token
return token, norm_name
def _compute_partial_tokens(self, name): def _compute_partial_tokens(self, name):
""" Normalize the given term, split it into partial words and return """ Normalize the given term, split it into partial words and return
then token list for them. then token list for them.
@@ -612,8 +625,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
class _TokenInfo: class _TokenInfo:
""" Collect token information to be sent back to the database. """ Collect token information to be sent back to the database.
""" """
def __init__(self, cache): def __init__(self):
self._cache = cache
self.data = {} self.data = {}
@staticmethod @staticmethod
@@ -627,11 +639,11 @@ class _TokenInfo:
self.data['names'] = self._mk_array(itertools.chain(fulls, partials)) self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
def add_housenumbers(self, conn, hnrs): def add_housenumbers(self, tokens, hnrs):
""" Extract housenumber information from a list of normalised """ Extract housenumber information from a list of normalised
housenumbers. housenumbers.
""" """
self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs)) self.data['hnr_tokens'] = self._mk_array(tokens)
self.data['hnr'] = ';'.join(hnrs) self.data['hnr'] = ';'.join(hnrs)
@@ -670,29 +682,3 @@ class _TokenCache:
self.fulls = {} self.fulls = {}
self.postcodes = set() self.postcodes = set()
self.housenumbers = {} self.housenumbers = {}
def get_hnr_tokens(self, conn, terms):
""" Get token ids for a list of housenumbers, looking them up in the
database if necessary. `terms` is an iterable of normalized
housenumbers.
"""
tokens = []
askdb = []
for term in terms:
token = self.housenumbers.get(term)
if token is None:
askdb.append(term)
else:
tokens.append(token)
if askdb:
with conn.cursor() as cur:
cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
(askdb, ))
for term, tid in cur:
self.housenumbers[term] = tid
tokens.append(tid)
return tokens