revert to using full names for street name matching

Using partial names turned out to not work well because there are
often similarly named streets next to each other. It also
prevents us from being able to take into account all addr:street:*
tags.

This change gets all the full term tokens for the addr:street tags
from the DB. As they are used for matching only, we can assume that
the term must already be there or there will be no match. This
avoid creating unused full name tags.
This commit is contained in:
Sarah Hoffmann
2021-12-06 11:38:38 +01:00
parent bb175cc958
commit 44cfce1ca4
2 changed files with 61 additions and 4 deletions

View File

@@ -409,13 +409,16 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
def _process_place_address(self, token_info, address):
hnrs = []
addr_terms = []
streets = []
for item in address:
if item.kind == 'postcode':
self._add_postcode(item.name)
elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(item.name)
elif item.kind == 'street':
token_info.add_street(self._compute_partial_tokens(item.name))
token = self._retrieve_full_token(item.name)
if token:
streets.append(token)
elif item.kind == 'place':
token_info.add_place(self._compute_partial_tokens(item.name))
elif not item.kind.startswith('_') and \
@@ -429,6 +432,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
if addr_terms:
token_info.add_address_terms(addr_terms)
if streets:
token_info.add_street(streets)
def _compute_partial_tokens(self, name):
""" Normalize the given term, split it into partial words and return
@@ -458,6 +464,31 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
return tokens
def _retrieve_full_token(self, name):
""" Get the full name token for the given name, if it exists.
The name is only retrived for the standard analyser.
"""
norm_name = self._normalized(name)
# return cached if possible
if norm_name in self._cache.fulls:
return self._cache.fulls[norm_name]
# otherwise compute
full, _ = self._cache.names.get(norm_name, (None, None))
if full is None:
with self.conn.cursor() as cur:
cur.execute("SELECT word_id FROM word WHERE word = %s and type = 'W' LIMIT 1",
(norm_name, ))
if cur.rowcount > 0:
full = cur.fetchone()[0]
self._cache.fulls[norm_name] = full
return full
def _compute_name_tokens(self, names):
""" Computes the full name and partial name tokens for the given
dictionary of names.
@@ -561,8 +592,7 @@ class _TokenInfo:
def add_street(self, tokens):
""" Add addr:street match terms.
"""
if tokens:
self.data['street'] = self._mk_array(tokens)
self.data['street'] = self._mk_array(tokens)
def add_place(self, tokens):
@@ -591,6 +621,7 @@ class _TokenCache:
def __init__(self):
self.names = {}
self.partials = {}
self.fulls = {}
self.postcodes = set()
self.housenumbers = {}