move houseunumber handling to tokenizer

Normalization and token computation are now done in the tokenizer.
The tokenizer keeps a cache to the hundred most used house numbers
to keep the numbers of calls to the database low.
This commit is contained in:
Sarah Hoffmann
2021-04-25 11:47:29 +02:00
parent d711f5a81e
commit d8ed1bfc60
5 changed files with 101 additions and 34 deletions

View File

@@ -4,6 +4,8 @@ Main work horse for indexing (computing addresses) the database.
import logging
import select
import psycopg2.extras
from nominatim.indexer.progress import ProgressLogger
from nominatim.indexer import runners
from nominatim.db.async_connection import DBConnection
@@ -176,6 +178,7 @@ class Indexer:
LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
with connect(self.dsn) as conn:
psycopg2.extras.register_hstore(conn)
with conn.cursor() as cur:
total_tuples = cur.scalar(runner.sql_count_objects())
LOG.debug("Total number of rows: %i", total_tuples)

View File

@@ -195,6 +195,8 @@ class LegacyNameAnalyzer:
self.conn.autocommit = True
psycopg2.extras.register_hstore(self.conn)
self._cache = _TokenCache(self.conn)
def __enter__(self):
return self
@@ -217,16 +219,23 @@ class LegacyNameAnalyzer:
Returns a JSON-serialisable structure that will be handed into
the database via the token_info field.
"""
token_info = _TokenInfo()
token_info = _TokenInfo(self._cache)
token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
address = place.get('address')
if address:
token_info.add_housenumbers(self.conn, address)
return token_info.data
class _TokenInfo:
def __init__(self):
""" Collect token information to be sent back to the database.
"""
def __init__(self, cache):
self.cache = cache
self.data = {}
@@ -245,3 +254,52 @@ class _TokenInfo:
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
cur.execute("SELECT create_country(%s, %s)",
(names, country_feature.lower()))
def add_housenumbers(self, conn, address):
""" Extract housenumber information from the address.
"""
hnrs = [v for k, v in address.items()
if k in ('housenumber', 'streetnumber', 'conscriptionnumber')]
if not hnrs:
return
if len(hnrs) == 1:
token = self.cache.get_housenumber(hnrs[0])
if token is not None:
self.data['hnr_tokens'] = token
self.data['hnr'] = hnrs[0]
return
# split numbers if necessary
simple_list = []
for hnr in hnrs:
simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
if len(simple_list) > 1:
simple_list = list(set(simple_list))
with conn.cursor() as cur:
cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
This cache is not thread-safe and needs to be instantiated per
analyzer.
"""
def __init__(self, conn):
# Lookup houseunumbers up to 100 and cache them
with conn.cursor() as cur:
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
FROM generate_series(1, 100) as i""")
self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
def get_housenumber(self, number):
""" Get a housenumber token from the cache.
"""
return self._cached_housenumbers.get(number)

View File

@@ -129,6 +129,9 @@ def change_housenumber_transliteration(conn, **_):
The database schema switched from saving raw housenumbers in
placex.housenumber to saving transliterated ones.
Note: the function create_housenumber_id() has been dropped in later
versions.
"""
with conn.cursor() as cur:
cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)