move houseunumber handling to tokenizer

Normalization and token computation are now done in the tokenizer.
The tokenizer keeps a cache to the hundred most used house numbers
to keep the numbers of calls to the database low.
This commit is contained in:
Sarah Hoffmann
2021-04-25 11:47:29 +02:00
parent d711f5a81e
commit d8ed1bfc60
5 changed files with 101 additions and 34 deletions

View File

@@ -292,7 +292,6 @@ CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
parent_place_id BIGINT, parent_place_id BIGINT,
address HSTORE, address HSTORE,
country TEXT, country TEXT,
housenumber TEXT,
token_info JSONB, token_info JSONB,
geometry GEOMETRY, geometry GEOMETRY,
OUT name_vector INTEGER[], OUT name_vector INTEGER[],
@@ -302,6 +301,7 @@ DECLARE
parent_name_vector INTEGER[]; parent_name_vector INTEGER[];
parent_address_vector INTEGER[]; parent_address_vector INTEGER[];
addr_place_ids INTEGER[]; addr_place_ids INTEGER[];
hnr_vector INTEGER[];
addr_item RECORD; addr_item RECORD;
parent_address_place_ids BIGINT[]; parent_address_place_ids BIGINT[];
@@ -358,9 +358,10 @@ BEGIN
-- This is unusual for the search_name table but prevents that the place -- This is unusual for the search_name table but prevents that the place
-- is returned when we only search for the street/place. -- is returned when we only search for the street/place.
IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN hnr_vector := token_get_housenumber_search_tokens(token_info);
name_vector := array_merge(name_vector,
ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]); IF hnr_vector is not null and not nameaddress_vector <@ parent_address_vector THEN
name_vector := array_merge(name_vector, hnr_vector);
END IF; END IF;
IF not address ? 'street' and address ? 'place' THEN IF not address ? 'street' and address ? 'place' THEN
@@ -370,7 +371,7 @@ BEGIN
nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids); nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
-- If there is a housenumber, also add the place name as a name, -- If there is a housenumber, also add the place name as a name,
-- so we can search it by the usual housenumber+place algorithms. -- so we can search it by the usual housenumber+place algorithms.
IF housenumber is not null THEN IF hnr_vector is not null THEN
name_vector := array_merge(name_vector, name_vector := array_merge(name_vector,
ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]); ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
END IF; END IF;
@@ -812,21 +813,8 @@ BEGIN
{% if debug %}RAISE WARNING 'Copy over address tags';{% endif %} {% if debug %}RAISE WARNING 'Copy over address tags';{% endif %}
-- housenumber is a computed field, so start with an empty value -- housenumber is a computed field, so start with an empty value
NEW.housenumber := NULL; NEW.housenumber := token_normalized_housenumber(NEW.token_info);
IF NEW.address is not NULL THEN IF NEW.address is not NULL THEN
IF NEW.address ? 'conscriptionnumber' THEN
IF NEW.address ? 'streetnumber' THEN
NEW.housenumber := (NEW.address->'conscriptionnumber') || '/' || (NEW.address->'streetnumber');
ELSE
NEW.housenumber := NEW.address->'conscriptionnumber';
END IF;
ELSEIF NEW.address ? 'streetnumber' THEN
NEW.housenumber := NEW.address->'streetnumber';
ELSEIF NEW.address ? 'housenumber' THEN
NEW.housenumber := NEW.address->'housenumber';
END IF;
NEW.housenumber := create_housenumber_id(NEW.housenumber);
addr_street := NEW.address->'street'; addr_street := NEW.address->'street';
addr_place := NEW.address->'place'; addr_place := NEW.address->'place';
@@ -940,8 +928,7 @@ BEGIN
SELECT * INTO name_vector, nameaddress_vector SELECT * INTO name_vector, nameaddress_vector
FROM create_poi_search_terms(NEW.place_id, FROM create_poi_search_terms(NEW.place_id,
NEW.partition, NEW.parent_place_id, NEW.partition, NEW.parent_place_id,
NEW.address, NEW.address, NEW.country_code,
NEW.country_code, NEW.housenumber,
NEW.token_info, NEW.centroid); NEW.token_info, NEW.centroid);
IF array_length(name_vector, 1) is not NULL THEN IF array_length(name_vector, 1) is not NULL THEN

View File

@@ -7,6 +7,7 @@ AS $$
SELECT (info->>'names')::INTEGER[] SELECT (info->>'names')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
-- Get tokens for matching the place name against others. -- Get tokens for matching the place name against others.
-- --
-- This should usually be restricted to full name tokens. -- This should usually be restricted to full name tokens.
@@ -17,6 +18,22 @@ AS $$
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
-- Return the housenumber tokens applicable for the place.
CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'hnr_tokens')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
-- Return the housenumber in the form that it can be matched during search.
CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
RETURNS TEXT
AS $$
SELECT info->>'hnr';
$$ LANGUAGE SQL IMMUTABLE STRICT;
-- Return token info that should be saved permanently in the database. -- Return token info that should be saved permanently in the database.
CREATE OR REPLACE FUNCTION token_strip_info(info JSONB) CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
RETURNS JSONB RETURNS JSONB
@@ -75,26 +92,25 @@ END;
$$ $$
LANGUAGE plpgsql; LANGUAGE plpgsql;
-- Create housenumber tokens from an OSM addr:housenumber. -- Create housenumber tokens from an OSM addr:housenumber.
-- The housnumber is split at comma and semicolon as necessary. -- The housnumber is split at comma and semicolon as necessary.
-- The function returns the normalized form of the housenumber suitable -- The function returns the normalized form of the housenumber suitable
-- for comparison. -- for comparison.
CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT) CREATE OR REPLACE FUNCTION create_housenumbers(housenumbers TEXT[],
RETURNS TEXT OUT tokens TEXT,
OUT normtext TEXT)
AS $$ AS $$
DECLARE
normtext TEXT;
BEGIN BEGIN
SELECT array_to_string(array_agg(trans), ';') SELECT array_to_string(array_agg(trans), ';'), array_agg(tid)::TEXT
INTO normtext INTO normtext, tokens
FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) as tid
FROM (SELECT make_standard_name(h) as lookup_word FROM (SELECT make_standard_name(h) as lookup_word
FROM regexp_split_to_table(housenumber, '[,;]') h) x) y; FROM unnest(housenumbers) h) x) y;
return normtext;
END; END;
$$ LANGUAGE plpgsql STABLE STRICT; $$ LANGUAGE plpgsql STABLE STRICT;
CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT) CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
RETURNS INTEGER RETURNS INTEGER
AS $$ AS $$

View File

@@ -4,6 +4,8 @@ Main work horse for indexing (computing addresses) the database.
import logging import logging
import select import select
import psycopg2.extras
from nominatim.indexer.progress import ProgressLogger from nominatim.indexer.progress import ProgressLogger
from nominatim.indexer import runners from nominatim.indexer import runners
from nominatim.db.async_connection import DBConnection from nominatim.db.async_connection import DBConnection
@@ -176,6 +178,7 @@ class Indexer:
LOG.warning("Starting %s (using batch size %s)", runner.name(), batch) LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
psycopg2.extras.register_hstore(conn)
with conn.cursor() as cur: with conn.cursor() as cur:
total_tuples = cur.scalar(runner.sql_count_objects()) total_tuples = cur.scalar(runner.sql_count_objects())
LOG.debug("Total number of rows: %i", total_tuples) LOG.debug("Total number of rows: %i", total_tuples)

View File

@@ -195,6 +195,8 @@ class LegacyNameAnalyzer:
self.conn.autocommit = True self.conn.autocommit = True
psycopg2.extras.register_hstore(self.conn) psycopg2.extras.register_hstore(self.conn)
self._cache = _TokenCache(self.conn)
def __enter__(self): def __enter__(self):
return self return self
@@ -217,16 +219,23 @@ class LegacyNameAnalyzer:
Returns a JSON-serialisable structure that will be handed into Returns a JSON-serialisable structure that will be handed into
the database via the token_info field. the database via the token_info field.
""" """
token_info = _TokenInfo() token_info = _TokenInfo(self._cache)
token_info.add_names(self.conn, place.get('name'), place.get('country_feature')) token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
address = place.get('address')
if address:
token_info.add_housenumbers(self.conn, address)
return token_info.data return token_info.data
class _TokenInfo: class _TokenInfo:
""" Collect token information to be sent back to the database.
def __init__(self): """
def __init__(self, cache):
self.cache = cache
self.data = {} self.data = {}
@@ -245,3 +254,52 @@ class _TokenInfo:
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
cur.execute("SELECT create_country(%s, %s)", cur.execute("SELECT create_country(%s, %s)",
(names, country_feature.lower())) (names, country_feature.lower()))
def add_housenumbers(self, conn, address):
""" Extract housenumber information from the address.
"""
hnrs = [v for k, v in address.items()
if k in ('housenumber', 'streetnumber', 'conscriptionnumber')]
if not hnrs:
return
if len(hnrs) == 1:
token = self.cache.get_housenumber(hnrs[0])
if token is not None:
self.data['hnr_tokens'] = token
self.data['hnr'] = hnrs[0]
return
# split numbers if necessary
simple_list = []
for hnr in hnrs:
simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
if len(simple_list) > 1:
simple_list = list(set(simple_list))
with conn.cursor() as cur:
cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
This cache is not thread-safe and needs to be instantiated per
analyzer.
"""
def __init__(self, conn):
# Lookup houseunumbers up to 100 and cache them
with conn.cursor() as cur:
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
FROM generate_series(1, 100) as i""")
self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
def get_housenumber(self, number):
""" Get a housenumber token from the cache.
"""
return self._cached_housenumbers.get(number)

View File

@@ -129,6 +129,9 @@ def change_housenumber_transliteration(conn, **_):
The database schema switched from saving raw housenumbers in The database schema switched from saving raw housenumbers in
placex.housenumber to saving transliterated ones. placex.housenumber to saving transliterated ones.
Note: the function create_housenumber_id() has been dropped in later
versions.
""" """
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT) cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)