mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
move houseunumber handling to tokenizer
Normalization and token computation are now done in the tokenizer. The tokenizer keeps a cache to the hundred most used house numbers to keep the numbers of calls to the database low.
This commit is contained in:
@@ -292,7 +292,6 @@ CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
|
|||||||
parent_place_id BIGINT,
|
parent_place_id BIGINT,
|
||||||
address HSTORE,
|
address HSTORE,
|
||||||
country TEXT,
|
country TEXT,
|
||||||
housenumber TEXT,
|
|
||||||
token_info JSONB,
|
token_info JSONB,
|
||||||
geometry GEOMETRY,
|
geometry GEOMETRY,
|
||||||
OUT name_vector INTEGER[],
|
OUT name_vector INTEGER[],
|
||||||
@@ -302,6 +301,7 @@ DECLARE
|
|||||||
parent_name_vector INTEGER[];
|
parent_name_vector INTEGER[];
|
||||||
parent_address_vector INTEGER[];
|
parent_address_vector INTEGER[];
|
||||||
addr_place_ids INTEGER[];
|
addr_place_ids INTEGER[];
|
||||||
|
hnr_vector INTEGER[];
|
||||||
|
|
||||||
addr_item RECORD;
|
addr_item RECORD;
|
||||||
parent_address_place_ids BIGINT[];
|
parent_address_place_ids BIGINT[];
|
||||||
@@ -358,9 +358,10 @@ BEGIN
|
|||||||
-- This is unusual for the search_name table but prevents that the place
|
-- This is unusual for the search_name table but prevents that the place
|
||||||
-- is returned when we only search for the street/place.
|
-- is returned when we only search for the street/place.
|
||||||
|
|
||||||
IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN
|
hnr_vector := token_get_housenumber_search_tokens(token_info);
|
||||||
name_vector := array_merge(name_vector,
|
|
||||||
ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]);
|
IF hnr_vector is not null and not nameaddress_vector <@ parent_address_vector THEN
|
||||||
|
name_vector := array_merge(name_vector, hnr_vector);
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
IF not address ? 'street' and address ? 'place' THEN
|
IF not address ? 'street' and address ? 'place' THEN
|
||||||
@@ -370,7 +371,7 @@ BEGIN
|
|||||||
nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
|
nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
|
||||||
-- If there is a housenumber, also add the place name as a name,
|
-- If there is a housenumber, also add the place name as a name,
|
||||||
-- so we can search it by the usual housenumber+place algorithms.
|
-- so we can search it by the usual housenumber+place algorithms.
|
||||||
IF housenumber is not null THEN
|
IF hnr_vector is not null THEN
|
||||||
name_vector := array_merge(name_vector,
|
name_vector := array_merge(name_vector,
|
||||||
ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
|
ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
|
||||||
END IF;
|
END IF;
|
||||||
@@ -812,21 +813,8 @@ BEGIN
|
|||||||
|
|
||||||
{% if debug %}RAISE WARNING 'Copy over address tags';{% endif %}
|
{% if debug %}RAISE WARNING 'Copy over address tags';{% endif %}
|
||||||
-- housenumber is a computed field, so start with an empty value
|
-- housenumber is a computed field, so start with an empty value
|
||||||
NEW.housenumber := NULL;
|
NEW.housenumber := token_normalized_housenumber(NEW.token_info);
|
||||||
IF NEW.address is not NULL THEN
|
IF NEW.address is not NULL THEN
|
||||||
IF NEW.address ? 'conscriptionnumber' THEN
|
|
||||||
IF NEW.address ? 'streetnumber' THEN
|
|
||||||
NEW.housenumber := (NEW.address->'conscriptionnumber') || '/' || (NEW.address->'streetnumber');
|
|
||||||
ELSE
|
|
||||||
NEW.housenumber := NEW.address->'conscriptionnumber';
|
|
||||||
END IF;
|
|
||||||
ELSEIF NEW.address ? 'streetnumber' THEN
|
|
||||||
NEW.housenumber := NEW.address->'streetnumber';
|
|
||||||
ELSEIF NEW.address ? 'housenumber' THEN
|
|
||||||
NEW.housenumber := NEW.address->'housenumber';
|
|
||||||
END IF;
|
|
||||||
NEW.housenumber := create_housenumber_id(NEW.housenumber);
|
|
||||||
|
|
||||||
addr_street := NEW.address->'street';
|
addr_street := NEW.address->'street';
|
||||||
addr_place := NEW.address->'place';
|
addr_place := NEW.address->'place';
|
||||||
|
|
||||||
@@ -940,8 +928,7 @@ BEGIN
|
|||||||
SELECT * INTO name_vector, nameaddress_vector
|
SELECT * INTO name_vector, nameaddress_vector
|
||||||
FROM create_poi_search_terms(NEW.place_id,
|
FROM create_poi_search_terms(NEW.place_id,
|
||||||
NEW.partition, NEW.parent_place_id,
|
NEW.partition, NEW.parent_place_id,
|
||||||
NEW.address,
|
NEW.address, NEW.country_code,
|
||||||
NEW.country_code, NEW.housenumber,
|
|
||||||
NEW.token_info, NEW.centroid);
|
NEW.token_info, NEW.centroid);
|
||||||
|
|
||||||
IF array_length(name_vector, 1) is not NULL THEN
|
IF array_length(name_vector, 1) is not NULL THEN
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ AS $$
|
|||||||
SELECT (info->>'names')::INTEGER[]
|
SELECT (info->>'names')::INTEGER[]
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
-- Get tokens for matching the place name against others.
|
-- Get tokens for matching the place name against others.
|
||||||
--
|
--
|
||||||
-- This should usually be restricted to full name tokens.
|
-- This should usually be restricted to full name tokens.
|
||||||
@@ -17,6 +18,22 @@ AS $$
|
|||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
|
-- Return the housenumber tokens applicable for the place.
|
||||||
|
CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
|
||||||
|
RETURNS INTEGER[]
|
||||||
|
AS $$
|
||||||
|
SELECT (info->>'hnr_tokens')::INTEGER[]
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
|
-- Return the housenumber in the form that it can be matched during search.
|
||||||
|
CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
|
||||||
|
RETURNS TEXT
|
||||||
|
AS $$
|
||||||
|
SELECT info->>'hnr';
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
-- Return token info that should be saved permanently in the database.
|
-- Return token info that should be saved permanently in the database.
|
||||||
CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
|
CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
|
||||||
RETURNS JSONB
|
RETURNS JSONB
|
||||||
@@ -75,26 +92,25 @@ END;
|
|||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql;
|
LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
|
||||||
-- Create housenumber tokens from an OSM addr:housenumber.
|
-- Create housenumber tokens from an OSM addr:housenumber.
|
||||||
-- The housnumber is split at comma and semicolon as necessary.
|
-- The housnumber is split at comma and semicolon as necessary.
|
||||||
-- The function returns the normalized form of the housenumber suitable
|
-- The function returns the normalized form of the housenumber suitable
|
||||||
-- for comparison.
|
-- for comparison.
|
||||||
CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
|
CREATE OR REPLACE FUNCTION create_housenumbers(housenumbers TEXT[],
|
||||||
RETURNS TEXT
|
OUT tokens TEXT,
|
||||||
|
OUT normtext TEXT)
|
||||||
AS $$
|
AS $$
|
||||||
DECLARE
|
|
||||||
normtext TEXT;
|
|
||||||
BEGIN
|
BEGIN
|
||||||
SELECT array_to_string(array_agg(trans), ';')
|
SELECT array_to_string(array_agg(trans), ';'), array_agg(tid)::TEXT
|
||||||
INTO normtext
|
INTO normtext, tokens
|
||||||
FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word)
|
FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) as tid
|
||||||
FROM (SELECT make_standard_name(h) as lookup_word
|
FROM (SELECT make_standard_name(h) as lookup_word
|
||||||
FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
|
FROM unnest(housenumbers) h) x) y;
|
||||||
|
|
||||||
return normtext;
|
|
||||||
END;
|
END;
|
||||||
$$ LANGUAGE plpgsql STABLE STRICT;
|
$$ LANGUAGE plpgsql STABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
|
CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
|
||||||
RETURNS INTEGER
|
RETURNS INTEGER
|
||||||
AS $$
|
AS $$
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ Main work horse for indexing (computing addresses) the database.
|
|||||||
import logging
|
import logging
|
||||||
import select
|
import select
|
||||||
|
|
||||||
|
import psycopg2.extras
|
||||||
|
|
||||||
from nominatim.indexer.progress import ProgressLogger
|
from nominatim.indexer.progress import ProgressLogger
|
||||||
from nominatim.indexer import runners
|
from nominatim.indexer import runners
|
||||||
from nominatim.db.async_connection import DBConnection
|
from nominatim.db.async_connection import DBConnection
|
||||||
@@ -176,6 +178,7 @@ class Indexer:
|
|||||||
LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
|
LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
|
||||||
|
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
|
psycopg2.extras.register_hstore(conn)
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
total_tuples = cur.scalar(runner.sql_count_objects())
|
total_tuples = cur.scalar(runner.sql_count_objects())
|
||||||
LOG.debug("Total number of rows: %i", total_tuples)
|
LOG.debug("Total number of rows: %i", total_tuples)
|
||||||
|
|||||||
@@ -195,6 +195,8 @@ class LegacyNameAnalyzer:
|
|||||||
self.conn.autocommit = True
|
self.conn.autocommit = True
|
||||||
psycopg2.extras.register_hstore(self.conn)
|
psycopg2.extras.register_hstore(self.conn)
|
||||||
|
|
||||||
|
self._cache = _TokenCache(self.conn)
|
||||||
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
return self
|
return self
|
||||||
@@ -217,16 +219,23 @@ class LegacyNameAnalyzer:
|
|||||||
Returns a JSON-serialisable structure that will be handed into
|
Returns a JSON-serialisable structure that will be handed into
|
||||||
the database via the token_info field.
|
the database via the token_info field.
|
||||||
"""
|
"""
|
||||||
token_info = _TokenInfo()
|
token_info = _TokenInfo(self._cache)
|
||||||
|
|
||||||
token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
|
token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
|
||||||
|
|
||||||
|
address = place.get('address')
|
||||||
|
|
||||||
|
if address:
|
||||||
|
token_info.add_housenumbers(self.conn, address)
|
||||||
|
|
||||||
return token_info.data
|
return token_info.data
|
||||||
|
|
||||||
|
|
||||||
class _TokenInfo:
|
class _TokenInfo:
|
||||||
|
""" Collect token information to be sent back to the database.
|
||||||
def __init__(self):
|
"""
|
||||||
|
def __init__(self, cache):
|
||||||
|
self.cache = cache
|
||||||
self.data = {}
|
self.data = {}
|
||||||
|
|
||||||
|
|
||||||
@@ -245,3 +254,52 @@ class _TokenInfo:
|
|||||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
||||||
cur.execute("SELECT create_country(%s, %s)",
|
cur.execute("SELECT create_country(%s, %s)",
|
||||||
(names, country_feature.lower()))
|
(names, country_feature.lower()))
|
||||||
|
|
||||||
|
|
||||||
|
def add_housenumbers(self, conn, address):
|
||||||
|
""" Extract housenumber information from the address.
|
||||||
|
"""
|
||||||
|
hnrs = [v for k, v in address.items()
|
||||||
|
if k in ('housenumber', 'streetnumber', 'conscriptionnumber')]
|
||||||
|
|
||||||
|
if not hnrs:
|
||||||
|
return
|
||||||
|
|
||||||
|
if len(hnrs) == 1:
|
||||||
|
token = self.cache.get_housenumber(hnrs[0])
|
||||||
|
if token is not None:
|
||||||
|
self.data['hnr_tokens'] = token
|
||||||
|
self.data['hnr'] = hnrs[0]
|
||||||
|
return
|
||||||
|
|
||||||
|
# split numbers if necessary
|
||||||
|
simple_list = []
|
||||||
|
for hnr in hnrs:
|
||||||
|
simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
|
||||||
|
|
||||||
|
if len(simple_list) > 1:
|
||||||
|
simple_list = list(set(simple_list))
|
||||||
|
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
|
||||||
|
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
class _TokenCache:
|
||||||
|
""" Cache for token information to avoid repeated database queries.
|
||||||
|
|
||||||
|
This cache is not thread-safe and needs to be instantiated per
|
||||||
|
analyzer.
|
||||||
|
"""
|
||||||
|
def __init__(self, conn):
|
||||||
|
# Lookup houseunumbers up to 100 and cache them
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
|
||||||
|
FROM generate_series(1, 100) as i""")
|
||||||
|
self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
|
||||||
|
|
||||||
|
|
||||||
|
def get_housenumber(self, number):
|
||||||
|
""" Get a housenumber token from the cache.
|
||||||
|
"""
|
||||||
|
return self._cached_housenumbers.get(number)
|
||||||
|
|||||||
@@ -129,6 +129,9 @@ def change_housenumber_transliteration(conn, **_):
|
|||||||
|
|
||||||
The database schema switched from saving raw housenumbers in
|
The database schema switched from saving raw housenumbers in
|
||||||
placex.housenumber to saving transliterated ones.
|
placex.housenumber to saving transliterated ones.
|
||||||
|
|
||||||
|
Note: the function create_housenumber_id() has been dropped in later
|
||||||
|
versions.
|
||||||
"""
|
"""
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
|
cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
|
||||||
|
|||||||
Reference in New Issue
Block a user