move houseunumber handling to tokenizer

Normalization and token computation are now done in the tokenizer. The tokenizer keeps a cache to the hundred most used house numbers to keep the numbers of calls to the database low.
2026-02-26 11:08:13 +00:00 · 2021-04-25 11:47:29 +02:00
parent d711f5a81e
commit d8ed1bfc60
5 changed files with 101 additions and 34 deletions
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -292,7 +292,6 @@ CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
                                                   parent_place_id BIGINT,
                                                   address HSTORE,
                                                   country TEXT,
                                                   housenumber TEXT,
                                                   token_info JSONB,
                                                   geometry GEOMETRY,
                                                   OUT name_vector INTEGER[],
@@ -302,6 +301,7 @@ DECLARE
  parent_name_vector INTEGER[];
  parent_address_vector INTEGER[];
  addr_place_ids INTEGER[];
  hnr_vector INTEGER[];
  addr_item RECORD;
  parent_address_place_ids BIGINT[];
@@ -358,9 +358,10 @@ BEGIN
  -- This is unusual for the search_name table but prevents that the place
  -- is returned when we only search for the street/place.
-  IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN
+  hnr_vector := token_get_housenumber_search_tokens(token_info);
-    name_vector := array_merge(name_vector,
+
-                               ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]);
+  IF hnr_vector is not null and not nameaddress_vector <@ parent_address_vector THEN
    name_vector := array_merge(name_vector, hnr_vector);
  END IF;
  IF not address ? 'street' and address ? 'place' THEN
@@ -370,7 +371,7 @@ BEGIN
      nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
      -- If there is a housenumber, also add the place name as a name,
      -- so we can search it by the usual housenumber+place algorithms.
-      IF housenumber is not null THEN
+      IF hnr_vector is not null THEN
        name_vector := array_merge(name_vector,
                                   ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
      END IF;
@@ -812,21 +813,8 @@ BEGIN
  {% if debug %}RAISE WARNING 'Copy over address tags';{% endif %}
  -- housenumber is a computed field, so start with an empty value
-  NEW.housenumber := NULL;
+  NEW.housenumber := token_normalized_housenumber(NEW.token_info);
  IF NEW.address is not NULL THEN
      IF NEW.address ? 'conscriptionnumber' THEN
        IF NEW.address ? 'streetnumber' THEN
            NEW.housenumber := (NEW.address->'conscriptionnumber') || '/' || (NEW.address->'streetnumber');
        ELSE
            NEW.housenumber := NEW.address->'conscriptionnumber';
        END IF;
      ELSEIF NEW.address ? 'streetnumber' THEN
        NEW.housenumber := NEW.address->'streetnumber';
      ELSEIF NEW.address ? 'housenumber' THEN
        NEW.housenumber := NEW.address->'housenumber';
      END IF;
      NEW.housenumber := create_housenumber_id(NEW.housenumber);
      addr_street := NEW.address->'street';
      addr_place := NEW.address->'place';
@@ -940,8 +928,7 @@ BEGIN
        SELECT * INTO name_vector, nameaddress_vector
          FROM create_poi_search_terms(NEW.place_id,
                                       NEW.partition, NEW.parent_place_id,
-                                       NEW.address,
+                                       NEW.address, NEW.country_code,
                                       NEW.country_code, NEW.housenumber,
                                       NEW.token_info, NEW.centroid);
        IF array_length(name_vector, 1) is not NULL THEN
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@@ -7,6 +7,7 @@ AS $$
  SELECT (info->>'names')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Get tokens for matching the place name against others.
 --
 -- This should usually be restricted to full name tokens.
@@ -17,6 +18,22 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Return the housenumber tokens applicable for the place.
 CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'hnr_tokens')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Return the housenumber in the form that it can be matched during search.
 CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
  RETURNS TEXT
 AS $$
  SELECT info->>'hnr';
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Return token info that should be saved permanently in the database.
 CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
  RETURNS JSONB
@@ -75,26 +92,25 @@ END;
 $$
 LANGUAGE plpgsql;
 -- Create housenumber tokens from an OSM addr:housenumber.
 -- The housnumber is split at comma and semicolon as necessary.
 -- The function returns the normalized form of the housenumber suitable
 -- for comparison.
-CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
+CREATE OR REPLACE FUNCTION create_housenumbers(housenumbers TEXT[],
-  RETURNS TEXT
+                                               OUT tokens TEXT,
                                               OUT normtext TEXT)
  AS $$
 DECLARE
  normtext TEXT;
 BEGIN
-  SELECT array_to_string(array_agg(trans), ';')
+  SELECT array_to_string(array_agg(trans), ';'), array_agg(tid)::TEXT
-    INTO normtext
+    INTO normtext, tokens
-    FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word)
+    FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) as tid
          FROM (SELECT make_standard_name(h) as lookup_word
-                FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
+                FROM unnest(housenumbers) h) x) y;
  return normtext;
 END;
 $$ LANGUAGE plpgsql STABLE STRICT;
 CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
  RETURNS INTEGER
  AS $$
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@@ -4,6 +4,8 @@ Main work horse for indexing (computing addresses) the database.
 import logging
 import select
 import psycopg2.extras
 from nominatim.indexer.progress import ProgressLogger
 from nominatim.indexer import runners
 from nominatim.db.async_connection import DBConnection
@@ -176,6 +178,7 @@ class Indexer:
        LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
        with connect(self.dsn) as conn:
            psycopg2.extras.register_hstore(conn)
            with conn.cursor() as cur:
                total_tuples = cur.scalar(runner.sql_count_objects())
                LOG.debug("Total number of rows: %i", total_tuples)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -195,6 +195,8 @@ class LegacyNameAnalyzer:
        self.conn.autocommit = True
        psycopg2.extras.register_hstore(self.conn)
        self._cache = _TokenCache(self.conn)
    def __enter__(self):
        return self
@@ -217,16 +219,23 @@ class LegacyNameAnalyzer:
            Returns a JSON-serialisable structure that will be handed into
            the database via the token_info field.
        """
-        token_info = _TokenInfo()
+        token_info = _TokenInfo(self._cache)
        token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
        address = place.get('address')
        if address:
            token_info.add_housenumbers(self.conn, address)
        return token_info.data
 class _TokenInfo:
-
+    """ Collect token information to be sent back to the database.
-    def __init__(self):
+    """
    def __init__(self, cache):
        self.cache = cache
        self.data = {}
@@ -245,3 +254,52 @@ class _TokenInfo:
            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
                cur.execute("SELECT create_country(%s, %s)",
                            (names, country_feature.lower()))
    def add_housenumbers(self, conn, address):
        """ Extract housenumber information from the address.
        """
        hnrs = [v for k, v in address.items()
                if k in ('housenumber', 'streetnumber', 'conscriptionnumber')]
        if not hnrs:
            return
        if len(hnrs) == 1:
            token = self.cache.get_housenumber(hnrs[0])
            if token is not None:
                self.data['hnr_tokens'] = token
                self.data['hnr'] = hnrs[0]
                return
        # split numbers if necessary
        simple_list = []
        for hnr in hnrs:
            simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
        if len(simple_list) > 1:
            simple_list = list(set(simple_list))
        with conn.cursor() as cur:
            cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
            self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
 class _TokenCache:
    """ Cache for token information to avoid repeated database queries.
        This cache is not thread-safe and needs to be instantiated per
        analyzer.
    """
    def __init__(self, conn):
        # Lookup houseunumbers up to 100 and cache them
        with conn.cursor() as cur:
            cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
                           FROM generate_series(1, 100) as i""")
            self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
    def get_housenumber(self, number):
        """ Get a housenumber token from the cache.
        """
        return self._cached_housenumbers.get(number)
--- a/nominatim/tools/migration.py
+++ b/nominatim/tools/migration.py
@@ -129,6 +129,9 @@ def change_housenumber_transliteration(conn, **_):
        The database schema switched from saving raw housenumbers in
        placex.housenumber to saving transliterated ones.
        Note: the function create_housenumber_id() has been dropped in later
              versions.
    """
    with conn.cursor() as cur:
        cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)