move postcode normalization into tokenizer

2026-02-26 11:08:13 +00:00 · 2021-04-25 18:26:36 +02:00
parent d8ed1bfc60
commit ffc2d82b0e
12 changed files with 181 additions and 74 deletions
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -116,7 +116,8 @@ class SetupAll:

        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Calculate postcodes')
-            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
+            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
+                                       tokenizer)

        if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
            if args.continue_at is not None and args.continue_at != 'load-data':
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@@ -147,7 +147,7 @@ class Indexer:

            if maxrank == 30:
                self._index(runners.RankRunner(0, analyzer))
-                self._index(runners.InterpolationRunner(), 20)
+                self._index(runners.InterpolationRunner(analyzer), 20)
                self._index(runners.RankRunner(30, analyzer), 20)
            else:
                self._index(runners.RankRunner(maxrank, analyzer))
--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@@ -25,7 +25,7 @@ class AbstractPlacexRunner:
                   SET indexed_status = 0, address = v.addr, token_info = v.ti
                   FROM (VALUES {}) as v(id, addr, ti)
                   WHERE place_id = v.id
-               """.format(','.join(["(%s, %s::hstore, %s::json)"]  * num_places))
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))


    def index_places(self, worker, places):
@@ -82,6 +82,10 @@ class InterpolationRunner:
        location_property_osmline.
    """

+    def __init__(self, analyzer):
+        self.analyzer = analyzer
+
+
    @staticmethod
    def name():
        return "interpolation lines (location_property_osmline)"
@@ -93,15 +97,30 @@ class InterpolationRunner:

    @staticmethod
    def sql_get_objects():
-        return """SELECT place_id FROM location_property_osmline
+        return """SELECT place_id, get_interpolation_address(address, osm_id) as address
+                  FROM location_property_osmline
                  WHERE indexed_status > 0
                  ORDER BY geometry_sector"""

+
    @staticmethod
-    def index_places(worker, ids):
-        worker.perform(""" UPDATE location_property_osmline
-                           SET indexed_status = 0 WHERE place_id IN ({})
-                       """.format(','.join((str(i[0]) for i in ids))))
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(num_places):
+        return """ UPDATE location_property_osmline
+                   SET indexed_status = 0, address = v.addr, token_info = v.ti
+                   FROM (VALUES {}) as v(id, addr, ti)
+                   WHERE place_id = v.id
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
+
+
+    def index_places(self, worker, places):
+        values = []
+        for place in places:
+            values.extend((place[x] for x in ('place_id', 'address')))
+            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+        worker.perform(self._index_sql(len(places)), values)
+


 class PostcodeRunner:
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -1,6 +1,7 @@
 """
 Tokenizer implementing normalisation as used before Nominatim 4.
 """
+from collections import OrderedDict
 import logging
 import re
 import shutil
@@ -213,6 +214,15 @@ class LegacyNameAnalyzer:
            self.conn.close()
            self.conn = None

+
+    def add_postcodes_from_db(self):
+        """ Add postcodes from the location_postcode table to the word table.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute("""SELECT count(create_postcode_id(pc))
+                           FROM (SELECT distinct(postcode) as pc
+                                 FROM location_postcode) x""")
+
    def process_place(self, place):
        """ Determine tokenizer information about the given place.

@@ -226,11 +236,25 @@ class LegacyNameAnalyzer:
        address = place.get('address')

        if address:
+            self._add_postcode(address.get('postcode'))
            token_info.add_housenumbers(self.conn, address)

        return token_info.data


+    def _add_postcode(self, postcode):
+        """ Make sure the normalized postcode is present in the word table.
+        """
+        if not postcode or re.search(r'[:,;]', postcode) is not None:
+            return
+
+        def _create_postcode_from_db(pcode):
+            with self.conn.cursor() as cur:
+                cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
+
+        self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+
+
 class _TokenInfo:
    """ Collect token information to be sent back to the database.
    """
@@ -285,6 +309,32 @@ class _TokenInfo:
            self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()


+class _LRU:
+    """ Least recently used cache that accepts a generator function to
+        produce the item when there is a cache miss.
+    """
+
+    def __init__(self, maxsize=128):
+        self.data = OrderedDict()
+        self.maxsize = maxsize
+
+    def get(self, key, generator):
+        """ Get the item with the given key from the cache. If nothing
+            is found in the cache, generate the value through the
+            generator function and store it in the cache.
+        """
+        value = self.data.get(key)
+        if value is not None:
+            self.data.move_to_end(key)
+        else:
+            value = generator(key)
+            if len(self.data) >= self.maxsize:
+                self.data.popitem(last=False)
+            self.data[key] = value
+
+        return value
+
+
 class _TokenCache:
    """ Cache for token information to avoid repeated database queries.

@@ -292,6 +342,9 @@ class _TokenCache:
        analyzer.
    """
    def __init__(self, conn):
+        # various LRU caches
+        self.postcodes = _LRU(maxsize=32)
+
        # Lookup houseunumbers up to 100 and cache them
        with conn.cursor() as cur:
            cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@@ -6,7 +6,7 @@ of artificial postcode centroids.
 from nominatim.db.utils import execute_file
 from nominatim.db.connection import connect

-def import_postcodes(dsn, project_dir):
+def import_postcodes(dsn, project_dir, tokenizer):
    """ Set up the initial list of postcodes.
    """

@@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
                SELECT nextval('seq_place'), 1, country_code,
-                       upper(trim (both ' ' from address->'postcode')) as pc,
+                       token_normalized_postcode(address->'postcode') as pc,
                       ST_Centroid(ST_Collect(ST_Centroid(geometry)))
                  FROM placex
-                 WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
+                 WHERE address ? 'postcode'
+                       and token_normalized_postcode(address->'postcode') is not null
                       AND geometry IS NOT null
                 GROUP BY country_code, pc
            """)
@@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'us', postcode,
+                SELECT nextval('seq_place'), 1, 'us',
+                       token_normalized_postcode(postcode),
                       ST_SetSRID(ST_Point(x,y),4326)
-                  FROM us_postcode WHERE postcode NOT IN
+                  FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
                        (SELECT postcode FROM location_postcode
                          WHERE country_code = 'us')
            """)
@@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
-                  FROM gb_postcode WHERE postcode NOT IN
+                SELECT nextval('seq_place'), 1, 'gb',
+                       token_normalized_postcode(postcode), geometry
+                  FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
                           (SELECT postcode FROM location_postcode
                             WHERE country_code = 'gb')
            """)
@@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
                    DELETE FROM word WHERE class='place' and type='postcode'
                    and word NOT IN (SELECT postcode FROM location_postcode)
            """)
-
-            cur.execute("""
-                SELECT count(getorcreate_postcode_id(v)) FROM
-                (SELECT distinct(postcode) as v FROM location_postcode) p
-            """)
        conn.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.add_postcodes_from_db()