introduce name analyzer

The name analyzer is the actual work horse of the tokenizer. It is instantiated on a thread-base and provides all functions for analysing names and queries.
2026-02-16 15:47:58 +00:00 · 2021-04-24 22:35:46 +02:00
parent e1c5673ac3
commit fa2bc60468
5 changed files with 113 additions and 23 deletions
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@@ -124,8 +124,9 @@ class Indexer:
        LOG.warning("Starting indexing boundaries using %s threads",
                    self.num_threads)
-        for rank in range(max(minrank, 4), min(maxrank, 26)):
+        with self.tokenizer.name_analyzer() as analyzer:
-            self._index(runners.BoundaryRunner(rank))
+            for rank in range(max(minrank, 4), min(maxrank, 26)):
                self._index(runners.BoundaryRunner(rank, analyzer))
    def index_by_rank(self, minrank, maxrank):
        """ Index all entries of placex in the given rank range (inclusive)
@@ -138,15 +139,16 @@ class Indexer:
        LOG.warning("Starting indexing rank (%i to %i) using %i threads",
                    minrank, maxrank, self.num_threads)
-        for rank in range(max(1, minrank), maxrank):
+        with self.tokenizer.name_analyzer() as analyzer:
-            self._index(runners.RankRunner(rank))
+            for rank in range(max(1, minrank), maxrank):
                self._index(runners.RankRunner(rank, analyzer))
-        if maxrank == 30:
+            if maxrank == 30:
-            self._index(runners.RankRunner(0))
+                self._index(runners.RankRunner(0, analyzer))
-            self._index(runners.InterpolationRunner(), 20)
+                self._index(runners.InterpolationRunner(), 20)
-            self._index(runners.RankRunner(30), 20)
+                self._index(runners.RankRunner(30, analyzer), 20)
-        else:
+            else:
-            self._index(runners.RankRunner(maxrank))
+                self._index(runners.RankRunner(maxrank, analyzer))
    def index_postcodes(self):
--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@@ -2,6 +2,10 @@
 Mix-ins that provide the actual commands for the indexer for various indexing
 tasks.
 """
 import functools
 import psycopg2.extras
 # pylint: disable=C0111
 class AbstractPlacexRunner:
@@ -9,28 +13,26 @@ class AbstractPlacexRunner:
    """
    SELECT_SQL = 'SELECT place_id, (placex_prepare_update(placex)).* FROM placex'
-    def __init__(self, rank):
+    def __init__(self, rank, analyzer):
        self.rank = rank
-        self._sql_terms = 0
+        self.analyzer = analyzer
        self._cached_index_sql = None
    def _index_sql(self, num_places):
        if num_places != self._sql_terms:
            self._cached_index_sql = \
                """ UPDATE placex
                    SET indexed_status = 0, address = v.addr
                    FROM (VALUES {}) as v(id, addr)
                    WHERE place_id = v.id
                """.format(','.join(["(%s, %s::hstore)"]  * num_places))
            self._sql_terms = num_places
-        return self._cached_index_sql
+    @staticmethod
    @functools.lru_cache(maxsize=1)
    def _index_sql(num_places):
        return """ UPDATE placex
                   SET indexed_status = 0, address = v.addr, token_info = v.ti
                   FROM (VALUES {}) as v(id, addr, ti)
                   WHERE place_id = v.id
               """.format(','.join(["(%s, %s::hstore, %s::json)"]  * num_places))
    def index_places(self, worker, places):
        values = []
        for place in places:
            values.extend((place[x] for x in ('place_id', 'address')))
            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
        worker.perform(self._index_sql(len(places)), values)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -5,6 +5,7 @@ import logging
 import shutil
 import psycopg2
 import psycopg2.extras
 from nominatim.db.connection import connect
 from nominatim.db import properties
@@ -140,6 +141,24 @@ class LegacyTokenizer:
            self._save_config(conn, config)
    def name_analyzer(self):
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
            be used accordingly:
            ```
            with tokenizer.name_analyzer() as analyzer:
                analyser.tokenize()
            ```
            When used outside the with construct, the caller must ensure to
            call the close() function before destructing the analyzer.
            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
        return LegacyNameAnalyzer(self.dsn)
    def _init_db_tables(self, config):
        """ Set up the word table and fill it with pre-computed word
            frequencies.
@@ -159,3 +178,42 @@ class LegacyTokenizer:
        """
        properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 class LegacyNameAnalyzer:
    """ The legacy analyzer uses the special Postgresql module for
        splitting names.
        Each instance opens a connection to the database to request the
        normalization.
    """
    def __init__(self, dsn):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
        psycopg2.extras.register_hstore(self.conn)
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
    def close(self):
        """ Free all resources used by the analyzer.
        """
        if self.conn:
            self.conn.close()
            self.conn = None
    def process_place(self, place):
        """ Determine tokenizer information about the given place.
            Returns a JSON-serialisable structure that will be handed into
            the database via the token_info field.
        """
        return {}
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@ -23,3 +23,30 @@ class DummyTokenizer:
    def init_from_project(self):
        assert self.init_state == None
        self.init_state = "loaded"
    def name_analyzer(self):
        return DummyNameAnalyzer()
 class DummyNameAnalyzer:
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
    def close(self):
        """ Free all resources used by the analyzer.
        """
        pass
    def process_place(self, place):
        """ Determine tokenizer information about the given place.
            Returns a JSON-serialisable structure that will be handed into
            the database via the token_info field.
        """
        return {}
--- a/test/python/test_indexing.py
+++ b/test/python/test_indexing.py
@@ -29,6 +29,7 @@ class IndexerTestDB:
                                                partition SMALLINT,
                                                admin_level SMALLINT,
                                                address HSTORE,
                                                token_info JSONB,
                                                geometry_sector INTEGER)""")
            cur.execute("""CREATE TABLE location_property_osmline (
                               place_id BIGINT,