move word table and normalisation SQL into tokenizer

Creating and populating the word table is now the responsibility of the tokenizer. The get_maxwordfreq() function has been replaced with a simple template parameter to the SQL during function installation. The number is taken from the parameter list in the database to ensure that it is not changed after installation.
2026-02-26 11:08:13 +00:00 · 2021-04-22 22:47:34 +02:00
parent b5540dc35c
commit fbbdd31399
15 changed files with 117 additions and 53 deletions
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -46,6 +46,7 @@ class UpdateRefresh:
    @staticmethod
    def run(args):
        from ..tools import refresh
+        from ..tokenizer import factory as tokenizer_factory

        if args.postcodes:
            LOG.warning("Update postcodes centroid")
@@ -66,6 +67,8 @@ class UpdateRefresh:
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.create_functions(conn, args.config,
                                         args.diffs, args.enable_debug_statements)
+                tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+                tokenizer.update_sql_functions(args.config)

        if args.wiki_data:
            data_path = Path(args.config.WIKIPEDIA_DATA_PATH
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -100,15 +100,19 @@ class SetupAll:
        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Initialise tables')
            with connect(args.config.get_libpq_dsn()) as conn:
-                database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
+                database_import.truncate_data_tables(conn)

            LOG.warning('Load data into placex table')
            database_import.load_data(args.config.get_libpq_dsn(),
-                                      args.data_dir,
                                      args.threads or psutil.cpu_count() or 1)

        LOG.warning("Setting up tokenizer")
-        tokenizer = tokenizer_factory.create_tokenizer(args.config)
+        if args.continue_at is None or args.continue_at == 'load-data':
+            # (re)initialise the tokenizer data
+            tokenizer = tokenizer_factory.create_tokenizer(args.config)
+        else:
+            # just load the tokenizer
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)

        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Calculate postcodes')
--- a/nominatim/db/sql_preprocessor.py
+++ b/nominatim/db/sql_preprocessor.py
@@ -89,8 +89,6 @@ class SQLPreprocessor:
        self.env.globals['db'] = db_info
        self.env.globals['sql'] = _setup_postgres_sql(conn)
        self.env.globals['postgres'] = _setup_postgresql_features(conn)
-        self.env.globals['modulepath'] = config.DATABASE_MODULE_PATH or \
-                                         str((config.project_dir / 'module').resolve())


    def run_sql_file(self, conn, name, **kwargs):
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -8,9 +8,12 @@ import psycopg2

 from nominatim.db.connection import connect
 from nominatim.db import properties
+from nominatim.db import utils as db_utils
+from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.errors import UsageError

 DBCFG_NORMALIZATION = "tokenizer_normalization"
+DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"

 LOG = logging.getLogger()

@@ -53,6 +56,9 @@ def _install_module(config_module_path, src_dir, module_dir):


 def _check_module(module_dir, conn):
+    """ Try to use the PostgreSQL module to confirm that it is correctly
+        installed and accessible from PostgreSQL.
+    """
    with conn.cursor() as cur:
        try:
            cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
@@ -91,7 +97,11 @@ class LegacyTokenizer:

        with connect(self.dsn) as conn:
            _check_module(module_dir, conn)
-            self._save_config(conn)
+            self._save_config(conn, config)
+            conn.commit()
+
+        self.update_sql_functions(config)
+        self._init_db_tables(config)


    def init_from_project(self):
@@ -101,6 +111,19 @@ class LegacyTokenizer:
            self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)


+    def update_sql_functions(self, config):
+        """ Reimport the SQL functions for this tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
+            modulepath = config.DATABASE_MODULE_PATH or \
+                         str((config.project_dir / 'module').resolve())
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
+                              max_word_freq=max_word_freq,
+                              modulepath=modulepath)
+
+
    def migrate_database(self, config):
        """ Initialise the project directory of an existing database for
            use with this tokenizer.
@@ -114,11 +137,25 @@ class LegacyTokenizer:

        with connect(self.dsn) as conn:
            _check_module(module_dir, conn)
-            self._save_config(conn)
+            self._save_config(conn, config)


-    def _save_config(self, conn):
+    def _init_db_tables(self, config):
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            conn.commit()
+
+        LOG.warning("Precomputing word tokens")
+        db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
+
+
+    def _save_config(self, conn, config):
        """ Save the configuration that needs to remain stable for the given
            database as database properties.
        """
        properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
--- a/nominatim/tools/database_import.py
+++ b/nominatim/tools/database_import.py
@@ -160,11 +160,10 @@ def create_partition_tables(conn, config):
    sql.run_sql_file(conn, 'partition-tables.src.sql')


-def truncate_data_tables(conn, max_word_frequency=None):
+def truncate_data_tables(conn):
    """ Truncate all data tables to prepare for a fresh load.
    """
    with conn.cursor() as cur:
-        cur.execute('TRUNCATE word')
        cur.execute('TRUNCATE placex')
        cur.execute('TRUNCATE place_addressline')
        cur.execute('TRUNCATE location_area')
@@ -183,23 +182,13 @@ def truncate_data_tables(conn, max_word_frequency=None):
        for table in [r[0] for r in list(cur)]:
            cur.execute('TRUNCATE ' + table)

-        if max_word_frequency is not None:
-            # Used by getorcreate_word_id to ignore frequent partial words.
-            cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
-                           RETURNS integer AS $$
-                             SELECT {} as maxwordfreq;
-                           $$ LANGUAGE SQL IMMUTABLE
-                        """.format(max_word_frequency))
-        conn.commit()
+    conn.commit()

 _COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'

-def load_data(dsn, data_dir, threads):
+def load_data(dsn, threads):
    """ Copy data into the word and placex table.
    """
-    # Pre-calculate the most important terms in the word list.
-    db_utils.execute_file(dsn, data_dir / 'words.sql')
-
    sel = selectors.DefaultSelector()
    # Then copy data from place to placex in <threads - 1> chunks.
    place_threads = max(1, threads - 1)
--- a/nominatim/tools/migration.py
+++ b/nominatim/tools/migration.py
@@ -49,6 +49,8 @@ def migrate(config, paths):
        if has_run_migration:
            LOG.warning('Updating SQL functions.')
            refresh.create_functions(conn, config)
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+            tokenizer.update_sql_functions(config)

        properties.set_property(conn, 'database_version',
                                '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))