recreate word table when refreshing counts

The counting touches a large part of the word table, leaving bloated tables and indexes. Thus recreate the table instead and swap it in.
2024-02-04 16:43:33 +01:00
parent 33c0f249b1
commit 81eed0680c
10 changed files with 130 additions and 82 deletions
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -110,7 +110,7 @@ class UpdateRefresh:

        if args.word_counts:
            LOG.warning('Recompute word statistics')
-            self._get_tokenizer(args.config).update_statistics()
+            self._get_tokenizer(args.config).update_statistics(args.config)

        if args.address_levels:
            LOG.warning('Updating address levels')
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -169,7 +169,7 @@ class SetupAll:
        tokenizer.finalize_import(args.config)

        LOG.warning('Recompute word counts')
-        tokenizer.update_statistics()
+        tokenizer.update_statistics(args.config)

        webdir = args.project_dir / 'website'
        LOG.warning('Setup website at %s', webdir)
--- a/nominatim/db/sql_preprocessor.py
+++ b/nominatim/db/sql_preprocessor.py
@@ -90,6 +90,18 @@ class SQLPreprocessor:
        self.env.globals['postgres'] = _setup_postgresql_features(conn)


+    def run_string(self, conn: Connection, template: str, **kwargs: Any) -> None:
+        """ Execute the given SQL template string on the connection.
+            The keyword arguments may supply additional parameters
+            for preprocessing.
+        """
+        sql = self.env.from_string(template).render(**kwargs)
+
+        with conn.cursor() as cur:
+            cur.execute(sql)
+        conn.commit()
+
+
    def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None:
        """ Execute the given SQL file on the connection. The keyword arguments
            may supply additional parameters for preprocessing.
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -201,7 +201,7 @@ class AbstractTokenizer(ABC):


    @abstractmethod
-    def update_statistics(self) -> None:
+    def update_statistics(self, config: Configuration) -> None:
        """ Recompute any tokenizer statistics necessary for efficient lookup.
            This function is meant to be called from time to time by the user
            to improve performance. However, the tokenizer must not depend on
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -31,6 +31,11 @@ DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"

 LOG = logging.getLogger()

+WORD_TYPES =(('country_names', 'C'),
+             ('postcodes', 'P'),
+             ('full_word', 'W'),
+             ('housenumbers', 'H'))
+
 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
    """ Create a new instance of the tokenizer provided by this module.
    """
@@ -62,7 +67,8 @@ class ICUTokenizer(AbstractTokenizer):

        if init_db:
            self.update_sql_functions(config)
-            self._init_db_tables(config)
+            self._setup_db_tables(config, 'word')
+            self._create_base_indices(config, 'word')


    def init_from_project(self, config: Configuration) -> None:
@@ -80,9 +86,7 @@ class ICUTokenizer(AbstractTokenizer):
        """ Do any required postprocessing to make the tokenizer data ready
            for use.
        """
-        with connect(self.dsn) as conn:
-            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+        self._create_lookup_indices(config, 'word')


    def update_sql_functions(self, config: Configuration) -> None:
@@ -100,24 +104,35 @@ class ICUTokenizer(AbstractTokenizer):
        self.init_from_project(config)


-    def update_statistics(self) -> None:
+    def update_statistics(self, config: Configuration) -> None:
        """ Recompute frequencies for all name words.
        """
        with connect(self.dsn) as conn:
-            if conn.table_exists('search_name'):
-                with conn.cursor() as cur:
-                    cur.drop_table("word_frequencies")
-                    LOG.info("Computing word frequencies")
-                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                     SELECT unnest(name_vector) as id, count(*)
-                                     FROM search_name GROUP BY id""")
-                    cur.execute("CREATE INDEX ON word_frequencies(id)")
-                    LOG.info("Update word table with recomputed frequencies")
-                    cur.execute("""UPDATE word
-                                   SET info = info || jsonb_build_object('count', count)
-                                   FROM word_frequencies WHERE word_id = id""")
-                    cur.drop_table("word_frequencies")
+            if not conn.table_exists('search_name'):
+                return
+
+            with conn.cursor() as cur:
+                LOG.info('Computing word frequencies')
+                cur.drop_table('word_frequencies')
+                cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                 SELECT unnest(name_vector) as id, count(*)
+                                 FROM search_name GROUP BY id""")
+                cur.execute('CREATE INDEX ON word_frequencies(id)')
+                LOG.info('Update word table with recomputed frequencies')
+                cur.drop_table('tmp_word')
+                cur.execute("""CREATE TABLE tmp_word AS
+                                SELECT word_id, word_token, type, word,
+                                       (CASE WHEN wf.count is null THEN info
+                                          ELSE info || jsonb_build_object('count', wf.count)
+                                        END) as info
+                                FROM word LEFT JOIN word_frequencies wf
+                                  ON word.word_id = wf.id""")
+                cur.drop_table('word_frequencies')
            conn.commit()
+        self._create_base_indices(config, 'tmp_word')
+        self._create_lookup_indices(config, 'tmp_word')
+        self._move_temporary_word_table('tmp_word')
+


    def _cleanup_housenumbers(self) -> None:
@@ -219,16 +234,81 @@ class ICUTokenizer(AbstractTokenizer):
            self.loader.save_config_to_db(conn)


-    def _init_db_tables(self, config: Configuration) -> None:
+    def _setup_db_tables(self, config: Configuration, table_name: str) -> None:
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table(table_name)
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn, """
+                CREATE TABLE {{table_name}} (
+                      word_id INTEGER,
+                      word_token text NOT NULL,
+                      type text NOT NULL,
+                      word text,
+                      info jsonb
+                    ) {{db.tablespace.search_data}};
+                GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
+
+                DROP SEQUENCE IF EXISTS seq_{{table_name}};
+                CREATE SEQUENCE seq_{{table_name}} start 1;
+                GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
+            """, table_name=table_name)
+
+
+    def _create_base_indices(self, config: Configuration, table_name: str) -> None:
        """ Set up the word table and fill it with pre-computed word
            frequencies.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
+            sqlp.run_string(conn,
+                            """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
+                               USING BTREE (word_token) {{db.tablespace.search_index}}""",
+                            table_name=table_name)
+            for name, ctype in WORD_TYPES:
+                sqlp.run_string(conn,
+                                """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
+                                   USING BTREE (word) {{db.tablespace.address_index}}
+                                   WHERE type = '{{column_type}}'
+                                """,
+                                table_name=table_name, idx_name=name,
+                                column_type=ctype)
+
+
+    def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
+        """ Create addtional indexes used when running the API.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            # Index required for details lookup.
+            sqlp.run_string(conn, """
+                CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
+                  ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
+            """,
+            table_name=table_name)
+
+
+    def _move_temporary_word_table(self, old: str) -> None:
+        """ Rename all tables and indexes used by the tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table('word')
+                cur.execute(f"ALTER TABLE {old} RENAME TO word")
+                for idx in ('word_token', 'word_id'):
+                    cur.execute(f"""ALTER INDEX idx_{old}_{idx}
+                                      RENAME TO idx_word_{idx}""")
+                for name, _ in WORD_TYPES:
+                    cur.execute(f"""ALTER INDEX idx_{old}_{name}
+                                    RENAME TO idx_word_{name}""")
            conn.commit()


+
+
 class ICUNameAnalyzer(AbstractAnalyzer):
    """ The ICU analyzer uses the ICU library for splitting names.

--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -210,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer):
            self._save_config(conn, config)


-    def update_statistics(self) -> None:
+    def update_statistics(self, _: Configuration) -> None:
        """ Recompute the frequency of full words.
        """
        with connect(self.dsn) as conn: