extend word statistics to address index

Word frequency in names is not sufficient to interpolate word frequency in the address because names of towns, states etc. are much more frequently used than, say street names.
2026-02-26 11:08:13 +00:00 · 2024-03-13 15:10:25 +01:00
parent 9c48726691
commit bb5de9b955
4 changed files with 88 additions and 19 deletions
--- a/nominatim/api/search/query.py
+++ b/nominatim/api/search/query.py
@@ -102,6 +102,7 @@ class Token(ABC):
    lookup_word: str
    is_indexed: bool
    addr_count: int = 1
    @abstractmethod
    def get_category(self) -> Tuple[str, str]:
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -110,7 +110,8 @@ class UpdateRefresh:
        if args.word_counts:
            LOG.warning('Recompute word statistics')
-            self._get_tokenizer(args.config).update_statistics(args.config)
+            self._get_tokenizer(args.config).update_statistics(args.config,
                                                               threads=args.threads or 1)
        if args.address_levels:
            LOG.warning('Updating address levels')
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -168,7 +168,7 @@ class SetupAll:
        tokenizer.finalize_import(args.config)
        LOG.warning('Recompute word counts')
-        tokenizer.update_statistics(args.config)
+        tokenizer.update_statistics(args.config, threads=num_threads)
        webdir = args.project_dir / 'website'
        LOG.warning('Setup website at %s', webdir)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -104,7 +104,7 @@ class ICUTokenizer(AbstractTokenizer):
        self.init_from_project(config)
-    def update_statistics(self, config: Configuration) -> None:
+    def update_statistics(self, config: Configuration, threads: int = 2) -> None:
        """ Recompute frequencies for all name words.
        """
        with connect(self.dsn) as conn:
@@ -112,22 +112,89 @@ class ICUTokenizer(AbstractTokenizer):
                return
            with conn.cursor() as cur:
-                LOG.info('Computing word frequencies')
+                cur.execute('ANALYSE search_name')
-                cur.drop_table('word_frequencies')
+                if threads > 1:
-                cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                    cur.execute('SET max_parallel_workers_per_gather TO %s',
-                                 SELECT unnest(name_vector) as id, count(*)
+                                (min(threads, 6),))
-                                 FROM search_name GROUP BY id""")
+
-                cur.execute('CREATE INDEX ON word_frequencies(id)')
+                if conn.server_version_tuple() < (12, 0):
-                LOG.info('Update word table with recomputed frequencies')
+                    LOG.info('Computing word frequencies')
-                cur.drop_table('tmp_word')
+                    cur.drop_table('word_frequencies')
-                cur.execute("""CREATE TABLE tmp_word AS
+                    cur.drop_table('addressword_frequencies')
-                                SELECT word_id, word_token, type, word,
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                       (CASE WHEN wf.count is null THEN info
+                                     SELECT unnest(name_vector) as id, count(*)
-                                          ELSE info || jsonb_build_object('count', wf.count)
+                                     FROM search_name GROUP BY id""")
-                                        END) as info
+                    cur.execute('CREATE INDEX ON word_frequencies(id)')
-                                FROM word LEFT JOIN word_frequencies wf
+                    cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
-                                  ON word.word_id = wf.id""")
+                                     SELECT unnest(nameaddress_vector) as id, count(*)
-                cur.drop_table('word_frequencies')
+                                     FROM search_name GROUP BY id""")
                    cur.execute('CREATE INDEX ON addressword_frequencies(id)')
                    cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
                                                                               INOUT info JSONB)
                                   AS $$
                                   DECLARE rec RECORD;
                                   BEGIN
                                   IF info is null THEN
                                     info = '{}'::jsonb;
                                   END IF;
                                   FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
                                   LOOP
                                     info = info || jsonb_build_object('count', rec.count);
                                   END LOOP;
                                   FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
                                   LOOP
                                     info = info || jsonb_build_object('addr_count', rec.count);
                                   END LOOP;
                                   IF info = '{}'::jsonb THEN
                                     info = null;
                                   END IF;
                                   END;
                                   $$ LANGUAGE plpgsql IMMUTABLE;
                                """)
                    LOG.info('Update word table with recomputed frequencies')
                    cur.drop_table('tmp_word')
                    cur.execute("""CREATE TABLE tmp_word AS
                                    SELECT word_id, word_token, type, word,
                                           word_freq_update(word_id, info) as info
                                    FROM word
                                """)
                    cur.drop_table('word_frequencies')
                    cur.drop_table('addressword_frequencies')
                else:
                    LOG.info('Computing word frequencies')
                    cur.drop_table('word_frequencies')
                    cur.execute("""
                      CREATE TEMP TABLE word_frequencies AS
                      WITH word_freq AS MATERIALIZED (
                               SELECT unnest(name_vector) as id, count(*)
                                     FROM search_name GROUP BY id),
                           addr_freq AS MATERIALIZED (
                               SELECT unnest(nameaddress_vector) as id, count(*)
                                     FROM search_name GROUP BY id)
                      SELECT coalesce(a.id, w.id) as id,
                             (CASE WHEN w.count is null THEN '{}'::JSONB
                                  ELSE jsonb_build_object('count', w.count) END
                              ||
                              CASE WHEN a.count is null THEN '{}'::JSONB
                                  ELSE jsonb_build_object('addr_count', a.count) END) as info
                      FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
                      """)
                    cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
                    cur.execute('ANALYSE word_frequencies')
                    LOG.info('Update word table with recomputed frequencies')
                    cur.drop_table('tmp_word')
                    cur.execute("""CREATE TABLE tmp_word AS
                                    SELECT word_id, word_token, type, word,
                                           (CASE WHEN wf.info is null THEN word.info
                                            ELSE coalesce(word.info, '{}'::jsonb) || wf.info
                                            END) as info
                                    FROM word LEFT JOIN word_frequencies wf
                                         ON word.word_id = wf.id
                                """)
                    cur.drop_table('word_frequencies')
            with conn.cursor() as cur:
                cur.execute('SET max_parallel_workers_per_gather TO 0')
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_string(conn,