make word recount a tokenizer-specific function

2026-02-26 11:08:13 +00:00 · 2021-10-19 11:21:16 +02:00
parent c86cfefc48
commit e8e2502e2f
6 changed files with 49 additions and 19 deletions
--- a/lib-sql/words_from_search_name.sql
+++ b/lib-sql/words_from_search_name.sql
@@ -1,11 +0,0 @@
 DROP TABLE IF EXISTS word_frequencies;
 CREATE TABLE word_frequencies AS
 SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
 CREATE INDEX idx_word_frequencies ON word_frequencies(id);
 UPDATE word SET search_name_count = count
  FROM word_frequencies
 WHERE word_token like ' %' and word_id = id;
 DROP TABLE word_frequencies;
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -71,8 +71,8 @@ class UpdateRefresh:
                          "Postcode updates on a frozen database is not possible.")
        if args.word_counts:
-            LOG.warning('Recompute frequency of full-word search terms')
+            LOG.warning('Recompute word statistics')
-            refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
+            self._get_tokenizer(args.config).update_statistics()
        if args.address_levels:
            cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -205,6 +205,16 @@ class AbstractTokenizer(ABC):
        pass
    @abstractmethod
    def update_statistics(self) -> None:
        """ Recompute any tokenizer statistics necessary for efficient lookup.
            This function is meant to be called from time to time by the user
            to improve performance. However, the tokenizer must not depend on
            it to be called in order to work.
        """
        pass
    @abstractmethod
    def name_analyzer(self) -> AbstractAnalyzer:
        """ Create a new analyzer for tokenizing names and queries
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -93,6 +93,25 @@ class LegacyICUTokenizer(AbstractTokenizer):
        return None
    def update_statistics(self):
        """ Recompute frequencies for all name words.
        """
        with connect(self.dsn) as conn:
            with conn.cursor() as cur:
                cur.drop_table("word_frequencies")
                LOG.info("Computing word frequencies")
                cur.execute("""CREATE TEMP TABLE word_frequencies AS
                                 SELECT unnest(name_vector) as id, count(*)
                                 FROM search_name GROUP BY id""")
                cur.execute("CREATE INDEX ON word_frequencies(id)")
                LOG.info("Update word table with recomputed frequencies")
                cur.execute("""UPDATE word
                               SET info = info || jsonb_build_object('count', count)
                               FROM word_frequencies WHERE word_id = id""")
                cur.drop_table("word_frequencies")
            conn.commit()
    def name_analyzer(self):
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer):
            self._save_config(conn, config)
    def update_statistics(self):
        """ Recompute the frequency of full words.
        """
        with connect(self.dsn) as conn:
            with conn.cursor() as cur:
                cur.drop_table("word_frequencies")
                LOG.info("Computing word frequencies")
                cur.execute("""CREATE TEMP TABLE word_frequencies AS
                                 SELECT unnest(name_vector) as id, count(*)
                                 FROM search_name GROUP BY id""")
                cur.execute("CREATE INDEX ON word_frequencies(id)")
                LOG.info("Update word table with recomputed frequencies")
                cur.execute("""UPDATE word SET search_name_count = count
                               FROM word_frequencies
                               WHERE word_token like ' %' and word_id = id""")
                cur.drop_table("word_frequencies")
            conn.commit()
    def name_analyzer(self):
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
--- a/nominatim/tools/refresh.py
+++ b/nominatim/tools/refresh.py
@@ -14,12 +14,6 @@ from nominatim.version import NOMINATIM_VERSION
 LOG = logging.getLogger()
 def recompute_word_counts(dsn, sql_dir):
    """ Compute the frequency of full-word search terms.
    """
    execute_file(dsn, sql_dir / 'words_from_search_name.sql')
 def _add_address_level_rows_from_entry(rows, entry):
    """ Converts a single entry from the JSON format for address rank
        descriptions into a flat format suitable for inserting into a