Merge pull request #2472 from lonvia/word-count-computation

Fix word count computation for ICU tokenizer
2021-10-19 14:58:57 +02:00
parent c86cfefc48 824562357b
commit a0f5613a23
8 changed files with 61 additions and 61 deletions
--- a/lib-sql/words_from_search_name.sql
+++ b/lib-sql/words_from_search_name.sql
@@ -1,11 +0,0 @@
 DROP TABLE IF EXISTS word_frequencies;
 CREATE TABLE word_frequencies AS
 SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
 CREATE INDEX idx_word_frequencies ON word_frequencies(id);
 UPDATE word SET search_name_count = count
  FROM word_frequencies
 WHERE word_token like ' %' and word_id = id;
 DROP TABLE word_frequencies;
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -71,8 +71,8 @@ class UpdateRefresh:
                          "Postcode updates on a frozen database is not possible.")
        if args.word_counts:
-            LOG.warning('Recompute frequency of full-word search terms')
+            LOG.warning('Recompute word statistics')
-            refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
+            self._get_tokenizer(args.config).update_statistics()
        if args.address_levels:
            cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -205,6 +205,16 @@ class AbstractTokenizer(ABC):
        pass
    @abstractmethod
    def update_statistics(self) -> None:
        """ Recompute any tokenizer statistics necessary for efficient lookup.
            This function is meant to be called from time to time by the user
            to improve performance. However, the tokenizer must not depend on
            it to be called in order to work.
        """
        pass
    @abstractmethod
    def name_analyzer(self) -> AbstractAnalyzer:
        """ Create a new analyzer for tokenizing names and queries
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -2,7 +2,6 @@
 Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
 import itertools
 import json
 import logging
@@ -93,6 +92,25 @@ class LegacyICUTokenizer(AbstractTokenizer):
        return None
    def update_statistics(self):
        """ Recompute frequencies for all name words.
        """
        with connect(self.dsn) as conn:
            with conn.cursor() as cur:
                cur.drop_table("word_frequencies")
                LOG.info("Computing word frequencies")
                cur.execute("""CREATE TEMP TABLE word_frequencies AS
                                 SELECT unnest(name_vector) as id, count(*)
                                 FROM search_name GROUP BY id""")
                cur.execute("CREATE INDEX ON word_frequencies(id)")
                LOG.info("Update word table with recomputed frequencies")
                cur.execute("""UPDATE word
                               SET info = info || jsonb_build_object('count', count)
                               FROM word_frequencies WHERE word_id = id""")
                cur.drop_table("word_frequencies")
            conn.commit()
    def name_analyzer(self):
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
@@ -142,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
            conn.commit()
            LOG.warning("Precomputing word tokens")
            # get partial words and their frequencies
            words = self._count_partial_terms(conn)
            # copy them back into the word table
            with CopyBuffer() as copystr:
                for term, cnt in words.items():
                    copystr.add('w', term, json.dumps({'count': cnt}))
                with conn.cursor() as cur:
                    copystr.copy_out(cur, 'word',
                                     columns=['type', 'word_token', 'info'])
                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
                                   WHERE word_id is null and type = 'w'""")
            conn.commit()
    def _count_partial_terms(self, conn):
        """ Count the partial terms from the names in the place table.
        """
        words = Counter()
        analysis = self.loader.make_token_analysis()
        with conn.cursor(name="words") as cur:
            cur.execute(""" SELECT v, count(*) FROM
                              (SELECT svals(name) as v FROM place)x
                            WHERE length(v) < 75 GROUP BY v""")
            for name, cnt in cur:
                word = analysis.search.transliterate(name)
                if word and ' ' in word:
                    for term in set(word.split()):
                        words[term] += cnt
        return words
 class LegacyICUNameAnalyzer(AbstractAnalyzer):
    """ The legacy analyzer uses the ICU library for splitting names.
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer):
            self._save_config(conn, config)
    def update_statistics(self):
        """ Recompute the frequency of full words.
        """
        with connect(self.dsn) as conn:
            with conn.cursor() as cur:
                cur.drop_table("word_frequencies")
                LOG.info("Computing word frequencies")
                cur.execute("""CREATE TEMP TABLE word_frequencies AS
                                 SELECT unnest(name_vector) as id, count(*)
                                 FROM search_name GROUP BY id""")
                cur.execute("CREATE INDEX ON word_frequencies(id)")
                LOG.info("Update word table with recomputed frequencies")
                cur.execute("""UPDATE word SET search_name_count = count
                               FROM word_frequencies
                               WHERE word_token like ' %' and word_id = id""")
                cur.drop_table("word_frequencies")
            conn.commit()
    def name_analyzer(self):
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
--- a/nominatim/tools/refresh.py
+++ b/nominatim/tools/refresh.py
@@ -14,12 +14,6 @@ from nominatim.version import NOMINATIM_VERSION
 LOG = logging.getLogger()
 def recompute_word_counts(dsn, sql_dir):
    """ Compute the frequency of full-word search terms.
    """
    execute_file(dsn, sql_dir / 'words_from_search_name.sql')
 def _add_address_level_rows_from_entry(rows, entry):
    """ Converts a single entry from the JSON format for address rank
        descriptions into a flat format suitable for inserting into a
--- a/test/python/test_cli.py
+++ b/test/python/test_cli.py
@@ -144,6 +144,7 @@ class TestCliWithDb:
            def __init__(self, *args, **kwargs):
                self.update_sql_functions_called = False
                self.finalize_import_called = False
                self.update_statistics_called = False
            def update_sql_functions(self, *args):
                self.update_sql_functions_called = True
@@ -151,6 +152,10 @@ class TestCliWithDb:
            def finalize_import(self, *args):
                self.finalize_import_called = True
            def update_statistics(self):
                self.update_statistics_called = True
        tok = DummyTokenizer()
        monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db',
                            lambda *args: tok)
@@ -316,7 +321,6 @@ class TestCliWithDb:
        assert func.called == 1
    @pytest.mark.parametrize("command,func", [
                             ('word-counts', 'recompute_word_counts'),
                             ('address-levels', 'load_address_levels_from_file'),
                             ('wiki-data', 'import_wikipedia_articles'),
                             ('importance', 'recompute_importance'),
@@ -329,6 +333,11 @@ class TestCliWithDb:
        assert func_mock.called == 1
    def test_refresh_word_count(self):
        assert self.call_nominatim('refresh', '--word-count') == 0
        assert self.tokenizer_mock.update_statistics_called
    def test_refresh_postcodes(self, mock_func_factory, place_table):
        func_mock = mock_func_factory(nominatim.tools.postcodes, 'update_postcodes')
        idx_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_postcodes')
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -160,7 +160,7 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
    assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
-def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
+def test_init_word_table(tokenizer_factory, test_config, place_row, temp_db_cursor):
    place_row(names={'name' : 'Test Area', 'ref' : '52'})
    place_row(names={'name' : 'No Area'})
    place_row(names={'name' : 'Holzstrasse'})
@@ -168,8 +168,7 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
    tok = tokenizer_factory()
    tok.init_new_db(test_config)
-    assert word_table.get_partial_words() == {('test', 1),
+    assert temp_db_cursor.table_exists('word')
                                              ('no', 1), ('area', 2)}
 def test_init_from_project(monkeypatch, test_config, tokenizer_factory):