Merge pull request #2472 from lonvia/word-count-computation

Fix word count computation for ICU tokenizer
This commit is contained in:
Sarah Hoffmann
2021-10-19 14:58:57 +02:00
committed by GitHub
8 changed files with 61 additions and 61 deletions

View File

@@ -1,11 +0,0 @@
DROP TABLE IF EXISTS word_frequencies;
CREATE TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
CREATE INDEX idx_word_frequencies ON word_frequencies(id);
UPDATE word SET search_name_count = count
FROM word_frequencies
WHERE word_token like ' %' and word_id = id;
DROP TABLE word_frequencies;

View File

@@ -71,8 +71,8 @@ class UpdateRefresh:
"Postcode updates on a frozen database is not possible.")
if args.word_counts:
LOG.warning('Recompute frequency of full-word search terms')
refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
LOG.warning('Recompute word statistics')
self._get_tokenizer(args.config).update_statistics()
if args.address_levels:
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)

View File

@@ -205,6 +205,16 @@ class AbstractTokenizer(ABC):
pass
@abstractmethod
def update_statistics(self) -> None:
""" Recompute any tokenizer statistics necessary for efficient lookup.
This function is meant to be called from time to time by the user
to improve performance. However, the tokenizer must not depend on
it to be called in order to work.
"""
pass
@abstractmethod
def name_analyzer(self) -> AbstractAnalyzer:
""" Create a new analyzer for tokenizing names and queries

View File

@@ -2,7 +2,6 @@
Tokenizer implementing normalisation as used before Nominatim 4 but using
libICU instead of the PostgreSQL module.
"""
from collections import Counter
import itertools
import json
import logging
@@ -93,6 +92,25 @@ class LegacyICUTokenizer(AbstractTokenizer):
return None
def update_statistics(self):
""" Recompute frequencies for all name words.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table("word_frequencies")
LOG.info("Computing word frequencies")
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute("CREATE INDEX ON word_frequencies(id)")
LOG.info("Update word table with recomputed frequencies")
cur.execute("""UPDATE word
SET info = info || jsonb_build_object('count', count)
FROM word_frequencies WHERE word_id = id""")
cur.drop_table("word_frequencies")
conn.commit()
def name_analyzer(self):
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
@@ -142,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
conn.commit()
LOG.warning("Precomputing word tokens")
# get partial words and their frequencies
words = self._count_partial_terms(conn)
# copy them back into the word table
with CopyBuffer() as copystr:
for term, cnt in words.items():
copystr.add('w', term, json.dumps({'count': cnt}))
with conn.cursor() as cur:
copystr.copy_out(cur, 'word',
columns=['type', 'word_token', 'info'])
cur.execute("""UPDATE word SET word_id = nextval('seq_word')
WHERE word_id is null and type = 'w'""")
conn.commit()
def _count_partial_terms(self, conn):
""" Count the partial terms from the names in the place table.
"""
words = Counter()
analysis = self.loader.make_token_analysis()
with conn.cursor(name="words") as cur:
cur.execute(""" SELECT v, count(*) FROM
(SELECT svals(name) as v FROM place)x
WHERE length(v) < 75 GROUP BY v""")
for name, cnt in cur:
word = analysis.search.transliterate(name)
if word and ' ' in word:
for term in set(word.split()):
words[term] += cnt
return words
class LegacyICUNameAnalyzer(AbstractAnalyzer):
""" The legacy analyzer uses the ICU library for splitting names.

View File

@@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer):
self._save_config(conn, config)
def update_statistics(self):
""" Recompute the frequency of full words.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table("word_frequencies")
LOG.info("Computing word frequencies")
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute("CREATE INDEX ON word_frequencies(id)")
LOG.info("Update word table with recomputed frequencies")
cur.execute("""UPDATE word SET search_name_count = count
FROM word_frequencies
WHERE word_token like ' %' and word_id = id""")
cur.drop_table("word_frequencies")
conn.commit()
def name_analyzer(self):
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should

View File

@@ -14,12 +14,6 @@ from nominatim.version import NOMINATIM_VERSION
LOG = logging.getLogger()
def recompute_word_counts(dsn, sql_dir):
""" Compute the frequency of full-word search terms.
"""
execute_file(dsn, sql_dir / 'words_from_search_name.sql')
def _add_address_level_rows_from_entry(rows, entry):
""" Converts a single entry from the JSON format for address rank
descriptions into a flat format suitable for inserting into a

View File

@@ -144,6 +144,7 @@ class TestCliWithDb:
def __init__(self, *args, **kwargs):
self.update_sql_functions_called = False
self.finalize_import_called = False
self.update_statistics_called = False
def update_sql_functions(self, *args):
self.update_sql_functions_called = True
@@ -151,6 +152,10 @@ class TestCliWithDb:
def finalize_import(self, *args):
self.finalize_import_called = True
def update_statistics(self):
self.update_statistics_called = True
tok = DummyTokenizer()
monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db',
lambda *args: tok)
@@ -316,7 +321,6 @@ class TestCliWithDb:
assert func.called == 1
@pytest.mark.parametrize("command,func", [
('word-counts', 'recompute_word_counts'),
('address-levels', 'load_address_levels_from_file'),
('wiki-data', 'import_wikipedia_articles'),
('importance', 'recompute_importance'),
@@ -329,6 +333,11 @@ class TestCliWithDb:
assert func_mock.called == 1
def test_refresh_word_count(self):
assert self.call_nominatim('refresh', '--word-count') == 0
assert self.tokenizer_mock.update_statistics_called
def test_refresh_postcodes(self, mock_func_factory, place_table):
func_mock = mock_func_factory(nominatim.tools.postcodes, 'update_postcodes')
idx_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_postcodes')

View File

@@ -160,7 +160,7 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
def test_init_word_table(tokenizer_factory, test_config, place_row, temp_db_cursor):
place_row(names={'name' : 'Test Area', 'ref' : '52'})
place_row(names={'name' : 'No Area'})
place_row(names={'name' : 'Holzstrasse'})
@@ -168,8 +168,7 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
tok = tokenizer_factory()
tok.init_new_db(test_config)
assert word_table.get_partial_words() == {('test', 1),
('no', 1), ('area', 2)}
assert temp_db_cursor.table_exists('word')
def test_init_from_project(monkeypatch, test_config, tokenizer_factory):