forked from hans/Nominatim
Merge pull request #2472 from lonvia/word-count-computation
Fix word count computation for ICU tokenizer
This commit is contained in:
@@ -1,11 +0,0 @@
|
||||
DROP TABLE IF EXISTS word_frequencies;
|
||||
CREATE TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
|
||||
|
||||
CREATE INDEX idx_word_frequencies ON word_frequencies(id);
|
||||
|
||||
UPDATE word SET search_name_count = count
|
||||
FROM word_frequencies
|
||||
WHERE word_token like ' %' and word_id = id;
|
||||
|
||||
DROP TABLE word_frequencies;
|
||||
@@ -71,8 +71,8 @@ class UpdateRefresh:
|
||||
"Postcode updates on a frozen database is not possible.")
|
||||
|
||||
if args.word_counts:
|
||||
LOG.warning('Recompute frequency of full-word search terms')
|
||||
refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
|
||||
LOG.warning('Recompute word statistics')
|
||||
self._get_tokenizer(args.config).update_statistics()
|
||||
|
||||
if args.address_levels:
|
||||
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
|
||||
|
||||
@@ -205,6 +205,16 @@ class AbstractTokenizer(ABC):
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_statistics(self) -> None:
|
||||
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
||||
This function is meant to be called from time to time by the user
|
||||
to improve performance. However, the tokenizer must not depend on
|
||||
it to be called in order to work.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def name_analyzer(self) -> AbstractAnalyzer:
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
||||
libICU instead of the PostgreSQL module.
|
||||
"""
|
||||
from collections import Counter
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
@@ -93,6 +92,25 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
return None
|
||||
|
||||
|
||||
def update_statistics(self):
|
||||
""" Recompute frequencies for all name words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table("word_frequencies")
|
||||
LOG.info("Computing word frequencies")
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||
LOG.info("Update word table with recomputed frequencies")
|
||||
cur.execute("""UPDATE word
|
||||
SET info = info || jsonb_build_object('count', count)
|
||||
FROM word_frequencies WHERE word_id = id""")
|
||||
cur.drop_table("word_frequencies")
|
||||
conn.commit()
|
||||
|
||||
|
||||
def name_analyzer(self):
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
using this tokinzer. Analyzers are context managers and should
|
||||
@@ -142,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
|
||||
conn.commit()
|
||||
|
||||
LOG.warning("Precomputing word tokens")
|
||||
|
||||
# get partial words and their frequencies
|
||||
words = self._count_partial_terms(conn)
|
||||
|
||||
# copy them back into the word table
|
||||
with CopyBuffer() as copystr:
|
||||
for term, cnt in words.items():
|
||||
copystr.add('w', term, json.dumps({'count': cnt}))
|
||||
|
||||
with conn.cursor() as cur:
|
||||
copystr.copy_out(cur, 'word',
|
||||
columns=['type', 'word_token', 'info'])
|
||||
cur.execute("""UPDATE word SET word_id = nextval('seq_word')
|
||||
WHERE word_id is null and type = 'w'""")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def _count_partial_terms(self, conn):
|
||||
""" Count the partial terms from the names in the place table.
|
||||
"""
|
||||
words = Counter()
|
||||
analysis = self.loader.make_token_analysis()
|
||||
|
||||
with conn.cursor(name="words") as cur:
|
||||
cur.execute(""" SELECT v, count(*) FROM
|
||||
(SELECT svals(name) as v FROM place)x
|
||||
WHERE length(v) < 75 GROUP BY v""")
|
||||
|
||||
for name, cnt in cur:
|
||||
word = analysis.search.transliterate(name)
|
||||
if word and ' ' in word:
|
||||
for term in set(word.split()):
|
||||
words[term] += cnt
|
||||
|
||||
return words
|
||||
|
||||
|
||||
class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
||||
""" The legacy analyzer uses the ICU library for splitting names.
|
||||
|
||||
@@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer):
|
||||
self._save_config(conn, config)
|
||||
|
||||
|
||||
def update_statistics(self):
|
||||
""" Recompute the frequency of full words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table("word_frequencies")
|
||||
LOG.info("Computing word frequencies")
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||
LOG.info("Update word table with recomputed frequencies")
|
||||
cur.execute("""UPDATE word SET search_name_count = count
|
||||
FROM word_frequencies
|
||||
WHERE word_token like ' %' and word_id = id""")
|
||||
cur.drop_table("word_frequencies")
|
||||
conn.commit()
|
||||
|
||||
def name_analyzer(self):
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
using this tokinzer. Analyzers are context managers and should
|
||||
|
||||
@@ -14,12 +14,6 @@ from nominatim.version import NOMINATIM_VERSION
|
||||
LOG = logging.getLogger()
|
||||
|
||||
|
||||
def recompute_word_counts(dsn, sql_dir):
|
||||
""" Compute the frequency of full-word search terms.
|
||||
"""
|
||||
execute_file(dsn, sql_dir / 'words_from_search_name.sql')
|
||||
|
||||
|
||||
def _add_address_level_rows_from_entry(rows, entry):
|
||||
""" Converts a single entry from the JSON format for address rank
|
||||
descriptions into a flat format suitable for inserting into a
|
||||
|
||||
@@ -144,6 +144,7 @@ class TestCliWithDb:
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.update_sql_functions_called = False
|
||||
self.finalize_import_called = False
|
||||
self.update_statistics_called = False
|
||||
|
||||
def update_sql_functions(self, *args):
|
||||
self.update_sql_functions_called = True
|
||||
@@ -151,6 +152,10 @@ class TestCliWithDb:
|
||||
def finalize_import(self, *args):
|
||||
self.finalize_import_called = True
|
||||
|
||||
def update_statistics(self):
|
||||
self.update_statistics_called = True
|
||||
|
||||
|
||||
tok = DummyTokenizer()
|
||||
monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db',
|
||||
lambda *args: tok)
|
||||
@@ -316,7 +321,6 @@ class TestCliWithDb:
|
||||
assert func.called == 1
|
||||
|
||||
@pytest.mark.parametrize("command,func", [
|
||||
('word-counts', 'recompute_word_counts'),
|
||||
('address-levels', 'load_address_levels_from_file'),
|
||||
('wiki-data', 'import_wikipedia_articles'),
|
||||
('importance', 'recompute_importance'),
|
||||
@@ -329,6 +333,11 @@ class TestCliWithDb:
|
||||
assert func_mock.called == 1
|
||||
|
||||
|
||||
def test_refresh_word_count(self):
|
||||
assert self.call_nominatim('refresh', '--word-count') == 0
|
||||
assert self.tokenizer_mock.update_statistics_called
|
||||
|
||||
|
||||
def test_refresh_postcodes(self, mock_func_factory, place_table):
|
||||
func_mock = mock_func_factory(nominatim.tools.postcodes, 'update_postcodes')
|
||||
idx_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_postcodes')
|
||||
|
||||
@@ -160,7 +160,7 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
|
||||
assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
|
||||
|
||||
|
||||
def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
|
||||
def test_init_word_table(tokenizer_factory, test_config, place_row, temp_db_cursor):
|
||||
place_row(names={'name' : 'Test Area', 'ref' : '52'})
|
||||
place_row(names={'name' : 'No Area'})
|
||||
place_row(names={'name' : 'Holzstrasse'})
|
||||
@@ -168,8 +168,7 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
|
||||
tok = tokenizer_factory()
|
||||
tok.init_new_db(test_config)
|
||||
|
||||
assert word_table.get_partial_words() == {('test', 1),
|
||||
('no', 1), ('area', 2)}
|
||||
assert temp_db_cursor.table_exists('word')
|
||||
|
||||
|
||||
def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
|
||||
|
||||
Reference in New Issue
Block a user