forked from hans/Nominatim
Merge pull request #2472 from lonvia/word-count-computation
Fix word count computation for ICU tokenizer
This commit is contained in:
@@ -1,11 +0,0 @@
|
|||||||
DROP TABLE IF EXISTS word_frequencies;
|
|
||||||
CREATE TABLE word_frequencies AS
|
|
||||||
SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
|
|
||||||
|
|
||||||
CREATE INDEX idx_word_frequencies ON word_frequencies(id);
|
|
||||||
|
|
||||||
UPDATE word SET search_name_count = count
|
|
||||||
FROM word_frequencies
|
|
||||||
WHERE word_token like ' %' and word_id = id;
|
|
||||||
|
|
||||||
DROP TABLE word_frequencies;
|
|
||||||
@@ -71,8 +71,8 @@ class UpdateRefresh:
|
|||||||
"Postcode updates on a frozen database is not possible.")
|
"Postcode updates on a frozen database is not possible.")
|
||||||
|
|
||||||
if args.word_counts:
|
if args.word_counts:
|
||||||
LOG.warning('Recompute frequency of full-word search terms')
|
LOG.warning('Recompute word statistics')
|
||||||
refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
|
self._get_tokenizer(args.config).update_statistics()
|
||||||
|
|
||||||
if args.address_levels:
|
if args.address_levels:
|
||||||
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
|
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
|
||||||
|
|||||||
@@ -205,6 +205,16 @@ class AbstractTokenizer(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def update_statistics(self) -> None:
|
||||||
|
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
||||||
|
This function is meant to be called from time to time by the user
|
||||||
|
to improve performance. However, the tokenizer must not depend on
|
||||||
|
it to be called in order to work.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def name_analyzer(self) -> AbstractAnalyzer:
|
def name_analyzer(self) -> AbstractAnalyzer:
|
||||||
""" Create a new analyzer for tokenizing names and queries
|
""" Create a new analyzer for tokenizing names and queries
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
||||||
libICU instead of the PostgreSQL module.
|
libICU instead of the PostgreSQL module.
|
||||||
"""
|
"""
|
||||||
from collections import Counter
|
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@@ -93,6 +92,25 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def update_statistics(self):
|
||||||
|
""" Recompute frequencies for all name words.
|
||||||
|
"""
|
||||||
|
with connect(self.dsn) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
LOG.info("Computing word frequencies")
|
||||||
|
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||||
|
SELECT unnest(name_vector) as id, count(*)
|
||||||
|
FROM search_name GROUP BY id""")
|
||||||
|
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||||
|
LOG.info("Update word table with recomputed frequencies")
|
||||||
|
cur.execute("""UPDATE word
|
||||||
|
SET info = info || jsonb_build_object('count', count)
|
||||||
|
FROM word_frequencies WHERE word_id = id""")
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def name_analyzer(self):
|
def name_analyzer(self):
|
||||||
""" Create a new analyzer for tokenizing names and queries
|
""" Create a new analyzer for tokenizing names and queries
|
||||||
using this tokinzer. Analyzers are context managers and should
|
using this tokinzer. Analyzers are context managers and should
|
||||||
@@ -142,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
|
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
LOG.warning("Precomputing word tokens")
|
|
||||||
|
|
||||||
# get partial words and their frequencies
|
|
||||||
words = self._count_partial_terms(conn)
|
|
||||||
|
|
||||||
# copy them back into the word table
|
|
||||||
with CopyBuffer() as copystr:
|
|
||||||
for term, cnt in words.items():
|
|
||||||
copystr.add('w', term, json.dumps({'count': cnt}))
|
|
||||||
|
|
||||||
with conn.cursor() as cur:
|
|
||||||
copystr.copy_out(cur, 'word',
|
|
||||||
columns=['type', 'word_token', 'info'])
|
|
||||||
cur.execute("""UPDATE word SET word_id = nextval('seq_word')
|
|
||||||
WHERE word_id is null and type = 'w'""")
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
def _count_partial_terms(self, conn):
|
|
||||||
""" Count the partial terms from the names in the place table.
|
|
||||||
"""
|
|
||||||
words = Counter()
|
|
||||||
analysis = self.loader.make_token_analysis()
|
|
||||||
|
|
||||||
with conn.cursor(name="words") as cur:
|
|
||||||
cur.execute(""" SELECT v, count(*) FROM
|
|
||||||
(SELECT svals(name) as v FROM place)x
|
|
||||||
WHERE length(v) < 75 GROUP BY v""")
|
|
||||||
|
|
||||||
for name, cnt in cur:
|
|
||||||
word = analysis.search.transliterate(name)
|
|
||||||
if word and ' ' in word:
|
|
||||||
for term in set(word.split()):
|
|
||||||
words[term] += cnt
|
|
||||||
|
|
||||||
return words
|
|
||||||
|
|
||||||
|
|
||||||
class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
||||||
""" The legacy analyzer uses the ICU library for splitting names.
|
""" The legacy analyzer uses the ICU library for splitting names.
|
||||||
|
|||||||
@@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
self._save_config(conn, config)
|
self._save_config(conn, config)
|
||||||
|
|
||||||
|
|
||||||
|
def update_statistics(self):
|
||||||
|
""" Recompute the frequency of full words.
|
||||||
|
"""
|
||||||
|
with connect(self.dsn) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
LOG.info("Computing word frequencies")
|
||||||
|
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||||
|
SELECT unnest(name_vector) as id, count(*)
|
||||||
|
FROM search_name GROUP BY id""")
|
||||||
|
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||||
|
LOG.info("Update word table with recomputed frequencies")
|
||||||
|
cur.execute("""UPDATE word SET search_name_count = count
|
||||||
|
FROM word_frequencies
|
||||||
|
WHERE word_token like ' %' and word_id = id""")
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
def name_analyzer(self):
|
def name_analyzer(self):
|
||||||
""" Create a new analyzer for tokenizing names and queries
|
""" Create a new analyzer for tokenizing names and queries
|
||||||
using this tokinzer. Analyzers are context managers and should
|
using this tokinzer. Analyzers are context managers and should
|
||||||
|
|||||||
@@ -14,12 +14,6 @@ from nominatim.version import NOMINATIM_VERSION
|
|||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
def recompute_word_counts(dsn, sql_dir):
|
|
||||||
""" Compute the frequency of full-word search terms.
|
|
||||||
"""
|
|
||||||
execute_file(dsn, sql_dir / 'words_from_search_name.sql')
|
|
||||||
|
|
||||||
|
|
||||||
def _add_address_level_rows_from_entry(rows, entry):
|
def _add_address_level_rows_from_entry(rows, entry):
|
||||||
""" Converts a single entry from the JSON format for address rank
|
""" Converts a single entry from the JSON format for address rank
|
||||||
descriptions into a flat format suitable for inserting into a
|
descriptions into a flat format suitable for inserting into a
|
||||||
|
|||||||
@@ -144,6 +144,7 @@ class TestCliWithDb:
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.update_sql_functions_called = False
|
self.update_sql_functions_called = False
|
||||||
self.finalize_import_called = False
|
self.finalize_import_called = False
|
||||||
|
self.update_statistics_called = False
|
||||||
|
|
||||||
def update_sql_functions(self, *args):
|
def update_sql_functions(self, *args):
|
||||||
self.update_sql_functions_called = True
|
self.update_sql_functions_called = True
|
||||||
@@ -151,6 +152,10 @@ class TestCliWithDb:
|
|||||||
def finalize_import(self, *args):
|
def finalize_import(self, *args):
|
||||||
self.finalize_import_called = True
|
self.finalize_import_called = True
|
||||||
|
|
||||||
|
def update_statistics(self):
|
||||||
|
self.update_statistics_called = True
|
||||||
|
|
||||||
|
|
||||||
tok = DummyTokenizer()
|
tok = DummyTokenizer()
|
||||||
monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db',
|
monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db',
|
||||||
lambda *args: tok)
|
lambda *args: tok)
|
||||||
@@ -316,7 +321,6 @@ class TestCliWithDb:
|
|||||||
assert func.called == 1
|
assert func.called == 1
|
||||||
|
|
||||||
@pytest.mark.parametrize("command,func", [
|
@pytest.mark.parametrize("command,func", [
|
||||||
('word-counts', 'recompute_word_counts'),
|
|
||||||
('address-levels', 'load_address_levels_from_file'),
|
('address-levels', 'load_address_levels_from_file'),
|
||||||
('wiki-data', 'import_wikipedia_articles'),
|
('wiki-data', 'import_wikipedia_articles'),
|
||||||
('importance', 'recompute_importance'),
|
('importance', 'recompute_importance'),
|
||||||
@@ -329,6 +333,11 @@ class TestCliWithDb:
|
|||||||
assert func_mock.called == 1
|
assert func_mock.called == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_refresh_word_count(self):
|
||||||
|
assert self.call_nominatim('refresh', '--word-count') == 0
|
||||||
|
assert self.tokenizer_mock.update_statistics_called
|
||||||
|
|
||||||
|
|
||||||
def test_refresh_postcodes(self, mock_func_factory, place_table):
|
def test_refresh_postcodes(self, mock_func_factory, place_table):
|
||||||
func_mock = mock_func_factory(nominatim.tools.postcodes, 'update_postcodes')
|
func_mock = mock_func_factory(nominatim.tools.postcodes, 'update_postcodes')
|
||||||
idx_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_postcodes')
|
idx_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_postcodes')
|
||||||
|
|||||||
@@ -160,7 +160,7 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
|
|||||||
assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
|
assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
|
||||||
|
|
||||||
|
|
||||||
def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
|
def test_init_word_table(tokenizer_factory, test_config, place_row, temp_db_cursor):
|
||||||
place_row(names={'name' : 'Test Area', 'ref' : '52'})
|
place_row(names={'name' : 'Test Area', 'ref' : '52'})
|
||||||
place_row(names={'name' : 'No Area'})
|
place_row(names={'name' : 'No Area'})
|
||||||
place_row(names={'name' : 'Holzstrasse'})
|
place_row(names={'name' : 'Holzstrasse'})
|
||||||
@@ -168,8 +168,7 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
|
|||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
tok.init_new_db(test_config)
|
tok.init_new_db(test_config)
|
||||||
|
|
||||||
assert word_table.get_partial_words() == {('test', 1),
|
assert temp_db_cursor.table_exists('word')
|
||||||
('no', 1), ('area', 2)}
|
|
||||||
|
|
||||||
|
|
||||||
def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
|
def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
|
||||||
|
|||||||
Reference in New Issue
Block a user