make word recount a tokenizer-specific function

This commit is contained in:
Sarah Hoffmann
2021-10-19 11:21:16 +02:00
parent c86cfefc48
commit e8e2502e2f
6 changed files with 49 additions and 19 deletions

View File

@@ -1,11 +0,0 @@
DROP TABLE IF EXISTS word_frequencies;
CREATE TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
CREATE INDEX idx_word_frequencies ON word_frequencies(id);
UPDATE word SET search_name_count = count
FROM word_frequencies
WHERE word_token like ' %' and word_id = id;
DROP TABLE word_frequencies;

View File

@@ -71,8 +71,8 @@ class UpdateRefresh:
"Postcode updates on a frozen database is not possible.") "Postcode updates on a frozen database is not possible.")
if args.word_counts: if args.word_counts:
LOG.warning('Recompute frequency of full-word search terms') LOG.warning('Recompute word statistics')
refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir) self._get_tokenizer(args.config).update_statistics()
if args.address_levels: if args.address_levels:
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG) cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)

View File

@@ -205,6 +205,16 @@ class AbstractTokenizer(ABC):
pass pass
@abstractmethod
def update_statistics(self) -> None:
""" Recompute any tokenizer statistics necessary for efficient lookup.
This function is meant to be called from time to time by the user
to improve performance. However, the tokenizer must not depend on
it to be called in order to work.
"""
pass
@abstractmethod @abstractmethod
def name_analyzer(self) -> AbstractAnalyzer: def name_analyzer(self) -> AbstractAnalyzer:
""" Create a new analyzer for tokenizing names and queries """ Create a new analyzer for tokenizing names and queries

View File

@@ -93,6 +93,25 @@ class LegacyICUTokenizer(AbstractTokenizer):
return None return None
def update_statistics(self):
""" Recompute frequencies for all name words.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table("word_frequencies")
LOG.info("Computing word frequencies")
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute("CREATE INDEX ON word_frequencies(id)")
LOG.info("Update word table with recomputed frequencies")
cur.execute("""UPDATE word
SET info = info || jsonb_build_object('count', count)
FROM word_frequencies WHERE word_id = id""")
cur.drop_table("word_frequencies")
conn.commit()
def name_analyzer(self): def name_analyzer(self):
""" Create a new analyzer for tokenizing names and queries """ Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should using this tokinzer. Analyzers are context managers and should

View File

@@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer):
self._save_config(conn, config) self._save_config(conn, config)
def update_statistics(self):
""" Recompute the frequency of full words.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table("word_frequencies")
LOG.info("Computing word frequencies")
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute("CREATE INDEX ON word_frequencies(id)")
LOG.info("Update word table with recomputed frequencies")
cur.execute("""UPDATE word SET search_name_count = count
FROM word_frequencies
WHERE word_token like ' %' and word_id = id""")
cur.drop_table("word_frequencies")
conn.commit()
def name_analyzer(self): def name_analyzer(self):
""" Create a new analyzer for tokenizing names and queries """ Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should using this tokinzer. Analyzers are context managers and should

View File

@@ -14,12 +14,6 @@ from nominatim.version import NOMINATIM_VERSION
LOG = logging.getLogger() LOG = logging.getLogger()
def recompute_word_counts(dsn, sql_dir):
""" Compute the frequency of full-word search terms.
"""
execute_file(dsn, sql_dir / 'words_from_search_name.sql')
def _add_address_level_rows_from_entry(rows, entry): def _add_address_level_rows_from_entry(rows, entry):
""" Converts a single entry from the JSON format for address rank """ Converts a single entry from the JSON format for address rank
descriptions into a flat format suitable for inserting into a descriptions into a flat format suitable for inserting into a