mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
make word recount a tokenizer-specific function
This commit is contained in:
@@ -71,8 +71,8 @@ class UpdateRefresh:
|
||||
"Postcode updates on a frozen database is not possible.")
|
||||
|
||||
if args.word_counts:
|
||||
LOG.warning('Recompute frequency of full-word search terms')
|
||||
refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
|
||||
LOG.warning('Recompute word statistics')
|
||||
self._get_tokenizer(args.config).update_statistics()
|
||||
|
||||
if args.address_levels:
|
||||
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
|
||||
|
||||
@@ -205,6 +205,16 @@ class AbstractTokenizer(ABC):
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_statistics(self) -> None:
|
||||
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
||||
This function is meant to be called from time to time by the user
|
||||
to improve performance. However, the tokenizer must not depend on
|
||||
it to be called in order to work.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def name_analyzer(self) -> AbstractAnalyzer:
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
|
||||
@@ -93,6 +93,25 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
return None
|
||||
|
||||
|
||||
def update_statistics(self):
|
||||
""" Recompute frequencies for all name words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table("word_frequencies")
|
||||
LOG.info("Computing word frequencies")
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||
LOG.info("Update word table with recomputed frequencies")
|
||||
cur.execute("""UPDATE word
|
||||
SET info = info || jsonb_build_object('count', count)
|
||||
FROM word_frequencies WHERE word_id = id""")
|
||||
cur.drop_table("word_frequencies")
|
||||
conn.commit()
|
||||
|
||||
|
||||
def name_analyzer(self):
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
using this tokinzer. Analyzers are context managers and should
|
||||
|
||||
@@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer):
|
||||
self._save_config(conn, config)
|
||||
|
||||
|
||||
def update_statistics(self):
|
||||
""" Recompute the frequency of full words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table("word_frequencies")
|
||||
LOG.info("Computing word frequencies")
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||
LOG.info("Update word table with recomputed frequencies")
|
||||
cur.execute("""UPDATE word SET search_name_count = count
|
||||
FROM word_frequencies
|
||||
WHERE word_token like ' %' and word_id = id""")
|
||||
cur.drop_table("word_frequencies")
|
||||
conn.commit()
|
||||
|
||||
def name_analyzer(self):
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
using this tokinzer. Analyzers are context managers and should
|
||||
|
||||
@@ -14,12 +14,6 @@ from nominatim.version import NOMINATIM_VERSION
|
||||
LOG = logging.getLogger()
|
||||
|
||||
|
||||
def recompute_word_counts(dsn, sql_dir):
|
||||
""" Compute the frequency of full-word search terms.
|
||||
"""
|
||||
execute_file(dsn, sql_dir / 'words_from_search_name.sql')
|
||||
|
||||
|
||||
def _add_address_level_rows_from_entry(rows, entry):
|
||||
""" Converts a single entry from the JSON format for address rank
|
||||
descriptions into a flat format suitable for inserting into a
|
||||
|
||||
Reference in New Issue
Block a user