mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
make word recount a tokenizer-specific function
This commit is contained in:
@@ -1,11 +0,0 @@
|
|||||||
DROP TABLE IF EXISTS word_frequencies;
|
|
||||||
CREATE TABLE word_frequencies AS
|
|
||||||
SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
|
|
||||||
|
|
||||||
CREATE INDEX idx_word_frequencies ON word_frequencies(id);
|
|
||||||
|
|
||||||
UPDATE word SET search_name_count = count
|
|
||||||
FROM word_frequencies
|
|
||||||
WHERE word_token like ' %' and word_id = id;
|
|
||||||
|
|
||||||
DROP TABLE word_frequencies;
|
|
||||||
@@ -71,8 +71,8 @@ class UpdateRefresh:
|
|||||||
"Postcode updates on a frozen database is not possible.")
|
"Postcode updates on a frozen database is not possible.")
|
||||||
|
|
||||||
if args.word_counts:
|
if args.word_counts:
|
||||||
LOG.warning('Recompute frequency of full-word search terms')
|
LOG.warning('Recompute word statistics')
|
||||||
refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
|
self._get_tokenizer(args.config).update_statistics()
|
||||||
|
|
||||||
if args.address_levels:
|
if args.address_levels:
|
||||||
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
|
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
|
||||||
|
|||||||
@@ -205,6 +205,16 @@ class AbstractTokenizer(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def update_statistics(self) -> None:
|
||||||
|
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
||||||
|
This function is meant to be called from time to time by the user
|
||||||
|
to improve performance. However, the tokenizer must not depend on
|
||||||
|
it to be called in order to work.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def name_analyzer(self) -> AbstractAnalyzer:
|
def name_analyzer(self) -> AbstractAnalyzer:
|
||||||
""" Create a new analyzer for tokenizing names and queries
|
""" Create a new analyzer for tokenizing names and queries
|
||||||
|
|||||||
@@ -93,6 +93,25 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def update_statistics(self):
|
||||||
|
""" Recompute frequencies for all name words.
|
||||||
|
"""
|
||||||
|
with connect(self.dsn) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
LOG.info("Computing word frequencies")
|
||||||
|
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||||
|
SELECT unnest(name_vector) as id, count(*)
|
||||||
|
FROM search_name GROUP BY id""")
|
||||||
|
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||||
|
LOG.info("Update word table with recomputed frequencies")
|
||||||
|
cur.execute("""UPDATE word
|
||||||
|
SET info = info || jsonb_build_object('count', count)
|
||||||
|
FROM word_frequencies WHERE word_id = id""")
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def name_analyzer(self):
|
def name_analyzer(self):
|
||||||
""" Create a new analyzer for tokenizing names and queries
|
""" Create a new analyzer for tokenizing names and queries
|
||||||
using this tokinzer. Analyzers are context managers and should
|
using this tokinzer. Analyzers are context managers and should
|
||||||
|
|||||||
@@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
self._save_config(conn, config)
|
self._save_config(conn, config)
|
||||||
|
|
||||||
|
|
||||||
|
def update_statistics(self):
|
||||||
|
""" Recompute the frequency of full words.
|
||||||
|
"""
|
||||||
|
with connect(self.dsn) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
LOG.info("Computing word frequencies")
|
||||||
|
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||||
|
SELECT unnest(name_vector) as id, count(*)
|
||||||
|
FROM search_name GROUP BY id""")
|
||||||
|
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||||
|
LOG.info("Update word table with recomputed frequencies")
|
||||||
|
cur.execute("""UPDATE word SET search_name_count = count
|
||||||
|
FROM word_frequencies
|
||||||
|
WHERE word_token like ' %' and word_id = id""")
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
def name_analyzer(self):
|
def name_analyzer(self):
|
||||||
""" Create a new analyzer for tokenizing names and queries
|
""" Create a new analyzer for tokenizing names and queries
|
||||||
using this tokinzer. Analyzers are context managers and should
|
using this tokinzer. Analyzers are context managers and should
|
||||||
|
|||||||
@@ -14,12 +14,6 @@ from nominatim.version import NOMINATIM_VERSION
|
|||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
def recompute_word_counts(dsn, sql_dir):
|
|
||||||
""" Compute the frequency of full-word search terms.
|
|
||||||
"""
|
|
||||||
execute_file(dsn, sql_dir / 'words_from_search_name.sql')
|
|
||||||
|
|
||||||
|
|
||||||
def _add_address_level_rows_from_entry(rows, entry):
|
def _add_address_level_rows_from_entry(rows, entry):
|
||||||
""" Converts a single entry from the JSON format for address rank
|
""" Converts a single entry from the JSON format for address rank
|
||||||
descriptions into a flat format suitable for inserting into a
|
descriptions into a flat format suitable for inserting into a
|
||||||
|
|||||||
Reference in New Issue
Block a user