mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-07 02:24:08 +00:00
make word recount a tokenizer-specific function
This commit is contained in:
@@ -205,6 +205,16 @@ class AbstractTokenizer(ABC):
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_statistics(self) -> None:
|
||||
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
||||
This function is meant to be called from time to time by the user
|
||||
to improve performance. However, the tokenizer must not depend on
|
||||
it to be called in order to work.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def name_analyzer(self) -> AbstractAnalyzer:
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
|
||||
@@ -93,6 +93,25 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
return None
|
||||
|
||||
|
||||
def update_statistics(self):
|
||||
""" Recompute frequencies for all name words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table("word_frequencies")
|
||||
LOG.info("Computing word frequencies")
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||
LOG.info("Update word table with recomputed frequencies")
|
||||
cur.execute("""UPDATE word
|
||||
SET info = info || jsonb_build_object('count', count)
|
||||
FROM word_frequencies WHERE word_id = id""")
|
||||
cur.drop_table("word_frequencies")
|
||||
conn.commit()
|
||||
|
||||
|
||||
def name_analyzer(self):
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
using this tokinzer. Analyzers are context managers and should
|
||||
|
||||
@@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer):
|
||||
self._save_config(conn, config)
|
||||
|
||||
|
||||
def update_statistics(self):
|
||||
""" Recompute the frequency of full words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table("word_frequencies")
|
||||
LOG.info("Computing word frequencies")
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||
LOG.info("Update word table with recomputed frequencies")
|
||||
cur.execute("""UPDATE word SET search_name_count = count
|
||||
FROM word_frequencies
|
||||
WHERE word_token like ' %' and word_id = id""")
|
||||
cur.drop_table("word_frequencies")
|
||||
conn.commit()
|
||||
|
||||
def name_analyzer(self):
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
using this tokinzer. Analyzers are context managers and should
|
||||
|
||||
Reference in New Issue
Block a user