mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-14 10:27:57 +00:00
icu: no longer precompute terms
The ICU analyzer no longer drops frequent partials, so it is no longer necessary to know the frequencies in advance.
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
||||
libICU instead of the PostgreSQL module.
|
||||
"""
|
||||
from collections import Counter
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
@@ -161,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
|
||||
conn.commit()
|
||||
|
||||
LOG.warning("Precomputing word tokens")
|
||||
|
||||
# get partial words and their frequencies
|
||||
words = self._count_partial_terms(conn)
|
||||
|
||||
# copy them back into the word table
|
||||
with CopyBuffer() as copystr:
|
||||
for term, cnt in words.items():
|
||||
copystr.add('w', term, json.dumps({'count': cnt}))
|
||||
|
||||
with conn.cursor() as cur:
|
||||
copystr.copy_out(cur, 'word',
|
||||
columns=['type', 'word_token', 'info'])
|
||||
cur.execute("""UPDATE word SET word_id = nextval('seq_word')
|
||||
WHERE word_id is null and type = 'w'""")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def _count_partial_terms(self, conn):
|
||||
""" Count the partial terms from the names in the place table.
|
||||
"""
|
||||
words = Counter()
|
||||
analysis = self.loader.make_token_analysis()
|
||||
|
||||
with conn.cursor(name="words") as cur:
|
||||
cur.execute(""" SELECT v, count(*) FROM
|
||||
(SELECT svals(name) as v FROM place)x
|
||||
WHERE length(v) < 75 GROUP BY v""")
|
||||
|
||||
for name, cnt in cur:
|
||||
word = analysis.search.transliterate(name)
|
||||
if word and ' ' in word:
|
||||
for term in set(word.split()):
|
||||
words[term] += cnt
|
||||
|
||||
return words
|
||||
|
||||
|
||||
class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
||||
""" The legacy analyzer uses the ICU library for splitting names.
|
||||
|
||||
Reference in New Issue
Block a user