icu: no longer precompute terms

The ICU analyzer no longer drops frequent partials, so it is no
longer necessary to know the frequencies in advance.
This commit is contained in:
Sarah Hoffmann
2021-10-19 11:50:06 +02:00
parent e8e2502e2f
commit ec7184c533

View File

@@ -2,7 +2,6 @@
Tokenizer implementing normalisation as used before Nominatim 4 but using
libICU instead of the PostgreSQL module.
"""
from collections import Counter
import itertools
import json
import logging
@@ -161,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
conn.commit()
LOG.warning("Precomputing word tokens")
# get partial words and their frequencies
words = self._count_partial_terms(conn)
# copy them back into the word table
with CopyBuffer() as copystr:
for term, cnt in words.items():
copystr.add('w', term, json.dumps({'count': cnt}))
with conn.cursor() as cur:
copystr.copy_out(cur, 'word',
columns=['type', 'word_token', 'info'])
cur.execute("""UPDATE word SET word_id = nextval('seq_word')
WHERE word_id is null and type = 'w'""")
conn.commit()
def _count_partial_terms(self, conn):
""" Count the partial terms from the names in the place table.
"""
words = Counter()
analysis = self.loader.make_token_analysis()
with conn.cursor(name="words") as cur:
cur.execute(""" SELECT v, count(*) FROM
(SELECT svals(name) as v FROM place)x
WHERE length(v) < 75 GROUP BY v""")
for name, cnt in cur:
word = analysis.search.transliterate(name)
if word and ' ' in word:
for term in set(word.split()):
words[term] += cnt
return words
class LegacyICUNameAnalyzer(AbstractAnalyzer):
""" The legacy analyzer uses the ICU library for splitting names.