icu: no longer precompute terms

The ICU analyzer no longer drops frequent partials, so it is no
longer necessary to know the frequencies in advance.
This commit is contained in:
Sarah Hoffmann
2021-10-19 11:50:06 +02:00
parent e8e2502e2f
commit ec7184c533

View File

@@ -2,7 +2,6 @@
Tokenizer implementing normalisation as used before Nominatim 4 but using Tokenizer implementing normalisation as used before Nominatim 4 but using
libICU instead of the PostgreSQL module. libICU instead of the PostgreSQL module.
""" """
from collections import Counter
import itertools import itertools
import json import json
import logging import logging
@@ -161,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql') sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
conn.commit() conn.commit()
LOG.warning("Precomputing word tokens")
# get partial words and their frequencies
words = self._count_partial_terms(conn)
# copy them back into the word table
with CopyBuffer() as copystr:
for term, cnt in words.items():
copystr.add('w', term, json.dumps({'count': cnt}))
with conn.cursor() as cur:
copystr.copy_out(cur, 'word',
columns=['type', 'word_token', 'info'])
cur.execute("""UPDATE word SET word_id = nextval('seq_word')
WHERE word_id is null and type = 'w'""")
conn.commit()
def _count_partial_terms(self, conn):
""" Count the partial terms from the names in the place table.
"""
words = Counter()
analysis = self.loader.make_token_analysis()
with conn.cursor(name="words") as cur:
cur.execute(""" SELECT v, count(*) FROM
(SELECT svals(name) as v FROM place)x
WHERE length(v) < 75 GROUP BY v""")
for name, cnt in cur:
word = analysis.search.transliterate(name)
if word and ' ' in word:
for term in set(word.split()):
words[term] += cnt
return words
class LegacyICUNameAnalyzer(AbstractAnalyzer): class LegacyICUNameAnalyzer(AbstractAnalyzer):
""" The legacy analyzer uses the ICU library for splitting names. """ The legacy analyzer uses the ICU library for splitting names.