mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-11 13:24:07 +00:00
icu: no longer precompute terms
The ICU analyzer no longer drops frequent partials, so it is no longer necessary to know the frequencies in advance.
This commit is contained in:
@@ -2,7 +2,6 @@
|
|||||||
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
||||||
libICU instead of the PostgreSQL module.
|
libICU instead of the PostgreSQL module.
|
||||||
"""
|
"""
|
||||||
from collections import Counter
|
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@@ -161,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
|
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
LOG.warning("Precomputing word tokens")
|
|
||||||
|
|
||||||
# get partial words and their frequencies
|
|
||||||
words = self._count_partial_terms(conn)
|
|
||||||
|
|
||||||
# copy them back into the word table
|
|
||||||
with CopyBuffer() as copystr:
|
|
||||||
for term, cnt in words.items():
|
|
||||||
copystr.add('w', term, json.dumps({'count': cnt}))
|
|
||||||
|
|
||||||
with conn.cursor() as cur:
|
|
||||||
copystr.copy_out(cur, 'word',
|
|
||||||
columns=['type', 'word_token', 'info'])
|
|
||||||
cur.execute("""UPDATE word SET word_id = nextval('seq_word')
|
|
||||||
WHERE word_id is null and type = 'w'""")
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
def _count_partial_terms(self, conn):
|
|
||||||
""" Count the partial terms from the names in the place table.
|
|
||||||
"""
|
|
||||||
words = Counter()
|
|
||||||
analysis = self.loader.make_token_analysis()
|
|
||||||
|
|
||||||
with conn.cursor(name="words") as cur:
|
|
||||||
cur.execute(""" SELECT v, count(*) FROM
|
|
||||||
(SELECT svals(name) as v FROM place)x
|
|
||||||
WHERE length(v) < 75 GROUP BY v""")
|
|
||||||
|
|
||||||
for name, cnt in cur:
|
|
||||||
word = analysis.search.transliterate(name)
|
|
||||||
if word and ' ' in word:
|
|
||||||
for term in set(word.split()):
|
|
||||||
words[term] += cnt
|
|
||||||
|
|
||||||
return words
|
|
||||||
|
|
||||||
|
|
||||||
class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
||||||
""" The legacy analyzer uses the ICU library for splitting names.
|
""" The legacy analyzer uses the ICU library for splitting names.
|
||||||
|
|||||||
Reference in New Issue
Block a user