icu: no longer precompute terms

The ICU analyzer no longer drops frequent partials, so it is no longer necessary to know the frequencies in advance.
2026-03-11 21:34:06 +00:00 · 2021-10-19 11:50:06 +02:00
parent e8e2502e2f
commit ec7184c533
1 changed files with 0 additions and 38 deletions
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -2,7 +2,6 @@
 Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
 import itertools
 import json
 import logging
@@ -161,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
            conn.commit()
            LOG.warning("Precomputing word tokens")
            # get partial words and their frequencies
            words = self._count_partial_terms(conn)
            # copy them back into the word table
            with CopyBuffer() as copystr:
                for term, cnt in words.items():
                    copystr.add('w', term, json.dumps({'count': cnt}))
                with conn.cursor() as cur:
                    copystr.copy_out(cur, 'word',
                                     columns=['type', 'word_token', 'info'])
                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
                                   WHERE word_id is null and type = 'w'""")
            conn.commit()
    def _count_partial_terms(self, conn):
        """ Count the partial terms from the names in the place table.
        """
        words = Counter()
        analysis = self.loader.make_token_analysis()
        with conn.cursor(name="words") as cur:
            cur.execute(""" SELECT v, count(*) FROM
                              (SELECT svals(name) as v FROM place)x
                            WHERE length(v) < 75 GROUP BY v""")
            for name, cnt in cur:
                word = analysis.search.transliterate(name)
                if word and ' ' in word:
                    for term in set(word.split()):
                        words[term] += cnt
        return words
 class LegacyICUNameAnalyzer(AbstractAnalyzer):
    """ The legacy analyzer uses the ICU library for splitting names.