move abbreviation computation into import phase

This adds precomputation of abbreviated terms for names and removes abbreviation of terms in the query. Basic import works but still needs some thorough testing as well as speed improvements during import. New dependency for python library datrie.
2021-05-28 22:06:13 +02:00
parent 6ba00e6aee
commit 8413075249
10 changed files with 665 additions and 206 deletions
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -18,11 +18,11 @@ import psycopg2.extras
 from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules

-DBCFG_NORMALIZATION = "tokenizer_normalization"
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
-DBCFG_TRANSLITERATION = "tokenizer_transliteration"
-DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
+DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"

 LOG = logging.getLogger()

@@ -41,9 +41,9 @@ class LegacyICUTokenizer:
    def __init__(self, dsn, data_dir):
        self.dsn = dsn
        self.data_dir = data_dir
-        self.normalization = None
-        self.transliteration = None
-        self.abbreviations = None
+        self.naming_rules = None
+        self.term_normalization = None
+        self.max_word_frequency = None


    def init_new_db(self, config, init_db=True):
@@ -55,14 +55,14 @@ class LegacyICUTokenizer:
        if config.TOKENIZER_CONFIG:
            cfgfile = Path(config.TOKENIZER_CONFIG)
        else:
-            cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
+            cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'

-        rules = json.loads(cfgfile.read_text())
-        self._load_transliteration(rules['normalization'], cfgfile.parent)
-        self.abbreviations = rules["abbreviations"]
-        self.normalization = config.TERM_NORMALIZATION
+        loader = ICURuleLoader(cfgfile)
+        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.term_normalization = config.TERM_NORMALIZATION
+        self.max_word_frequency = config.MAX_WORD_FREQUENCY

-        self._install_php(config)
+        self._install_php(config.lib_dir.php)
        self._save_config(config)

        if init_db:
@@ -70,19 +70,13 @@ class LegacyICUTokenizer:
            self._init_db_tables(config)


-    def _load_transliteration(self, rules, cfg_path):
-        if isinstance(rules, str):
-            self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
-        else:
-            self.transliteration = ';'.join(rules) + ';'
-
    def init_from_project(self):
        """ Initialise the tokenizer from the project directory.
        """
        with connect(self.dsn) as conn:
-            self.normalization = get_property(conn, DBCFG_NORMALIZATION)
-            self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
-            self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
+            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
+            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)


    def finalize_import(self, config):
@@ -132,26 +126,20 @@ class LegacyICUTokenizer:

            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
-        norm = Transliterator.createFromRules("normalizer", self.normalization)
-        trans = Transliterator.createFromRules("trans", self.transliteration)
-        return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
+        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))


-    def _install_php(self, config):
+    def _install_php(self, phpdir):
        """ Install the php script for the tokenizer.
        """
-        abbr_inverse = list(zip(*self.abbreviations))
        php_file = self.data_dir / "tokenizer.php"
        php_file.write_text(dedent("""\
            <?php
-            @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
-            @define('CONST_Term_Normalization_Rules', "{0.normalization}");
-            @define('CONST_Transliteration', "{0.transliteration}");
-            @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
-            require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, config,
-                       "','".join(abbr_inverse[0]),
-                       "','".join(abbr_inverse[1]))))
+            @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
+            @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
+            @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
+            require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
+            """.format(self, phpdir)))


    def _save_config(self, config):
@@ -159,10 +147,10 @@ class LegacyICUTokenizer:
            database as database properties.
        """
        with connect(self.dsn) as conn:
-            set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+            self.naming_rules.save_rules(conn)
+
            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
-            set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
-            set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
+            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)


    def _init_db_tables(self, config):
@@ -178,15 +166,14 @@ class LegacyICUTokenizer:

            # get partial words and their frequencies
            words = Counter()
-            with self.name_analyzer() as analyzer:
-                with conn.cursor(name="words") as cur:
-                    cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+            name_proc = ICUNameProcessor(self.naming_rules)
+            with conn.cursor(name="words") as cur:
+                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")

-                    for name, cnt in cur:
-                        term = analyzer.make_standard_word(name)
-                        if term:
-                            for word in term.split():
-                                words[word] += cnt
+                for name, cnt in cur:
+                    for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
+                        for term in word.split():
+                            words[term] += cnt

            # copy them back into the word table
            copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
@@ -208,12 +195,10 @@ class LegacyICUNameAnalyzer:
        normalization.
    """

-    def __init__(self, dsn, normalizer, transliterator, abbreviations):
+    def __init__(self, dsn, name_proc):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
-        self.normalizer = normalizer
-        self.transliterator = transliterator
-        self.abbreviations = abbreviations
+        self.name_processor = name_proc

        self._cache = _TokenCache()

@@ -248,9 +233,9 @@ class LegacyICUNameAnalyzer:
        tokens = {}
        for word in words:
            if word.startswith('#'):
-                tokens[word] = ' ' + self.make_standard_word(word[1:])
+                tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
            else:
-                tokens[word] = self.make_standard_word(word)
+                tokens[word] = self.name_processor.get_normalized(word)

        with conn.cursor() as cur:
            cur.execute("""SELECT word_token, word_id
@@ -263,12 +248,6 @@ class LegacyICUNameAnalyzer:
        return [(k, v, ids[v]) for k, v in tokens.items()]


-    def normalize(self, phrase):
-        """ Normalize the given phrase, i.e. remove all properties that
-            are irrelevant for search.
-        """
-        return self.normalizer.transliterate(phrase)
-
    @staticmethod
    def normalize_postcode(postcode):
        """ Convert the postcode to a standardized form.
@@ -279,27 +258,12 @@ class LegacyICUNameAnalyzer:
        return postcode.strip().upper()


-    @functools.lru_cache(maxsize=1024)
-    def make_standard_word(self, name):
-        """ Create the normalised version of the input.
-        """
-        norm = ' ' + self.transliterator.transliterate(name) + ' '
-        for full, abbr in self.abbreviations:
-            if full in norm:
-                norm = norm.replace(full, abbr)
-
-        return norm.strip()
-
-
    def _make_standard_hnr(self, hnr):
        """ Create a normalised version of a housenumber.

            This function takes minor shortcuts on transliteration.
        """
-        if hnr.isdigit():
-            return hnr
-
-        return self.transliterator.transliterate(hnr)
+        return self.name_processor.get_search_normalized(hnr)

    def update_postcodes_from_db(self):
        """ Update postcode tokens in the word table from the location_postcode
@@ -325,7 +289,7 @@ class LegacyICUNameAnalyzer:
                else:
                    copystr.write(postcode)
                    copystr.write('\t ')
-                    copystr.write(self.transliterator.transliterate(postcode))
+                    copystr.write(self.name_processor.get_search_normalized(postcode))
                    copystr.write('\tplace\tpostcode\t0\n')

            if to_delete:
@@ -344,7 +308,7 @@ class LegacyICUNameAnalyzer:
    def update_special_phrases(self, phrases, should_replace):
        """ Replace the search index for special phrases with the new phrases.
        """
-        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
                            for p in phrases))

        with self.conn.cursor() as cur:
@@ -362,7 +326,7 @@ class LegacyICUNameAnalyzer:
            if to_add:
                copystr = io.StringIO()
                for word, cls, typ, oper in to_add:
-                    term = self.make_standard_word(word)
+                    term = self.name_processor.get_search_normalized(word)
                    if term:
                        copystr.write(word)
                        copystr.write('\t ')
@@ -395,15 +359,11 @@ class LegacyICUNameAnalyzer:
    def add_country_names(self, country_code, names):
        """ Add names for the given country to the search index.
        """
-        full_names = set((self.make_standard_word(n) for n in names))
-        full_names.discard('')
-        self._add_normalized_country_names(country_code, full_names)
+        word_tokens = set()
+        for name in self._compute_full_names(names):
+            if name:
+                word_tokens.add(' ' + self.name_processor.get_search_normalized(name))

-
-    def _add_normalized_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
-        """
-        word_tokens = set((' ' + name for name in names))
        with self.conn.cursor() as cur:
            # Get existing names
            cur.execute("SELECT word_token FROM word WHERE country_code = %s",
@@ -429,14 +389,13 @@ class LegacyICUNameAnalyzer:
        names = place.get('name')

        if names:
-            full_names = self._compute_full_names(names)
+            fulls, partials = self._compute_name_tokens(names)

-            token_info.add_names(self.conn, full_names)
+            token_info.add_names(fulls, partials)

            country_feature = place.get('country_feature')
            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self._add_normalized_country_names(country_feature.lower(),
-                                                   full_names)
+                self.add_country_names(country_feature.lower(), names)

        address = place.get('address')

@@ -449,38 +408,60 @@ class LegacyICUNameAnalyzer:
                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                    hnrs.append(value)
                elif key == 'street':
-                    token_info.add_street(self.conn, self.make_standard_word(value))
+                    token_info.add_street(*self._compute_name_tokens({'name': value}))
                elif key == 'place':
-                    token_info.add_place(self.conn, self.make_standard_word(value))
+                    token_info.add_place(*self._compute_name_tokens({'name': value}))
                elif not key.startswith('_') and \
                     key not in ('country', 'full'):
-                    addr_terms.append((key, self.make_standard_word(value)))
+                    addr_terms.append((key, *self._compute_name_tokens({'name': value})))

            if hnrs:
                hnrs = self._split_housenumbers(hnrs)
                token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])

            if addr_terms:
-                token_info.add_address_terms(self.conn, addr_terms)
+                token_info.add_address_terms(addr_terms)

        return token_info.data


+    def _compute_name_tokens(self, names):
+        """ Computes the full name and partial name tokens for the given
+            dictionary of names.
+        """
+        full_names = self._compute_full_names(names)
+        full_tokens = set()
+        partial_tokens = set()
+
+        for name in full_names:
+            norm_name = self.name_processor.get_normalized(name)
+            full, part = self._cache.names.get(norm_name, (None, None))
+            if full is None:
+                variants = self.name_processor.get_variants_ascii(norm_name)
+                with self.conn.cursor() as cur:
+                    cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
+                                (norm_name, variants))
+                    full, part = cur.fetchone()
+
+                self._cache.names[norm_name] = (full, part)
+
+            full_tokens.add(full)
+            partial_tokens.update(part)
+
+        return full_tokens, partial_tokens
+
+
    def _compute_full_names(self, names):
        """ Return the set of all full name word ids to be used with the
            given dictionary of names.
        """
        full_names = set()
        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
-            word = self.make_standard_word(name)
-            if word:
-                full_names.add(word)
+            full_names.add(name.strip())

-                brace_split = name.split('(', 2)
-                if len(brace_split) > 1:
-                    word = self.make_standard_word(brace_split[0])
-                    if word:
-                        full_names.add(word)
+            brace_idx = name.find('(')
+            if brace_idx >= 0:
+                full_names.add(name[:brace_idx].strip())

        return full_names

@@ -492,7 +473,7 @@ class LegacyICUNameAnalyzer:
            postcode = self.normalize_postcode(postcode)

            if postcode not in self._cache.postcodes:
-                term = self.make_standard_word(postcode)
+                term = self.name_processor.get_search_normalized(postcode)
                if not term:
                    return

@@ -508,6 +489,7 @@ class LegacyICUNameAnalyzer:
                                """, (' ' + term, postcode))
                self._cache.postcodes.add(postcode)

+
    @staticmethod
    def _split_housenumbers(hnrs):
        if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
@@ -530,7 +512,7 @@ class _TokenInfo:
    """ Collect token information to be sent back to the database.
    """
    def __init__(self, cache):
-        self.cache = cache
+        self._cache = cache
        self.data = {}

    @staticmethod
@@ -538,86 +520,44 @@ class _TokenInfo:
        return '{%s}' % ','.join((str(s) for s in tokens))


-    def add_names(self, conn, names):
+    def add_names(self, fulls, partials):
        """ Adds token information for the normalised names.
        """
-        # Start with all partial names
-        terms = set((part for ns in names for part in ns.split()))
-        # Add the full names
-        terms.update((' ' + n for n in names))
-
-        self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
+        self.data['names'] = self._mk_array(itertools.chain(fulls, partials))


    def add_housenumbers(self, conn, hnrs):
        """ Extract housenumber information from a list of normalised
            housenumbers.
        """
-        self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
+        self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
        self.data['hnr'] = ';'.join(hnrs)


-    def add_street(self, conn, street):
+    def add_street(self, fulls, partials):
        """ Add addr:street match terms.
        """
-        if not street:
-            return
-
-        term = ' ' + street
-
-        tid = self.cache.names.get(term)
-
-        if tid is None:
-            with conn.cursor() as cur:
-                cur.execute("""SELECT word_id FROM word
-                                WHERE word_token = %s
-                                      and class is null and type is null""",
-                            (term, ))
-                if cur.rowcount > 0:
-                    tid = cur.fetchone()[0]
-                    self.cache.names[term] = tid
-
-        if tid is not None:
-            self.data['street'] = '{%d}' % tid
+        if fulls:
+            self.data['street'] = self._mk_array(fulls)


-    def add_place(self, conn, place):
+    def add_place(self, fulls, partials):
        """ Add addr:place search and match terms.
        """
-        if not place:
-            return
-
-        partial_ids = self.cache.get_term_tokens(conn, place.split())
-        tid = self.cache.get_term_tokens(conn, [' ' + place])
-
-        self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
-        self.data['place_match'] = '{%s}' % tid[0]
+        if fulls:
+            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
+            self.data['place_match'] = self._mk_array(fulls)


-    def add_address_terms(self, conn, terms):
+    def add_address_terms(self, terms):
        """ Add additional address terms.
        """
        tokens = {}

-        for key, value in terms:
-            if not value:
-                continue
-            partial_ids = self.cache.get_term_tokens(conn, value.split())
-            term = ' ' + value
-            tid = self.cache.names.get(term)
-
-            if tid is None:
-                with conn.cursor() as cur:
-                    cur.execute("""SELECT word_id FROM word
-                                    WHERE word_token = %s
-                                          and class is null and type is null""",
-                                (term, ))
-                    if cur.rowcount > 0:
-                        tid = cur.fetchone()[0]
-                        self.cache.names[term] = tid
-
-            tokens[key] = [self._mk_array(partial_ids),
-                           '{%s}' % ('' if tid is None else str(tid))]
+        for key, fulls, partials in terms:
+            if fulls:
+                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
+                               self._mk_array(fulls)]

        if tokens:
            self.data['addr'] = tokens
@@ -635,32 +575,6 @@ class _TokenCache:
        self.housenumbers = {}


-    def get_term_tokens(self, conn, terms):
-        """ Get token ids for a list of terms, looking them up in the database
-            if necessary.
-        """
-        tokens = []
-        askdb = []
-
-        for term in terms:
-            token = self.names.get(term)
-            if token is None:
-                askdb.append(term)
-            elif token != 0:
-                tokens.append(token)
-
-        if askdb:
-            with conn.cursor() as cur:
-                cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
-                            (askdb, ))
-                for term, tid in cur:
-                    self.names[term] = tid
-                    if tid != 0:
-                        tokens.append(tid)
-
-        return tokens
-
-
    def get_hnr_tokens(self, conn, terms):
        """ Get token ids for a list of housenumbers, looking them up in the
            database if necessary.