move amenity creation to tokenizer

The BDD tests still use the old-style amenity creation scripts because we don't have simple means to import a hand-crafted test file of special phrases right now.
2026-03-07 10:34:08 +00:00 · 2021-04-27 21:50:35 +02:00
parent bef300305e
commit 7cb7cf848d
9 changed files with 256 additions and 483 deletions
--- a/nominatim/clicmd/special_phrases.py
+++ b/nominatim/clicmd/special_phrases.py
@@ -22,10 +22,13 @@ class ImportSpecialPhrases:

    @staticmethod
    def run(args):
+        from ..tokenizer import factory as tokenizer_factory
+
        if args.import_from_wiki:
            LOG.warning('Special phrases importation starting')
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
            with connect(args.config.get_libpq_dsn()) as db_connection:
                SpecialPhrasesImporter(
                    args.config, args.phplib_dir, db_connection
-                ).import_from_wiki()
+                ).import_from_wiki(tokenizer)
        return 0
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -6,6 +6,7 @@ import logging
 import re
 import shutil

+from icu import Transliterator
 import psycopg2
 import psycopg2.extras

@@ -158,7 +159,9 @@ class LegacyTokenizer:

            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
-        return LegacyNameAnalyzer(self.dsn)
+        normalizer = Transliterator.createFromRules("phrase normalizer",
+                                                    self.normalization)
+        return LegacyNameAnalyzer(self.dsn, normalizer)


    def _init_db_tables(self, config):
@@ -182,7 +185,6 @@ class LegacyTokenizer:
        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)


-
 class LegacyNameAnalyzer:
    """ The legacy analyzer uses the special Postgresql module for
        splitting names.
@@ -191,9 +193,10 @@ class LegacyNameAnalyzer:
        normalization.
    """

-    def __init__(self, dsn):
+    def __init__(self, dsn, normalizer):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
+        self.normalizer = normalizer
        psycopg2.extras.register_hstore(self.conn)

        self._cache = _TokenCache(self.conn)
@@ -215,6 +218,13 @@ class LegacyNameAnalyzer:
            self.conn = None


+    def normalize(self, phrase):
+        """ Normalize the given phrase, i.e. remove all properties that
+            are irrelevant for search.
+        """
+        return self.normalizer.transliterate(phrase)
+
+
    def add_postcodes_from_db(self):
        """ Add postcodes from the location_postcode table to the word table.
        """
@@ -224,6 +234,47 @@ class LegacyNameAnalyzer:
                                 FROM location_postcode) x""")


+    def update_special_phrases(self, phrases):
+        """ Replace the search index for special phrases with the new phrases.
+        """
+        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+                            for p in phrases))
+
+        with self.conn.cursor() as cur:
+            # Get the old phrases.
+            existing_phrases = set()
+            cur.execute("""SELECT word, class, type, operator FROM word
+                           WHERE class != 'place'
+                                 OR (type != 'house' AND type != 'postcode')""")
+            for label, cls, typ, oper in cur:
+                existing_phrases.add((label, cls, typ, oper or '-'))
+
+            to_add = norm_phrases - existing_phrases
+            to_delete = existing_phrases - norm_phrases
+
+            if to_add:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ INSERT INTO word (word_id, word_token, word, class, type,
+                                          search_name_count, operator)
+                        (SELECT nextval('seq_word'), make_standard_name(name), name,
+                                class, type, 0,
+                                CASE WHEN op in ('in', 'near') THEN op ELSE null END
+                           FROM (VALUES %s) as v(name, class, type, op))""",
+                    to_add)
+
+            if to_delete:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                        WHERE word = name and class = in_class and type = in_type
+                              and ((op = '-' and operator is null) or op = operator)""",
+                    to_delete)
+
+        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+                 len(norm_phrases), len(to_add), len(to_delete))
+
+
    def add_country_names(self, country_code, names):
        """ Add names for the given country to the search index.
        """
--- a/nominatim/tools/special_phrases/importer_statistics.py
+++ b/nominatim/tools/special_phrases/importer_statistics.py
@@ -24,9 +24,6 @@ class SpecialPhrasesImporterStatistics():
        self.tables_deleted = 0
        self.tables_ignored = 0
        self.global_phrases_invalid = 0
-        self.global_phrases_added = 0
-        self.global_phrases_ignored = 0
-        self.global_phrases_deleted = 0

    def _set_lang_values_to_0(self):
        """
@@ -34,8 +31,6 @@ class SpecialPhrasesImporterStatistics():
            lang to 0.
        """
        self.lang_phrases_invalid = 0
-        self.lang_phrases_added = 0
-        self.lang_phrases_ignored = 0

    def notify_one_phrase_invalid(self):
        """
@@ -45,29 +40,6 @@ class SpecialPhrasesImporterStatistics():
        self.lang_phrases_invalid += 1
        self.global_phrases_invalid += 1

-    def notify_one_phrase_added(self):
-        """
-            Add +1 to the count of entries
-            added to the db.
-        """
-        self.lang_phrases_added += 1
-        self.global_phrases_added += 1
-
-    def notify_one_phrase_ignored(self):
-        """
-            Add +1 to the count of ignored
-            entries as it was already in the db.
-        """
-        self.lang_phrases_ignored += 1
-        self.global_phrases_ignored += 1
-
-    def notify_one_phrase_deleted(self):
-        """
-            Add +1 to the count of phrases deleted
-            from the database.
-        """
-        self.global_phrases_deleted += 1
-
    def notify_one_table_created(self):
        """
            Add +1 to the count of created tables.
@@ -97,12 +69,6 @@ class SpecialPhrasesImporterStatistics():
        LOG.info('- %s phrases were invalid.', self.global_phrases_invalid)
        if self.global_phrases_invalid > 0:
            LOG.info('  Those invalid phrases have been skipped.')
-        LOG.info('- %s phrases were ignored as they are already in the database',
-                 self.global_phrases_ignored)
-        LOG.info('- %s phrases were added to the database', self.global_phrases_added)
-        LOG.info('- %s phrases were deleted from the database', self.global_phrases_deleted)
-        if self.global_phrases_deleted > 0:
-            LOG.info('  They were deleted as they are not valid anymore.')
        LOG.info('- %s tables were ignored as they already exist on the database',
                 self.tables_ignored)
        LOG.info('- %s tables were created', self.tables_created)
@@ -126,9 +92,6 @@ class SpecialPhrasesImporterStatistics():
        LOG.info('- %s phrases were invalid.', self.lang_phrases_invalid)
        if self.lang_phrases_invalid > 0:
            LOG.info('  Those invalid phrases have been skipped.')
-        LOG.info('- %s phrases were ignored as they are already in the database',
-                 self.lang_phrases_ignored)
-        LOG.info('- %s phrases were added to the database', self.lang_phrases_added)
        LOG.info('====================================================================')

        if self.lang_phrases_invalid > 0:
--- a/nominatim/tools/special_phrases/special_phrases_importer.py
+++ b/nominatim/tools/special_phrases/special_phrases_importer.py
@@ -9,7 +9,6 @@ import re
 import subprocess
 import json

-from icu import Transliterator
 from psycopg2.sql import Identifier, Literal, SQL

 from nominatim.tools.exec_utils import get_url
@@ -33,21 +32,14 @@ class SpecialPhrasesImporter():
            r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
        )
        self.sanity_check_pattern = re.compile(r'^\w+$')
-        self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
-                                                             self.config.TERM_NORMALIZATION)
-        #This set will contain all existing phrases from the word table which
-        #no longer exist on the wiki.
-        #It contain tuples with the following format: (normalized_word, class, type, operator)
-        self.words_phrases_to_delete = set()
-        #This set will contain the phrases which still exist from the wiki.
-        #It is used to prevent duplicates on the wiki by removing them from
-        #the word_phrases_to_delete only at the end.
-        self.words_phrases_still_exist = set()
+        # This set will contain all existing phrases to be added.
+        # It contains tuples with the following format: (lable, class, type, operator)
+        self.word_phrases = set()
        #This set will contain all existing place_classtype tables which doesn't match any
        #special phrases class/type on the wiki.
        self.table_phrases_to_delete = set()

-    def import_from_wiki(self, languages=None):
+    def import_from_wiki(self, tokenizer, languages=None):
        """
            Iterate through all specified languages and
            extract corresponding special phrases from the wiki.
@@ -55,7 +47,6 @@ class SpecialPhrasesImporter():
        if languages is not None and not isinstance(languages, list):
            raise TypeError('The \'languages\' argument should be of type list.')

-        self._fetch_existing_words_phrases()
        self._fetch_existing_place_classtype_tables()

        #Get all languages to process.
@@ -71,30 +62,15 @@ class SpecialPhrasesImporter():
            self.statistics_handler.notify_current_lang_done(lang)

        self._create_place_classtype_table_and_indexes(class_type_pairs)
-        self._remove_non_existent_phrases_from_db()
+        self._remove_non_existent_tables_from_db()
        self.db_connection.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.update_special_phrases(self.word_phrases)
+
        LOG.warning('Import done.')
        self.statistics_handler.notify_import_done()

-    def _fetch_existing_words_phrases(self):
-        """
-            Fetch existing special phrases from the word table.
-            Fill the word_phrases_to_delete set of the class.
-        """
-        #Only extract special phrases terms:
-        #If class=place and type=house then it is a housenumber term.
-        #If class=place and type=postcode then it is a postcode term.
-        word_query = """
-            SELECT word, class, type, operator FROM word
-            WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
-        """
-        with self.db_connection.cursor() as db_cursor:
-            db_cursor.execute(SQL(word_query))
-            for row in db_cursor:
-                row[3] = '-' if row[3] is None else row[3]
-                self.words_phrases_to_delete.add(
-                    (row[0], row[1], row[2], row[3])
-                )

    def _fetch_existing_place_classtype_tables(self):
        """
@@ -176,7 +152,6 @@ class SpecialPhrasesImporter():

        for match in matches:
            phrase_label = match[0].strip()
-            normalized_label = self.transliterator.transliterate(phrase_label)
            phrase_class = match[1].strip()
            phrase_type = match[2].strip()
            phrase_operator = match[3].strip()
@@ -198,20 +173,6 @@ class SpecialPhrasesImporter():
            ):
                continue

-            #Check if the phrase already exists in the database.
-            if (
-                    (normalized_label, phrase_class, phrase_type, phrase_operator)
-                    in self.words_phrases_to_delete
-            ):
-                #Remove this phrase from the ones to delete as it still exist on the wiki.
-                self.words_phrases_still_exist.add(
-                    (normalized_label, phrase_class, phrase_type, phrase_operator)
-                )
-                class_type_pairs.add((phrase_class, phrase_type))
-                self.statistics_handler.notify_one_phrase_ignored()
-                #Dont need to add this phrase as it already exists in the word table.
-                continue
-
            #sanity check, in case somebody added garbage in the wiki
            if not self._check_sanity(lang, phrase_class, phrase_type):
                self.statistics_handler.notify_one_phrase_invalid()
@@ -219,35 +180,11 @@ class SpecialPhrasesImporter():

            class_type_pairs.add((phrase_class, phrase_type))

-            self._process_amenity(
-                phrase_label, normalized_label, phrase_class,
-                phrase_type, phrase_operator
-            )
-            self.statistics_handler.notify_one_phrase_added()
+            self.word_phrases.add((phrase_label, phrase_class,
+                                   phrase_type, phrase_operator))

        return class_type_pairs

-    def _process_amenity(self, phrase_label, normalized_label,
-                         phrase_class, phrase_type, phrase_operator):
-        # pylint: disable-msg=too-many-arguments
-        """
-            Add phrase lookup and corresponding class and
-            type to the word table based on the operator.
-        """
-        with self.db_connection.cursor() as db_cursor:
-            if phrase_operator == 'near':
-                db_cursor.execute("""SELECT getorcreate_amenityoperator(
-                                  make_standard_name(%s), %s, %s, %s, 'near')""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-            elif phrase_operator == 'in':
-                db_cursor.execute("""SELECT getorcreate_amenityoperator(
-                                  make_standard_name(%s), %s, %s, %s, 'in')""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-            else:
-                db_cursor.execute("""SELECT getorcreate_amenity(
-                                  make_standard_name(%s), %s, %s, %s)""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-

    def _create_place_classtype_table_and_indexes(self, class_type_pairs):
        """
@@ -339,33 +276,15 @@ class SpecialPhrasesImporter():
                              .format(Identifier(table_name),
                                      Identifier(self.config.DATABASE_WEBUSER)))

-    def _remove_non_existent_phrases_from_db(self):
+    def _remove_non_existent_tables_from_db(self):
        """
            Remove special phrases which doesn't exist on the wiki anymore.
-            Delete from the word table and delete the place_classtype tables.
+            Delete the place_classtype tables.
        """
        LOG.warning('Cleaning database...')
-        self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
        #Array containing all queries to execute. Contain tuples of format (query, parameters)
        queries_parameters = []

-        #Delete phrases from the word table which are not on the wiki anymore.
-        for phrase_to_delete in self.words_phrases_to_delete:
-            self.statistics_handler.notify_one_phrase_deleted()
-            if phrase_to_delete[3] == '-':
-                query = """
-                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
-                """
-                parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
-                queries_parameters.append((query, parameters))
-            else:
-                query = """
-                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
-                """
-                parameters = (phrase_to_delete[0], phrase_to_delete[1],
-                              phrase_to_delete[2], phrase_to_delete[3], )
-                queries_parameters.append((query, parameters))
-
        #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
        for table in self.table_phrases_to_delete:
            self.statistics_handler.notify_one_table_deleted()