move default country name creation to tokenizer

The new function is also used, when a country us updated. All SQL function related to country names have been removed.
2021-04-27 11:37:18 +02:00
parent dc700c25b6
commit bef300305e
8 changed files with 105 additions and 134 deletions
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -133,7 +133,8 @@ class SetupAll:
            database_import.create_search_indices(conn, args.config,
                                                  drop=args.no_updates)
            LOG.warning('Create search index for default country names.')
-            database_import.create_country_names(conn, args.config)
+            database_import.create_country_names(conn, tokenizer,
+                                                 args.config.LANGUAGES)

        webdir = args.project_dir / 'website'
        LOG.warning('Setup website at %s', webdir)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -223,6 +223,21 @@ class LegacyNameAnalyzer:
                           FROM (SELECT distinct(postcode) as pc
                                 FROM location_postcode) x""")

+
+    def add_country_names(self, country_code, names):
+        """ Add names for the given country to the search index.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute(
+                """INSERT INTO word (word_id, word_token, country_code)
+                   (SELECT nextval('seq_word'), lookup_token, %s
+                      FROM (SELECT ' ' || make_standard_name(n) as lookup_token
+                            FROM unnest(%s)n) y
+                      WHERE NOT EXISTS(SELECT * FROM word
+                                       WHERE word_token = lookup_token and country_code = %s))
+                """, (country_code, names, country_code))
+
+
    def process_place(self, place):
        """ Determine tokenizer information about the given place.

@@ -231,7 +246,14 @@ class LegacyNameAnalyzer:
        """
        token_info = _TokenInfo(self._cache)

-        token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
+        names = place.get('name')
+
+        if names:
+            token_info.add_names(self.conn, names)
+
+            country_feature = place.get('country_feature')
+            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
+                self.add_country_names(country_feature.lower(), list(names.values()))

        address = place.get('address')

@@ -279,22 +301,14 @@ class _TokenInfo:
        self.data = {}


-    def add_names(self, conn, names, country_feature):
+    def add_names(self, conn, names):
        """ Add token information for the names of the place.
        """
-        if not names:
-            return
-
        with conn.cursor() as cur:
            # Create the token IDs for all names.
            self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
                                            (names, ))

-            # Add country tokens to word table if necessary.
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                cur.execute("SELECT create_country(%s, %s)",
-                            (names, country_feature.lower()))
-

    def add_housenumbers(self, conn, hnrs):
        """ Extract housenumber information from the address.
@@ -334,7 +348,8 @@ class _TokenInfo:
        """
        def _get_place(name):
            with conn.cursor() as cur:
-                cur.execute("""SELECT (addr_ids_from_name(%s) || getorcreate_name_id(make_standard_name(%s), ''))::text,
+                cur.execute("""SELECT (addr_ids_from_name(%s)
+                                       || getorcreate_name_id(make_standard_name(%s), ''))::text,
                                      word_ids_from_name(%s)::text""",
                            (name, name, name))
                return cur.fetchone()
--- a/nominatim/tools/database_import.py
+++ b/nominatim/tools/database_import.py
@@ -8,6 +8,7 @@ import subprocess
 from pathlib import Path

 import psutil
+import psycopg2.extras

 from nominatim.db.connection import connect, get_pg_env
 from nominatim.db import utils as db_utils
@@ -250,34 +251,37 @@ def create_search_indices(conn, config, drop=False):

    sql.run_sql_file(conn, 'indices.sql', drop=drop)

-def create_country_names(conn, config):
-    """ Create search index for default country names.
+def create_country_names(conn, tokenizer, languages=None):
+    """ Add default country names to search index. `languages` is a comma-
+        separated list of language codes as used in OSM. If `languages` is not
+        empty then only name translations for the given languages are added
+        to the index.
    """
+    if languages:
+        languages = languages.split(',')
+
+    def _include_key(key):
+        return key == 'name' or \
+               (key.startswith('name:') \
+                and (not languages or key[5:] in languages))

    with conn.cursor() as cur:
-        cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""")
-        cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""")
-        cur.execute("""SELECT COUNT(*) FROM
-                       (SELECT getorcreate_country(make_standard_name(country_code),
-                       country_code) FROM country_name WHERE country_code is not null) AS x""")
-        cur.execute("""SELECT COUNT(*) FROM
-                       (SELECT getorcreate_country(make_standard_name(name->'name'), country_code) 
-                       FROM country_name WHERE name ? 'name') AS x""")
-        sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v),
-                           country_code) FROM (SELECT country_code, skeys(name)
-                           AS k, svals(name) AS v FROM country_name) x WHERE k"""
+        psycopg2.extras.register_hstore(cur)
+        cur.execute("""SELECT country_code, name FROM country_name
+                       WHERE country_code is not null""")

-        languages = config.LANGUAGES
+        with tokenizer.name_analyzer() as analyzer:
+            for code, name in cur:
+                names = [code]
+                if code == 'gb':
+                    names.append('UK')
+                if code == 'us':
+                    names.append('United States')
+
+                # country names (only in languages as provided)
+                if name:
+                    names.extend((v for k, v in name.items() if _include_key(k)))
+
+                analyzer.add_country_names(code, names)

-        if languages:
-            sql_statement = "{} IN (".format(sql_statement)
-            delim = ''
-            for language in languages.split(','):
-                sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language)
-                delim = ', '
-            sql_statement = '{})'.format(sql_statement)
-        else:
-            sql_statement = "{} LIKE 'name:%'".format(sql_statement)
-        sql_statement = "{}) v".format(sql_statement)
-        cur.execute(sql_statement)
    conn.commit()