bdd: run full import on tests

This uncovered a couple of outdated/wrong tests which have been fixed, too.
2026-03-11 05:14:07 +00:00 · 2022-02-24 11:35:21 +01:00
parent a9e3329c39
commit f74228830d
10 changed files with 115 additions and 137 deletions
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -390,17 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):


    def add_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
+        """ Add default names for the given country to the search index.
        """
        # Make sure any name preprocessing for country names applies.
        info = PlaceInfo({'name': names, 'country_code': country_code,
                          'rank_address': 4, 'class': 'boundary',
                          'type': 'administrative'})
        self._add_country_full_names(country_code,
-                                     self.sanitizer.process_names(info)[0])
+                                     self.sanitizer.process_names(info)[0],
+                                     internal=True)


-    def _add_country_full_names(self, country_code, names):
+    def _add_country_full_names(self, country_code, names, internal=False):
        """ Add names for the given country from an already sanitized
            name list.
        """
@@ -412,21 +413,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):

        with self.conn.cursor() as cur:
            # Get existing names
-            cur.execute("""SELECT word_token FROM word
-                            WHERE type = 'C' and word = %s""",
+            cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
+                             FROM word
+                             WHERE type = 'C' and word = %s""",
                        (country_code, ))
-            existing_tokens = {t[0] for t in cur}
-
-            # Only add those names that are not yet in the list.
-            new_tokens = word_tokens - existing_tokens
-            if new_tokens:
-                cur.execute("""INSERT INTO word (word_token, type, word)
-                               (SELECT token, 'C', %s
-                                FROM unnest(%s) as token)
-                            """, (country_code, list(new_tokens)))
+            existing_tokens = {True: set(), False: set()} # internal/external names
+            for word in cur:
+                existing_tokens[word[1]].add(word[0])

            # Delete names that no longer exist.
-            gone_tokens = existing_tokens - word_tokens
+            gone_tokens = existing_tokens[internal] - word_tokens
+            if internal:
+                gone_tokens.update(existing_tokens[False] & word_tokens)
            if gone_tokens:
                cur.execute("""DELETE FROM word
                               USING unnest(%s) as token
@@ -434,6 +432,23 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                                     and word_token = token""",
                            (list(gone_tokens), country_code))

+            # Only add those names that are not yet in the list.
+            new_tokens = word_tokens - existing_tokens[True]
+            if not internal:
+                new_tokens -= existing_tokens[False]
+            if new_tokens:
+                if internal:
+                    sql = """INSERT INTO word (word_token, type, word, info)
+                               (SELECT token, 'C', %s, '{"internal": "yes"}'
+                                  FROM unnest(%s) as token)
+                           """
+                else:
+                    sql = """INSERT INTO word (word_token, type, word)
+                                   (SELECT token, 'C', %s
+                                    FROM unnest(%s) as token)
+                          """
+                cur.execute(sql, (country_code, list(new_tokens)))
+

    def process_place(self, place):
        """ Determine tokenizer information about the given place.