Resolve conflicts

2026-02-16 15:47:58 +00:00 · 2021-05-17 13:52:35 +02:00
parent a33f2c0f5b 925726222f
commit 3206bf59df
37 changed files with 7285 additions and 6729 deletions
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -263,6 +263,16 @@ class LegacyICUNameAnalyzer:
        """
        return self.normalizer.transliterate(phrase)

+    @staticmethod
+    def normalize_postcode(postcode):
+        """ Convert the postcode to a standardized form.
+
+            This function must yield exactly the same result as the SQL function
+            'token_normalized_postcode()'.
+        """
+        return postcode.strip().upper()
+
+
    @functools.lru_cache(maxsize=1024)
    def make_standard_word(self, name):
        """ Create the normalised version of the input.
@@ -285,25 +295,44 @@ class LegacyICUNameAnalyzer:

        return self.transliterator.transliterate(hnr)

-    def add_postcodes_from_db(self):
-        """ Add postcodes from the location_postcode table to the word table.
+    def update_postcodes_from_db(self):
+        """ Update postcode tokens in the word table from the location_postcode
+            table.
        """
+        to_delete = []
        copystr = io.StringIO()
        with self.conn.cursor() as cur:
-            cur.execute("SELECT distinct(postcode) FROM location_postcode")
-            for (postcode, ) in cur:
-                copystr.write(postcode)
-                copystr.write('\t ')
-                copystr.write(self.transliterator.transliterate(postcode))
-                copystr.write('\tplace\tpostcode\t0\n')
+            # This finds us the rows in location_postcode and word that are
+            # missing in the other table.
+            cur.execute("""SELECT * FROM
+                            (SELECT pc, word FROM
+                              (SELECT distinct(postcode) as pc FROM location_postcode) p
+                              FULL JOIN
+                              (SELECT word FROM word
+                                WHERE class ='place' and type = 'postcode') w
+                              ON pc = word) x
+                           WHERE pc is null or word is null""")

-            copystr.seek(0)
-            cur.copy_from(copystr, 'word',
-                          columns=['word', 'word_token', 'class', 'type',
-                                   'search_name_count'])
-            # Don't really need an ID for postcodes....
-            # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-            #                WHERE word_id is null and type = 'postcode'""")
+            for postcode, word in cur:
+                if postcode is None:
+                    to_delete.append(word)
+                else:
+                    copystr.write(postcode)
+                    copystr.write('\t ')
+                    copystr.write(self.transliterator.transliterate(postcode))
+                    copystr.write('\tplace\tpostcode\t0\n')
+
+            if to_delete:
+                cur.execute("""DELETE FROM WORD
+                               WHERE class ='place' and type = 'postcode'
+                                     and word = any(%s)
+                            """, (to_delete, ))
+
+            if copystr.getvalue():
+                copystr.seek(0)
+                cur.copy_from(copystr, 'word',
+                              columns=['word', 'word_token', 'class', 'type',
+                                       'search_name_count'])


    def update_special_phrases(self, phrases, should_replace):
@@ -435,22 +464,25 @@ class LegacyICUNameAnalyzer:
    def _add_postcode(self, postcode):
        """ Make sure the normalized postcode is present in the word table.
        """
-        if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
-            term = self.make_standard_word(postcode)
-            if not term:
-                return
+        if re.search(r'[:,;]', postcode) is None:
+            postcode = self.normalize_postcode(postcode)

-            with self.conn.cursor() as cur:
-                # no word_id needed for postcodes
-                cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                 search_name_count)
-                               (SELECT pc, %s, 'place', 'postcode', 0
-                                FROM (VALUES (%s)) as v(pc)
-                                WHERE NOT EXISTS
-                                 (SELECT * FROM word
-                                  WHERE word = pc and class='place' and type='postcode'))
-                            """, (' ' + term, postcode))
-            self._cache.postcodes.add(postcode)
+            if postcode not in self._cache.postcodes:
+                term = self.make_standard_word(postcode)
+                if not term:
+                    return
+
+                with self.conn.cursor() as cur:
+                    # no word_id needed for postcodes
+                    cur.execute("""INSERT INTO word (word, word_token, class, type,
+                                                     search_name_count)
+                                   (SELECT pc, %s, 'place', 'postcode', 0
+                                    FROM (VALUES (%s)) as v(pc)
+                                    WHERE NOT EXISTS
+                                     (SELECT * FROM word
+                                      WHERE word = pc and class='place' and type='postcode'))
+                                """, (' ' + term, postcode))
+                self._cache.postcodes.add(postcode)

    @staticmethod
    def _split_housenumbers(hnrs):
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -305,13 +305,51 @@ class LegacyNameAnalyzer:
        return self.normalizer.transliterate(phrase)


-    def add_postcodes_from_db(self):
-        """ Add postcodes from the location_postcode table to the word table.
+    @staticmethod
+    def normalize_postcode(postcode):
+        """ Convert the postcode to a standardized form.
+
+            This function must yield exactly the same result as the SQL function
+            'token_normalized_postcode()'.
+        """
+        return postcode.strip().upper()
+
+
+    def update_postcodes_from_db(self):
+        """ Update postcode tokens in the word table from the location_postcode
+            table.
        """
        with self.conn.cursor() as cur:
-            cur.execute("""SELECT count(create_postcode_id(pc))
-                           FROM (SELECT distinct(postcode) as pc
-                                 FROM location_postcode) x""")
+            # This finds us the rows in location_postcode and word that are
+            # missing in the other table.
+            cur.execute("""SELECT * FROM
+                            (SELECT pc, word FROM
+                              (SELECT distinct(postcode) as pc FROM location_postcode) p
+                              FULL JOIN
+                              (SELECT word FROM word
+                                WHERE class ='place' and type = 'postcode') w
+                              ON pc = word) x
+                           WHERE pc is null or word is null""")
+
+            to_delete = []
+            to_add = []
+
+            for postcode, word in cur:
+                if postcode is None:
+                    to_delete.append(word)
+                else:
+                    to_add.append(postcode)
+
+            if to_delete:
+                cur.execute("""DELETE FROM WORD
+                               WHERE class ='place' and type = 'postcode'
+                                     and word = any(%s)
+                            """, (to_delete, ))
+            if to_add:
+                cur.execute("""SELECT count(create_postcode_id(pc))
+                               FROM unnest(%s) as pc
+                            """, (to_add, ))
+


    def update_special_phrases(self, phrases, should_replace):
@@ -416,12 +454,8 @@ class LegacyNameAnalyzer:
    def _add_postcode(self, postcode):
        """ Make sure the normalized postcode is present in the word table.
        """
-        def _create_postcode_from_db(pcode):
-            with self.conn.cursor() as cur:
-                cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
-
        if re.search(r'[:,;]', postcode) is None:
-            self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+            self._cache.add_postcode(self.conn, self.normalize_postcode(postcode))


 class _TokenInfo:
@@ -552,16 +586,19 @@ class _TokenCache:
                           FROM generate_series(1, 100) as i""")
            self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}

-        # Get postcodes that are already saved
-        postcodes = OrderedDict()
-        with conn.cursor() as cur:
-            cur.execute("""SELECT word FROM word
-                           WHERE class ='place' and type = 'postcode'""")
-            for row in cur:
-                postcodes[row[0]] = None
-        self.postcodes = _LRU(maxsize=32, init_data=postcodes)
+        # For postcodes remember the ones that have already been added
+        self.postcodes = set()

    def get_housenumber(self, number):
        """ Get a housenumber token from the cache.
        """
        return self._cached_housenumbers.get(number)
+
+
+    def add_postcode(self, conn, postcode):
+        """ Make sure the given postcode is in the database.
+        """
+        if postcode not in self.postcodes:
+            with conn.cursor() as cur:
+                cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
+            self.postcodes.add(postcode)