switch postcode tokens to new word table layout

2026-02-26 11:08:13 +00:00 · 2021-07-20 12:11:12 +02:00
parent 5ab0a63fd6
commit 5394b1fa1b
3 changed files with 26 additions and 17 deletions
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -147,7 +147,7 @@ class Tokenizer
    {
        // Check which tokens we have, get the ID numbers
        $sSQL = 'SELECT word_id, word_token, type';
-        $sSQL .= "      info->>'cc' as country";
+        $sSQL .= "      info->>'cc' as country, info->>'postcode' as postcode";
        $sSQL .= ' FROM word WHERE word_token in (';
        $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
@@ -171,6 +171,16 @@ class Tokenizer
                'H':  // house number tokens
                    $oToken = new Token\HouseNumber($iId, $aWord['word_token']);
                    break;
                'P':  // postcode tokens
                    // Postcodes are not normalized, so they may have content
                    // that makes SQL injection possible. Reject postcodes
                    // that would need special escaping.
                    if ($aWord['postcode'] === null
                        || pg_escape_string($aWord['postcode']) == $aWord['postcode']
                    ) {
                       continue;
                    }
                    $oToken = new Token\Postcode($iId, $aWord['postcode'], null);
                default:
                    continue;
            }
--- a/lib-sql/tokenizer/icu_tokenizer_tables.sql
+++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql
@@ -10,7 +10,12 @@ CREATE INDEX idx_word_word_token ON word
    USING BTREE (word_token) {{db.tablespace.search_index}};
 -- Used when updating country names from the boundary relation.
 CREATE INDEX idx_word_country_names ON word
-    USING btree((info->>'cc')) WHERE type = 'C';
+    USING btree((info->>'cc')) {{db.tablespace.address_index}}
    WHERE type = 'C';
 -- Used when inserting new postcodes on updates.
 CREATE INDEX idx_word_postcodes ON word
    USING btree((info->>'postcode')) {{db.tablespace.address_index}}
    WHERE type = 'P'
 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 DROP SEQUENCE IF EXISTS seq_word;
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -276,8 +276,7 @@ class LegacyICUNameAnalyzer:
                            (SELECT pc, word FROM
                              (SELECT distinct(postcode) as pc FROM location_postcode) p
                              FULL JOIN
-                              (SELECT word FROM word
+                              (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
                                WHERE class ='place' and type = 'postcode') w
                              ON pc = word) x
                           WHERE pc is null or word is null""")
@@ -286,20 +285,16 @@ class LegacyICUNameAnalyzer:
                    if postcode is None:
                        to_delete.append(word)
                    else:
-                        copystr.add(
+                        copystr.add(self.name_processor.get_search_normalized(postcode),
-                            postcode,
+                                    'P', {'postcode': postcode})
                            ' ' + self.name_processor.get_search_normalized(postcode),
                            'place', 'postcode', 0)
                if to_delete:
                    cur.execute("""DELETE FROM WORD
-                                   WHERE class ='place' and type = 'postcode'
+                                   WHERE class ='P' and info->>'postcode' = any(%s)
                                         and word = any(%s)
                                """, (to_delete, ))
                copystr.copy_out(cur, 'word',
-                                 columns=['word', 'word_token', 'class', 'type',
+                                 columns=['word_token', 'type', 'info'])
                                          'search_name_count'])
    def update_special_phrases(self, phrases, should_replace):
@@ -503,14 +498,13 @@ class LegacyICUNameAnalyzer:
                with self.conn.cursor() as cur:
                    # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word, word_token, class, type,
+                    cur.execute("""INSERT INTO word (word_token, type, info)
-                                                     search_name_count)
+                                   (SELECT %s, 'P', json_build_object('postcode', pc)
                                   (SELECT pc, %s, 'place', 'postcode', 0
                                    FROM (VALUES (%s)) as v(pc)
                                    WHERE NOT EXISTS
                                     (SELECT * FROM word
-                                      WHERE word = pc and class='place' and type='postcode'))
+                                      WHERE type = 'P' and info->>postcode = pc))
-                                """, (' ' + term, postcode))
+                                """, (term, postcode))
                self._cache.postcodes.add(postcode)