handle postcodes properly on word table updates

update_postcodes_from_db() needs to do the full postcode treatment in order to derive the correct word table entries.
2026-02-26 11:08:13 +00:00 · 2022-06-21 22:05:35 +02:00
parent 5be320368c
commit 612d34930b
5 changed files with 214 additions and 54 deletions
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -290,33 +290,72 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        """ Update postcode tokens in the word table from the location_postcode
            table.
        """
-        to_delete = []
+        analyzer = self.token_analysis.analysis.get('@postcode')
        with self.conn.cursor() as cur:
-            # This finds us the rows in location_postcode and word that are
+            # First get all postcode names currently in the word table.
-            # missing in the other table.
+            cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
-            cur.execute("""SELECT * FROM
+            word_entries = set((entry[0] for entry in cur))
                            (SELECT pc, word FROM
                              (SELECT distinct(postcode) as pc FROM location_postcode) p
                              FULL JOIN
                              (SELECT word FROM word WHERE type = 'P') w
                              ON pc = word) x
                           WHERE pc is null or word is null""")
-            with CopyBuffer() as copystr:
+            # Then compute the required postcode names from the postcode table.
-                for postcode, word in cur:
+            needed_entries = set()
-                    if postcode is None:
+            cur.execute("SELECT country_code, postcode FROM location_postcode")
-                        to_delete.append(word)
+            for cc, postcode in cur:
-                    else:
+                info = PlaceInfo({'country_code': cc,
-                        copystr.add(self._search_normalized(postcode),
+                                  'class': 'place', 'type': 'postcode',
-                                    'P', postcode)
+                                  'address': {'postcode': postcode}})
                address = self.sanitizer.process_names(info)[1]
                for place in address:
                    if place.kind == 'postcode':
                        if analyzer is None:
                            postcode_name = place.name.strip().upper()
                            variant_base = None
                        else:
                            postcode_name = analyzer.normalize(place.name)
                            variant_base = place.get_attr("variant")
                        if variant_base:
                            needed_entries.add(f'{postcode_name}@{variant_base}')
                        else:
                            needed_entries.add(postcode_name)
                        break
        # Now update the word table.
        self._delete_unused_postcode_words(word_entries - needed_entries)
        self._add_missing_postcode_words(needed_entries - word_entries)
    def _delete_unused_postcode_words(self, tokens):
        if tokens:
            with self.conn.cursor() as cur:
                cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
                            (list(tokens), ))
    def _add_missing_postcode_words(self, tokens):
        if not tokens:
            return
        analyzer = self.token_analysis.analysis.get('@postcode')
        terms = []
        for postcode_name in tokens:
            if '@' in postcode_name:
                term, variant = postcode_name.split('@', 2)
                term = self._search_normalized(term)
                variants = {term}
                if analyzer is not None:
                    variants.update(analyzer.get_variants_ascii(variant))
                    variants = list(variants)
            else:
                variants = [self._search_normalized(postcode_name)]
            terms.append((postcode_name, variants))
        if terms:
            with self.conn.cursor() as cur:
                cur.execute_values("""SELECT create_postcode_word(pc, var)
                                      FROM (VALUES %s) AS v(pc, var)""",
                                   terms)
                if to_delete:
                    cur.execute("""DELETE FROM WORD
                                   WHERE type ='P' and word = any(%s)
                                """, (to_delete, ))
                copystr.copy_out(cur, 'word',
                                 columns=['word_token', 'type', 'word'])
    def update_special_phrases(self, phrases, should_replace):
@@ -616,7 +655,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            postcode_name = analyzer.normalize(item.name)
            variant_base = item.get_attr("variant")
-        if variant_base is not None:
+        if variant_base:
            postcode = f'{postcode_name}@{variant_base}'
        else:
            postcode = postcode_name
@@ -627,7 +666,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                return None
            variants = {term}
-            if analyzer is not None and variant_base is not None:
+            if analyzer is not None and variant_base:
                variants.update(analyzer.get_variants_ascii(variant_base))
            with self.conn.cursor() as cur:
--- a/nominatim/tokenizer/token_analysis/postcodes.py
+++ b/nominatim/tokenizer/token_analysis/postcodes.py
@@ -25,8 +25,18 @@ def create(normalizer, transliterator, config): # pylint: disable=W0613
    """
    return PostcodeTokenAnalysis(normalizer, transliterator)
 class PostcodeTokenAnalysis:
-    """ Detects common housenumber patterns and normalizes them.
+    """ Special normalization and variant generation for postcodes.
        This analyser must not be used with anything but postcodes as
        it follows some special rules: `normalize` doesn't necessarily
        need to return a standard form as per normalization rules. It
        needs to return the canonical form of the postcode that is also
        used for output. `get_variants_ascii` then needs to ensure that
        the generated variants once more follow the standard normalization
        and transliteration, so that postcodes are correctly recognised by
        the search algorithm.
    """
    def __init__(self, norm, trans):
        self.norm = norm
@@ -44,11 +54,12 @@ class PostcodeTokenAnalysis:
    def get_variants_ascii(self, norm_name):
        """ Compute the spelling variants for the given normalized postcode.
-            The official form creates one variant. If a 'lookup version' is
+            Takes the canonical form of the postcode, normalizes it using the
-            given, then it will create variants with optional spaces.
+            standard rules and then creates variants of the result where
            all spaces are optional.
        """
        # Postcodes follow their own transliteration rules.
        # Make sure at this point, that the terms are normalized in a way
        # that they are searchable with the standard transliteration rules.
        return [self.trans.transliterate(term) for term in
-                self.mutator.generate([self.norm.transliterate(norm_name)])]
+                self.mutator.generate([self.norm.transliterate(norm_name)]) if term]
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -18,13 +18,18 @@ from nominatim.tokenizer import factory as tokenizer_factory
 def check_database_integrity(context):
    """ Check some generic constraints on the tables.
    """
-    # place_addressline should not have duplicate (place_id, address_place_id)
+    with context.db.cursor() as cur:
-    cur = context.db.cursor()
+        # place_addressline should not have duplicate (place_id, address_place_id)
-    cur.execute("""SELECT count(*) FROM
+        cur.execute("""SELECT count(*) FROM
-                    (SELECT place_id, address_place_id, count(*) as c
+                        (SELECT place_id, address_place_id, count(*) as c
-                     FROM place_addressline GROUP BY place_id, address_place_id) x
+                         FROM place_addressline GROUP BY place_id, address_place_id) x
-                   WHERE c > 1""")
+                       WHERE c > 1""")
-    assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
+        assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
        # word table must not have empty word_tokens
        cur.execute("SELECT count(*) FROM word WHERE word_token = ''")
        assert cur.fetchone()[0] == 0, "Empty word tokens found in word table"
 ################################ GIVEN ##################################
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
    def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
                     variants=('~gasse -> gasse', 'street => st', ),
-                     sanitizers=[], with_housenumber=False):
+                     sanitizers=[], with_housenumber=False,
                     with_postcode=False):
        cfgstr = {'normalization': list(norm),
                  'sanitizers': sanitizers,
                  'transliteration': list(trans),
@@ -81,6 +82,9 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
        if with_housenumber:
            cfgstr['token-analysis'].append({'id': '@housenumber',
                                             'analyzer': 'housenumbers'})
        if with_postcode:
            cfgstr['token-analysis'].append({'id': '@postcode',
                                             'analyzer': 'postcodes'})
        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
        tok.loader = nominatim.tokenizer.icu_rule_loader.ICURuleLoader(test_config)
@@ -246,28 +250,69 @@ def test_normalize_postcode(analyzer):
        anl.normalize_postcode('38 Б') == '38 Б'
-def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table):
+class TestPostcodes:
    table_factory('location_postcode', 'postcode TEXT',
                  content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
-    with analyzer() as anl:
+    @pytest.fixture(autouse=True)
-        anl.update_postcodes_from_db()
+    def setup(self, analyzer, sql_functions):
-
+        sanitizers = [{'step': 'clean-postcodes'}]
-    assert word_table.count() == 3
+        with analyzer(sanitizers=sanitizers, with_postcode=True) as anl:
-    assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'}
+            self.analyzer = anl
            yield anl
-def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table):
+    def process_postcode(self, cc, postcode):
-    table_factory('location_postcode', 'postcode TEXT',
+        return self.analyzer.process_place(PlaceInfo({'country_code': cc,
-                  content=(('1234',), ('45BC', ), ('XX45', )))
+                                                      'address': {'postcode': postcode}}))
    word_table.add_postcode(' 1234', '1234')
    word_table.add_postcode(' 5678', '5678')
    with analyzer() as anl:
        anl.update_postcodes_from_db()
-    assert word_table.count() == 3
+    def test_update_postcodes_from_db_empty(self, table_factory, word_table):
-    assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'}
+        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
                      content=(('de', '12345'), ('se', '132 34'),
                               ('bm', 'AB23'), ('fr', '12345')))
        self.analyzer.update_postcodes_from_db()
        assert word_table.count() == 5
        assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'}
    def test_update_postcodes_from_db_ambigious(self, table_factory, word_table):
        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
                      content=(('in', '123456'), ('sg', '123456')))
        self.analyzer.update_postcodes_from_db()
        assert word_table.count() == 3
        assert word_table.get_postcodes() == {'123456', '123456@123 456'}
    def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table):
        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
                      content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45')))
        word_table.add_postcode(' 1234', '1234')
        word_table.add_postcode(' 5678', '5678')
        self.analyzer.update_postcodes_from_db()
        assert word_table.count() == 5
        assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'}
    def test_process_place_postcode_simple(self, word_table):
        info = self.process_postcode('de', '12345')
        assert info['postcode'] == '12345'
        assert word_table.get_postcodes() == {'12345', }
    def test_process_place_postcode_with_space(self, word_table):
        info = self.process_postcode('in', '123 567')
        assert info['postcode'] == '123567'
        assert word_table.get_postcodes() == {'123567@123 567', }
 def test_update_special_phrase_empty_table(analyzer, word_table):
--- a/test/python/tokenizer/token_analysis/test_analysis_postcodes.py
+++ b/test/python/tokenizer/token_analysis/test_analysis_postcodes.py
@@ -0,0 +1,60 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Tests for special postcode analysis and variant generation.
 """
 import pytest
 from icu import Transliterator
 import nominatim.tokenizer.token_analysis.postcodes as module
 from nominatim.errors import UsageError
 DEFAULT_NORMALIZATION = """ :: NFD ();
                            '🜳' > ' ';
                            [[:Nonspacing Mark:] [:Cf:]] >;
                            :: lower ();
                            [[:Punctuation:][:Space:]]+ > ' ';
                            :: NFC ();
                        """
 DEFAULT_TRANSLITERATION = """ ::  Latin ();
                              '🜵' > ' ';
                          """
@pytest.fixture
 def analyser():
    rules = { 'analyzer': 'postcodes'}
    config = module.configure(rules, DEFAULT_NORMALIZATION)
    trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
    norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
    return module.create(norm, trans, config)
 def get_normalized_variants(proc, name):
    norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
    return proc.get_variants_ascii(norm.transliterate(name).strip())
@pytest.mark.parametrize('name,norm', [('12', '12'),
                                       ('A 34 ', 'A 34'),
                                       ('34-av', '34-AV')])
 def test_normalize(analyser, name, norm):
    assert analyser.normalize(name) == norm
@pytest.mark.parametrize('postcode,variants', [('12345', {'12345'}),
                                               ('AB-998', {'ab 998', 'ab998'}),
                                               ('23 FGH D3', {'23 fgh d3', '23fgh d3',
                                                              '23 fghd3', '23fghd3'})])
 def test_get_variants_ascii(analyser, postcode, variants):
    out = analyser.get_variants_ascii(postcode)
    assert len(out) == len(set(out))
    assert set(out) == variants