introduce and use analyzer for postcodes

2026-02-26 11:08:13 +00:00 · 2022-05-24 21:45:06 +02:00
parent 18864afa8a
commit ca7b46511d
5 changed files with 114 additions and 20 deletions
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@@ -223,3 +223,26 @@ BEGIN
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[])
  RETURNS BOOLEAN
  AS $$
 DECLARE
  existing INTEGER;
 BEGIN
  SELECT count(*) INTO existing
    FROM word WHERE word = postcode and type = 'P';
  IF existing > 0 THEN
    RETURN TRUE;
  END IF;
  -- postcodes don't need word ids
  INSERT INTO word (word_token, type, word)
    SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term;
  RETURN FALSE;
 END;
 $$
 LANGUAGE plpgsql;
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -11,7 +11,6 @@ libICU instead of the PostgreSQL module.
 import itertools
 import json
 import logging
 import re
 from textwrap import dedent
 from nominatim.db.connection import connect
@@ -473,7 +472,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
    def _process_place_address(self, token_info, address):
        for item in address:
            if item.kind == 'postcode':
-                self._add_postcode(item.name)
+                token_info.set_postcode(self._add_postcode(item))
            elif item.kind == 'housenumber':
                token_info.add_housenumber(*self._compute_housenumber_token(item))
            elif item.kind == 'street':
@@ -605,26 +604,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        return full_tokens, partial_tokens
-    def _add_postcode(self, postcode):
+    def _add_postcode(self, item):
        """ Make sure the normalized postcode is present in the word table.
        """
-        if re.search(r'[:,;]', postcode) is None:
+        analyzer = self.token_analysis.get_analyzer('@postcode')
            postcode = self.normalize_postcode(postcode)
-            if postcode not in self._cache.postcodes:
+        if analyzer is None:
-                term = self._search_normalized(postcode)
+            postcode_name = item.name.strip().upper()
-                if not term:
+            variant_base = None
-                    return
+        else:
            postcode_name = analyzer.normalize(item.name)
            variant_base = item.get_attr("variant")
-                with self.conn.cursor() as cur:
+        if variant_base is not None:
-                    # no word_id needed for postcodes
+            postcode = f'{postcode_name}@{variant_base}'
-                    cur.execute("""INSERT INTO word (word_token, type, word)
+        else:
-                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
+            postcode = postcode_name
-                                    WHERE NOT EXISTS
+
-                                     (SELECT * FROM word
+        if postcode not in self._cache.postcodes:
-                                      WHERE type = 'P' and word = pc))
+            term = self._search_normalized(postcode_name)
-                                """, (term, postcode))
+            if not term:
-                self._cache.postcodes.add(postcode)
+                return
            variants = {term}
            if analyzer is not None and variant_base is not None:
                variants.update(analyzer.get_variants_ascii(variant_base))
            with self.conn.cursor() as cur:
                cur.execute("SELECT create_postcode_word(%s, %s)",
                            (postcode, list(variants)))
            self._cache.postcodes.add(postcode)
 class _TokenInfo:
@@ -637,6 +646,7 @@ class _TokenInfo:
        self.street_tokens = set()
        self.place_tokens = set()
        self.address_tokens = {}
        self.postcode = None
    @staticmethod
@@ -701,6 +711,11 @@ class _TokenInfo:
        if partials:
            self.address_tokens[key] = self._mk_array(partials)
    def set_postcode(self, postcode):
        """ Set the postcode to the given one.
        """
        self.postcode = postcode
 class _TokenCache:
    """ Cache for token information to avoid repeated database queries.
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -98,7 +98,7 @@ class _PostcodeSanitizer:
                    obj.address.pop(pos)
            else:
                postcode.name = formatted[0]
-                postcode.set_attr('lookup', formatted[1])
+                postcode.set_attr('variant', formatted[1])
    def scan(self, postcode, country):
--- a/nominatim/tokenizer/token_analysis/postcodes.py
+++ b/nominatim/tokenizer/token_analysis/postcodes.py
@@ -0,0 +1,54 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Specialized processor for postcodes. Supports a 'lookup' variant of the
 token, which produces variants with optional spaces.
 """
 from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
 ### Configuration section
 def configure(rules, normalization_rules): # pylint: disable=W0613
    """ All behaviour is currently hard-coded.
    """
    return None
 ### Analysis section
 def create(normalizer, transliterator, config): # pylint: disable=W0613
    """ Create a new token analysis instance for this module.
    """
    return PostcodeTokenAnalysis(normalizer, transliterator)
 class PostcodeTokenAnalysis:
    """ Detects common housenumber patterns and normalizes them.
    """
    def __init__(self, norm, trans):
        self.norm = norm
        self.trans = trans
        self.mutator = MutationVariantGenerator(' ', (' ', ''))
    def normalize(self, name):
        """ Return the standard form of the postcode.
        """
        return name.strip().upper()
    def get_variants_ascii(self, norm_name):
        """ Compute the spelling variants for the given normalized postcode.
            The official form creates one variant. If a 'lookup version' is
            given, then it will create variants with optional spaces.
        """
        # Postcodes follow their own transliteration rules.
        # Make sure at this point, that the terms are normalized in a way
        # that they are searchable with the standard transliteration rules.
        return [self.trans.transliterate(term) for term in
                self.mutator.generate([self.norm.transliterate(norm_name)])]
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -34,7 +34,7 @@ sanitizers:
        - (\A|.*,)[^\d,]{3,}(,.*|\Z)
    - step: clean-postcodes
      convert-to-address: yes
-      default-pattern: [A-Z0-9- ]{3,12}
+      default-pattern: "[A-Z0-9- ]{3,12}"
    - step: split-name-list
    - step: strip-brace-terms
    - step: tag-analyzer-by-language
@@ -46,6 +46,8 @@ token-analysis:
    - analyzer: generic
    - id: "@housenumber"
      analyzer: housenumbers
    - id: "@postcode"
      analyzer: postcodes
    - id: bg
      analyzer: generic
      mode: variant-only