introduce and use analyzer for postcodes

This commit is contained in:
Sarah Hoffmann
2022-05-24 21:45:06 +02:00
parent 18864afa8a
commit ca7b46511d
5 changed files with 114 additions and 20 deletions

View File

@@ -223,3 +223,26 @@ BEGIN
END; END;
$$ $$
LANGUAGE plpgsql; LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[])
RETURNS BOOLEAN
AS $$
DECLARE
existing INTEGER;
BEGIN
SELECT count(*) INTO existing
FROM word WHERE word = postcode and type = 'P';
IF existing > 0 THEN
RETURN TRUE;
END IF;
-- postcodes don't need word ids
INSERT INTO word (word_token, type, word)
SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term;
RETURN FALSE;
END;
$$
LANGUAGE plpgsql;

View File

@@ -11,7 +11,6 @@ libICU instead of the PostgreSQL module.
import itertools import itertools
import json import json
import logging import logging
import re
from textwrap import dedent from textwrap import dedent
from nominatim.db.connection import connect from nominatim.db.connection import connect
@@ -473,7 +472,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
def _process_place_address(self, token_info, address): def _process_place_address(self, token_info, address):
for item in address: for item in address:
if item.kind == 'postcode': if item.kind == 'postcode':
self._add_postcode(item.name) token_info.set_postcode(self._add_postcode(item))
elif item.kind == 'housenumber': elif item.kind == 'housenumber':
token_info.add_housenumber(*self._compute_housenumber_token(item)) token_info.add_housenumber(*self._compute_housenumber_token(item))
elif item.kind == 'street': elif item.kind == 'street':
@@ -605,26 +604,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
return full_tokens, partial_tokens return full_tokens, partial_tokens
def _add_postcode(self, postcode): def _add_postcode(self, item):
""" Make sure the normalized postcode is present in the word table. """ Make sure the normalized postcode is present in the word table.
""" """
if re.search(r'[:,;]', postcode) is None: analyzer = self.token_analysis.get_analyzer('@postcode')
postcode = self.normalize_postcode(postcode)
if postcode not in self._cache.postcodes: if analyzer is None:
term = self._search_normalized(postcode) postcode_name = item.name.strip().upper()
if not term: variant_base = None
return else:
postcode_name = analyzer.normalize(item.name)
variant_base = item.get_attr("variant")
with self.conn.cursor() as cur: if variant_base is not None:
# no word_id needed for postcodes postcode = f'{postcode_name}@{variant_base}'
cur.execute("""INSERT INTO word (word_token, type, word) else:
(SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) postcode = postcode_name
WHERE NOT EXISTS
(SELECT * FROM word if postcode not in self._cache.postcodes:
WHERE type = 'P' and word = pc)) term = self._search_normalized(postcode_name)
""", (term, postcode)) if not term:
self._cache.postcodes.add(postcode) return
variants = {term}
if analyzer is not None and variant_base is not None:
variants.update(analyzer.get_variants_ascii(variant_base))
with self.conn.cursor() as cur:
cur.execute("SELECT create_postcode_word(%s, %s)",
(postcode, list(variants)))
self._cache.postcodes.add(postcode)
class _TokenInfo: class _TokenInfo:
@@ -637,6 +646,7 @@ class _TokenInfo:
self.street_tokens = set() self.street_tokens = set()
self.place_tokens = set() self.place_tokens = set()
self.address_tokens = {} self.address_tokens = {}
self.postcode = None
@staticmethod @staticmethod
@@ -701,6 +711,11 @@ class _TokenInfo:
if partials: if partials:
self.address_tokens[key] = self._mk_array(partials) self.address_tokens[key] = self._mk_array(partials)
def set_postcode(self, postcode):
""" Set the postcode to the given one.
"""
self.postcode = postcode
class _TokenCache: class _TokenCache:
""" Cache for token information to avoid repeated database queries. """ Cache for token information to avoid repeated database queries.

View File

@@ -98,7 +98,7 @@ class _PostcodeSanitizer:
obj.address.pop(pos) obj.address.pop(pos)
else: else:
postcode.name = formatted[0] postcode.name = formatted[0]
postcode.set_attr('lookup', formatted[1]) postcode.set_attr('variant', formatted[1])
def scan(self, postcode, country): def scan(self, postcode, country):

View File

@@ -0,0 +1,54 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Specialized processor for postcodes. Supports a 'lookup' variant of the
token, which produces variants with optional spaces.
"""
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
### Configuration section
def configure(rules, normalization_rules): # pylint: disable=W0613
""" All behaviour is currently hard-coded.
"""
return None
### Analysis section
def create(normalizer, transliterator, config): # pylint: disable=W0613
""" Create a new token analysis instance for this module.
"""
return PostcodeTokenAnalysis(normalizer, transliterator)
class PostcodeTokenAnalysis:
""" Detects common housenumber patterns and normalizes them.
"""
def __init__(self, norm, trans):
self.norm = norm
self.trans = trans
self.mutator = MutationVariantGenerator(' ', (' ', ''))
def normalize(self, name):
""" Return the standard form of the postcode.
"""
return name.strip().upper()
def get_variants_ascii(self, norm_name):
""" Compute the spelling variants for the given normalized postcode.
The official form creates one variant. If a 'lookup version' is
given, then it will create variants with optional spaces.
"""
# Postcodes follow their own transliteration rules.
# Make sure at this point, that the terms are normalized in a way
# that they are searchable with the standard transliteration rules.
return [self.trans.transliterate(term) for term in
self.mutator.generate([self.norm.transliterate(norm_name)])]

View File

@@ -34,7 +34,7 @@ sanitizers:
- (\A|.*,)[^\d,]{3,}(,.*|\Z) - (\A|.*,)[^\d,]{3,}(,.*|\Z)
- step: clean-postcodes - step: clean-postcodes
convert-to-address: yes convert-to-address: yes
default-pattern: [A-Z0-9- ]{3,12} default-pattern: "[A-Z0-9- ]{3,12}"
- step: split-name-list - step: split-name-list
- step: strip-brace-terms - step: strip-brace-terms
- step: tag-analyzer-by-language - step: tag-analyzer-by-language
@@ -46,6 +46,8 @@ token-analysis:
- analyzer: generic - analyzer: generic
- id: "@housenumber" - id: "@housenumber"
analyzer: housenumbers analyzer: housenumbers
- id: "@postcode"
analyzer: postcodes
- id: bg - id: bg
analyzer: generic analyzer: generic
mode: variant-only mode: variant-only