move postcode matcher in a separate file

This commit is contained in:
Sarah Hoffmann
2022-06-06 23:37:04 +02:00
parent bf86b45178
commit 80ea13437d
5 changed files with 103 additions and 73 deletions

View File

View File

@@ -0,0 +1,97 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for formatting postcodes according to their country-specific
format.
"""
import re
from nominatim.errors import UsageError
from nominatim.tools import country_info
class CountryPostcodeMatcher:
""" Matches and formats a postcode according to a format definition
of the given country.
"""
def __init__(self, country_code, config):
if 'pattern' not in config:
raise UsageError("Field 'pattern' required for 'postcode' "
f"for country '{country_code}'")
pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
self.pattern = re.compile(pc_pattern)
self.output = config.get('output', r'\g<0>')
def match(self, postcode):
""" Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the match was successful
and None otherwise.
"""
# Upper-case, strip spaces and leading country code.
normalized = self.norm_pattern.fullmatch(postcode.upper())
if normalized:
return self.pattern.fullmatch(normalized.group(1))
return None
def normalize(self, match):
""" Return the default format of the postcode for the given match.
`match` must be a `re.Match` object previously returned by
`match()`
"""
return match.expand(self.output)
class PostcodeFormatter:
""" Container for different postcode formats of the world and
access functions.
"""
def __init__(self):
# Objects without a country code can't have a postcode per definition.
self.country_without_postcode = {None}
self.country_matcher = {}
self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
for ccode, prop in country_info.iterate('postcode'):
if prop is False:
self.country_without_postcode.add(ccode)
elif isinstance(prop, dict):
self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
else:
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
def set_default_pattern(self, pattern):
""" Set the postcode match pattern to use, when a country does not
have a specific pattern or is marked as country without postcode.
"""
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
def match(self, country_code, postcode):
""" Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the country has a pattern
and the match was successful or None if the match failed.
"""
if country_code in self.country_without_postcode:
return None
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
def normalize(self, country_code, match):
""" Return the default format of the postcode for the given match.
`match` must be a `re.Match` object previously returned by
`match()`
"""
return self.country_matcher.get(country_code, self.default_matcher).normalize(match)

View File

@@ -607,7 +607,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
def _add_postcode(self, item): def _add_postcode(self, item):
""" Make sure the normalized postcode is present in the word table. """ Make sure the normalized postcode is present in the word table.
""" """
analyzer = self.token_analysis.get_analyzer('@postcode') analyzer = self.token_analysis.analysis.get('@postcode')
if analyzer is None: if analyzer is None:
postcode_name = item.name.strip().upper() postcode_name = item.name.strip().upper()

View File

@@ -16,70 +16,17 @@ Arguments:
When set to 'no', non-conforming postcodes are not When set to 'no', non-conforming postcodes are not
searchable either. searchable either.
""" """
import re from nominatim.data.postcode_format import PostcodeFormatter
from nominatim.errors import UsageError
from nominatim.tools import country_info
class _PostcodeMatcher:
""" Matches and formats a postcode according to the format definition.
"""
def __init__(self, country_code, config):
if 'pattern' not in config:
raise UsageError("Field 'pattern' required for 'postcode' "
f"for country '{country_code}'")
pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
self.pattern = re.compile(pc_pattern)
self.output = config.get('output', r'\g<0>')
def match(self, postcode):
""" Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the match was successful
and None otherwise.
"""
# Upper-case, strip spaces and leading country code.
normalized = self.norm_pattern.fullmatch(postcode.upper())
if normalized:
return self.pattern.fullmatch(normalized.group(1))
return None
def normalize(self, match):
""" Return the default format of the postcode for the given match.
`match` must be a `re.Match` object previously returned by
`match()`
"""
return match.expand(self.output)
class _PostcodeSanitizer: class _PostcodeSanitizer:
def __init__(self, config): def __init__(self, config):
self.convert_to_address = config.get_bool('convert-to-address', True) self.convert_to_address = config.get_bool('convert-to-address', True)
# Objects without a country code can't have a postcode per definition. self.matcher = PostcodeFormatter()
self.country_without_postcode = {None}
self.country_matcher = {}
for ccode, prop in country_info.iterate('postcode'):
if prop is False:
self.country_without_postcode.add(ccode)
elif isinstance(prop, dict):
self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
else:
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
default_pattern = config.get('default-pattern') default_pattern = config.get('default-pattern')
if default_pattern is not None and isinstance(default_pattern, str): if default_pattern is not None and isinstance(default_pattern, str):
self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern}) self.matcher.set_default_pattern(default_pattern)
else:
self.default_matcher = None
def __call__(self, obj): def __call__(self, obj):
@@ -106,18 +53,11 @@ class _PostcodeSanitizer:
normalized version. Returns None if the postcode does not normalized version. Returns None if the postcode does not
correspond to the oficial format of the given country. correspond to the oficial format of the given country.
""" """
if country in self.country_without_postcode: match = self.matcher.match(country, postcode)
return None
matcher = self.country_matcher.get(country, self.default_matcher)
if matcher is None:
return postcode.upper(), ''
match = matcher.match(postcode)
if match is None: if match is None:
return None return None
return matcher.normalize(match), ' '.join(match.groups()) return self.matcher.normalize(country, match), ' '.join(match.groups())

View File

@@ -437,13 +437,6 @@ class TestPlaceAddress:
assert word_table.get_postcodes() == {pcode, } assert word_table.get_postcodes() == {pcode, }
@pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
def test_process_place_bad_postcode(self, word_table, pcode):
self.process_address(postcode=pcode)
assert not word_table.get_postcodes()
@pytest.mark.parametrize('hnr', ['123a', '1', '101']) @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id): def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
info = self.process_address(housenumber=hnr) info = self.process_address(housenumber=hnr)