mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
move postcode matcher in a separate file
This commit is contained in:
0
nominatim/data/__init__.py
Normal file
0
nominatim/data/__init__.py
Normal file
97
nominatim/data/postcode_format.py
Normal file
97
nominatim/data/postcode_format.py
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Functions for formatting postcodes according to their country-specific
|
||||||
|
format.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.tools import country_info
|
||||||
|
|
||||||
|
class CountryPostcodeMatcher:
|
||||||
|
""" Matches and formats a postcode according to a format definition
|
||||||
|
of the given country.
|
||||||
|
"""
|
||||||
|
def __init__(self, country_code, config):
|
||||||
|
if 'pattern' not in config:
|
||||||
|
raise UsageError("Field 'pattern' required for 'postcode' "
|
||||||
|
f"for country '{country_code}'")
|
||||||
|
|
||||||
|
pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
|
||||||
|
|
||||||
|
self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
|
||||||
|
self.pattern = re.compile(pc_pattern)
|
||||||
|
|
||||||
|
self.output = config.get('output', r'\g<0>')
|
||||||
|
|
||||||
|
|
||||||
|
def match(self, postcode):
|
||||||
|
""" Match the given postcode against the postcode pattern for this
|
||||||
|
matcher. Returns a `re.Match` object if the match was successful
|
||||||
|
and None otherwise.
|
||||||
|
"""
|
||||||
|
# Upper-case, strip spaces and leading country code.
|
||||||
|
normalized = self.norm_pattern.fullmatch(postcode.upper())
|
||||||
|
|
||||||
|
if normalized:
|
||||||
|
return self.pattern.fullmatch(normalized.group(1))
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(self, match):
|
||||||
|
""" Return the default format of the postcode for the given match.
|
||||||
|
`match` must be a `re.Match` object previously returned by
|
||||||
|
`match()`
|
||||||
|
"""
|
||||||
|
return match.expand(self.output)
|
||||||
|
|
||||||
|
|
||||||
|
class PostcodeFormatter:
|
||||||
|
""" Container for different postcode formats of the world and
|
||||||
|
access functions.
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
# Objects without a country code can't have a postcode per definition.
|
||||||
|
self.country_without_postcode = {None}
|
||||||
|
self.country_matcher = {}
|
||||||
|
self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
|
||||||
|
|
||||||
|
for ccode, prop in country_info.iterate('postcode'):
|
||||||
|
if prop is False:
|
||||||
|
self.country_without_postcode.add(ccode)
|
||||||
|
elif isinstance(prop, dict):
|
||||||
|
self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
|
||||||
|
else:
|
||||||
|
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
|
||||||
|
|
||||||
|
|
||||||
|
def set_default_pattern(self, pattern):
|
||||||
|
""" Set the postcode match pattern to use, when a country does not
|
||||||
|
have a specific pattern or is marked as country without postcode.
|
||||||
|
"""
|
||||||
|
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
|
||||||
|
|
||||||
|
|
||||||
|
def match(self, country_code, postcode):
|
||||||
|
""" Match the given postcode against the postcode pattern for this
|
||||||
|
matcher. Returns a `re.Match` object if the country has a pattern
|
||||||
|
and the match was successful or None if the match failed.
|
||||||
|
"""
|
||||||
|
if country_code in self.country_without_postcode:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(self, country_code, match):
|
||||||
|
""" Return the default format of the postcode for the given match.
|
||||||
|
`match` must be a `re.Match` object previously returned by
|
||||||
|
`match()`
|
||||||
|
"""
|
||||||
|
return self.country_matcher.get(country_code, self.default_matcher).normalize(match)
|
||||||
@@ -607,7 +607,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
def _add_postcode(self, item):
|
def _add_postcode(self, item):
|
||||||
""" Make sure the normalized postcode is present in the word table.
|
""" Make sure the normalized postcode is present in the word table.
|
||||||
"""
|
"""
|
||||||
analyzer = self.token_analysis.get_analyzer('@postcode')
|
analyzer = self.token_analysis.analysis.get('@postcode')
|
||||||
|
|
||||||
if analyzer is None:
|
if analyzer is None:
|
||||||
postcode_name = item.name.strip().upper()
|
postcode_name = item.name.strip().upper()
|
||||||
|
|||||||
@@ -16,70 +16,17 @@ Arguments:
|
|||||||
When set to 'no', non-conforming postcodes are not
|
When set to 'no', non-conforming postcodes are not
|
||||||
searchable either.
|
searchable either.
|
||||||
"""
|
"""
|
||||||
import re
|
from nominatim.data.postcode_format import PostcodeFormatter
|
||||||
|
|
||||||
from nominatim.errors import UsageError
|
|
||||||
from nominatim.tools import country_info
|
|
||||||
|
|
||||||
class _PostcodeMatcher:
|
|
||||||
""" Matches and formats a postcode according to the format definition.
|
|
||||||
"""
|
|
||||||
def __init__(self, country_code, config):
|
|
||||||
if 'pattern' not in config:
|
|
||||||
raise UsageError("Field 'pattern' required for 'postcode' "
|
|
||||||
f"for country '{country_code}'")
|
|
||||||
|
|
||||||
pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
|
|
||||||
|
|
||||||
self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
|
|
||||||
self.pattern = re.compile(pc_pattern)
|
|
||||||
|
|
||||||
self.output = config.get('output', r'\g<0>')
|
|
||||||
|
|
||||||
|
|
||||||
def match(self, postcode):
|
|
||||||
""" Match the given postcode against the postcode pattern for this
|
|
||||||
matcher. Returns a `re.Match` object if the match was successful
|
|
||||||
and None otherwise.
|
|
||||||
"""
|
|
||||||
# Upper-case, strip spaces and leading country code.
|
|
||||||
normalized = self.norm_pattern.fullmatch(postcode.upper())
|
|
||||||
|
|
||||||
if normalized:
|
|
||||||
return self.pattern.fullmatch(normalized.group(1))
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def normalize(self, match):
|
|
||||||
""" Return the default format of the postcode for the given match.
|
|
||||||
`match` must be a `re.Match` object previously returned by
|
|
||||||
`match()`
|
|
||||||
"""
|
|
||||||
return match.expand(self.output)
|
|
||||||
|
|
||||||
|
|
||||||
class _PostcodeSanitizer:
|
class _PostcodeSanitizer:
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
self.convert_to_address = config.get_bool('convert-to-address', True)
|
self.convert_to_address = config.get_bool('convert-to-address', True)
|
||||||
# Objects without a country code can't have a postcode per definition.
|
self.matcher = PostcodeFormatter()
|
||||||
self.country_without_postcode = {None}
|
|
||||||
self.country_matcher = {}
|
|
||||||
|
|
||||||
for ccode, prop in country_info.iterate('postcode'):
|
|
||||||
if prop is False:
|
|
||||||
self.country_without_postcode.add(ccode)
|
|
||||||
elif isinstance(prop, dict):
|
|
||||||
self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
|
|
||||||
else:
|
|
||||||
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
|
|
||||||
|
|
||||||
default_pattern = config.get('default-pattern')
|
default_pattern = config.get('default-pattern')
|
||||||
if default_pattern is not None and isinstance(default_pattern, str):
|
if default_pattern is not None and isinstance(default_pattern, str):
|
||||||
self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern})
|
self.matcher.set_default_pattern(default_pattern)
|
||||||
else:
|
|
||||||
self.default_matcher = None
|
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, obj):
|
def __call__(self, obj):
|
||||||
@@ -106,18 +53,11 @@ class _PostcodeSanitizer:
|
|||||||
normalized version. Returns None if the postcode does not
|
normalized version. Returns None if the postcode does not
|
||||||
correspond to the oficial format of the given country.
|
correspond to the oficial format of the given country.
|
||||||
"""
|
"""
|
||||||
if country in self.country_without_postcode:
|
match = self.matcher.match(country, postcode)
|
||||||
return None
|
|
||||||
|
|
||||||
matcher = self.country_matcher.get(country, self.default_matcher)
|
|
||||||
if matcher is None:
|
|
||||||
return postcode.upper(), ''
|
|
||||||
|
|
||||||
match = matcher.match(postcode)
|
|
||||||
if match is None:
|
if match is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return matcher.normalize(match), ' '.join(match.groups())
|
return self.matcher.normalize(country, match), ' '.join(match.groups())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -437,13 +437,6 @@ class TestPlaceAddress:
|
|||||||
assert word_table.get_postcodes() == {pcode, }
|
assert word_table.get_postcodes() == {pcode, }
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
|
|
||||||
def test_process_place_bad_postcode(self, word_table, pcode):
|
|
||||||
self.process_address(postcode=pcode)
|
|
||||||
|
|
||||||
assert not word_table.get_postcodes()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
|
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
|
||||||
def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
|
def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
|
||||||
info = self.process_address(housenumber=hnr)
|
info = self.process_address(housenumber=hnr)
|
||||||
|
|||||||
Reference in New Issue
Block a user