move postcode matcher in a separate file

This commit is contained in:
Sarah Hoffmann
2022-06-06 23:37:04 +02:00
parent bf86b45178
commit 80ea13437d
5 changed files with 103 additions and 73 deletions

View File

@@ -607,7 +607,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
def _add_postcode(self, item):
""" Make sure the normalized postcode is present in the word table.
"""
analyzer = self.token_analysis.get_analyzer('@postcode')
analyzer = self.token_analysis.analysis.get('@postcode')
if analyzer is None:
postcode_name = item.name.strip().upper()

View File

@@ -16,70 +16,17 @@ Arguments:
When set to 'no', non-conforming postcodes are not
searchable either.
"""
import re
from nominatim.errors import UsageError
from nominatim.tools import country_info
class _PostcodeMatcher:
""" Matches and formats a postcode according to the format definition.
"""
def __init__(self, country_code, config):
if 'pattern' not in config:
raise UsageError("Field 'pattern' required for 'postcode' "
f"for country '{country_code}'")
pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
self.pattern = re.compile(pc_pattern)
self.output = config.get('output', r'\g<0>')
def match(self, postcode):
""" Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the match was successful
and None otherwise.
"""
# Upper-case, strip spaces and leading country code.
normalized = self.norm_pattern.fullmatch(postcode.upper())
if normalized:
return self.pattern.fullmatch(normalized.group(1))
return None
def normalize(self, match):
""" Return the default format of the postcode for the given match.
`match` must be a `re.Match` object previously returned by
`match()`
"""
return match.expand(self.output)
from nominatim.data.postcode_format import PostcodeFormatter
class _PostcodeSanitizer:
def __init__(self, config):
self.convert_to_address = config.get_bool('convert-to-address', True)
# Objects without a country code can't have a postcode per definition.
self.country_without_postcode = {None}
self.country_matcher = {}
for ccode, prop in country_info.iterate('postcode'):
if prop is False:
self.country_without_postcode.add(ccode)
elif isinstance(prop, dict):
self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
else:
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
self.matcher = PostcodeFormatter()
default_pattern = config.get('default-pattern')
if default_pattern is not None and isinstance(default_pattern, str):
self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern})
else:
self.default_matcher = None
self.matcher.set_default_pattern(default_pattern)
def __call__(self, obj):
@@ -106,18 +53,11 @@ class _PostcodeSanitizer:
normalized version. Returns None if the postcode does not
correspond to the oficial format of the given country.
"""
if country in self.country_without_postcode:
return None
matcher = self.country_matcher.get(country, self.default_matcher)
if matcher is None:
return postcode.upper(), ''
match = matcher.match(postcode)
match = self.matcher.match(country, postcode)
if match is None:
return None
return matcher.normalize(match), ' '.join(match.groups())
return self.matcher.normalize(country, match), ' '.join(match.groups())