mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
initial postcode cleaner for simple patterns
Moves postcodes that are either in countries without a postcode system or don't correspond to the local pattern for postcodes into a field for a normal address part. Makes them searchable but not as a special address. This has two consequences: they are no longer a skippable part of the address and the postcodes cannot be searched on their own.
This commit is contained in:
99
nominatim/tokenizer/sanitizers/clean_postcodes.py
Normal file
99
nominatim/tokenizer/sanitizers/clean_postcodes.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Sanitizer that filters postcodes by their officially allowed pattern.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
convert-to-address: If set to 'yes' (the default), then postcodes that do
|
||||||
|
not conform with their country-specific pattern are
|
||||||
|
converted to an address component. That means that
|
||||||
|
the postcode does not take part when computing the
|
||||||
|
postcode centroids of a country but is still searchable.
|
||||||
|
When set to 'no', non-conforming postcodes are not
|
||||||
|
searchable either.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.tools import country_info
|
||||||
|
|
||||||
|
class _PostcodeMatcher:
|
||||||
|
""" Matches and formats a postcode according to the format definition.
|
||||||
|
"""
|
||||||
|
def __init__(self, country_code, config):
|
||||||
|
if 'pattern' not in config:
|
||||||
|
raise UsageError("Field 'pattern' required for 'postcode' "
|
||||||
|
f"for country '{country_code}'")
|
||||||
|
|
||||||
|
self.pattern = re.compile(config['pattern'].replace('d', '[0-9]')
|
||||||
|
.replace('l', '[A-Z]'))
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(self, postcode):
|
||||||
|
""" Return the normalized version of the postcode. If the given postcode
|
||||||
|
does not correspond to the usage-pattern, return null.
|
||||||
|
"""
|
||||||
|
normalized = postcode.strip().upper()
|
||||||
|
|
||||||
|
return normalized if self.pattern.fullmatch(normalized) else None
|
||||||
|
|
||||||
|
|
||||||
|
class _PostcodeSanitizer:
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
self.convert_to_address = config.get_bool('convert-to-address', True)
|
||||||
|
# Objects without a country code can't have a postcode per definition.
|
||||||
|
self.country_without_postcode = {None}
|
||||||
|
self.country_matcher = {}
|
||||||
|
|
||||||
|
for ccode, prop in country_info.iterate('postcode'):
|
||||||
|
if prop is False:
|
||||||
|
self.country_without_postcode.add(ccode)
|
||||||
|
elif isinstance(prop, dict):
|
||||||
|
self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
|
||||||
|
else:
|
||||||
|
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
|
||||||
|
|
||||||
|
|
||||||
|
def __call__(self, obj):
|
||||||
|
if not obj.address:
|
||||||
|
return
|
||||||
|
|
||||||
|
postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
|
||||||
|
|
||||||
|
for pos, postcode in postcodes:
|
||||||
|
formatted = self.scan(postcode.name, obj.place.country_code)
|
||||||
|
|
||||||
|
if formatted is None:
|
||||||
|
if self.convert_to_address:
|
||||||
|
postcode.kind = 'unofficial_postcode'
|
||||||
|
else:
|
||||||
|
obj.address.pop(pos)
|
||||||
|
else:
|
||||||
|
postcode.name = formatted
|
||||||
|
|
||||||
|
|
||||||
|
def scan(self, postcode, country):
|
||||||
|
""" Check the postcode for correct formatting and return the
|
||||||
|
normalized version. Returns None if the postcode does not
|
||||||
|
correspond to the oficial format of the given country.
|
||||||
|
"""
|
||||||
|
if country in self.country_without_postcode:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if country in self.country_matcher:
|
||||||
|
return self.country_matcher[country].normalize(postcode)
|
||||||
|
|
||||||
|
return postcode.upper()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def create(config):
|
||||||
|
""" Create a housenumber processing function.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return _PostcodeSanitizer(config)
|
||||||
54
test/python/tokenizer/sanitizers/test_clean_postcodes.py
Normal file
54
test/python/tokenizer/sanitizers/test_clean_postcodes.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Tests for the sanitizer that normalizes postcodes.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
from nominatim.tools import country_info
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sanitize(def_config, request):
|
||||||
|
country_info.setup_country_config(def_config)
|
||||||
|
sanitizer_args = {'step': 'clean-postcodes'}
|
||||||
|
for mark in request.node.iter_markers(name="sanitizer_params"):
|
||||||
|
sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
|
||||||
|
|
||||||
|
def _run(country=None, **kwargs):
|
||||||
|
pi = {'address': kwargs}
|
||||||
|
if country is not None:
|
||||||
|
pi['country_code'] = country
|
||||||
|
|
||||||
|
_, address = PlaceSanitizer([sanitizer_args]).process_names(PlaceInfo(pi))
|
||||||
|
|
||||||
|
return sorted([(p.kind, p.name) for p in address])
|
||||||
|
|
||||||
|
return _run
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("country", (None, 'ae'))
|
||||||
|
def test_postcode_no_country(sanitize, country):
|
||||||
|
assert sanitize(country=country, postcode='23231') == [('unofficial_postcode', '23231')]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("country", (None, 'ae'))
|
||||||
|
@pytest.mark.sanitizer_params(convert_to_address=False)
|
||||||
|
def test_postcode_no_country_drop(sanitize, country):
|
||||||
|
assert sanitize(country=country, postcode='23231') == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("postcode", ('12345', ' 34009 '))
|
||||||
|
def test_postcode_pass_good_format(sanitize, postcode):
|
||||||
|
assert sanitize(country='de', postcode=postcode) == [('postcode', postcode.strip())]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("postcode", ('123456', '', ' ', '.....'))
|
||||||
|
@pytest.mark.sanitizer_params(convert_to_address=False)
|
||||||
|
def test_postcode_drop_bad_format(sanitize, postcode):
|
||||||
|
assert sanitize(country='de', postcode=postcode) == []
|
||||||
Reference in New Issue
Block a user