mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-12 05:44:06 +00:00
split code into submodules
This commit is contained in:
80
src/nominatim_db/tokenizer/sanitizers/clean_postcodes.py
Normal file
80
src/nominatim_db/tokenizer/sanitizers/clean_postcodes.py
Normal file
@@ -0,0 +1,80 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Sanitizer that filters postcodes by their officially allowed pattern.
|
||||
|
||||
Arguments:
|
||||
convert-to-address: If set to 'yes' (the default), then postcodes that do
|
||||
not conform with their country-specific pattern are
|
||||
converted to an address component. That means that
|
||||
the postcode does not take part when computing the
|
||||
postcode centroids of a country but is still searchable.
|
||||
When set to 'no', non-conforming postcodes are not
|
||||
searchable either.
|
||||
default-pattern: Pattern to use, when there is none available for the
|
||||
country in question. Warning: will not be used for
|
||||
objects that have no country assigned. These are always
|
||||
assumed to have no postcode.
|
||||
"""
|
||||
from typing import Callable, Optional, Tuple
|
||||
|
||||
from ...data.postcode_format import PostcodeFormatter
|
||||
from .base import ProcessInfo
|
||||
from .config import SanitizerConfig
|
||||
|
||||
class _PostcodeSanitizer:
|
||||
|
||||
def __init__(self, config: SanitizerConfig) -> None:
|
||||
self.convert_to_address = config.get_bool('convert-to-address', True)
|
||||
self.matcher = PostcodeFormatter()
|
||||
|
||||
default_pattern = config.get('default-pattern')
|
||||
if default_pattern is not None and isinstance(default_pattern, str):
|
||||
self.matcher.set_default_pattern(default_pattern)
|
||||
|
||||
|
||||
def __call__(self, obj: ProcessInfo) -> None:
|
||||
if not obj.address:
|
||||
return
|
||||
|
||||
postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
|
||||
|
||||
for pos, postcode in postcodes:
|
||||
formatted = self.scan(postcode.name, obj.place.country_code)
|
||||
|
||||
if formatted is None:
|
||||
if self.convert_to_address:
|
||||
postcode.kind = 'unofficial_postcode'
|
||||
else:
|
||||
obj.address.pop(pos)
|
||||
else:
|
||||
postcode.name = formatted[0]
|
||||
postcode.set_attr('variant', formatted[1])
|
||||
|
||||
|
||||
def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
|
||||
""" Check the postcode for correct formatting and return the
|
||||
normalized version. Returns None if the postcode does not
|
||||
correspond to the official format of the given country.
|
||||
"""
|
||||
match = self.matcher.match(country, postcode)
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
assert country is not None
|
||||
|
||||
return self.matcher.normalize(country, match),\
|
||||
' '.join(filter(lambda p: p is not None, match.groups()))
|
||||
|
||||
|
||||
|
||||
|
||||
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||
""" Create a function that filters postcodes by their officially allowed pattern.
|
||||
"""
|
||||
|
||||
return _PostcodeSanitizer(config)
|
||||
Reference in New Issue
Block a user