mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-16 15:47:58 +00:00
132 lines
5.0 KiB
Python
132 lines
5.0 KiB
Python
# SPDX-License-Identifier: GPL-2.0-only
|
|
#
|
|
# This file is part of Nominatim. (https://nominatim.org)
|
|
#
|
|
# Copyright (C) 2022 by the Nominatim developer community.
|
|
# For a full list of authors see the git log.
|
|
"""
|
|
Configuration for Sanitizers.
|
|
"""
|
|
from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING
|
|
from collections import UserDict
|
|
import re
|
|
|
|
from nominatim.errors import UsageError
|
|
|
|
# working around missing generics in Python < 3.8
|
|
# See https://github.com/python/typing/issues/60#issuecomment-869757075
|
|
if TYPE_CHECKING:
|
|
_BaseUserDict = UserDict[str, Any]
|
|
else:
|
|
_BaseUserDict = UserDict
|
|
|
|
class SanitizerConfig(_BaseUserDict):
|
|
""" The `SanitizerConfig` class is a read-only dictionary
|
|
with configuration options for the sanitizer.
|
|
In addition to the usual dictionary functions, the class provides
|
|
accessors to standard sanitizer options that are used by many of the
|
|
sanitizers.
|
|
"""
|
|
|
|
def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
|
|
""" Extract a configuration parameter as a string list.
|
|
|
|
Arguments:
|
|
param: Name of the configuration parameter.
|
|
default: Value to return, when the parameter is missing.
|
|
|
|
Returns:
|
|
If the parameter value is a simple string, it is returned as a
|
|
one-item list. If the parameter value does not exist, the given
|
|
default is returned. If the parameter value is a list, it is
|
|
checked to contain only strings before being returned.
|
|
"""
|
|
values = self.data.get(param, None)
|
|
|
|
if values is None:
|
|
return None if default is None else list(default)
|
|
|
|
if isinstance(values, str):
|
|
return [values] if values else []
|
|
|
|
if not isinstance(values, (list, tuple)):
|
|
raise UsageError(f"Parameter '{param}' must be string or list of strings.")
|
|
|
|
if any(not isinstance(value, str) for value in values):
|
|
raise UsageError(f"Parameter '{param}' must be string or list of strings.")
|
|
|
|
return values
|
|
|
|
|
|
def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
|
|
""" Extract a configuration parameter as a boolean.
|
|
|
|
Arguments:
|
|
param: Name of the configuration parameter. The parameter must
|
|
contain one of the yaml boolean values or an
|
|
UsageError will be raised.
|
|
default: Value to return, when the parameter is missing.
|
|
When set to `None`, the parameter must be defined.
|
|
|
|
Returns:
|
|
Boolean value of the given parameter.
|
|
"""
|
|
value = self.data.get(param, default)
|
|
|
|
if not isinstance(value, bool):
|
|
raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.")
|
|
|
|
return value
|
|
|
|
|
|
def get_delimiter(self, default: str = ',;') -> Pattern[str]:
|
|
""" Return the 'delimiters' parameter in the configuration as a
|
|
compiled regular expression that can be used to split strings on
|
|
these delimiters.
|
|
|
|
Arguments:
|
|
default: Delimiters to be used when 'delimiters' parameter
|
|
is not explicitly configured.
|
|
|
|
Returns:
|
|
A regular expression pattern which can be used to
|
|
split a string. The regular expression makes sure that the
|
|
resulting names are stripped and that repeated delimiters
|
|
are ignored. It may still create empty fields on occasion. The
|
|
code needs to filter those.
|
|
"""
|
|
delimiter_set = set(self.data.get('delimiters', default))
|
|
if not delimiter_set:
|
|
raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
|
|
|
|
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
|
|
|
|
|
|
def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
|
|
""" Return a filter function for the name kind from the 'filter-kind'
|
|
config parameter.
|
|
|
|
If the 'filter-kind' parameter is empty, the filter lets all items
|
|
pass. If the parameter is a string, it is interpreted as a single
|
|
regular expression that must match the full kind string.
|
|
If the parameter is a list then
|
|
any of the regular expressions in the list must match to pass.
|
|
|
|
Arguments:
|
|
default: Filters to be used, when the 'filter-kind' parameter
|
|
is not specified. If omitted then the default is to
|
|
let all names pass.
|
|
|
|
Returns:
|
|
A filter function which takes a name string and returns
|
|
True when the item passes the filter.
|
|
"""
|
|
filters = self.get_string_list('filter-kind', default)
|
|
|
|
if not filters:
|
|
return lambda _: True
|
|
|
|
regexes = [re.compile(regex) for regex in filters]
|
|
|
|
return lambda name: any(regex.fullmatch(name) for regex in regexes)
|