mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-11 13:24:07 +00:00
add documentation for sanitizer interface
Also switches mkdocstrings to 0.18 with the rather unfortunate consequence that now mkdocstrings-python-legacy is needed as well.
This commit is contained in:
@@ -14,14 +14,20 @@ from nominatim.data.place_info import PlaceInfo
|
||||
from nominatim.typing import Protocol, Final
|
||||
|
||||
class PlaceName:
|
||||
""" A searchable name for a place together with properties.
|
||||
Every name object saves the name proper and two basic properties:
|
||||
* 'kind' describes the name of the OSM key used without any suffixes
|
||||
""" Each name and address part of a place is encapsulated in an object of
|
||||
this class. It saves not only the name proper but also describes the
|
||||
kind of name with two properties:
|
||||
|
||||
* `kind` describes the name of the OSM key used without any suffixes
|
||||
(i.e. the part after the colon removed)
|
||||
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
|
||||
* `suffix` contains the suffix of the OSM tag, if any. The suffix
|
||||
is the part of the key after the first colon.
|
||||
In addition to that, the name may have arbitrary additional attributes.
|
||||
Which attributes are used, depends on the token analyser.
|
||||
|
||||
In addition to that, a name may have arbitrary additional attributes.
|
||||
How attributes are used, depends on the sanatizers and token analysers.
|
||||
The exception is is the 'analyzer' attribute. This apptribute determines
|
||||
which token analysis module will be used to finalize the treatment of
|
||||
names.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, kind: str, suffix: Optional[str]):
|
||||
@@ -113,7 +119,13 @@ class SanitizerHandler(Protocol):
|
||||
|
||||
def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||
"""
|
||||
A sanitizer must define a single function `create`. It takes the
|
||||
dictionary with the configuration information for the sanitizer and
|
||||
returns a function that transforms name and address.
|
||||
Create a function for sanitizing a place.
|
||||
|
||||
Arguments:
|
||||
config: A dictionary with the additional configuration options
|
||||
specified in the tokenizer configuration
|
||||
|
||||
Return:
|
||||
The result must be a callable that takes a place description
|
||||
and transforms name and address as reuqired.
|
||||
"""
|
||||
|
||||
@@ -21,8 +21,8 @@ else:
|
||||
_BaseUserDict = UserDict
|
||||
|
||||
class SanitizerConfig(_BaseUserDict):
|
||||
""" Dictionary with configuration options for a sanitizer.
|
||||
|
||||
""" The `SanitizerConfig` class is a read-only dictionary
|
||||
with configuration options for the sanitizer.
|
||||
In addition to the usual dictionary function, the class provides
|
||||
accessors to standard sanatizer options that are used by many of the
|
||||
sanitizers.
|
||||
@@ -30,10 +30,16 @@ class SanitizerConfig(_BaseUserDict):
|
||||
|
||||
def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
|
||||
""" Extract a configuration parameter as a string list.
|
||||
If the parameter value is a simple string, it is returned as a
|
||||
one-item list. If the parameter value does not exist, the given
|
||||
default is returned. If the parameter value is a list, it is checked
|
||||
to contain only strings before being returned.
|
||||
|
||||
Arguments:
|
||||
param: Name of the configuration parameter.
|
||||
default: Value to return, when the parameter is missing.
|
||||
|
||||
Returns:
|
||||
If the parameter value is a simple string, it is returned as a
|
||||
one-item list. If the parameter value does not exist, the given
|
||||
default is returned. If the parameter value is a list, it is
|
||||
checked to contain only strings before being returned.
|
||||
"""
|
||||
values = self.data.get(param, None)
|
||||
|
||||
@@ -54,9 +60,16 @@ class SanitizerConfig(_BaseUserDict):
|
||||
|
||||
def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
|
||||
""" Extract a configuration parameter as a boolean.
|
||||
The parameter must be one of the yaml boolean values or an
|
||||
user error will be raised. If `default` is given, then the parameter
|
||||
may also be missing or empty.
|
||||
|
||||
Arguments:
|
||||
param: Name of the configuration parameter. The parameter must
|
||||
contain one of the yaml boolean values or an
|
||||
UsageError will be raised.
|
||||
default: Value to return, when the parameter is missing.
|
||||
When set to `None`, the parameter must be defined.
|
||||
|
||||
Returns:
|
||||
Boolean value of the given parameter.
|
||||
"""
|
||||
value = self.data.get(param, default)
|
||||
|
||||
@@ -67,15 +80,20 @@ class SanitizerConfig(_BaseUserDict):
|
||||
|
||||
|
||||
def get_delimiter(self, default: str = ',;') -> Pattern[str]:
|
||||
""" Return the 'delimiter' parameter in the configuration as a
|
||||
compiled regular expression that can be used to split the names on the
|
||||
delimiters. The regular expression makes sure that the resulting names
|
||||
are stripped and that repeated delimiters
|
||||
are ignored but it will still create empty fields on occasion. The
|
||||
code needs to filter those.
|
||||
""" Return the 'delimiters' parameter in the configuration as a
|
||||
compiled regular expression that can be used to split names on these
|
||||
delimiters.
|
||||
|
||||
The 'default' parameter defines the delimiter set to be used when
|
||||
not explicitly configured.
|
||||
Arguments:
|
||||
default: Delimiters to be used, when 'delimiters' parameter
|
||||
is not explicitly configured.
|
||||
|
||||
Returns:
|
||||
A regular expression pattern, which can be used to
|
||||
split a string. The regular expression makes sure that the
|
||||
resulting names are stripped and that repeated delimiters
|
||||
are ignored. It may still create empty fields on occasion. The
|
||||
code needs to filter those.
|
||||
"""
|
||||
delimiter_set = set(self.data.get('delimiters', default))
|
||||
if not delimiter_set:
|
||||
@@ -86,13 +104,22 @@ class SanitizerConfig(_BaseUserDict):
|
||||
|
||||
def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
|
||||
""" Return a filter function for the name kind from the 'filter-kind'
|
||||
config parameter. The filter functions takes a name item and returns
|
||||
True when the item passes the filter.
|
||||
config parameter.
|
||||
|
||||
If the parameter is empty, the filter lets all items pass. If the
|
||||
parameter is a string, it is interpreted as a single regular expression
|
||||
that must match the full kind string. If the parameter is a list then
|
||||
If the 'filter-kind' parameter is empty, the filter lets all items
|
||||
pass. If the parameter is a string, it is interpreted as a single
|
||||
regular expression that must match the full kind string.
|
||||
If the parameter is a list then
|
||||
any of the regular expressions in the list must match to pass.
|
||||
|
||||
Arguments:
|
||||
default: Filters to be used, when the 'filter-kind' parameter
|
||||
is not specified. If omitted then the default is to
|
||||
let all names pass.
|
||||
|
||||
Returns:
|
||||
A filter function which takes a name string and returns
|
||||
True when the item passes the filter.
|
||||
"""
|
||||
filters = self.get_string_list('filter-kind', default)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user