add documentation for sanitizer interface

Also switches mkdocstrings to 0.18 with the rather unfortunate
consequence that now mkdocstrings-python-legacy is needed as well.
This commit is contained in:
Sarah Hoffmann
2022-07-28 22:00:29 +02:00
parent a8b037669a
commit 3746befd88
7 changed files with 185 additions and 50 deletions

View File

@@ -14,14 +14,20 @@ from nominatim.data.place_info import PlaceInfo
from nominatim.typing import Protocol, Final
class PlaceName:
""" A searchable name for a place together with properties.
Every name object saves the name proper and two basic properties:
* 'kind' describes the name of the OSM key used without any suffixes
""" Each name and address part of a place is encapsulated in an object of
this class. It saves not only the name proper but also describes the
kind of name with two properties:
* `kind` describes the name of the OSM key used without any suffixes
(i.e. the part after the colon removed)
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
* `suffix` contains the suffix of the OSM tag, if any. The suffix
is the part of the key after the first colon.
In addition to that, the name may have arbitrary additional attributes.
Which attributes are used, depends on the token analyser.
In addition to that, a name may have arbitrary additional attributes.
How attributes are used, depends on the sanatizers and token analysers.
The exception is is the 'analyzer' attribute. This apptribute determines
which token analysis module will be used to finalize the treatment of
names.
"""
def __init__(self, name: str, kind: str, suffix: Optional[str]):
@@ -113,7 +119,13 @@ class SanitizerHandler(Protocol):
def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
"""
A sanitizer must define a single function `create`. It takes the
dictionary with the configuration information for the sanitizer and
returns a function that transforms name and address.
Create a function for sanitizing a place.
Arguments:
config: A dictionary with the additional configuration options
specified in the tokenizer configuration
Return:
The result must be a callable that takes a place description
and transforms name and address as reuqired.
"""

View File

@@ -21,8 +21,8 @@ else:
_BaseUserDict = UserDict
class SanitizerConfig(_BaseUserDict):
""" Dictionary with configuration options for a sanitizer.
""" The `SanitizerConfig` class is a read-only dictionary
with configuration options for the sanitizer.
In addition to the usual dictionary function, the class provides
accessors to standard sanatizer options that are used by many of the
sanitizers.
@@ -30,10 +30,16 @@ class SanitizerConfig(_BaseUserDict):
def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
""" Extract a configuration parameter as a string list.
If the parameter value is a simple string, it is returned as a
one-item list. If the parameter value does not exist, the given
default is returned. If the parameter value is a list, it is checked
to contain only strings before being returned.
Arguments:
param: Name of the configuration parameter.
default: Value to return, when the parameter is missing.
Returns:
If the parameter value is a simple string, it is returned as a
one-item list. If the parameter value does not exist, the given
default is returned. If the parameter value is a list, it is
checked to contain only strings before being returned.
"""
values = self.data.get(param, None)
@@ -54,9 +60,16 @@ class SanitizerConfig(_BaseUserDict):
def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
""" Extract a configuration parameter as a boolean.
The parameter must be one of the yaml boolean values or an
user error will be raised. If `default` is given, then the parameter
may also be missing or empty.
Arguments:
param: Name of the configuration parameter. The parameter must
contain one of the yaml boolean values or an
UsageError will be raised.
default: Value to return, when the parameter is missing.
When set to `None`, the parameter must be defined.
Returns:
Boolean value of the given parameter.
"""
value = self.data.get(param, default)
@@ -67,15 +80,20 @@ class SanitizerConfig(_BaseUserDict):
def get_delimiter(self, default: str = ',;') -> Pattern[str]:
""" Return the 'delimiter' parameter in the configuration as a
compiled regular expression that can be used to split the names on the
delimiters. The regular expression makes sure that the resulting names
are stripped and that repeated delimiters
are ignored but it will still create empty fields on occasion. The
code needs to filter those.
""" Return the 'delimiters' parameter in the configuration as a
compiled regular expression that can be used to split names on these
delimiters.
The 'default' parameter defines the delimiter set to be used when
not explicitly configured.
Arguments:
default: Delimiters to be used, when 'delimiters' parameter
is not explicitly configured.
Returns:
A regular expression pattern, which can be used to
split a string. The regular expression makes sure that the
resulting names are stripped and that repeated delimiters
are ignored. It may still create empty fields on occasion. The
code needs to filter those.
"""
delimiter_set = set(self.data.get('delimiters', default))
if not delimiter_set:
@@ -86,13 +104,22 @@ class SanitizerConfig(_BaseUserDict):
def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
""" Return a filter function for the name kind from the 'filter-kind'
config parameter. The filter functions takes a name item and returns
True when the item passes the filter.
config parameter.
If the parameter is empty, the filter lets all items pass. If the
parameter is a string, it is interpreted as a single regular expression
that must match the full kind string. If the parameter is a list then
If the 'filter-kind' parameter is empty, the filter lets all items
pass. If the parameter is a string, it is interpreted as a single
regular expression that must match the full kind string.
If the parameter is a list then
any of the regular expressions in the list must match to pass.
Arguments:
default: Filters to be used, when the 'filter-kind' parameter
is not specified. If omitted then the default is to
let all names pass.
Returns:
A filter function which takes a name string and returns
True when the item passes the filter.
"""
filters = self.get_string_list('filter-kind', default)