mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-11 21:34:06 +00:00
add documentation for sanitizer interface
Also switches mkdocstrings to 0.18 with the rather unfortunate consequence that now mkdocstrings-python-legacy is needed as well.
This commit is contained in:
@@ -11,8 +11,8 @@ the tokenizer.
|
||||
from typing import Optional, Mapping, Any
|
||||
|
||||
class PlaceInfo:
|
||||
""" Data class containing all information the tokenizer gets about a
|
||||
place it should process the names for.
|
||||
""" This data class contains all information the tokenizer can access
|
||||
about a place.
|
||||
"""
|
||||
|
||||
def __init__(self, info: Mapping[str, Any]) -> None:
|
||||
@@ -21,16 +21,25 @@ class PlaceInfo:
|
||||
|
||||
@property
|
||||
def name(self) -> Optional[Mapping[str, str]]:
|
||||
""" A dictionary with the names of the place or None if the place
|
||||
has no names.
|
||||
""" A dictionary with the names of the place. Keys and values represent
|
||||
the full key and value of the corresponding OSM tag. Which tags
|
||||
are saved as names is determined by the import style.
|
||||
The property may be None if the place has no names.
|
||||
"""
|
||||
return self._info.get('name')
|
||||
|
||||
|
||||
@property
|
||||
def address(self) -> Optional[Mapping[str, str]]:
|
||||
""" A dictionary with the address elements of the place
|
||||
or None if no address information is available.
|
||||
""" A dictionary with the address elements of the place. They key
|
||||
usually corresponds to the suffix part of the key of an OSM
|
||||
'addr:*' or 'isin:*' tag. There are also some special keys like
|
||||
`country` or `country_code` which merge OSM keys that contain
|
||||
the same information. See [Import Styles][1] for details.
|
||||
|
||||
The property may be None if the place has no address information.
|
||||
|
||||
[1]: ../customize/Import-Styles.md
|
||||
"""
|
||||
return self._info.get('address')
|
||||
|
||||
@@ -38,28 +47,30 @@ class PlaceInfo:
|
||||
@property
|
||||
def country_code(self) -> Optional[str]:
|
||||
""" The country code of the country the place is in. Guaranteed
|
||||
to be a two-letter lower-case string or None, if no country
|
||||
could be found.
|
||||
to be a two-letter lower-case string. If the place is not inside
|
||||
any country, the property is set to None.
|
||||
"""
|
||||
return self._info.get('country_code')
|
||||
|
||||
|
||||
@property
|
||||
def rank_address(self) -> int:
|
||||
""" The computed rank address before rank correction.
|
||||
""" The [rank address][1] before ant rank correction is applied.
|
||||
|
||||
[1]: ../customize/Ranking.md#address-rank
|
||||
"""
|
||||
return self._info.get('rank_address', 0)
|
||||
|
||||
|
||||
def is_a(self, key: str, value: str) -> bool:
|
||||
""" Check if the place's primary tag corresponds to the given
|
||||
""" Set to True when the place's primary tag corresponds to the given
|
||||
key and value.
|
||||
"""
|
||||
return self._info.get('class') == key and self._info.get('type') == value
|
||||
|
||||
|
||||
def is_country(self) -> bool:
|
||||
""" Check if the place is a valid country boundary.
|
||||
""" Set to True when the place is a valid country boundary.
|
||||
"""
|
||||
return self.rank_address == 4 \
|
||||
and self.is_a('boundary', 'administrative') \
|
||||
|
||||
@@ -14,14 +14,20 @@ from nominatim.data.place_info import PlaceInfo
|
||||
from nominatim.typing import Protocol, Final
|
||||
|
||||
class PlaceName:
|
||||
""" A searchable name for a place together with properties.
|
||||
Every name object saves the name proper and two basic properties:
|
||||
* 'kind' describes the name of the OSM key used without any suffixes
|
||||
""" Each name and address part of a place is encapsulated in an object of
|
||||
this class. It saves not only the name proper but also describes the
|
||||
kind of name with two properties:
|
||||
|
||||
* `kind` describes the name of the OSM key used without any suffixes
|
||||
(i.e. the part after the colon removed)
|
||||
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
|
||||
* `suffix` contains the suffix of the OSM tag, if any. The suffix
|
||||
is the part of the key after the first colon.
|
||||
In addition to that, the name may have arbitrary additional attributes.
|
||||
Which attributes are used, depends on the token analyser.
|
||||
|
||||
In addition to that, a name may have arbitrary additional attributes.
|
||||
How attributes are used, depends on the sanatizers and token analysers.
|
||||
The exception is is the 'analyzer' attribute. This apptribute determines
|
||||
which token analysis module will be used to finalize the treatment of
|
||||
names.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, kind: str, suffix: Optional[str]):
|
||||
@@ -113,7 +119,13 @@ class SanitizerHandler(Protocol):
|
||||
|
||||
def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||
"""
|
||||
A sanitizer must define a single function `create`. It takes the
|
||||
dictionary with the configuration information for the sanitizer and
|
||||
returns a function that transforms name and address.
|
||||
Create a function for sanitizing a place.
|
||||
|
||||
Arguments:
|
||||
config: A dictionary with the additional configuration options
|
||||
specified in the tokenizer configuration
|
||||
|
||||
Return:
|
||||
The result must be a callable that takes a place description
|
||||
and transforms name and address as reuqired.
|
||||
"""
|
||||
|
||||
@@ -21,8 +21,8 @@ else:
|
||||
_BaseUserDict = UserDict
|
||||
|
||||
class SanitizerConfig(_BaseUserDict):
|
||||
""" Dictionary with configuration options for a sanitizer.
|
||||
|
||||
""" The `SanitizerConfig` class is a read-only dictionary
|
||||
with configuration options for the sanitizer.
|
||||
In addition to the usual dictionary function, the class provides
|
||||
accessors to standard sanatizer options that are used by many of the
|
||||
sanitizers.
|
||||
@@ -30,10 +30,16 @@ class SanitizerConfig(_BaseUserDict):
|
||||
|
||||
def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
|
||||
""" Extract a configuration parameter as a string list.
|
||||
If the parameter value is a simple string, it is returned as a
|
||||
one-item list. If the parameter value does not exist, the given
|
||||
default is returned. If the parameter value is a list, it is checked
|
||||
to contain only strings before being returned.
|
||||
|
||||
Arguments:
|
||||
param: Name of the configuration parameter.
|
||||
default: Value to return, when the parameter is missing.
|
||||
|
||||
Returns:
|
||||
If the parameter value is a simple string, it is returned as a
|
||||
one-item list. If the parameter value does not exist, the given
|
||||
default is returned. If the parameter value is a list, it is
|
||||
checked to contain only strings before being returned.
|
||||
"""
|
||||
values = self.data.get(param, None)
|
||||
|
||||
@@ -54,9 +60,16 @@ class SanitizerConfig(_BaseUserDict):
|
||||
|
||||
def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
|
||||
""" Extract a configuration parameter as a boolean.
|
||||
The parameter must be one of the yaml boolean values or an
|
||||
user error will be raised. If `default` is given, then the parameter
|
||||
may also be missing or empty.
|
||||
|
||||
Arguments:
|
||||
param: Name of the configuration parameter. The parameter must
|
||||
contain one of the yaml boolean values or an
|
||||
UsageError will be raised.
|
||||
default: Value to return, when the parameter is missing.
|
||||
When set to `None`, the parameter must be defined.
|
||||
|
||||
Returns:
|
||||
Boolean value of the given parameter.
|
||||
"""
|
||||
value = self.data.get(param, default)
|
||||
|
||||
@@ -67,15 +80,20 @@ class SanitizerConfig(_BaseUserDict):
|
||||
|
||||
|
||||
def get_delimiter(self, default: str = ',;') -> Pattern[str]:
|
||||
""" Return the 'delimiter' parameter in the configuration as a
|
||||
compiled regular expression that can be used to split the names on the
|
||||
delimiters. The regular expression makes sure that the resulting names
|
||||
are stripped and that repeated delimiters
|
||||
are ignored but it will still create empty fields on occasion. The
|
||||
code needs to filter those.
|
||||
""" Return the 'delimiters' parameter in the configuration as a
|
||||
compiled regular expression that can be used to split names on these
|
||||
delimiters.
|
||||
|
||||
The 'default' parameter defines the delimiter set to be used when
|
||||
not explicitly configured.
|
||||
Arguments:
|
||||
default: Delimiters to be used, when 'delimiters' parameter
|
||||
is not explicitly configured.
|
||||
|
||||
Returns:
|
||||
A regular expression pattern, which can be used to
|
||||
split a string. The regular expression makes sure that the
|
||||
resulting names are stripped and that repeated delimiters
|
||||
are ignored. It may still create empty fields on occasion. The
|
||||
code needs to filter those.
|
||||
"""
|
||||
delimiter_set = set(self.data.get('delimiters', default))
|
||||
if not delimiter_set:
|
||||
@@ -86,13 +104,22 @@ class SanitizerConfig(_BaseUserDict):
|
||||
|
||||
def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
|
||||
""" Return a filter function for the name kind from the 'filter-kind'
|
||||
config parameter. The filter functions takes a name item and returns
|
||||
True when the item passes the filter.
|
||||
config parameter.
|
||||
|
||||
If the parameter is empty, the filter lets all items pass. If the
|
||||
parameter is a string, it is interpreted as a single regular expression
|
||||
that must match the full kind string. If the parameter is a list then
|
||||
If the 'filter-kind' parameter is empty, the filter lets all items
|
||||
pass. If the parameter is a string, it is interpreted as a single
|
||||
regular expression that must match the full kind string.
|
||||
If the parameter is a list then
|
||||
any of the regular expressions in the list must match to pass.
|
||||
|
||||
Arguments:
|
||||
default: Filters to be used, when the 'filter-kind' parameter
|
||||
is not specified. If omitted then the default is to
|
||||
let all names pass.
|
||||
|
||||
Returns:
|
||||
A filter function which takes a name string and returns
|
||||
True when the item passes the filter.
|
||||
"""
|
||||
filters = self.get_string_list('filter-kind', default)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user