mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-08 02:54:08 +00:00
add documentation for new configuration of ICU tokenizer
This commit is contained in:
@@ -1,5 +1,9 @@
|
||||
"""
|
||||
Name processor that splits name values with multiple values into their components.
|
||||
Sanitizer that splits lists of names into their components.
|
||||
|
||||
Arguments:
|
||||
delimiters: Define the set of characters to be used for
|
||||
splitting the list. (default: `,;`)
|
||||
"""
|
||||
import re
|
||||
|
||||
@@ -7,9 +11,7 @@ from nominatim.errors import UsageError
|
||||
|
||||
def create(func):
|
||||
""" Create a name processing function that splits name values with
|
||||
multiple values into their components. The optional parameter
|
||||
'delimiters' can be used to define the characters that should be used
|
||||
for splitting. The default is ',;'.
|
||||
multiple values into their components.
|
||||
"""
|
||||
delimiter_set = set(func.get('delimiters', ',;'))
|
||||
if not delimiter_set:
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
"""
|
||||
Sanitizer handling names with addendums in braces.
|
||||
This sanitizer creates additional name variants for names that have
|
||||
addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
|
||||
only the main name part with the bracket part removed.
|
||||
"""
|
||||
|
||||
def create(_):
|
||||
""" Create a name processing function that creates additional name variants
|
||||
when a name has an addendum in brackets (e.g. "Halle (Saale)"). The
|
||||
additional variant only contains the main name without the bracket part.
|
||||
for bracket addendums.
|
||||
"""
|
||||
def _process(obj):
|
||||
""" Add variants for names that have a bracket extension.
|
||||
|
||||
@@ -1,5 +1,28 @@
|
||||
"""
|
||||
Name processor for tagging the langauge of the name
|
||||
This sanitizer sets the `analyzer` property depending on the
|
||||
language of the tag. The language is taken from the suffix of the name.
|
||||
If a name already has an analyzer tagged, then this is kept.
|
||||
|
||||
Arguments:
|
||||
|
||||
filter-kind: Restrict the names the sanitizer should be applied to
|
||||
to the given tags. The parameter expects a list of
|
||||
regular expressions which are matched against `kind`.
|
||||
Note that a match against the full string is expected.
|
||||
whitelist: Restrict the set of languages that should be tagged.
|
||||
Expects a list of acceptable suffixes. When unset,
|
||||
all 2- and 3-letter lower-case codes are accepted.
|
||||
use-defaults: Configure what happens when the name has no suffix.
|
||||
When set to 'all', a variant is created for
|
||||
each of the default languages in the country
|
||||
the feature is in. When set to 'mono', a variant is
|
||||
only created, when exactly one language is spoken
|
||||
in the country. The default is to do nothing with
|
||||
the default languages of a country.
|
||||
mode: Define how the variants are created and may be 'replace' or
|
||||
'append'. When set to 'append' the original name (without
|
||||
any analyzer tagged) is retained. (default: replace)
|
||||
|
||||
"""
|
||||
import re
|
||||
|
||||
@@ -75,24 +98,6 @@ class _AnalyzerByLanguage:
|
||||
|
||||
def create(config):
|
||||
""" Create a function that sets the analyzer property depending on the
|
||||
language of the tag. The language is taken from the suffix.
|
||||
|
||||
To restrict the set of languages that should be tagged, use
|
||||
'whitelist'. A list of acceptable suffixes. When unset, all 2- and
|
||||
3-letter codes are accepted.
|
||||
|
||||
'use-defaults' configures what happens when the name has no suffix
|
||||
with a language tag. When set to 'all', a variant is created for
|
||||
each on the spoken languages in the country the feature is in. When
|
||||
set to 'mono', a variant is created, when only one language is spoken
|
||||
in the country. The default is, to do nothing with the default languages
|
||||
of a country.
|
||||
|
||||
'mode' hay be 'replace' (the default) or 'append' and configures if
|
||||
the original name (without any analyzer tagged) is retained.
|
||||
|
||||
With 'filter-kind' the set of names the sanitizer should be applied
|
||||
to can be retricted to the given patterns of 'kind'. It expects a
|
||||
list of regular expression to be matched against 'kind'.
|
||||
language of the tag.
|
||||
"""
|
||||
return _AnalyzerByLanguage(config)
|
||||
|
||||
Reference in New Issue
Block a user