add documentation for new configuration of ICU tokenizer

2026-03-08 02:54:08 +00:00 · 2021-10-07 11:55:53 +02:00
parent 2a94bfc703
commit 6c79a60e19
4 changed files with 159 additions and 56 deletions
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -1,5 +1,9 @@
 """
-Name processor that splits name values with multiple values into their components.
+Sanitizer that splits lists of names into their components.
+
+Arguments:
+    delimiters: Define the set of characters to be used for
+                splitting the list. (default: `,;`)
 """
 import re

@@ -7,9 +11,7 @@ from nominatim.errors import UsageError

 def create(func):
    """ Create a name processing function that splits name values with
-        multiple values into their components. The optional parameter
-        'delimiters' can be used to define the characters that should be used
-        for splitting. The default is ',;'.
+        multiple values into their components.
    """
    delimiter_set = set(func.get('delimiters', ',;'))
    if not delimiter_set:
--- a/nominatim/tokenizer/sanitizers/strip_brace_terms.py
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@@ -1,11 +1,12 @@
 """
-Sanitizer handling names with addendums in braces.
+This sanitizer creates additional name variants for names that have
+addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
+only the main name part with the bracket part removed.
 """

 def create(_):
    """ Create a name processing function that creates additional name variants
-        when a name has an addendum in brackets (e.g. "Halle (Saale)"). The
-        additional variant only contains the main name without the bracket part.
+        for bracket addendums.
    """
    def _process(obj):
        """ Add variants for names that have a bracket extension.
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -1,5 +1,28 @@
 """
-Name processor for tagging the langauge of the name
+This sanitizer sets the `analyzer` property depending on the
+language of the tag. The language is taken from the suffix of the name.
+If a name already has an analyzer tagged, then this is kept.
+
+Arguments:
+
+    filter-kind: Restrict the names the sanitizer should be applied to
+                 to the given tags. The parameter expects a list of
+                 regular expressions which are matched against `kind`.
+                 Note that a match against the full string is expected.
+    whitelist: Restrict the set of languages that should be tagged.
+               Expects a list of acceptable suffixes. When unset,
+               all 2- and 3-letter lower-case codes are accepted.
+    use-defaults:  Configure what happens when the name has no suffix.
+                   When set to 'all', a variant is created for
+                   each of the default languages in the country
+                   the feature is in. When set to 'mono', a variant is
+                   only created, when exactly one language is spoken
+                   in the country. The default is to do nothing with
+                   the default languages of a country.
+    mode: Define how the variants are created and may be 'replace' or
+          'append'. When set to 'append' the original name (without
+          any analyzer tagged) is retained. (default: replace)
+
 """
 import re

@@ -75,24 +98,6 @@ class _AnalyzerByLanguage:

 def create(config):
    """ Create a function that sets the analyzer property depending on the
-        language of the tag. The language is taken from the suffix.
-
-        To restrict the set of languages that should be tagged, use
-        'whitelist'. A list of acceptable suffixes. When unset, all 2- and
-        3-letter codes are accepted.
-
-        'use-defaults' configures what happens when the name has no suffix
-        with a language tag. When set to 'all', a variant is created for
-        each on the spoken languages in the country the feature is in. When
-        set to 'mono', a variant is created, when only one language is spoken
-        in the country. The default is, to do nothing with the default languages
-        of a country.
-
-        'mode' hay be 'replace' (the default) or 'append' and configures if
-        the original name (without any analyzer tagged) is retained.
-
-        With 'filter-kind' the set of names the sanitizer should be applied
-        to can be retricted to the given patterns of 'kind'. It expects a
-        list of regular expression to be matched against 'kind'.
+        language of the tag.
    """
    return _AnalyzerByLanguage(config)