apply variants by languages

Adds a tagger for names by language so that the analyzer of that language is used. Thus variants are now only applied to names in the specific language and only tag name tags, no longer to reference-like tags.
2021-10-05 17:18:10 +02:00
parent d35400a7d7
commit 97a10ec218
8 changed files with 307 additions and 46 deletions
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -0,0 +1,100 @@
+"""
+Name processor for tagging the langauge of the name
+"""
+import re
+
+from nominatim.tools import country_info
+
+class _AnalyzerByLanguage:
+    """ Processor for tagging the language of names in a place.
+    """
+
+    def __init__(self, config):
+        if 'filter-kind' in config:
+            self.regexes = [re.compile(regex) for regex in config['filter-kind']]
+        else:
+            self.regexes = None
+
+        self.use_defaults = config.get('use-defaults', 'no')
+        if self.use_defaults not in ('mono', 'all'):
+            self.use_defaults = False
+
+        self.replace = config.get('mode', 'replace') != 'append'
+        self.whitelist = config.get('whitelist')
+
+        # Compute the languages to use when no suffix is given.
+        self.deflangs = {}
+        for ccode, prop in country_info.iterate():
+            clangs = prop['languages']
+            if len(clangs) == 1 or self.use_defaults == 'all':
+                if self.whitelist:
+                    self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
+                else:
+                    self.deflangs[ccode] = clangs
+
+
+
+    def _kind_matches(self, kind):
+        if self.regexes is None:
+            return True
+
+        return any(regex.search(kind) for regex in self.regexes)
+
+
+    def _suffix_matches(self, suffix):
+        if self.whitelist is None:
+            return len(suffix) in (2, 3) and suffix.islower()
+
+        return suffix in self.whitelist
+
+
+    def __call__(self, obj):
+        if not obj.names:
+            return
+
+        more_names = []
+
+        for name in (n for n in obj.names
+                     if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
+            if name.suffix:
+                langs = [name.suffix] if self._suffix_matches(name.suffix) else None
+            else:
+                if self.use_defaults:
+                    langs = self.deflangs.get(obj.place.country_code)
+                    if self.use_defaults == 'mono' and len(langs) > 1:
+                        langs = None
+
+            if langs:
+                if self.replace:
+                    name.set_attr('analyzer', langs[0])
+                else:
+                    more_names.append(name.clone(attr={'analyzer': langs[0]}))
+
+                more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
+
+        obj.names.extend(more_names)
+
+
+def create(config):
+    """ Create a function that sets the analyzer property depending on the
+        language of the tag. The language is taken from the suffix.
+
+        To restrict the set of languages that should be tagged, use
+        'whitelist'. A list of acceptable suffixes. When unset, all 2- and
+        3-letter codes are accepted.
+
+        'use-defaults' configures what happens when the name has no suffix
+        with a language tag. When set to 'all', a variant is created for
+        each on the spoken languages in the country the feature is in. When
+        set to 'mono', a variant is created, when only one language is spoken
+        in the country. The default is, to do nothing with the default languages
+        of a country.
+
+        'mode' hay be 'replace' (the default) or 'append' and configures if
+        the original name (without any analyzer tagged) is retained.
+
+        With 'filter-kind' the set of names the sanitizer should be applied
+        to can be retricted to the given patterns of 'kind'. It expects a
+        list of regular expression to be matched against 'kind'.
+    """
+    return _AnalyzerByLanguage(config)