split code into submodules

2026-02-26 11:08:13 +00:00 · 2024-05-16 11:55:17 +02:00
parent 0fb4fe8e4d
commit 6e89310a92
137 changed files with 757 additions and 716 deletions
--- a/src/nominatim_db/tokenizer/token_analysis/init.py
+++ b/src/nominatim_db/tokenizer/token_analysis/init.py
--- a/src/nominatim_db/tokenizer/token_analysis/base.py
+++ b/src/nominatim_db/tokenizer/token_analysis/base.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Common data types and protocols for analysers.
+"""
+from typing import Mapping, List, Any
+
+from nominatim_core.typing import Protocol
+from ...data.place_name import PlaceName
+
+class Analyzer(Protocol):
+    """ The `create()` function of an analysis module needs to return an
+        object that implements the following functions.
+    """
+
+    def get_canonical_id(self, name: PlaceName) -> str:
+        """ Return the canonical form of the given name. The canonical ID must
+            be unique (the same ID must always yield the same variants) and
+            must be a form from which the variants can be derived.
+
+            Arguments:
+                name: Extended place name description as prepared by
+                      the sanitizers.
+
+            Returns:
+                ID string with a canonical form of the name. The string may
+                    be empty, when the analyzer cannot analyze the name at all,
+                    for example because the character set in use does not match.
+        """
+
+    def compute_variants(self, canonical_id: str) -> List[str]:
+        """ Compute the transliterated spelling variants for the given
+            canonical ID.
+
+            Arguments:
+                canonical_id: ID string previously computed with
+                              `get_canonical_id()`.
+
+            Returns:
+                A list of possible spelling variants. All strings must have
+                    been transformed with the global normalizer and
+                    transliterator ICU rules. Otherwise they cannot be matched
+                    against the input by the query frontend.
+                    The list may be empty, when there are no useful
+                    spelling variants. This may happen when an analyzer only
+                    usually outputs additional variants to the canonical spelling
+                    and there are no such variants.
+        """
+
+
+class AnalysisModule(Protocol):
+    """ The setup of the token analysis is split into two parts:
+        configuration and analyser factory. A token analysis module must
+        therefore implement the two functions here described.
+    """
+
+    def configure(self, rules: Mapping[str, Any],
+                  normalizer: Any, transliterator: Any) -> Any:
+        """ Prepare the configuration of the analysis module.
+            This function should prepare all data that can be shared
+            between instances of this analyser.
+
+            Arguments:
+                rules: A dictionary with the additional configuration options
+                       as specified in the tokenizer configuration.
+                normalizer: an ICU Transliterator with the compiled
+                            global normalization rules.
+                transliterator: an ICU Transliterator with the compiled
+                                global transliteration rules.
+
+            Returns:
+                A data object with configuration data. This will be handed
+                    as is into the `create()` function and may be
+                    used freely by the analysis module as needed.
+        """
+
+    def create(self, normalizer: Any, transliterator: Any, config: Any) -> Analyzer:
+        """ Create a new instance of the analyser.
+            A separate instance of the analyser is created for each thread
+            when used in multi-threading context.
+
+            Arguments:
+                normalizer: an ICU Transliterator with the compiled normalization
+                            rules.
+                transliterator: an ICU Transliterator with the compiled
+                                transliteration rules.
+                config: The object that was returned by the call to configure().
+
+            Returns:
+                A new analyzer instance. This must be an object that implements
+                    the Analyzer protocol.
+        """
--- a/src/nominatim_db/tokenizer/token_analysis/config_variants.py
+++ b/src/nominatim_db/tokenizer/token_analysis/config_variants.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Parser for configuration for variants.
+"""
+from typing import Any, Iterator, Tuple, List, Optional, Set, NamedTuple
+from collections import defaultdict
+import itertools
+import re
+
+from nominatim_core.config import flatten_config_list
+from nominatim_core.errors import UsageError
+
+class ICUVariant(NamedTuple):
+    """ A single replacement rule for variant creation.
+    """
+    source: str
+    replacement: str
+
+
+def get_variant_config(in_rules: Any,
+                       normalizer: Any) -> Tuple[List[Tuple[str, List[str]]], str]:
+    """ Convert the variant definition from the configuration into
+        replacement sets.
+
+        Returns a tuple containing the replacement set and the list of characters
+        used in the replacements.
+    """
+    immediate = defaultdict(list)
+    chars: Set[str] = set()
+
+    if in_rules:
+        vset: Set[ICUVariant] = set()
+        rules = flatten_config_list(in_rules, 'variants')
+
+        vmaker = _VariantMaker(normalizer)
+
+        for section in rules:
+            for rule in (section.get('words') or []):
+                vset.update(vmaker.compute(rule))
+
+        # Intermediate reorder by source. Also compute required character set.
+        for variant in vset:
+            if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
+                replstr = variant.replacement[:-1]
+            else:
+                replstr = variant.replacement
+            immediate[variant.source].append(replstr)
+            chars.update(variant.source)
+
+    return list(immediate.items()), ''.join(chars)
+
+
+class _VariantMaker:
+    """ Generator for all necessary ICUVariants from a single variant rule.
+
+        All text in rules is normalized to make sure the variants match later.
+    """
+
+    def __init__(self, normalizer: Any) -> None:
+        self.norm = normalizer
+
+
+    def compute(self, rule: Any) -> Iterator[ICUVariant]:
+        """ Generator for all ICUVariant tuples from a single variant rule.
+        """
+        parts = re.split(r'(\|)?([=-])>', rule)
+        if len(parts) != 4:
+            raise UsageError(f"Syntax error in variant rule: {rule}")
+
+        decompose = parts[1] is None
+        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
+        repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
+
+        # If the source should be kept, add a 1:1 replacement
+        if parts[2] == '-':
+            for src in src_terms:
+                if src:
+                    for froms, tos in _create_variants(*src, src[0], decompose):
+                        yield ICUVariant(froms, tos)
+
+        for src, repl in itertools.product(src_terms, repl_terms):
+            if src and repl:
+                for froms, tos in _create_variants(*src, repl, decompose):
+                    yield ICUVariant(froms, tos)
+
+
+    def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]:
+        name = name.strip()
+        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
+        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
+            raise UsageError(f"Invalid variant word descriptor '{name}'")
+        norm_name = self.norm.transliterate(match.group(2)).strip()
+        if not norm_name:
+            return None
+
+        return norm_name, match.group(1), match.group(3)
+
+
+_FLAG_MATCH = {'^': '^ ',
+               '$': ' ^',
+               '': ' '}
+
+
+def _create_variants(src: str, preflag: str, postflag: str,
+                     repl: str, decompose: bool) -> Iterator[Tuple[str, str]]:
+    if preflag == '~':
+        postfix = _FLAG_MATCH[postflag]
+        # suffix decomposition
+        src = src + postfix
+        repl = repl + postfix
+
+        yield src, repl
+        yield ' ' + src, ' ' + repl
+
+        if decompose:
+            yield src, ' ' + repl
+            yield ' ' + src, repl
+    elif postflag == '~':
+        # prefix decomposition
+        prefix = _FLAG_MATCH[preflag]
+        src = prefix + src
+        repl = prefix + repl
+
+        yield src, repl
+        yield src + ' ', repl + ' '
+
+        if decompose:
+            yield src, repl + ' '
+            yield src + ' ', repl
+    else:
+        prefix = _FLAG_MATCH[preflag]
+        postfix = _FLAG_MATCH[postflag]
+
+        yield prefix + src + postfix, prefix + repl + postfix
--- a/src/nominatim_db/tokenizer/token_analysis/generic.py
+++ b/src/nominatim_db/tokenizer/token_analysis/generic.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Generic processor for names that creates abbreviation variants.
+"""
+from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
+import itertools
+
+import datrie
+
+from nominatim_core.errors import UsageError
+from ...data.place_name import PlaceName
+from .config_variants import get_variant_config
+from .generic_mutation import MutationVariantGenerator
+
+### Configuration section
+
+def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
+    """ Extract and preprocess the configuration for this module.
+    """
+    config: Dict[str, Any] = {}
+
+    config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
+                                                                 normalizer)
+    config['variant_only'] = rules.get('mode', '') == 'variant-only'
+
+    # parse mutation rules
+    config['mutations'] = []
+    for rule in rules.get('mutations', []):
+        if 'pattern' not in rule:
+            raise UsageError("Missing field 'pattern' in mutation configuration.")
+        if not isinstance(rule['pattern'], str):
+            raise UsageError("Field 'pattern' in mutation configuration "
+                             "must be a simple text field.")
+        if 'replacements' not in rule:
+            raise UsageError("Missing field 'replacements' in mutation configuration.")
+        if not isinstance(rule['replacements'], list):
+            raise UsageError("Field 'replacements' in mutation configuration "
+                             "must be a list of texts.")
+
+        config['mutations'].append((rule['pattern'], rule['replacements']))
+
+    return config
+
+
+### Analysis section
+
+def create(normalizer: Any, transliterator: Any,
+           config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
+    """ Create a new token analysis instance for this module.
+    """
+    return GenericTokenAnalysis(normalizer, transliterator, config)
+
+
+class GenericTokenAnalysis:
+    """ Collects the different transformation rules for normalisation of names
+        and provides the functions to apply the transformations.
+    """
+
+    def __init__(self, norm: Any, to_ascii: Any, config: Mapping[str, Any]) -> None:
+        self.norm = norm
+        self.to_ascii = to_ascii
+        self.variant_only = config['variant_only']
+
+        # Set up datrie
+        if config['replacements']:
+            self.replacements = datrie.Trie(config['chars'])
+            for src, repllist in config['replacements']:
+                self.replacements[src] = repllist
+        else:
+            self.replacements = None
+
+        # set up mutation rules
+        self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
+
+
+    def get_canonical_id(self, name: PlaceName) -> str:
+        """ Return the normalized form of the name. This is the standard form
+            from which possible variants for the name can be derived.
+        """
+        return cast(str, self.norm.transliterate(name.name)).strip()
+
+
+    def compute_variants(self, norm_name: str) -> List[str]:
+        """ Compute the spelling variants for the given normalized name
+            and transliterate the result.
+        """
+        variants = self._generate_word_variants(norm_name)
+
+        for mutation in self.mutations:
+            variants = mutation.generate(variants)
+
+        return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
+
+
+    def _transliterate_unique_list(self, norm_name: str,
+                                   iterable: Iterable[str]) -> Iterator[Optional[str]]:
+        seen = set()
+        if self.variant_only:
+            seen.add(norm_name)
+
+        for variant in map(str.strip, iterable):
+            if variant not in seen:
+                seen.add(variant)
+                yield self.to_ascii.transliterate(variant).strip()
+
+
+    def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
+        baseform = '^ ' + norm_name + ' ^'
+        baselen = len(baseform)
+        partials = ['']
+
+        startpos = 0
+        if self.replacements is not None:
+            pos = 0
+            force_space = False
+            while pos < baselen:
+                full, repl = self.replacements.longest_prefix_item(baseform[pos:],
+                                                                   (None, None))
+                if full is not None:
+                    done = baseform[startpos:pos]
+                    partials = [v + done + r
+                                for v, r in itertools.product(partials, repl)
+                                if not force_space or r.startswith(' ')]
+                    if len(partials) > 128:
+                        # If too many variants are produced, they are unlikely
+                        # to be helpful. Only use the original term.
+                        startpos = 0
+                        break
+                    startpos = pos + len(full)
+                    if full[-1] == ' ':
+                        startpos -= 1
+                        force_space = True
+                    pos = startpos
+                else:
+                    pos += 1
+                    force_space = False
+
+        # No variants detected? Fast return.
+        if startpos == 0:
+            return (norm_name, )
+
+        if startpos < baselen:
+            return (part[1:] + baseform[startpos:-1] for part in partials)
+
+        return (part[1:-1] for part in partials)
--- a/src/nominatim_db/tokenizer/token_analysis/generic_mutation.py
+++ b/src/nominatim_db/tokenizer/token_analysis/generic_mutation.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Creator for mutation variants for the generic token analysis.
+"""
+from typing import Sequence, Iterable, Iterator, Tuple
+import itertools
+import logging
+import re
+
+from nominatim_core.errors import UsageError
+
+LOG = logging.getLogger()
+
+def _zigzag(outer: Iterable[str], inner: Iterable[str]) -> Iterator[str]:
+    return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
+
+
+class MutationVariantGenerator:
+    """ Generates name variants by applying a regular expression to the name
+        and replacing it with one or more variants. When the regular expression
+        matches more than once, each occurrence is replaced with all replacement
+        patterns.
+    """
+
+    def __init__(self, pattern: str, replacements: Sequence[str]):
+        self.pattern = re.compile(pattern)
+        self.replacements = replacements
+
+        if self.pattern.groups > 0:
+            LOG.fatal("The mutation pattern %s contains a capturing group. "
+                      "This is not allowed.", pattern)
+            raise UsageError("Bad mutation pattern in configuration.")
+
+
+    def generate(self, names: Iterable[str]) -> Iterator[str]:
+        """ Generator function for the name variants. 'names' is an iterable
+            over a set of names for which the variants are to be generated.
+        """
+        for name in names:
+            parts = self.pattern.split(name)
+            if len(parts) == 1:
+                yield name
+            else:
+                for seps in self._fillers(len(parts)):
+                    yield ''.join(_zigzag(parts, seps))
+
+
+    def _fillers(self, num_parts: int) -> Iterator[Tuple[str, ...]]:
+        """ Returns a generator for strings to join the given number of string
+            parts in all possible combinations.
+        """
+        return itertools.product(self.replacements, repeat=num_parts - 1)
--- a/src/nominatim_db/tokenizer/token_analysis/housenumbers.py
+++ b/src/nominatim_db/tokenizer/token_analysis/housenumbers.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Specialized processor for housenumbers. Analyses common housenumber patterns
+and creates variants for them.
+"""
+from typing import Any, List, cast
+import re
+
+from ...data.place_name import PlaceName
+from .generic_mutation import MutationVariantGenerator
+
+RE_NON_DIGIT = re.compile('[^0-9]')
+RE_DIGIT_ALPHA = re.compile(r'(\d)\s*([^\d\s␣])')
+RE_ALPHA_DIGIT = re.compile(r'([^\s\d␣])\s*(\d)')
+RE_NAMED_PART = re.compile(r'[a-z]{4}')
+
+### Configuration section
+
+def configure(*_: Any) -> None:
+    """ All behaviour is currently hard-coded.
+    """
+    return None
+
+### Analysis section
+
+def create(normalizer: Any, transliterator: Any, config: None) -> 'HousenumberTokenAnalysis': # pylint: disable=W0613
+    """ Create a new token analysis instance for this module.
+    """
+    return HousenumberTokenAnalysis(normalizer, transliterator)
+
+
+class HousenumberTokenAnalysis:
+    """ Detects common housenumber patterns and normalizes them.
+    """
+    def __init__(self, norm: Any, trans: Any) -> None:
+        self.norm = norm
+        self.trans = trans
+
+        self.mutator = MutationVariantGenerator('␣', (' ', ''))
+
+    def get_canonical_id(self, name: PlaceName) -> str:
+        """ Return the normalized form of the housenumber.
+        """
+        # shortcut for number-only numbers, which make up 90% of the data.
+        if RE_NON_DIGIT.search(name.name) is None:
+            return name.name
+
+        norm = cast(str, self.trans.transliterate(self.norm.transliterate(name.name)))
+        # If there is a significant non-numeric part, use as is.
+        if RE_NAMED_PART.search(norm) is None:
+            # Otherwise add optional spaces between digits and letters.
+            (norm_opt, cnt1) = RE_DIGIT_ALPHA.subn(r'\1␣\2', norm)
+            (norm_opt, cnt2) = RE_ALPHA_DIGIT.subn(r'\1␣\2', norm_opt)
+            # Avoid creating too many variants per number.
+            if cnt1 + cnt2 <= 4:
+                return norm_opt
+
+        return norm
+
+    def compute_variants(self, norm_name: str) -> List[str]:
+        """ Compute the spelling variants for the given normalized housenumber.
+
+            Generates variants for optional spaces (marked with '␣').
+        """
+        return list(self.mutator.generate([norm_name]))
--- a/src/nominatim_db/tokenizer/token_analysis/postcodes.py
+++ b/src/nominatim_db/tokenizer/token_analysis/postcodes.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Specialized processor for postcodes. Supports a 'lookup' variant of the
+token, which produces variants with optional spaces.
+"""
+from typing import Any, List
+
+from ...data.place_name import PlaceName
+from .generic_mutation import MutationVariantGenerator
+
+### Configuration section
+
+def configure(*_: Any) -> None:
+    """ All behaviour is currently hard-coded.
+    """
+    return None
+
+### Analysis section
+
+def create(normalizer: Any, transliterator: Any, config: None) -> 'PostcodeTokenAnalysis': # pylint: disable=W0613
+    """ Create a new token analysis instance for this module.
+    """
+    return PostcodeTokenAnalysis(normalizer, transliterator)
+
+
+class PostcodeTokenAnalysis:
+    """ Special normalization and variant generation for postcodes.
+
+        This analyser must not be used with anything but postcodes as
+        it follows some special rules: the canonial ID is the form that
+        is used for the output. `compute_variants` then needs to ensure that
+        the generated variants once more follow the standard normalization
+        and transliteration, so that postcodes are correctly recognised by
+        the search algorithm.
+    """
+    def __init__(self, norm: Any, trans: Any) -> None:
+        self.norm = norm
+        self.trans = trans
+
+        self.mutator = MutationVariantGenerator(' ', (' ', ''))
+
+
+    def get_canonical_id(self, name: PlaceName) -> str:
+        """ Return the standard form of the postcode.
+        """
+        return name.name.strip().upper()
+
+
+    def compute_variants(self, norm_name: str) -> List[str]:
+        """ Compute the spelling variants for the given normalized postcode.
+
+            Takes the canonical form of the postcode, normalizes it using the
+            standard rules and then creates variants of the result where
+            all spaces are optional.
+        """
+        # Postcodes follow their own transliteration rules.
+        # Make sure at this point, that the terms are normalized in a way
+        # that they are searchable with the standard transliteration rules.
+        return [self.trans.transliterate(term) for term in
+                self.mutator.generate([self.norm.transliterate(norm_name)]) if term]