Merge pull request #2585 from lonvia/name-mutations

Introduce character mutations to token analysis
2026-03-08 02:54:08 +00:00 · 2022-01-19 17:09:36 +01:00
parent d6b5f2f5da 3df560ea38
commit a7e048484b
7 changed files with 413 additions and 146 deletions
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -99,6 +99,9 @@ token-analysis:
          - words:
              - road -> rd
              - bridge -> bdge,br,brdg,bri,brg
      mutations:
          - pattern: 'ä'
            replacements: ['ä', 'ae']
 ```
 The configuration file contains four sections:
@@ -205,12 +208,11 @@ the `analyzer` parameter must be set. Currently there is only one implementation
 ##### Generic token analyzer
 The generic analyzer is able to create variants from a list of given
-abbreviation and decomposition replacements. It takes one optional parameter
+abbreviation and decomposition replacements and introduce spelling variations.
 `variants` which lists the replacements to apply. If the section is
 omitted, then the generic analyzer becomes a simple analyzer that only
 applies the transliteration.
-The variants section defines lists of replacements which create alternative
+###### Variants
 The optional 'variants' section defines lists of replacements which create alternative
 spellings of a name. To create the variants, a name is scanned from left to
 right and the longest matching replacement is applied until the end of the
 string is reached.
@@ -296,6 +298,32 @@ decomposition has an effect here on the source as well. So a rule
 means that for a word like `hauptstrasse` four variants are created:
 `hauptstrasse`, `haupt strasse`, `hauptstr` and `haupt str`.
 ###### Mutations
 The 'mutation' section in the configuration describes an additional set of
 replacements to be applied after the variants have been computed.
 Each mutation is described by two parameters: `pattern` and `replacements`.
 The pattern must contain a single regular expression to search for in the
 variant name. The regular expressions need to follow the syntax for
 [Python regular expressions](file:///usr/share/doc/python3-doc/html/library/re.html#regular-expression-syntax).
 Capturing groups are not permitted.
 `replacements` must contain a list of strings that the pattern
 should be replaced with. Each occurrence of the pattern is replaced with
 all given replacements. Be mindful of combinatorial explosion of variants.
 ###### Modes
 The generic analyser supports a special mode `variant-only`. When configured
 then it consumes the input token and emits only variants (if any exist). Enable
 the mode by adding:
 ```
  mode: variant-only
 ```
 to the analyser configuration.
 ### Reconfiguration
 Changing the configuration after the import is currently not possible, although
--- a/nominatim/tokenizer/token_analysis/config_variants.py
+++ b/nominatim/tokenizer/token_analysis/config_variants.py
@@ -0,0 +1,134 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Parser for configuration for variants.
 """
 from collections import defaultdict, namedtuple
 import itertools
 import re
 from icu import Transliterator
 from nominatim.config import flatten_config_list
 from nominatim.errors import UsageError
 ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
 def get_variant_config(rules, normalization_rules):
    """ Convert the variant definition from the configuration into
        replacement sets.
        Returns a tuple containing the replacement set and the list of characters
        used in the replacements.
    """
    immediate = defaultdict(list)
    chars = set()
    if rules:
        vset = set()
        rules = flatten_config_list(rules, 'variants')
        vmaker = _VariantMaker(normalization_rules)
        for section in rules:
            for rule in (section.get('words') or []):
                vset.update(vmaker.compute(rule))
        # Intermediate reorder by source. Also compute required character set.
        for variant in vset:
            if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
                replstr = variant.replacement[:-1]
            else:
                replstr = variant.replacement
            immediate[variant.source].append(replstr)
            chars.update(variant.source)
    return list(immediate.items()), ''.join(chars)
 class _VariantMaker:
    """ Generater for all necessary ICUVariants from a single variant rule.
        All text in rules is normalized to make sure the variants match later.
    """
    def __init__(self, norm_rules):
        self.norm = Transliterator.createFromRules("rule_loader_normalization",
                                                   norm_rules)
    def compute(self, rule):
        """ Generator for all ICUVariant tuples from a single variant rule.
        """
        parts = re.split(r'(\|)?([=-])>', rule)
        if len(parts) != 4:
            raise UsageError("Syntax error in variant rule: " + rule)
        decompose = parts[1] is None
        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
        repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
        # If the source should be kept, add a 1:1 replacement
        if parts[2] == '-':
            for src in src_terms:
                if src:
                    for froms, tos in _create_variants(*src, src[0], decompose):
                        yield ICUVariant(froms, tos)
        for src, repl in itertools.product(src_terms, repl_terms):
            if src and repl:
                for froms, tos in _create_variants(*src, repl, decompose):
                    yield ICUVariant(froms, tos)
    def _parse_variant_word(self, name):
        name = name.strip()
        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
            raise UsageError("Invalid variant word descriptor '{}'".format(name))
        norm_name = self.norm.transliterate(match.group(2)).strip()
        if not norm_name:
            return None
        return norm_name, match.group(1), match.group(3)
 _FLAG_MATCH = {'^': '^ ',
               '$': ' ^',
               '': ' '}
 def _create_variants(src, preflag, postflag, repl, decompose):
    if preflag == '~':
        postfix = _FLAG_MATCH[postflag]
        # suffix decomposition
        src = src + postfix
        repl = repl + postfix
        yield src, repl
        yield ' ' + src, ' ' + repl
        if decompose:
            yield src, ' ' + repl
            yield ' ' + src, repl
    elif postflag == '~':
        # prefix decomposition
        prefix = _FLAG_MATCH[preflag]
        src = prefix + src
        repl = prefix + repl
        yield src, repl
        yield src + ' ', repl + ' '
        if decompose:
            yield src, repl + ' '
            yield src + ' ', repl
    else:
        prefix = _FLAG_MATCH[preflag]
        postfix = _FLAG_MATCH[postflag]
        yield prefix + src + postfix, prefix + repl + postfix
--- a/nominatim/tokenizer/token_analysis/generic.py
+++ b/nominatim/tokenizer/token_analysis/generic.py
@@ -7,145 +7,44 @@
 """
 Generic processor for names that creates abbreviation variants.
 """
 from collections import defaultdict, namedtuple
 import itertools
 import re
 from icu import Transliterator
 import datrie
 from nominatim.config import flatten_config_list
 from nominatim.errors import UsageError
 from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
 from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
 ### Configuration section
 ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
 def configure(rules, normalization_rules):
    """ Extract and preprocess the configuration for this module.
    """
    config = {}
-    config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
+    config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
-                                                                  normalization_rules)
+                                                                 normalization_rules)
    config['variant_only'] = rules.get('mode', '') == 'variant-only'
    # parse mutation rules
    config['mutations'] = []
    for rule in rules.get('mutations', []):
        if 'pattern' not in rule:
            raise UsageError("Missing field 'pattern' in mutation configuration.")
        if not isinstance(rule['pattern'], str):
            raise UsageError("Field 'pattern' in mutation configuration "
                             "must be a simple text field.")
        if 'replacements' not in rule:
            raise UsageError("Missing field 'replacements' in mutation configuration.")
        if not isinstance(rule['replacements'], list):
            raise UsageError("Field 'replacements' in mutation configuration "
                             "must be a list of texts.")
        config['mutations'].append((rule['pattern'], rule['replacements']))
    return config
 def _get_variant_config(rules, normalization_rules):
    """ Convert the variant definition from the configuration into
        replacement sets.
    """
    immediate = defaultdict(list)
    chars = set()
    if rules:
        vset = set()
        rules = flatten_config_list(rules, 'variants')
        vmaker = _VariantMaker(normalization_rules)
        for section in rules:
            for rule in (section.get('words') or []):
                vset.update(vmaker.compute(rule))
        # Intermediate reorder by source. Also compute required character set.
        for variant in vset:
            if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
                replstr = variant.replacement[:-1]
            else:
                replstr = variant.replacement
            immediate[variant.source].append(replstr)
            chars.update(variant.source)
    return list(immediate.items()), ''.join(chars)
 class _VariantMaker:
    """ Generater for all necessary ICUVariants from a single variant rule.
        All text in rules is normalized to make sure the variants match later.
    """
    def __init__(self, norm_rules):
        self.norm = Transliterator.createFromRules("rule_loader_normalization",
                                                   norm_rules)
    def compute(self, rule):
        """ Generator for all ICUVariant tuples from a single variant rule.
        """
        parts = re.split(r'(\|)?([=-])>', rule)
        if len(parts) != 4:
            raise UsageError("Syntax error in variant rule: " + rule)
        decompose = parts[1] is None
        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
        repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
        # If the source should be kept, add a 1:1 replacement
        if parts[2] == '-':
            for src in src_terms:
                if src:
                    for froms, tos in _create_variants(*src, src[0], decompose):
                        yield ICUVariant(froms, tos)
        for src, repl in itertools.product(src_terms, repl_terms):
            if src and repl:
                for froms, tos in _create_variants(*src, repl, decompose):
                    yield ICUVariant(froms, tos)
    def _parse_variant_word(self, name):
        name = name.strip()
        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
            raise UsageError("Invalid variant word descriptor '{}'".format(name))
        norm_name = self.norm.transliterate(match.group(2)).strip()
        if not norm_name:
            return None
        return norm_name, match.group(1), match.group(3)
 _FLAG_MATCH = {'^': '^ ',
               '$': ' ^',
               '': ' '}
 def _create_variants(src, preflag, postflag, repl, decompose):
    if preflag == '~':
        postfix = _FLAG_MATCH[postflag]
        # suffix decomposition
        src = src + postfix
        repl = repl + postfix
        yield src, repl
        yield ' ' + src, ' ' + repl
        if decompose:
            yield src, ' ' + repl
            yield ' ' + src, repl
    elif postflag == '~':
        # prefix decomposition
        prefix = _FLAG_MATCH[preflag]
        src = prefix + src
        repl = prefix + repl
        yield src, repl
        yield src + ' ', repl + ' '
        if decompose:
            yield src, repl + ' '
            yield src + ' ', repl
    else:
        prefix = _FLAG_MATCH[preflag]
        postfix = _FLAG_MATCH[postflag]
        yield prefix + src + postfix, prefix + repl + postfix
 ### Analysis section
 def create(transliterator, config):
@@ -171,19 +70,43 @@ class GenericTokenAnalysis:
        else:
            self.replacements = None
        # set up mutation rules
        self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
    def get_variants_ascii(self, norm_name):
        """ Compute the spelling variants for the given normalized name
            and transliterate the result.
        """
        variants = self._generate_word_variants(norm_name)
        for mutation in self.mutations:
            variants = mutation.generate(variants)
        return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
    def _transliterate_unique_list(self, norm_name, iterable):
        seen = set()
        if self.variant_only:
            seen.add(norm_name)
        for variant in map(str.strip, iterable):
            if variant not in seen:
                seen.add(variant)
                yield self.to_ascii.transliterate(variant).strip()
    def _generate_word_variants(self, norm_name):
        baseform = '^ ' + norm_name + ' ^'
        baselen = len(baseform)
        partials = ['']
        startpos = 0
        if self.replacements is not None:
            pos = 0
            force_space = False
-            while pos < len(baseform):
+            while pos < baselen:
                full, repl = self.replacements.longest_prefix_item(baseform[pos:],
                                                                   (None, None))
                if full is not None:
@@ -207,24 +130,9 @@ class GenericTokenAnalysis:
        # No variants detected? Fast return.
        if startpos == 0:
-            if self.variant_only:
+            return (norm_name, )
                return []
-            trans_name = self.to_ascii.transliterate(norm_name).strip()
+        if startpos < baselen:
-            return [trans_name] if trans_name else []
+            return (part[1:] + baseform[startpos:-1] for part in partials)
-        return self._compute_result_set(partials, baseform[startpos:],
+        return (part[1:-1] for part in partials)
                                        norm_name if self.variant_only else '')
    def _compute_result_set(self, partials, prefix, exclude):
        results = set()
        for variant in partials:
            vname = (variant + prefix)[1:-1].strip()
            if vname != exclude:
                trans_name = self.to_ascii.transliterate(vname).strip()
                if trans_name:
                    results.add(trans_name)
        return list(results)
--- a/nominatim/tokenizer/token_analysis/generic_mutation.py
+++ b/nominatim/tokenizer/token_analysis/generic_mutation.py
@@ -0,0 +1,56 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Creator for mutation variants for the generic token analysis.
 """
 import itertools
 import logging
 import re
 from nominatim.errors import UsageError
 LOG = logging.getLogger()
 def _zigzag(outer, inner):
    return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
 class MutationVariantGenerator:
    """ Generates name variants by applying a regular expression to the name
        and replacing it with one or more variants. When the regular expression
        matches more than once, each occurence is replaced with all replacement
        patterns.
    """
    def __init__(self, pattern, replacements):
        self.pattern = re.compile(pattern)
        self.replacements = replacements
        if self.pattern.groups > 0:
            LOG.fatal("The mutation pattern %s contains a capturing group. "
                      "This is not allowed.", pattern)
            raise UsageError("Bad mutation pattern in configuration.")
    def generate(self, names):
        """ Generator function for the name variants. 'names' is an iterable
            over a set of names for which the variants are to be generated.
        """
        for name in names:
            parts = self.pattern.split(name)
            if len(parts) == 1:
                yield name
            else:
                for seps in self._fillers(len(parts)):
                    yield ''.join(_zigzag(parts, seps))
    def _fillers(self, num_parts):
        """ Returns a generator for strings to join the given number of string
            parts in all possible combinations.
        """
        return itertools.product(self.replacements, repeat=num_parts - 1)
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -59,6 +59,13 @@ token-analysis:
      mode: variant-only
      variants:
          - !include icu-rules/variants-de.yaml
      mutations:
          - pattern: ä
            replacements: ["ä", "ae"]
          - pattern: ö
            replacements: ["ö", "oe"]
          - pattern: ü
            replacements: ["ü", "ue"]
    - id: el
      analyzer: generic
      mode: variant-only
--- a/test/bdd/db/import/naming.feature
+++ b/test/bdd/db/import/naming.feature
@@ -58,3 +58,48 @@ Feature: Import and search of names
        | រាជធានីភ្នំពេញ |
        | 東京都 |
        | ပုဗ္ဗသီရိ |
    Scenario: German umlauts can be found when expanded
        Given the places
            | osm | class | type | name+name:de |
            | N1  | place | city | Münster      |
            | N2  | place | city | Köln         |
            | N3  | place | city | Gräfenroda   |
        When importing
        When sending search query "münster"
        Then results contain
            | osm |
            | N1  |
        When sending search query "muenster"
        Then results contain
            | osm |
            | N1  |
        When sending search query "munster"
        Then results contain
            | osm |
            | N1  |
        When sending search query "Köln"
        Then results contain
            | osm |
            | N2  |
        When sending search query "Koeln"
        Then results contain
            | osm |
            | N2  |
        When sending search query "Koln"
        Then results contain
            | osm |
            | N2  |
        When sending search query "gräfenroda"
        Then results contain
            | osm |
            | N3  |
        When sending search query "graefenroda"
        Then results contain
            | osm |
            | N3  |
        When sending search query "grafenroda"
        Then results contain
            | osm |
            | N3  |
--- a/test/python/tokenizer/token_analysis/test_generic_mutation.py
+++ b/test/python/tokenizer/token_analysis/test_generic_mutation.py
@@ -0,0 +1,89 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Tests for generic token analysis, mutation part.
 """
 import pytest
 from icu import Transliterator
 import nominatim.tokenizer.token_analysis.generic as module
 from nominatim.errors import UsageError
 DEFAULT_NORMALIZATION = """ '🜳' > ' ';
                            [[:Nonspacing Mark:] [:Cf:]] >;
                            :: lower ();
                            [[:Punctuation:][:Space:]]+ > ' '
                        """
 DEFAULT_TRANSLITERATION = """ ::  Latin ();
                              '🜵' > ' ';
                          """
 class TestMutationNoVariants:
    def make_analyser(self, *mutations):
        rules = { 'analyzer': 'generic',
                  'mutations': [ {'pattern': m[0], 'replacements': m[1]}
                                 for m in mutations]
                }
        config = module.configure(rules, DEFAULT_NORMALIZATION)
        trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
        self.analysis = module.create(trans, config)
    def variants(self, name):
        norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
        return set(self.analysis.get_variants_ascii(norm.transliterate(name).strip()))
    @pytest.mark.parametrize('pattern', ('(capture)', ['a list']))
    def test_bad_pattern(self, pattern):
        with pytest.raises(UsageError):
            self.make_analyser((pattern, ['b']))
    @pytest.mark.parametrize('replacements', (None, 'a string'))
    def test_bad_replacement(self, replacements):
        with pytest.raises(UsageError):
            self.make_analyser(('a', replacements))
    def test_simple_replacement(self):
        self.make_analyser(('a', ['b']))
        assert self.variants('none') == {'none'}
        assert self.variants('abba') == {'bbbb'}
        assert self.variants('2 aar') == {'2 bbr'}
    def test_multichar_replacement(self):
        self.make_analyser(('1 1', ['1 1 1']))
        assert self.variants('1 1456') == {'1 1 1456'}
        assert self.variants('1 1 1') == {'1 1 1 1'}
    def test_removement_replacement(self):
        self.make_analyser((' ', [' ', '']))
        assert self.variants('A 345') == {'a 345', 'a345'}
        assert self.variants('a g b') == {'a g b', 'ag b', 'a gb', 'agb'}
    def test_regex_pattern(self):
        self.make_analyser(('[^a-z]+', ['XXX', ' ']))
        assert self.variants('a-34n12') == {'aXXXnXXX', 'aXXXn', 'a nXXX', 'a n'}
    def test_multiple_mutations(self):
        self.make_analyser(('ä', ['ä', 'ae']), ('ö', ['ö', 'oe']))
        assert self.variants('Längenöhr') == {'längenöhr', 'laengenöhr',
                                              'längenoehr', 'laengenoehr'}