move variant configuration reading in separate file

2026-02-26 11:08:13 +00:00 · 2022-01-12 09:53:32 +01:00
parent 630ad38a67
commit 0192a7af96
2 changed files with 137 additions and 122 deletions
--- a/nominatim/tokenizer/token_analysis/config_variants.py
+++ b/nominatim/tokenizer/token_analysis/config_variants.py
@@ -0,0 +1,134 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Parser for configuration for variants.
 """
 from collections import defaultdict, namedtuple
 import itertools
 import re
 from icu import Transliterator
 from nominatim.config import flatten_config_list
 from nominatim.errors import UsageError
 ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
 def get_variant_config(rules, normalization_rules):
    """ Convert the variant definition from the configuration into
        replacement sets.
        Returns a tuple containing the replacement set and the list of characters
        used in the replacements.
    """
    immediate = defaultdict(list)
    chars = set()
    if rules:
        vset = set()
        rules = flatten_config_list(rules, 'variants')
        vmaker = _VariantMaker(normalization_rules)
        for section in rules:
            for rule in (section.get('words') or []):
                vset.update(vmaker.compute(rule))
        # Intermediate reorder by source. Also compute required character set.
        for variant in vset:
            if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
                replstr = variant.replacement[:-1]
            else:
                replstr = variant.replacement
            immediate[variant.source].append(replstr)
            chars.update(variant.source)
    return list(immediate.items()), ''.join(chars)
 class _VariantMaker:
    """ Generater for all necessary ICUVariants from a single variant rule.
        All text in rules is normalized to make sure the variants match later.
    """
    def __init__(self, norm_rules):
        self.norm = Transliterator.createFromRules("rule_loader_normalization",
                                                   norm_rules)
    def compute(self, rule):
        """ Generator for all ICUVariant tuples from a single variant rule.
        """
        parts = re.split(r'(\|)?([=-])>', rule)
        if len(parts) != 4:
            raise UsageError("Syntax error in variant rule: " + rule)
        decompose = parts[1] is None
        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
        repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
        # If the source should be kept, add a 1:1 replacement
        if parts[2] == '-':
            for src in src_terms:
                if src:
                    for froms, tos in _create_variants(*src, src[0], decompose):
                        yield ICUVariant(froms, tos)
        for src, repl in itertools.product(src_terms, repl_terms):
            if src and repl:
                for froms, tos in _create_variants(*src, repl, decompose):
                    yield ICUVariant(froms, tos)
    def _parse_variant_word(self, name):
        name = name.strip()
        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
            raise UsageError("Invalid variant word descriptor '{}'".format(name))
        norm_name = self.norm.transliterate(match.group(2)).strip()
        if not norm_name:
            return None
        return norm_name, match.group(1), match.group(3)
 _FLAG_MATCH = {'^': '^ ',
               '$': ' ^',
               '': ' '}
 def _create_variants(src, preflag, postflag, repl, decompose):
    if preflag == '~':
        postfix = _FLAG_MATCH[postflag]
        # suffix decomposition
        src = src + postfix
        repl = repl + postfix
        yield src, repl
        yield ' ' + src, ' ' + repl
        if decompose:
            yield src, ' ' + repl
            yield ' ' + src, repl
    elif postflag == '~':
        # prefix decomposition
        prefix = _FLAG_MATCH[preflag]
        src = prefix + src
        repl = prefix + repl
        yield src, repl
        yield src + ' ', repl + ' '
        if decompose:
            yield src, repl + ' '
            yield src + ' ', repl
    else:
        prefix = _FLAG_MATCH[preflag]
        postfix = _FLAG_MATCH[postflag]
        yield prefix + src + postfix, prefix + repl + postfix
--- a/nominatim/tokenizer/token_analysis/generic.py
+++ b/nominatim/tokenizer/token_analysis/generic.py
@@ -7,145 +7,26 @@
 """
 Generic processor for names that creates abbreviation variants.
 """
 from collections import defaultdict, namedtuple
 import itertools
 import re
 from icu import Transliterator
 import datrie
-from nominatim.config import flatten_config_list
+from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
 from nominatim.errors import UsageError
 ### Configuration section
 ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
 def configure(rules, normalization_rules):
    """ Extract and preprocess the configuration for this module.
    """
    config = {}
-    config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
+    config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
-                                                                  normalization_rules)
+                                                                 normalization_rules)
    config['variant_only'] = rules.get('mode', '') == 'variant-only'
    return config
 def _get_variant_config(rules, normalization_rules):
    """ Convert the variant definition from the configuration into
        replacement sets.
    """
    immediate = defaultdict(list)
    chars = set()
    if rules:
        vset = set()
        rules = flatten_config_list(rules, 'variants')
        vmaker = _VariantMaker(normalization_rules)
        for section in rules:
            for rule in (section.get('words') or []):
                vset.update(vmaker.compute(rule))
        # Intermediate reorder by source. Also compute required character set.
        for variant in vset:
            if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
                replstr = variant.replacement[:-1]
            else:
                replstr = variant.replacement
            immediate[variant.source].append(replstr)
            chars.update(variant.source)
    return list(immediate.items()), ''.join(chars)
 class _VariantMaker:
    """ Generater for all necessary ICUVariants from a single variant rule.
        All text in rules is normalized to make sure the variants match later.
    """
    def __init__(self, norm_rules):
        self.norm = Transliterator.createFromRules("rule_loader_normalization",
                                                   norm_rules)
    def compute(self, rule):
        """ Generator for all ICUVariant tuples from a single variant rule.
        """
        parts = re.split(r'(\|)?([=-])>', rule)
        if len(parts) != 4:
            raise UsageError("Syntax error in variant rule: " + rule)
        decompose = parts[1] is None
        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
        repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
        # If the source should be kept, add a 1:1 replacement
        if parts[2] == '-':
            for src in src_terms:
                if src:
                    for froms, tos in _create_variants(*src, src[0], decompose):
                        yield ICUVariant(froms, tos)
        for src, repl in itertools.product(src_terms, repl_terms):
            if src and repl:
                for froms, tos in _create_variants(*src, repl, decompose):
                    yield ICUVariant(froms, tos)
    def _parse_variant_word(self, name):
        name = name.strip()
        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
            raise UsageError("Invalid variant word descriptor '{}'".format(name))
        norm_name = self.norm.transliterate(match.group(2)).strip()
        if not norm_name:
            return None
        return norm_name, match.group(1), match.group(3)
 _FLAG_MATCH = {'^': '^ ',
               '$': ' ^',
               '': ' '}
 def _create_variants(src, preflag, postflag, repl, decompose):
    if preflag == '~':
        postfix = _FLAG_MATCH[postflag]
        # suffix decomposition
        src = src + postfix
        repl = repl + postfix
        yield src, repl
        yield ' ' + src, ' ' + repl
        if decompose:
            yield src, ' ' + repl
            yield ' ' + src, repl
    elif postflag == '~':
        # prefix decomposition
        prefix = _FLAG_MATCH[preflag]
        src = prefix + src
        repl = prefix + repl
        yield src, repl
        yield src + ' ', repl + ' '
        if decompose:
            yield src, repl + ' '
            yield src + ' ', repl
    else:
        prefix = _FLAG_MATCH[preflag]
        postfix = _FLAG_MATCH[postflag]
        yield prefix + src + postfix, prefix + repl + postfix
 ### Analysis section
 def create(transliterator, config):