move variant configuration reading in separate file

This commit is contained in:
Sarah Hoffmann
2022-01-12 09:53:32 +01:00
parent 630ad38a67
commit 0192a7af96
2 changed files with 137 additions and 122 deletions

View File

@@ -7,145 +7,26 @@
"""
Generic processor for names that creates abbreviation variants.
"""
from collections import defaultdict, namedtuple
import itertools
import re
from icu import Transliterator
import datrie
from nominatim.config import flatten_config_list
from nominatim.errors import UsageError
from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
### Configuration section
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
def configure(rules, normalization_rules):
""" Extract and preprocess the configuration for this module.
"""
config = {}
config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
normalization_rules)
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
normalization_rules)
config['variant_only'] = rules.get('mode', '') == 'variant-only'
return config
def _get_variant_config(rules, normalization_rules):
""" Convert the variant definition from the configuration into
replacement sets.
"""
immediate = defaultdict(list)
chars = set()
if rules:
vset = set()
rules = flatten_config_list(rules, 'variants')
vmaker = _VariantMaker(normalization_rules)
for section in rules:
for rule in (section.get('words') or []):
vset.update(vmaker.compute(rule))
# Intermediate reorder by source. Also compute required character set.
for variant in vset:
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
replstr = variant.replacement[:-1]
else:
replstr = variant.replacement
immediate[variant.source].append(replstr)
chars.update(variant.source)
return list(immediate.items()), ''.join(chars)
class _VariantMaker:
""" Generater for all necessary ICUVariants from a single variant rule.
All text in rules is normalized to make sure the variants match later.
"""
def __init__(self, norm_rules):
self.norm = Transliterator.createFromRules("rule_loader_normalization",
norm_rules)
def compute(self, rule):
""" Generator for all ICUVariant tuples from a single variant rule.
"""
parts = re.split(r'(\|)?([=-])>', rule)
if len(parts) != 4:
raise UsageError("Syntax error in variant rule: " + rule)
decompose = parts[1] is None
src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
# If the source should be kept, add a 1:1 replacement
if parts[2] == '-':
for src in src_terms:
if src:
for froms, tos in _create_variants(*src, src[0], decompose):
yield ICUVariant(froms, tos)
for src, repl in itertools.product(src_terms, repl_terms):
if src and repl:
for froms, tos in _create_variants(*src, repl, decompose):
yield ICUVariant(froms, tos)
def _parse_variant_word(self, name):
name = name.strip()
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
raise UsageError("Invalid variant word descriptor '{}'".format(name))
norm_name = self.norm.transliterate(match.group(2)).strip()
if not norm_name:
return None
return norm_name, match.group(1), match.group(3)
_FLAG_MATCH = {'^': '^ ',
'$': ' ^',
'': ' '}
def _create_variants(src, preflag, postflag, repl, decompose):
if preflag == '~':
postfix = _FLAG_MATCH[postflag]
# suffix decomposition
src = src + postfix
repl = repl + postfix
yield src, repl
yield ' ' + src, ' ' + repl
if decompose:
yield src, ' ' + repl
yield ' ' + src, repl
elif postflag == '~':
# prefix decomposition
prefix = _FLAG_MATCH[preflag]
src = prefix + src
repl = prefix + repl
yield src, repl
yield src + ' ', repl + ' '
if decompose:
yield src, repl + ' '
yield src + ' ', repl
else:
prefix = _FLAG_MATCH[preflag]
postfix = _FLAG_MATCH[postflag]
yield prefix + src + postfix, prefix + repl + postfix
### Analysis section
def create(transliterator, config):