diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index 4d5fbb15..5c766f50 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -99,6 +99,9 @@ token-analysis: - words: - road -> rd - bridge -> bdge,br,brdg,bri,brg + mutations: + - pattern: 'ä' + replacements: ['ä', 'ae'] ``` The configuration file contains four sections: @@ -205,12 +208,11 @@ the `analyzer` parameter must be set. Currently there is only one implementation ##### Generic token analyzer The generic analyzer is able to create variants from a list of given -abbreviation and decomposition replacements. It takes one optional parameter -`variants` which lists the replacements to apply. If the section is -omitted, then the generic analyzer becomes a simple analyzer that only -applies the transliteration. +abbreviation and decomposition replacements and introduce spelling variations. -The variants section defines lists of replacements which create alternative +###### Variants + +The optional 'variants' section defines lists of replacements which create alternative spellings of a name. To create the variants, a name is scanned from left to right and the longest matching replacement is applied until the end of the string is reached. @@ -296,6 +298,32 @@ decomposition has an effect here on the source as well. So a rule means that for a word like `hauptstrasse` four variants are created: `hauptstrasse`, `haupt strasse`, `hauptstr` and `haupt str`. +###### Mutations + +The 'mutation' section in the configuration describes an additional set of +replacements to be applied after the variants have been computed. + +Each mutation is described by two parameters: `pattern` and `replacements`. +The pattern must contain a single regular expression to search for in the +variant name. The regular expressions need to follow the syntax for +[Python regular expressions](file:///usr/share/doc/python3-doc/html/library/re.html#regular-expression-syntax). +Capturing groups are not permitted. +`replacements` must contain a list of strings that the pattern +should be replaced with. Each occurrence of the pattern is replaced with +all given replacements. Be mindful of combinatorial explosion of variants. + +###### Modes + +The generic analyser supports a special mode `variant-only`. When configured +then it consumes the input token and emits only variants (if any exist). Enable +the mode by adding: + +``` + mode: variant-only +``` + +to the analyser configuration. + ### Reconfiguration Changing the configuration after the import is currently not possible, although diff --git a/nominatim/tokenizer/token_analysis/config_variants.py b/nominatim/tokenizer/token_analysis/config_variants.py new file mode 100644 index 00000000..59ceeb22 --- /dev/null +++ b/nominatim/tokenizer/token_analysis/config_variants.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Parser for configuration for variants. +""" +from collections import defaultdict, namedtuple +import itertools +import re + +from icu import Transliterator + +from nominatim.config import flatten_config_list +from nominatim.errors import UsageError + +ICUVariant = namedtuple('ICUVariant', ['source', 'replacement']) + +def get_variant_config(rules, normalization_rules): + """ Convert the variant definition from the configuration into + replacement sets. + + Returns a tuple containing the replacement set and the list of characters + used in the replacements. + """ + immediate = defaultdict(list) + chars = set() + + if rules: + vset = set() + rules = flatten_config_list(rules, 'variants') + + vmaker = _VariantMaker(normalization_rules) + + for section in rules: + for rule in (section.get('words') or []): + vset.update(vmaker.compute(rule)) + + # Intermediate reorder by source. Also compute required character set. + for variant in vset: + if variant.source[-1] == ' ' and variant.replacement[-1] == ' ': + replstr = variant.replacement[:-1] + else: + replstr = variant.replacement + immediate[variant.source].append(replstr) + chars.update(variant.source) + + return list(immediate.items()), ''.join(chars) + + +class _VariantMaker: + """ Generater for all necessary ICUVariants from a single variant rule. + + All text in rules is normalized to make sure the variants match later. + """ + + def __init__(self, norm_rules): + self.norm = Transliterator.createFromRules("rule_loader_normalization", + norm_rules) + + + def compute(self, rule): + """ Generator for all ICUVariant tuples from a single variant rule. + """ + parts = re.split(r'(\|)?([=-])>', rule) + if len(parts) != 4: + raise UsageError("Syntax error in variant rule: " + rule) + + decompose = parts[1] is None + src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')] + repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(',')) + + # If the source should be kept, add a 1:1 replacement + if parts[2] == '-': + for src in src_terms: + if src: + for froms, tos in _create_variants(*src, src[0], decompose): + yield ICUVariant(froms, tos) + + for src, repl in itertools.product(src_terms, repl_terms): + if src and repl: + for froms, tos in _create_variants(*src, repl, decompose): + yield ICUVariant(froms, tos) + + + def _parse_variant_word(self, name): + name = name.strip() + match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name) + if match is None or (match.group(1) == '~' and match.group(3) == '~'): + raise UsageError("Invalid variant word descriptor '{}'".format(name)) + norm_name = self.norm.transliterate(match.group(2)).strip() + if not norm_name: + return None + + return norm_name, match.group(1), match.group(3) + + +_FLAG_MATCH = {'^': '^ ', + '$': ' ^', + '': ' '} + + +def _create_variants(src, preflag, postflag, repl, decompose): + if preflag == '~': + postfix = _FLAG_MATCH[postflag] + # suffix decomposition + src = src + postfix + repl = repl + postfix + + yield src, repl + yield ' ' + src, ' ' + repl + + if decompose: + yield src, ' ' + repl + yield ' ' + src, repl + elif postflag == '~': + # prefix decomposition + prefix = _FLAG_MATCH[preflag] + src = prefix + src + repl = prefix + repl + + yield src, repl + yield src + ' ', repl + ' ' + + if decompose: + yield src, repl + ' ' + yield src + ' ', repl + else: + prefix = _FLAG_MATCH[preflag] + postfix = _FLAG_MATCH[postflag] + + yield prefix + src + postfix, prefix + repl + postfix diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py index f790dad2..d4eae312 100644 --- a/nominatim/tokenizer/token_analysis/generic.py +++ b/nominatim/tokenizer/token_analysis/generic.py @@ -7,145 +7,44 @@ """ Generic processor for names that creates abbreviation variants. """ -from collections import defaultdict, namedtuple import itertools -import re -from icu import Transliterator import datrie -from nominatim.config import flatten_config_list from nominatim.errors import UsageError +from nominatim.tokenizer.token_analysis.config_variants import get_variant_config +from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator ### Configuration section -ICUVariant = namedtuple('ICUVariant', ['source', 'replacement']) - def configure(rules, normalization_rules): """ Extract and preprocess the configuration for this module. """ config = {} - config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'), - normalization_rules) + config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), + normalization_rules) config['variant_only'] = rules.get('mode', '') == 'variant-only' + # parse mutation rules + config['mutations'] = [] + for rule in rules.get('mutations', []): + if 'pattern' not in rule: + raise UsageError("Missing field 'pattern' in mutation configuration.") + if not isinstance(rule['pattern'], str): + raise UsageError("Field 'pattern' in mutation configuration " + "must be a simple text field.") + if 'replacements' not in rule: + raise UsageError("Missing field 'replacements' in mutation configuration.") + if not isinstance(rule['replacements'], list): + raise UsageError("Field 'replacements' in mutation configuration " + "must be a list of texts.") + + config['mutations'].append((rule['pattern'], rule['replacements'])) + return config -def _get_variant_config(rules, normalization_rules): - """ Convert the variant definition from the configuration into - replacement sets. - """ - immediate = defaultdict(list) - chars = set() - - if rules: - vset = set() - rules = flatten_config_list(rules, 'variants') - - vmaker = _VariantMaker(normalization_rules) - - for section in rules: - for rule in (section.get('words') or []): - vset.update(vmaker.compute(rule)) - - # Intermediate reorder by source. Also compute required character set. - for variant in vset: - if variant.source[-1] == ' ' and variant.replacement[-1] == ' ': - replstr = variant.replacement[:-1] - else: - replstr = variant.replacement - immediate[variant.source].append(replstr) - chars.update(variant.source) - - return list(immediate.items()), ''.join(chars) - - -class _VariantMaker: - """ Generater for all necessary ICUVariants from a single variant rule. - - All text in rules is normalized to make sure the variants match later. - """ - - def __init__(self, norm_rules): - self.norm = Transliterator.createFromRules("rule_loader_normalization", - norm_rules) - - - def compute(self, rule): - """ Generator for all ICUVariant tuples from a single variant rule. - """ - parts = re.split(r'(\|)?([=-])>', rule) - if len(parts) != 4: - raise UsageError("Syntax error in variant rule: " + rule) - - decompose = parts[1] is None - src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')] - repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(',')) - - # If the source should be kept, add a 1:1 replacement - if parts[2] == '-': - for src in src_terms: - if src: - for froms, tos in _create_variants(*src, src[0], decompose): - yield ICUVariant(froms, tos) - - for src, repl in itertools.product(src_terms, repl_terms): - if src and repl: - for froms, tos in _create_variants(*src, repl, decompose): - yield ICUVariant(froms, tos) - - - def _parse_variant_word(self, name): - name = name.strip() - match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name) - if match is None or (match.group(1) == '~' and match.group(3) == '~'): - raise UsageError("Invalid variant word descriptor '{}'".format(name)) - norm_name = self.norm.transliterate(match.group(2)).strip() - if not norm_name: - return None - - return norm_name, match.group(1), match.group(3) - - -_FLAG_MATCH = {'^': '^ ', - '$': ' ^', - '': ' '} - - -def _create_variants(src, preflag, postflag, repl, decompose): - if preflag == '~': - postfix = _FLAG_MATCH[postflag] - # suffix decomposition - src = src + postfix - repl = repl + postfix - - yield src, repl - yield ' ' + src, ' ' + repl - - if decompose: - yield src, ' ' + repl - yield ' ' + src, repl - elif postflag == '~': - # prefix decomposition - prefix = _FLAG_MATCH[preflag] - src = prefix + src - repl = prefix + repl - - yield src, repl - yield src + ' ', repl + ' ' - - if decompose: - yield src, repl + ' ' - yield src + ' ', repl - else: - prefix = _FLAG_MATCH[preflag] - postfix = _FLAG_MATCH[postflag] - - yield prefix + src + postfix, prefix + repl + postfix - - ### Analysis section def create(transliterator, config): @@ -171,19 +70,43 @@ class GenericTokenAnalysis: else: self.replacements = None + # set up mutation rules + self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']] + def get_variants_ascii(self, norm_name): """ Compute the spelling variants for the given normalized name and transliterate the result. """ + variants = self._generate_word_variants(norm_name) + + for mutation in self.mutations: + variants = mutation.generate(variants) + + return [name for name in self._transliterate_unique_list(norm_name, variants) if name] + + + def _transliterate_unique_list(self, norm_name, iterable): + seen = set() + if self.variant_only: + seen.add(norm_name) + + for variant in map(str.strip, iterable): + if variant not in seen: + seen.add(variant) + yield self.to_ascii.transliterate(variant).strip() + + + def _generate_word_variants(self, norm_name): baseform = '^ ' + norm_name + ' ^' + baselen = len(baseform) partials = [''] startpos = 0 if self.replacements is not None: pos = 0 force_space = False - while pos < len(baseform): + while pos < baselen: full, repl = self.replacements.longest_prefix_item(baseform[pos:], (None, None)) if full is not None: @@ -207,24 +130,9 @@ class GenericTokenAnalysis: # No variants detected? Fast return. if startpos == 0: - if self.variant_only: - return [] + return (norm_name, ) - trans_name = self.to_ascii.transliterate(norm_name).strip() - return [trans_name] if trans_name else [] + if startpos < baselen: + return (part[1:] + baseform[startpos:-1] for part in partials) - return self._compute_result_set(partials, baseform[startpos:], - norm_name if self.variant_only else '') - - - def _compute_result_set(self, partials, prefix, exclude): - results = set() - - for variant in partials: - vname = (variant + prefix)[1:-1].strip() - if vname != exclude: - trans_name = self.to_ascii.transliterate(vname).strip() - if trans_name: - results.add(trans_name) - - return list(results) + return (part[1:-1] for part in partials) diff --git a/nominatim/tokenizer/token_analysis/generic_mutation.py b/nominatim/tokenizer/token_analysis/generic_mutation.py new file mode 100644 index 00000000..d23d5cd4 --- /dev/null +++ b/nominatim/tokenizer/token_analysis/generic_mutation.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Creator for mutation variants for the generic token analysis. +""" +import itertools +import logging +import re + +from nominatim.errors import UsageError + +LOG = logging.getLogger() + +def _zigzag(outer, inner): + return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue='')) + + +class MutationVariantGenerator: + """ Generates name variants by applying a regular expression to the name + and replacing it with one or more variants. When the regular expression + matches more than once, each occurence is replaced with all replacement + patterns. + """ + + def __init__(self, pattern, replacements): + self.pattern = re.compile(pattern) + self.replacements = replacements + + if self.pattern.groups > 0: + LOG.fatal("The mutation pattern %s contains a capturing group. " + "This is not allowed.", pattern) + raise UsageError("Bad mutation pattern in configuration.") + + + def generate(self, names): + """ Generator function for the name variants. 'names' is an iterable + over a set of names for which the variants are to be generated. + """ + for name in names: + parts = self.pattern.split(name) + if len(parts) == 1: + yield name + else: + for seps in self._fillers(len(parts)): + yield ''.join(_zigzag(parts, seps)) + + + def _fillers(self, num_parts): + """ Returns a generator for strings to join the given number of string + parts in all possible combinations. + """ + return itertools.product(self.replacements, repeat=num_parts - 1) diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index a3c62e67..c6601faf 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -59,6 +59,13 @@ token-analysis: mode: variant-only variants: - !include icu-rules/variants-de.yaml + mutations: + - pattern: ä + replacements: ["ä", "ae"] + - pattern: ö + replacements: ["ö", "oe"] + - pattern: ü + replacements: ["ü", "ue"] - id: el analyzer: generic mode: variant-only diff --git a/test/bdd/db/import/naming.feature b/test/bdd/db/import/naming.feature index bb29d2a3..b739cbae 100644 --- a/test/bdd/db/import/naming.feature +++ b/test/bdd/db/import/naming.feature @@ -58,3 +58,48 @@ Feature: Import and search of names | រាជធានីភ្នំពេញ | | 東京都 | | ပုဗ္ဗသီရိ | + + + Scenario: German umlauts can be found when expanded + Given the places + | osm | class | type | name+name:de | + | N1 | place | city | Münster | + | N2 | place | city | Köln | + | N3 | place | city | Gräfenroda | + When importing + When sending search query "münster" + Then results contain + | osm | + | N1 | + When sending search query "muenster" + Then results contain + | osm | + | N1 | + When sending search query "munster" + Then results contain + | osm | + | N1 | + When sending search query "Köln" + Then results contain + | osm | + | N2 | + When sending search query "Koeln" + Then results contain + | osm | + | N2 | + When sending search query "Koln" + Then results contain + | osm | + | N2 | + When sending search query "gräfenroda" + Then results contain + | osm | + | N3 | + When sending search query "graefenroda" + Then results contain + | osm | + | N3 | + When sending search query "grafenroda" + Then results contain + | osm | + | N3 | diff --git a/test/python/tokenizer/token_analysis/test_generic_mutation.py b/test/python/tokenizer/token_analysis/test_generic_mutation.py new file mode 100644 index 00000000..757f0311 --- /dev/null +++ b/test/python/tokenizer/token_analysis/test_generic_mutation.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for generic token analysis, mutation part. +""" +import pytest + +from icu import Transliterator + +import nominatim.tokenizer.token_analysis.generic as module +from nominatim.errors import UsageError + +DEFAULT_NORMALIZATION = """ '🜳' > ' '; + [[:Nonspacing Mark:] [:Cf:]] >; + :: lower (); + [[:Punctuation:][:Space:]]+ > ' ' + """ + +DEFAULT_TRANSLITERATION = """ :: Latin (); + '🜵' > ' '; + """ + +class TestMutationNoVariants: + + def make_analyser(self, *mutations): + rules = { 'analyzer': 'generic', + 'mutations': [ {'pattern': m[0], 'replacements': m[1]} + for m in mutations] + } + config = module.configure(rules, DEFAULT_NORMALIZATION) + trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) + + self.analysis = module.create(trans, config) + + + def variants(self, name): + norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) + return set(self.analysis.get_variants_ascii(norm.transliterate(name).strip())) + + + @pytest.mark.parametrize('pattern', ('(capture)', ['a list'])) + def test_bad_pattern(self, pattern): + with pytest.raises(UsageError): + self.make_analyser((pattern, ['b'])) + + + @pytest.mark.parametrize('replacements', (None, 'a string')) + def test_bad_replacement(self, replacements): + with pytest.raises(UsageError): + self.make_analyser(('a', replacements)) + + + def test_simple_replacement(self): + self.make_analyser(('a', ['b'])) + + assert self.variants('none') == {'none'} + assert self.variants('abba') == {'bbbb'} + assert self.variants('2 aar') == {'2 bbr'} + + + def test_multichar_replacement(self): + self.make_analyser(('1 1', ['1 1 1'])) + + assert self.variants('1 1456') == {'1 1 1456'} + assert self.variants('1 1 1') == {'1 1 1 1'} + + + def test_removement_replacement(self): + self.make_analyser((' ', [' ', ''])) + + assert self.variants('A 345') == {'a 345', 'a345'} + assert self.variants('a g b') == {'a g b', 'ag b', 'a gb', 'agb'} + + + def test_regex_pattern(self): + self.make_analyser(('[^a-z]+', ['XXX', ' '])) + + assert self.variants('a-34n12') == {'aXXXnXXX', 'aXXXn', 'a nXXX', 'a n'} + + + def test_multiple_mutations(self): + self.make_analyser(('ä', ['ä', 'ae']), ('ö', ['ö', 'oe'])) + + assert self.variants('Längenöhr') == {'längenöhr', 'laengenöhr', + 'längenoehr', 'laengenoehr'}