introduce mutation variants to generic token analyser

Mutations are regular-expression-based replacements that are applied
after variants have been computed. They are meant to be used for
variations on character level.

Add spelling variations for German umlauts.
This commit is contained in:
Sarah Hoffmann
2022-01-12 16:25:47 +01:00
parent 0192a7af96
commit b453b0ea95
5 changed files with 233 additions and 7 deletions

View File

@@ -11,7 +11,9 @@ import itertools
import datrie
from nominatim.errors import UsageError
from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
### Configuration section
@@ -23,6 +25,7 @@ def configure(rules, normalization_rules):
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
normalization_rules)
config['variant_only'] = rules.get('mode', '') == 'variant-only'
config['mutations'] = rules.get('mutations', [])
return config
@@ -52,19 +55,45 @@ class GenericTokenAnalysis:
else:
self.replacements = None
# set up mutation rules
self.mutations = []
for cfg in config['mutations']:
if 'pattern' not in cfg:
raise UsageError("Missing field 'pattern' in mutation configuration.")
if not isinstance(cfg['pattern'], str):
raise UsageError("Field 'pattern' in mutation configuration "
"must be a simple text field.")
if 'replacements' not in cfg:
raise UsageError("Missing field 'replacements' in mutation configuration.")
if not isinstance(cfg['replacements'], list):
raise UsageError("Field 'replacements' in mutation configuration "
"must be a list of texts.")
self.mutations.append(MutationVariantGenerator(cfg['pattern'],
cfg['replacements']))
def get_variants_ascii(self, norm_name):
""" Compute the spelling variants for the given normalized name
and transliterate the result.
"""
results = set()
for variant in self._generate_word_variants(norm_name):
if not self.variant_only or variant.strip() != norm_name:
trans_name = self.to_ascii.transliterate(variant).strip()
if trans_name:
results.add(trans_name)
variants = self._generate_word_variants(norm_name)
return list(results)
for mutation in self.mutations:
variants = mutation.generate(variants)
return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
def _transliterate_unique_list(self, norm_name, iterable):
seen = set()
if self.variant_only:
seen.add(norm_name)
for variant in map(str.strip, iterable):
if variant not in seen:
seen.add(variant)
yield self.to_ascii.transliterate(variant).strip()
def _generate_word_variants(self, norm_name):

View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Creator for mutation variants for the generic token analysis.
"""
import itertools
import logging
import re
from nominatim.errors import UsageError
LOG = logging.getLogger()
def _zigzag(outer, inner):
return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
class MutationVariantGenerator:
""" Generates name variants by applying a regular expression to the name
and replacing it with one or more variants. When the regular expression
matches more than once, each occurence is replaced with all replacement
patterns.
"""
def __init__(self, pattern, replacements):
self.pattern = re.compile(pattern)
self.replacements = replacements
if self.pattern.groups > 0:
LOG.fatal("The mutation pattern %s contains a capturing group. "
"This is not allowed.", pattern)
raise UsageError("Bad mutation pattern in configuration.")
def generate(self, names):
""" Generator function for the name variants. 'names' is an iterable
over a set of names for which the variants are to be generated.
"""
for name in names:
parts = self.pattern.split(name)
if len(parts) == 1:
yield name
else:
for seps in self._fillers(len(parts)):
yield ''.join(_zigzag(parts, seps))
def _fillers(self, num_parts):
""" Returns a generator for strings to join the given number of string
parts in all possible combinations.
"""
return itertools.product(self.replacements, repeat=num_parts - 1)

View File

@@ -59,6 +59,13 @@ token-analysis:
mode: variant-only
variants:
- !include icu-rules/variants-de.yaml
mutations:
- pattern: ä
replacements: ["ä", "ae"]
- pattern: ö
replacements: ["ö", "oe"]
- pattern: ü
replacements: ["ü", "ue"]
- id: el
analyzer: generic
mode: variant-only

View File

@@ -58,3 +58,48 @@ Feature: Import and search of names
| |
| |
| |
Scenario: German umlauts can be found when expanded
Given the places
| osm | class | type | name+name:de |
| N1 | place | city | Münster |
| N2 | place | city | Köln |
| N3 | place | city | Gräfenroda |
When importing
When sending search query "münster"
Then results contain
| osm |
| N1 |
When sending search query "muenster"
Then results contain
| osm |
| N1 |
When sending search query "munster"
Then results contain
| osm |
| N1 |
When sending search query "Köln"
Then results contain
| osm |
| N2 |
When sending search query "Koeln"
Then results contain
| osm |
| N2 |
When sending search query "Koln"
Then results contain
| osm |
| N2 |
When sending search query "gräfenroda"
Then results contain
| osm |
| N3 |
When sending search query "graefenroda"
Then results contain
| osm |
| N3 |
When sending search query "grafenroda"
Then results contain
| osm |
| N3 |

View File

@@ -0,0 +1,89 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tests for generic token analysis, mutation part.
"""
import pytest
from icu import Transliterator
import nominatim.tokenizer.token_analysis.generic as module
from nominatim.errors import UsageError
DEFAULT_NORMALIZATION = """ '🜳' > ' ';
[[:Nonspacing Mark:] [:Cf:]] >;
:: lower ();
[[:Punctuation:][:Space:]]+ > ' '
"""
DEFAULT_TRANSLITERATION = """ :: Latin ();
'🜵' > ' ';
"""
class TestMutationNoVariants:
def make_analyser(self, *mutations):
rules = { 'analyzer': 'generic',
'mutations': [ {'pattern': m[0], 'replacements': m[1]}
for m in mutations]
}
config = module.configure(rules, DEFAULT_NORMALIZATION)
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
self.analysis = module.create(trans, config)
def variants(self, name):
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
return set(self.analysis.get_variants_ascii(norm.transliterate(name).strip()))
@pytest.mark.parametrize('pattern', ('(capture)', ['a list']))
def test_bad_pattern(self, pattern):
with pytest.raises(UsageError):
self.make_analyser((pattern, ['b']))
@pytest.mark.parametrize('replacements', (None, 'a string'))
def test_bad_replacement(self, replacements):
with pytest.raises(UsageError):
self.make_analyser(('a', replacements))
def test_simple_replacement(self):
self.make_analyser(('a', ['b']))
assert self.variants('none') == {'none'}
assert self.variants('abba') == {'bbbb'}
assert self.variants('2 aar') == {'2 bbr'}
def test_multichar_replacement(self):
self.make_analyser(('1 1', ['1 1 1']))
assert self.variants('1 1456') == {'1 1 1456'}
assert self.variants('1 1 1') == {'1 1 1 1'}
def test_removement_replacement(self):
self.make_analyser((' ', [' ', '']))
assert self.variants('A 345') == {'a 345', 'a345'}
assert self.variants('a g b') == {'a g b', 'ag b', 'a gb', 'agb'}
def test_regex_pattern(self):
self.make_analyser(('[^a-z]+', ['XXX', ' ']))
assert self.variants('a-34n12') == {'aXXXnXXX', 'aXXXn', 'a nXXX', 'a n'}
def test_multiple_mutations(self):
self.make_analyser(('ä', ['ä', 'ae']), ('ö', ['ö', 'oe']))
assert self.variants('Längenöhr') == {'längenöhr', 'laengenöhr',
'längenoehr', 'laengenoehr'}