forked from hans/Nominatim
introduce mutation variants to generic token analyser
Mutations are regular-expression-based replacements that are applied after variants have been computed. They are meant to be used for variations on character level. Add spelling variations for German umlauts.
This commit is contained in:
@@ -11,7 +11,9 @@ import itertools
|
||||
|
||||
import datrie
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
|
||||
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
||||
|
||||
### Configuration section
|
||||
|
||||
@@ -23,6 +25,7 @@ def configure(rules, normalization_rules):
|
||||
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
|
||||
normalization_rules)
|
||||
config['variant_only'] = rules.get('mode', '') == 'variant-only'
|
||||
config['mutations'] = rules.get('mutations', [])
|
||||
|
||||
return config
|
||||
|
||||
@@ -52,19 +55,45 @@ class GenericTokenAnalysis:
|
||||
else:
|
||||
self.replacements = None
|
||||
|
||||
# set up mutation rules
|
||||
self.mutations = []
|
||||
for cfg in config['mutations']:
|
||||
if 'pattern' not in cfg:
|
||||
raise UsageError("Missing field 'pattern' in mutation configuration.")
|
||||
if not isinstance(cfg['pattern'], str):
|
||||
raise UsageError("Field 'pattern' in mutation configuration "
|
||||
"must be a simple text field.")
|
||||
if 'replacements' not in cfg:
|
||||
raise UsageError("Missing field 'replacements' in mutation configuration.")
|
||||
if not isinstance(cfg['replacements'], list):
|
||||
raise UsageError("Field 'replacements' in mutation configuration "
|
||||
"must be a list of texts.")
|
||||
|
||||
self.mutations.append(MutationVariantGenerator(cfg['pattern'],
|
||||
cfg['replacements']))
|
||||
|
||||
|
||||
def get_variants_ascii(self, norm_name):
|
||||
""" Compute the spelling variants for the given normalized name
|
||||
and transliterate the result.
|
||||
"""
|
||||
results = set()
|
||||
for variant in self._generate_word_variants(norm_name):
|
||||
if not self.variant_only or variant.strip() != norm_name:
|
||||
trans_name = self.to_ascii.transliterate(variant).strip()
|
||||
if trans_name:
|
||||
results.add(trans_name)
|
||||
variants = self._generate_word_variants(norm_name)
|
||||
|
||||
return list(results)
|
||||
for mutation in self.mutations:
|
||||
variants = mutation.generate(variants)
|
||||
|
||||
return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
|
||||
|
||||
|
||||
def _transliterate_unique_list(self, norm_name, iterable):
|
||||
seen = set()
|
||||
if self.variant_only:
|
||||
seen.add(norm_name)
|
||||
|
||||
for variant in map(str.strip, iterable):
|
||||
if variant not in seen:
|
||||
seen.add(variant)
|
||||
yield self.to_ascii.transliterate(variant).strip()
|
||||
|
||||
|
||||
def _generate_word_variants(self, norm_name):
|
||||
|
||||
56
nominatim/tokenizer/token_analysis/generic_mutation.py
Normal file
56
nominatim/tokenizer/token_analysis/generic_mutation.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Creator for mutation variants for the generic token analysis.
|
||||
"""
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _zigzag(outer, inner):
|
||||
return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
|
||||
|
||||
|
||||
class MutationVariantGenerator:
|
||||
""" Generates name variants by applying a regular expression to the name
|
||||
and replacing it with one or more variants. When the regular expression
|
||||
matches more than once, each occurence is replaced with all replacement
|
||||
patterns.
|
||||
"""
|
||||
|
||||
def __init__(self, pattern, replacements):
|
||||
self.pattern = re.compile(pattern)
|
||||
self.replacements = replacements
|
||||
|
||||
if self.pattern.groups > 0:
|
||||
LOG.fatal("The mutation pattern %s contains a capturing group. "
|
||||
"This is not allowed.", pattern)
|
||||
raise UsageError("Bad mutation pattern in configuration.")
|
||||
|
||||
|
||||
def generate(self, names):
|
||||
""" Generator function for the name variants. 'names' is an iterable
|
||||
over a set of names for which the variants are to be generated.
|
||||
"""
|
||||
for name in names:
|
||||
parts = self.pattern.split(name)
|
||||
if len(parts) == 1:
|
||||
yield name
|
||||
else:
|
||||
for seps in self._fillers(len(parts)):
|
||||
yield ''.join(_zigzag(parts, seps))
|
||||
|
||||
|
||||
def _fillers(self, num_parts):
|
||||
""" Returns a generator for strings to join the given number of string
|
||||
parts in all possible combinations.
|
||||
"""
|
||||
return itertools.product(self.replacements, repeat=num_parts - 1)
|
||||
@@ -59,6 +59,13 @@ token-analysis:
|
||||
mode: variant-only
|
||||
variants:
|
||||
- !include icu-rules/variants-de.yaml
|
||||
mutations:
|
||||
- pattern: ä
|
||||
replacements: ["ä", "ae"]
|
||||
- pattern: ö
|
||||
replacements: ["ö", "oe"]
|
||||
- pattern: ü
|
||||
replacements: ["ü", "ue"]
|
||||
- id: el
|
||||
analyzer: generic
|
||||
mode: variant-only
|
||||
|
||||
@@ -58,3 +58,48 @@ Feature: Import and search of names
|
||||
| រាជធានីភ្នំពេញ |
|
||||
| 東京都 |
|
||||
| ပုဗ္ဗသီရိ |
|
||||
|
||||
|
||||
Scenario: German umlauts can be found when expanded
|
||||
Given the places
|
||||
| osm | class | type | name+name:de |
|
||||
| N1 | place | city | Münster |
|
||||
| N2 | place | city | Köln |
|
||||
| N3 | place | city | Gräfenroda |
|
||||
When importing
|
||||
When sending search query "münster"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N1 |
|
||||
When sending search query "muenster"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N1 |
|
||||
When sending search query "munster"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N1 |
|
||||
When sending search query "Köln"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N2 |
|
||||
When sending search query "Koeln"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N2 |
|
||||
When sending search query "Koln"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N2 |
|
||||
When sending search query "gräfenroda"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N3 |
|
||||
When sending search query "graefenroda"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N3 |
|
||||
When sending search query "grafenroda"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N3 |
|
||||
|
||||
@@ -0,0 +1,89 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for generic token analysis, mutation part.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
import nominatim.tokenizer.token_analysis.generic as module
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
DEFAULT_NORMALIZATION = """ '🜳' > ' ';
|
||||
[[:Nonspacing Mark:] [:Cf:]] >;
|
||||
:: lower ();
|
||||
[[:Punctuation:][:Space:]]+ > ' '
|
||||
"""
|
||||
|
||||
DEFAULT_TRANSLITERATION = """ :: Latin ();
|
||||
'🜵' > ' ';
|
||||
"""
|
||||
|
||||
class TestMutationNoVariants:
|
||||
|
||||
def make_analyser(self, *mutations):
|
||||
rules = { 'analyzer': 'generic',
|
||||
'mutations': [ {'pattern': m[0], 'replacements': m[1]}
|
||||
for m in mutations]
|
||||
}
|
||||
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||
|
||||
self.analysis = module.create(trans, config)
|
||||
|
||||
|
||||
def variants(self, name):
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
return set(self.analysis.get_variants_ascii(norm.transliterate(name).strip()))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pattern', ('(capture)', ['a list']))
|
||||
def test_bad_pattern(self, pattern):
|
||||
with pytest.raises(UsageError):
|
||||
self.make_analyser((pattern, ['b']))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('replacements', (None, 'a string'))
|
||||
def test_bad_replacement(self, replacements):
|
||||
with pytest.raises(UsageError):
|
||||
self.make_analyser(('a', replacements))
|
||||
|
||||
|
||||
def test_simple_replacement(self):
|
||||
self.make_analyser(('a', ['b']))
|
||||
|
||||
assert self.variants('none') == {'none'}
|
||||
assert self.variants('abba') == {'bbbb'}
|
||||
assert self.variants('2 aar') == {'2 bbr'}
|
||||
|
||||
|
||||
def test_multichar_replacement(self):
|
||||
self.make_analyser(('1 1', ['1 1 1']))
|
||||
|
||||
assert self.variants('1 1456') == {'1 1 1456'}
|
||||
assert self.variants('1 1 1') == {'1 1 1 1'}
|
||||
|
||||
|
||||
def test_removement_replacement(self):
|
||||
self.make_analyser((' ', [' ', '']))
|
||||
|
||||
assert self.variants('A 345') == {'a 345', 'a345'}
|
||||
assert self.variants('a g b') == {'a g b', 'ag b', 'a gb', 'agb'}
|
||||
|
||||
|
||||
def test_regex_pattern(self):
|
||||
self.make_analyser(('[^a-z]+', ['XXX', ' ']))
|
||||
|
||||
assert self.variants('a-34n12') == {'aXXXnXXX', 'aXXXn', 'a nXXX', 'a n'}
|
||||
|
||||
|
||||
def test_multiple_mutations(self):
|
||||
self.make_analyser(('ä', ['ä', 'ae']), ('ö', ['ö', 'oe']))
|
||||
|
||||
assert self.variants('Längenöhr') == {'längenöhr', 'laengenöhr',
|
||||
'längenoehr', 'laengenoehr'}
|
||||
Reference in New Issue
Block a user