mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-08 02:54:08 +00:00
Merge pull request #2585 from lonvia/name-mutations
Introduce character mutations to token analysis
This commit is contained in:
@@ -99,6 +99,9 @@ token-analysis:
|
|||||||
- words:
|
- words:
|
||||||
- road -> rd
|
- road -> rd
|
||||||
- bridge -> bdge,br,brdg,bri,brg
|
- bridge -> bdge,br,brdg,bri,brg
|
||||||
|
mutations:
|
||||||
|
- pattern: 'ä'
|
||||||
|
replacements: ['ä', 'ae']
|
||||||
```
|
```
|
||||||
|
|
||||||
The configuration file contains four sections:
|
The configuration file contains four sections:
|
||||||
@@ -205,12 +208,11 @@ the `analyzer` parameter must be set. Currently there is only one implementation
|
|||||||
##### Generic token analyzer
|
##### Generic token analyzer
|
||||||
|
|
||||||
The generic analyzer is able to create variants from a list of given
|
The generic analyzer is able to create variants from a list of given
|
||||||
abbreviation and decomposition replacements. It takes one optional parameter
|
abbreviation and decomposition replacements and introduce spelling variations.
|
||||||
`variants` which lists the replacements to apply. If the section is
|
|
||||||
omitted, then the generic analyzer becomes a simple analyzer that only
|
|
||||||
applies the transliteration.
|
|
||||||
|
|
||||||
The variants section defines lists of replacements which create alternative
|
###### Variants
|
||||||
|
|
||||||
|
The optional 'variants' section defines lists of replacements which create alternative
|
||||||
spellings of a name. To create the variants, a name is scanned from left to
|
spellings of a name. To create the variants, a name is scanned from left to
|
||||||
right and the longest matching replacement is applied until the end of the
|
right and the longest matching replacement is applied until the end of the
|
||||||
string is reached.
|
string is reached.
|
||||||
@@ -296,6 +298,32 @@ decomposition has an effect here on the source as well. So a rule
|
|||||||
means that for a word like `hauptstrasse` four variants are created:
|
means that for a word like `hauptstrasse` four variants are created:
|
||||||
`hauptstrasse`, `haupt strasse`, `hauptstr` and `haupt str`.
|
`hauptstrasse`, `haupt strasse`, `hauptstr` and `haupt str`.
|
||||||
|
|
||||||
|
###### Mutations
|
||||||
|
|
||||||
|
The 'mutation' section in the configuration describes an additional set of
|
||||||
|
replacements to be applied after the variants have been computed.
|
||||||
|
|
||||||
|
Each mutation is described by two parameters: `pattern` and `replacements`.
|
||||||
|
The pattern must contain a single regular expression to search for in the
|
||||||
|
variant name. The regular expressions need to follow the syntax for
|
||||||
|
[Python regular expressions](file:///usr/share/doc/python3-doc/html/library/re.html#regular-expression-syntax).
|
||||||
|
Capturing groups are not permitted.
|
||||||
|
`replacements` must contain a list of strings that the pattern
|
||||||
|
should be replaced with. Each occurrence of the pattern is replaced with
|
||||||
|
all given replacements. Be mindful of combinatorial explosion of variants.
|
||||||
|
|
||||||
|
###### Modes
|
||||||
|
|
||||||
|
The generic analyser supports a special mode `variant-only`. When configured
|
||||||
|
then it consumes the input token and emits only variants (if any exist). Enable
|
||||||
|
the mode by adding:
|
||||||
|
|
||||||
|
```
|
||||||
|
mode: variant-only
|
||||||
|
```
|
||||||
|
|
||||||
|
to the analyser configuration.
|
||||||
|
|
||||||
### Reconfiguration
|
### Reconfiguration
|
||||||
|
|
||||||
Changing the configuration after the import is currently not possible, although
|
Changing the configuration after the import is currently not possible, although
|
||||||
|
|||||||
134
nominatim/tokenizer/token_analysis/config_variants.py
Normal file
134
nominatim/tokenizer/token_analysis/config_variants.py
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Parser for configuration for variants.
|
||||||
|
"""
|
||||||
|
from collections import defaultdict, namedtuple
|
||||||
|
import itertools
|
||||||
|
import re
|
||||||
|
|
||||||
|
from icu import Transliterator
|
||||||
|
|
||||||
|
from nominatim.config import flatten_config_list
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
|
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
|
||||||
|
|
||||||
|
def get_variant_config(rules, normalization_rules):
|
||||||
|
""" Convert the variant definition from the configuration into
|
||||||
|
replacement sets.
|
||||||
|
|
||||||
|
Returns a tuple containing the replacement set and the list of characters
|
||||||
|
used in the replacements.
|
||||||
|
"""
|
||||||
|
immediate = defaultdict(list)
|
||||||
|
chars = set()
|
||||||
|
|
||||||
|
if rules:
|
||||||
|
vset = set()
|
||||||
|
rules = flatten_config_list(rules, 'variants')
|
||||||
|
|
||||||
|
vmaker = _VariantMaker(normalization_rules)
|
||||||
|
|
||||||
|
for section in rules:
|
||||||
|
for rule in (section.get('words') or []):
|
||||||
|
vset.update(vmaker.compute(rule))
|
||||||
|
|
||||||
|
# Intermediate reorder by source. Also compute required character set.
|
||||||
|
for variant in vset:
|
||||||
|
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
|
||||||
|
replstr = variant.replacement[:-1]
|
||||||
|
else:
|
||||||
|
replstr = variant.replacement
|
||||||
|
immediate[variant.source].append(replstr)
|
||||||
|
chars.update(variant.source)
|
||||||
|
|
||||||
|
return list(immediate.items()), ''.join(chars)
|
||||||
|
|
||||||
|
|
||||||
|
class _VariantMaker:
|
||||||
|
""" Generater for all necessary ICUVariants from a single variant rule.
|
||||||
|
|
||||||
|
All text in rules is normalized to make sure the variants match later.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, norm_rules):
|
||||||
|
self.norm = Transliterator.createFromRules("rule_loader_normalization",
|
||||||
|
norm_rules)
|
||||||
|
|
||||||
|
|
||||||
|
def compute(self, rule):
|
||||||
|
""" Generator for all ICUVariant tuples from a single variant rule.
|
||||||
|
"""
|
||||||
|
parts = re.split(r'(\|)?([=-])>', rule)
|
||||||
|
if len(parts) != 4:
|
||||||
|
raise UsageError("Syntax error in variant rule: " + rule)
|
||||||
|
|
||||||
|
decompose = parts[1] is None
|
||||||
|
src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
|
||||||
|
repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
|
||||||
|
|
||||||
|
# If the source should be kept, add a 1:1 replacement
|
||||||
|
if parts[2] == '-':
|
||||||
|
for src in src_terms:
|
||||||
|
if src:
|
||||||
|
for froms, tos in _create_variants(*src, src[0], decompose):
|
||||||
|
yield ICUVariant(froms, tos)
|
||||||
|
|
||||||
|
for src, repl in itertools.product(src_terms, repl_terms):
|
||||||
|
if src and repl:
|
||||||
|
for froms, tos in _create_variants(*src, repl, decompose):
|
||||||
|
yield ICUVariant(froms, tos)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_variant_word(self, name):
|
||||||
|
name = name.strip()
|
||||||
|
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
|
||||||
|
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
|
||||||
|
raise UsageError("Invalid variant word descriptor '{}'".format(name))
|
||||||
|
norm_name = self.norm.transliterate(match.group(2)).strip()
|
||||||
|
if not norm_name:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return norm_name, match.group(1), match.group(3)
|
||||||
|
|
||||||
|
|
||||||
|
_FLAG_MATCH = {'^': '^ ',
|
||||||
|
'$': ' ^',
|
||||||
|
'': ' '}
|
||||||
|
|
||||||
|
|
||||||
|
def _create_variants(src, preflag, postflag, repl, decompose):
|
||||||
|
if preflag == '~':
|
||||||
|
postfix = _FLAG_MATCH[postflag]
|
||||||
|
# suffix decomposition
|
||||||
|
src = src + postfix
|
||||||
|
repl = repl + postfix
|
||||||
|
|
||||||
|
yield src, repl
|
||||||
|
yield ' ' + src, ' ' + repl
|
||||||
|
|
||||||
|
if decompose:
|
||||||
|
yield src, ' ' + repl
|
||||||
|
yield ' ' + src, repl
|
||||||
|
elif postflag == '~':
|
||||||
|
# prefix decomposition
|
||||||
|
prefix = _FLAG_MATCH[preflag]
|
||||||
|
src = prefix + src
|
||||||
|
repl = prefix + repl
|
||||||
|
|
||||||
|
yield src, repl
|
||||||
|
yield src + ' ', repl + ' '
|
||||||
|
|
||||||
|
if decompose:
|
||||||
|
yield src, repl + ' '
|
||||||
|
yield src + ' ', repl
|
||||||
|
else:
|
||||||
|
prefix = _FLAG_MATCH[preflag]
|
||||||
|
postfix = _FLAG_MATCH[postflag]
|
||||||
|
|
||||||
|
yield prefix + src + postfix, prefix + repl + postfix
|
||||||
@@ -7,145 +7,44 @@
|
|||||||
"""
|
"""
|
||||||
Generic processor for names that creates abbreviation variants.
|
Generic processor for names that creates abbreviation variants.
|
||||||
"""
|
"""
|
||||||
from collections import defaultdict, namedtuple
|
|
||||||
import itertools
|
import itertools
|
||||||
import re
|
|
||||||
|
|
||||||
from icu import Transliterator
|
|
||||||
import datrie
|
import datrie
|
||||||
|
|
||||||
from nominatim.config import flatten_config_list
|
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
|
||||||
|
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
||||||
|
|
||||||
### Configuration section
|
### Configuration section
|
||||||
|
|
||||||
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
|
|
||||||
|
|
||||||
def configure(rules, normalization_rules):
|
def configure(rules, normalization_rules):
|
||||||
""" Extract and preprocess the configuration for this module.
|
""" Extract and preprocess the configuration for this module.
|
||||||
"""
|
"""
|
||||||
config = {}
|
config = {}
|
||||||
|
|
||||||
config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
|
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
|
||||||
normalization_rules)
|
normalization_rules)
|
||||||
config['variant_only'] = rules.get('mode', '') == 'variant-only'
|
config['variant_only'] = rules.get('mode', '') == 'variant-only'
|
||||||
|
|
||||||
|
# parse mutation rules
|
||||||
|
config['mutations'] = []
|
||||||
|
for rule in rules.get('mutations', []):
|
||||||
|
if 'pattern' not in rule:
|
||||||
|
raise UsageError("Missing field 'pattern' in mutation configuration.")
|
||||||
|
if not isinstance(rule['pattern'], str):
|
||||||
|
raise UsageError("Field 'pattern' in mutation configuration "
|
||||||
|
"must be a simple text field.")
|
||||||
|
if 'replacements' not in rule:
|
||||||
|
raise UsageError("Missing field 'replacements' in mutation configuration.")
|
||||||
|
if not isinstance(rule['replacements'], list):
|
||||||
|
raise UsageError("Field 'replacements' in mutation configuration "
|
||||||
|
"must be a list of texts.")
|
||||||
|
|
||||||
|
config['mutations'].append((rule['pattern'], rule['replacements']))
|
||||||
|
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
def _get_variant_config(rules, normalization_rules):
|
|
||||||
""" Convert the variant definition from the configuration into
|
|
||||||
replacement sets.
|
|
||||||
"""
|
|
||||||
immediate = defaultdict(list)
|
|
||||||
chars = set()
|
|
||||||
|
|
||||||
if rules:
|
|
||||||
vset = set()
|
|
||||||
rules = flatten_config_list(rules, 'variants')
|
|
||||||
|
|
||||||
vmaker = _VariantMaker(normalization_rules)
|
|
||||||
|
|
||||||
for section in rules:
|
|
||||||
for rule in (section.get('words') or []):
|
|
||||||
vset.update(vmaker.compute(rule))
|
|
||||||
|
|
||||||
# Intermediate reorder by source. Also compute required character set.
|
|
||||||
for variant in vset:
|
|
||||||
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
|
|
||||||
replstr = variant.replacement[:-1]
|
|
||||||
else:
|
|
||||||
replstr = variant.replacement
|
|
||||||
immediate[variant.source].append(replstr)
|
|
||||||
chars.update(variant.source)
|
|
||||||
|
|
||||||
return list(immediate.items()), ''.join(chars)
|
|
||||||
|
|
||||||
|
|
||||||
class _VariantMaker:
|
|
||||||
""" Generater for all necessary ICUVariants from a single variant rule.
|
|
||||||
|
|
||||||
All text in rules is normalized to make sure the variants match later.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, norm_rules):
|
|
||||||
self.norm = Transliterator.createFromRules("rule_loader_normalization",
|
|
||||||
norm_rules)
|
|
||||||
|
|
||||||
|
|
||||||
def compute(self, rule):
|
|
||||||
""" Generator for all ICUVariant tuples from a single variant rule.
|
|
||||||
"""
|
|
||||||
parts = re.split(r'(\|)?([=-])>', rule)
|
|
||||||
if len(parts) != 4:
|
|
||||||
raise UsageError("Syntax error in variant rule: " + rule)
|
|
||||||
|
|
||||||
decompose = parts[1] is None
|
|
||||||
src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
|
|
||||||
repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
|
|
||||||
|
|
||||||
# If the source should be kept, add a 1:1 replacement
|
|
||||||
if parts[2] == '-':
|
|
||||||
for src in src_terms:
|
|
||||||
if src:
|
|
||||||
for froms, tos in _create_variants(*src, src[0], decompose):
|
|
||||||
yield ICUVariant(froms, tos)
|
|
||||||
|
|
||||||
for src, repl in itertools.product(src_terms, repl_terms):
|
|
||||||
if src and repl:
|
|
||||||
for froms, tos in _create_variants(*src, repl, decompose):
|
|
||||||
yield ICUVariant(froms, tos)
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_variant_word(self, name):
|
|
||||||
name = name.strip()
|
|
||||||
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
|
|
||||||
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
|
|
||||||
raise UsageError("Invalid variant word descriptor '{}'".format(name))
|
|
||||||
norm_name = self.norm.transliterate(match.group(2)).strip()
|
|
||||||
if not norm_name:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return norm_name, match.group(1), match.group(3)
|
|
||||||
|
|
||||||
|
|
||||||
_FLAG_MATCH = {'^': '^ ',
|
|
||||||
'$': ' ^',
|
|
||||||
'': ' '}
|
|
||||||
|
|
||||||
|
|
||||||
def _create_variants(src, preflag, postflag, repl, decompose):
|
|
||||||
if preflag == '~':
|
|
||||||
postfix = _FLAG_MATCH[postflag]
|
|
||||||
# suffix decomposition
|
|
||||||
src = src + postfix
|
|
||||||
repl = repl + postfix
|
|
||||||
|
|
||||||
yield src, repl
|
|
||||||
yield ' ' + src, ' ' + repl
|
|
||||||
|
|
||||||
if decompose:
|
|
||||||
yield src, ' ' + repl
|
|
||||||
yield ' ' + src, repl
|
|
||||||
elif postflag == '~':
|
|
||||||
# prefix decomposition
|
|
||||||
prefix = _FLAG_MATCH[preflag]
|
|
||||||
src = prefix + src
|
|
||||||
repl = prefix + repl
|
|
||||||
|
|
||||||
yield src, repl
|
|
||||||
yield src + ' ', repl + ' '
|
|
||||||
|
|
||||||
if decompose:
|
|
||||||
yield src, repl + ' '
|
|
||||||
yield src + ' ', repl
|
|
||||||
else:
|
|
||||||
prefix = _FLAG_MATCH[preflag]
|
|
||||||
postfix = _FLAG_MATCH[postflag]
|
|
||||||
|
|
||||||
yield prefix + src + postfix, prefix + repl + postfix
|
|
||||||
|
|
||||||
|
|
||||||
### Analysis section
|
### Analysis section
|
||||||
|
|
||||||
def create(transliterator, config):
|
def create(transliterator, config):
|
||||||
@@ -171,19 +70,43 @@ class GenericTokenAnalysis:
|
|||||||
else:
|
else:
|
||||||
self.replacements = None
|
self.replacements = None
|
||||||
|
|
||||||
|
# set up mutation rules
|
||||||
|
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
|
||||||
|
|
||||||
|
|
||||||
def get_variants_ascii(self, norm_name):
|
def get_variants_ascii(self, norm_name):
|
||||||
""" Compute the spelling variants for the given normalized name
|
""" Compute the spelling variants for the given normalized name
|
||||||
and transliterate the result.
|
and transliterate the result.
|
||||||
"""
|
"""
|
||||||
|
variants = self._generate_word_variants(norm_name)
|
||||||
|
|
||||||
|
for mutation in self.mutations:
|
||||||
|
variants = mutation.generate(variants)
|
||||||
|
|
||||||
|
return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
|
||||||
|
|
||||||
|
|
||||||
|
def _transliterate_unique_list(self, norm_name, iterable):
|
||||||
|
seen = set()
|
||||||
|
if self.variant_only:
|
||||||
|
seen.add(norm_name)
|
||||||
|
|
||||||
|
for variant in map(str.strip, iterable):
|
||||||
|
if variant not in seen:
|
||||||
|
seen.add(variant)
|
||||||
|
yield self.to_ascii.transliterate(variant).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_word_variants(self, norm_name):
|
||||||
baseform = '^ ' + norm_name + ' ^'
|
baseform = '^ ' + norm_name + ' ^'
|
||||||
|
baselen = len(baseform)
|
||||||
partials = ['']
|
partials = ['']
|
||||||
|
|
||||||
startpos = 0
|
startpos = 0
|
||||||
if self.replacements is not None:
|
if self.replacements is not None:
|
||||||
pos = 0
|
pos = 0
|
||||||
force_space = False
|
force_space = False
|
||||||
while pos < len(baseform):
|
while pos < baselen:
|
||||||
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
|
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
|
||||||
(None, None))
|
(None, None))
|
||||||
if full is not None:
|
if full is not None:
|
||||||
@@ -207,24 +130,9 @@ class GenericTokenAnalysis:
|
|||||||
|
|
||||||
# No variants detected? Fast return.
|
# No variants detected? Fast return.
|
||||||
if startpos == 0:
|
if startpos == 0:
|
||||||
if self.variant_only:
|
return (norm_name, )
|
||||||
return []
|
|
||||||
|
|
||||||
trans_name = self.to_ascii.transliterate(norm_name).strip()
|
if startpos < baselen:
|
||||||
return [trans_name] if trans_name else []
|
return (part[1:] + baseform[startpos:-1] for part in partials)
|
||||||
|
|
||||||
return self._compute_result_set(partials, baseform[startpos:],
|
return (part[1:-1] for part in partials)
|
||||||
norm_name if self.variant_only else '')
|
|
||||||
|
|
||||||
|
|
||||||
def _compute_result_set(self, partials, prefix, exclude):
|
|
||||||
results = set()
|
|
||||||
|
|
||||||
for variant in partials:
|
|
||||||
vname = (variant + prefix)[1:-1].strip()
|
|
||||||
if vname != exclude:
|
|
||||||
trans_name = self.to_ascii.transliterate(vname).strip()
|
|
||||||
if trans_name:
|
|
||||||
results.add(trans_name)
|
|
||||||
|
|
||||||
return list(results)
|
|
||||||
|
|||||||
56
nominatim/tokenizer/token_analysis/generic_mutation.py
Normal file
56
nominatim/tokenizer/token_analysis/generic_mutation.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Creator for mutation variants for the generic token analysis.
|
||||||
|
"""
|
||||||
|
import itertools
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
def _zigzag(outer, inner):
|
||||||
|
return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
|
||||||
|
|
||||||
|
|
||||||
|
class MutationVariantGenerator:
|
||||||
|
""" Generates name variants by applying a regular expression to the name
|
||||||
|
and replacing it with one or more variants. When the regular expression
|
||||||
|
matches more than once, each occurence is replaced with all replacement
|
||||||
|
patterns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, pattern, replacements):
|
||||||
|
self.pattern = re.compile(pattern)
|
||||||
|
self.replacements = replacements
|
||||||
|
|
||||||
|
if self.pattern.groups > 0:
|
||||||
|
LOG.fatal("The mutation pattern %s contains a capturing group. "
|
||||||
|
"This is not allowed.", pattern)
|
||||||
|
raise UsageError("Bad mutation pattern in configuration.")
|
||||||
|
|
||||||
|
|
||||||
|
def generate(self, names):
|
||||||
|
""" Generator function for the name variants. 'names' is an iterable
|
||||||
|
over a set of names for which the variants are to be generated.
|
||||||
|
"""
|
||||||
|
for name in names:
|
||||||
|
parts = self.pattern.split(name)
|
||||||
|
if len(parts) == 1:
|
||||||
|
yield name
|
||||||
|
else:
|
||||||
|
for seps in self._fillers(len(parts)):
|
||||||
|
yield ''.join(_zigzag(parts, seps))
|
||||||
|
|
||||||
|
|
||||||
|
def _fillers(self, num_parts):
|
||||||
|
""" Returns a generator for strings to join the given number of string
|
||||||
|
parts in all possible combinations.
|
||||||
|
"""
|
||||||
|
return itertools.product(self.replacements, repeat=num_parts - 1)
|
||||||
@@ -59,6 +59,13 @@ token-analysis:
|
|||||||
mode: variant-only
|
mode: variant-only
|
||||||
variants:
|
variants:
|
||||||
- !include icu-rules/variants-de.yaml
|
- !include icu-rules/variants-de.yaml
|
||||||
|
mutations:
|
||||||
|
- pattern: ä
|
||||||
|
replacements: ["ä", "ae"]
|
||||||
|
- pattern: ö
|
||||||
|
replacements: ["ö", "oe"]
|
||||||
|
- pattern: ü
|
||||||
|
replacements: ["ü", "ue"]
|
||||||
- id: el
|
- id: el
|
||||||
analyzer: generic
|
analyzer: generic
|
||||||
mode: variant-only
|
mode: variant-only
|
||||||
|
|||||||
@@ -58,3 +58,48 @@ Feature: Import and search of names
|
|||||||
| រាជធានីភ្នំពេញ |
|
| រាជធានីភ្នំពេញ |
|
||||||
| 東京都 |
|
| 東京都 |
|
||||||
| ပုဗ္ဗသီရိ |
|
| ပုဗ္ဗသီရိ |
|
||||||
|
|
||||||
|
|
||||||
|
Scenario: German umlauts can be found when expanded
|
||||||
|
Given the places
|
||||||
|
| osm | class | type | name+name:de |
|
||||||
|
| N1 | place | city | Münster |
|
||||||
|
| N2 | place | city | Köln |
|
||||||
|
| N3 | place | city | Gräfenroda |
|
||||||
|
When importing
|
||||||
|
When sending search query "münster"
|
||||||
|
Then results contain
|
||||||
|
| osm |
|
||||||
|
| N1 |
|
||||||
|
When sending search query "muenster"
|
||||||
|
Then results contain
|
||||||
|
| osm |
|
||||||
|
| N1 |
|
||||||
|
When sending search query "munster"
|
||||||
|
Then results contain
|
||||||
|
| osm |
|
||||||
|
| N1 |
|
||||||
|
When sending search query "Köln"
|
||||||
|
Then results contain
|
||||||
|
| osm |
|
||||||
|
| N2 |
|
||||||
|
When sending search query "Koeln"
|
||||||
|
Then results contain
|
||||||
|
| osm |
|
||||||
|
| N2 |
|
||||||
|
When sending search query "Koln"
|
||||||
|
Then results contain
|
||||||
|
| osm |
|
||||||
|
| N2 |
|
||||||
|
When sending search query "gräfenroda"
|
||||||
|
Then results contain
|
||||||
|
| osm |
|
||||||
|
| N3 |
|
||||||
|
When sending search query "graefenroda"
|
||||||
|
Then results contain
|
||||||
|
| osm |
|
||||||
|
| N3 |
|
||||||
|
When sending search query "grafenroda"
|
||||||
|
Then results contain
|
||||||
|
| osm |
|
||||||
|
| N3 |
|
||||||
|
|||||||
@@ -0,0 +1,89 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Tests for generic token analysis, mutation part.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from icu import Transliterator
|
||||||
|
|
||||||
|
import nominatim.tokenizer.token_analysis.generic as module
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
|
DEFAULT_NORMALIZATION = """ '🜳' > ' ';
|
||||||
|
[[:Nonspacing Mark:] [:Cf:]] >;
|
||||||
|
:: lower ();
|
||||||
|
[[:Punctuation:][:Space:]]+ > ' '
|
||||||
|
"""
|
||||||
|
|
||||||
|
DEFAULT_TRANSLITERATION = """ :: Latin ();
|
||||||
|
'🜵' > ' ';
|
||||||
|
"""
|
||||||
|
|
||||||
|
class TestMutationNoVariants:
|
||||||
|
|
||||||
|
def make_analyser(self, *mutations):
|
||||||
|
rules = { 'analyzer': 'generic',
|
||||||
|
'mutations': [ {'pattern': m[0], 'replacements': m[1]}
|
||||||
|
for m in mutations]
|
||||||
|
}
|
||||||
|
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
||||||
|
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||||
|
|
||||||
|
self.analysis = module.create(trans, config)
|
||||||
|
|
||||||
|
|
||||||
|
def variants(self, name):
|
||||||
|
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||||
|
return set(self.analysis.get_variants_ascii(norm.transliterate(name).strip()))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('pattern', ('(capture)', ['a list']))
|
||||||
|
def test_bad_pattern(self, pattern):
|
||||||
|
with pytest.raises(UsageError):
|
||||||
|
self.make_analyser((pattern, ['b']))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('replacements', (None, 'a string'))
|
||||||
|
def test_bad_replacement(self, replacements):
|
||||||
|
with pytest.raises(UsageError):
|
||||||
|
self.make_analyser(('a', replacements))
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple_replacement(self):
|
||||||
|
self.make_analyser(('a', ['b']))
|
||||||
|
|
||||||
|
assert self.variants('none') == {'none'}
|
||||||
|
assert self.variants('abba') == {'bbbb'}
|
||||||
|
assert self.variants('2 aar') == {'2 bbr'}
|
||||||
|
|
||||||
|
|
||||||
|
def test_multichar_replacement(self):
|
||||||
|
self.make_analyser(('1 1', ['1 1 1']))
|
||||||
|
|
||||||
|
assert self.variants('1 1456') == {'1 1 1456'}
|
||||||
|
assert self.variants('1 1 1') == {'1 1 1 1'}
|
||||||
|
|
||||||
|
|
||||||
|
def test_removement_replacement(self):
|
||||||
|
self.make_analyser((' ', [' ', '']))
|
||||||
|
|
||||||
|
assert self.variants('A 345') == {'a 345', 'a345'}
|
||||||
|
assert self.variants('a g b') == {'a g b', 'ag b', 'a gb', 'agb'}
|
||||||
|
|
||||||
|
|
||||||
|
def test_regex_pattern(self):
|
||||||
|
self.make_analyser(('[^a-z]+', ['XXX', ' ']))
|
||||||
|
|
||||||
|
assert self.variants('a-34n12') == {'aXXXnXXX', 'aXXXn', 'a nXXX', 'a n'}
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_mutations(self):
|
||||||
|
self.make_analyser(('ä', ['ä', 'ae']), ('ö', ['ö', 'oe']))
|
||||||
|
|
||||||
|
assert self.variants('Längenöhr') == {'längenöhr', 'laengenöhr',
|
||||||
|
'längenoehr', 'laengenoehr'}
|
||||||
Reference in New Issue
Block a user