apply variants by languages

Adds a tagger for names by language so that the analyzer of that
language is used. Thus variants are now only applied to names
in the specific language and only tag name tags, no longer to
reference-like tags.
This commit is contained in:
Sarah Hoffmann
2021-10-05 17:18:10 +02:00
parent d35400a7d7
commit 97a10ec218
8 changed files with 307 additions and 46 deletions

View File

@@ -11,6 +11,7 @@ from nominatim.db.properties import set_property, get_property
from nominatim.errors import UsageError from nominatim.errors import UsageError
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
import nominatim.tools.country_info
LOG = logging.getLogger() LOG = logging.getLogger()
@@ -38,6 +39,9 @@ class ICURuleLoader:
rules = config.load_sub_configuration('icu_tokenizer.yaml', rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG') config='TOKENIZER_CONFIG')
# Make sure country information is available to analyzers and sanatizers.
nominatim.tools.country_info.setup_country_config(config)
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self.analysis_rules = _get_section(rules, 'token-analysis') self.analysis_rules = _get_section(rules, 'token-analysis')

View File

@@ -0,0 +1,100 @@
"""
Name processor for tagging the langauge of the name
"""
import re
from nominatim.tools import country_info
class _AnalyzerByLanguage:
""" Processor for tagging the language of names in a place.
"""
def __init__(self, config):
if 'filter-kind' in config:
self.regexes = [re.compile(regex) for regex in config['filter-kind']]
else:
self.regexes = None
self.use_defaults = config.get('use-defaults', 'no')
if self.use_defaults not in ('mono', 'all'):
self.use_defaults = False
self.replace = config.get('mode', 'replace') != 'append'
self.whitelist = config.get('whitelist')
# Compute the languages to use when no suffix is given.
self.deflangs = {}
for ccode, prop in country_info.iterate():
clangs = prop['languages']
if len(clangs) == 1 or self.use_defaults == 'all':
if self.whitelist:
self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
else:
self.deflangs[ccode] = clangs
def _kind_matches(self, kind):
if self.regexes is None:
return True
return any(regex.search(kind) for regex in self.regexes)
def _suffix_matches(self, suffix):
if self.whitelist is None:
return len(suffix) in (2, 3) and suffix.islower()
return suffix in self.whitelist
def __call__(self, obj):
if not obj.names:
return
more_names = []
for name in (n for n in obj.names
if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
if name.suffix:
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
else:
if self.use_defaults:
langs = self.deflangs.get(obj.place.country_code)
if self.use_defaults == 'mono' and len(langs) > 1:
langs = None
if langs:
if self.replace:
name.set_attr('analyzer', langs[0])
else:
more_names.append(name.clone(attr={'analyzer': langs[0]}))
more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
obj.names.extend(more_names)
def create(config):
""" Create a function that sets the analyzer property depending on the
language of the tag. The language is taken from the suffix.
To restrict the set of languages that should be tagged, use
'whitelist'. A list of acceptable suffixes. When unset, all 2- and
3-letter codes are accepted.
'use-defaults' configures what happens when the name has no suffix
with a language tag. When set to 'all', a variant is created for
each on the spoken languages in the country the feature is in. When
set to 'mono', a variant is created, when only one language is spoken
in the country. The default is, to do nothing with the default languages
of a country.
'mode' hay be 'replace' (the default) or 'append' and configures if
the original name (without any analyzer tagged) is retained.
With 'filter-kind' the set of names the sanitizer should be applied
to can be retricted to the given patterns of 'kind'. It expects a
list of regular expression to be matched against 'kind'.
"""
return _AnalyzerByLanguage(config)

View File

@@ -18,7 +18,19 @@ ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
def configure(rules, normalization_rules): def configure(rules, normalization_rules):
""" Extract and preprocess the configuration for this module. """ Extract and preprocess the configuration for this module.
""" """
rules = rules.get('variants') config = {}
config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
normalization_rules)
config['variant_only'] = rules.get('mode', '') == 'variant-only'
return config
def _get_variant_config(rules, normalization_rules):
""" Convert the variant definition from the configuration into
replacement sets.
"""
immediate = defaultdict(list) immediate = defaultdict(list)
chars = set() chars = set()
@@ -41,8 +53,7 @@ def configure(rules, normalization_rules):
immediate[variant.source].append(replstr) immediate[variant.source].append(replstr)
chars.update(variant.source) chars.update(variant.source)
return {'replacements': list(immediate.items()), return list(immediate.items()), ''.join(chars)
'chars': ''.join(chars)}
class _VariantMaker: class _VariantMaker:
@@ -144,11 +155,15 @@ class GenericTokenAnalysis:
def __init__(self, to_ascii, config): def __init__(self, to_ascii, config):
self.to_ascii = to_ascii self.to_ascii = to_ascii
self.variant_only = config['variant_only']
# Set up datrie # Set up datrie
self.replacements = datrie.Trie(config['chars']) if config['replacements']:
for src, repllist in config['replacements']: self.replacements = datrie.Trie(config['chars'])
self.replacements[src] = repllist for src, repllist in config['replacements']:
self.replacements[src] = repllist
else:
self.replacements = None
def get_variants_ascii(self, norm_name): def get_variants_ascii(self, norm_name):
@@ -159,45 +174,51 @@ class GenericTokenAnalysis:
partials = [''] partials = ['']
startpos = 0 startpos = 0
pos = 0 if self.replacements is not None:
force_space = False pos = 0
while pos < len(baseform): force_space = False
full, repl = self.replacements.longest_prefix_item(baseform[pos:], while pos < len(baseform):
(None, None)) full, repl = self.replacements.longest_prefix_item(baseform[pos:],
if full is not None: (None, None))
done = baseform[startpos:pos] if full is not None:
partials = [v + done + r done = baseform[startpos:pos]
for v, r in itertools.product(partials, repl) partials = [v + done + r
if not force_space or r.startswith(' ')] for v, r in itertools.product(partials, repl)
if len(partials) > 128: if not force_space or r.startswith(' ')]
# If too many variants are produced, they are unlikely if len(partials) > 128:
# to be helpful. Only use the original term. # If too many variants are produced, they are unlikely
startpos = 0 # to be helpful. Only use the original term.
break startpos = 0
startpos = pos + len(full) break
if full[-1] == ' ': startpos = pos + len(full)
startpos -= 1 if full[-1] == ' ':
force_space = True startpos -= 1
pos = startpos force_space = True
else: pos = startpos
pos += 1 else:
force_space = False pos += 1
force_space = False
# No variants detected? Fast return. # No variants detected? Fast return.
if startpos == 0: if startpos == 0:
if self.variant_only:
return []
trans_name = self.to_ascii.transliterate(norm_name).strip() trans_name = self.to_ascii.transliterate(norm_name).strip()
return [trans_name] if trans_name else [] return [trans_name] if trans_name else []
return self._compute_result_set(partials, baseform[startpos:]) return self._compute_result_set(partials, baseform[startpos:],
norm_name if self.variant_only else '')
def _compute_result_set(self, partials, prefix): def _compute_result_set(self, partials, prefix, exclude):
results = set() results = set()
for variant in partials: for variant in partials:
vname = variant + prefix vname = (variant + prefix)[1:-1].strip()
trans_name = self.to_ascii.transliterate(vname[1:-1]).strip() if vname != exclude:
if trans_name: trans_name = self.to_ascii.transliterate(vname).strip()
results.add(trans_name) if trans_name:
results.add(trans_name)
return list(results) return list(results)

View File

@@ -13,12 +13,21 @@ class _CountryInfo:
def __init__(self): def __init__(self):
self._info = {} self._info = {}
def load(self, config): def load(self, config):
""" Load the country properties from the configuration files, """ Load the country properties from the configuration files,
if they are not loaded yet. if they are not loaded yet.
""" """
if not self._info: if not self._info:
self._info = config.load_sub_configuration('country_settings.yaml') self._info = config.load_sub_configuration('country_settings.yaml')
# Convert languages into a list for simpler handling.
for prop in self._info.values():
if 'languages' not in prop:
prop['languages'] = []
elif not isinstance(prop['languages'], list):
prop['languages'] = [x.strip()
for x in prop['languages'].split(',')]
def items(self): def items(self):
""" Return tuples of (country_code, property dict) as iterable. """ Return tuples of (country_code, property dict) as iterable.
@@ -36,6 +45,12 @@ def setup_country_config(config):
_COUNTRY_INFO.load(config) _COUNTRY_INFO.load(config)
def iterate():
""" Iterate over country code and properties.
"""
return _COUNTRY_INFO.items()
def setup_country_tables(dsn, sql_dir, ignore_partitions=False): def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
""" Create and populate the tables with basic static data that provides """ Create and populate the tables with basic static data that provides
the background for geocoding. Data is assumed to not yet exist. the background for geocoding. Data is assumed to not yet exist.
@@ -50,10 +65,7 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
partition = 0 partition = 0
else: else:
partition = props.get('partition') partition = props.get('partition')
if ',' in (props.get('languages', ',') or ','): lang = props['languages'][0] if len(props['languages']) == 1 else None
lang = None
else:
lang = props['languages']
params.append((ccode, partition, lang)) params.append((ccode, partition, lang))
with connect(dsn) as conn: with connect(dsn) as conn:

View File

@@ -171,7 +171,7 @@ bt:
# (Bouvet Island) # (Bouvet Island)
bv: bv:
partition: 185 partition: 185
languages: no languages: "no"
# Botswana (Botswana) # Botswana (Botswana)
bw: bw:
@@ -1006,7 +1006,7 @@ si:
# (Svalbard and Jan Mayen) # (Svalbard and Jan Mayen)
sj: sj:
partition: 197 partition: 197
languages: no languages: "no"
# Slovakia (Slovensko) # Slovakia (Slovensko)
sk: sk:

View File

@@ -27,36 +27,160 @@ transliteration:
sanitizers: sanitizers:
- step: split-name-list - step: split-name-list
- step: strip-brace-terms - step: strip-brace-terms
- step: tag-analyzer-by-language
filter-kind: [".*name.*"]
whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
use-defaults: all
mode: append
token-analysis: token-analysis:
- analyzer: generic - analyzer: generic
- id: bg
analyzer: generic
mode: variant-only
variants: variants:
- !include icu-rules/variants-bg.yaml - !include icu-rules/variants-bg.yaml
- id: ca
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-ca.yaml - !include icu-rules/variants-ca.yaml
- id: cs
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-cs.yaml - !include icu-rules/variants-cs.yaml
- id: da
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-da.yaml - !include icu-rules/variants-da.yaml
- id: de
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-de.yaml - !include icu-rules/variants-de.yaml
- id: el
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-el.yaml - !include icu-rules/variants-el.yaml
- id: en
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-en.yaml - !include icu-rules/variants-en.yaml
- id: es
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-es.yaml - !include icu-rules/variants-es.yaml
- id: et
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-et.yaml - !include icu-rules/variants-et.yaml
- id: eu
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-eu.yaml - !include icu-rules/variants-eu.yaml
- id: fi
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-fi.yaml - !include icu-rules/variants-fi.yaml
- id: fr
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-fr.yaml - !include icu-rules/variants-fr.yaml
- id: gl
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-gl.yaml - !include icu-rules/variants-gl.yaml
- id: hu
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-hu.yaml - !include icu-rules/variants-hu.yaml
- id: it
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-it.yaml - !include icu-rules/variants-it.yaml
- id: ja
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-ja.yaml - !include icu-rules/variants-ja.yaml
- id: mg
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-mg.yaml - !include icu-rules/variants-mg.yaml
- id: ms
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-ms.yaml - !include icu-rules/variants-ms.yaml
- id: nl
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-nl.yaml - !include icu-rules/variants-nl.yaml
- id: no
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-no.yaml - !include icu-rules/variants-no.yaml
- id: pl
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-pl.yaml - !include icu-rules/variants-pl.yaml
- id: pt
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-pt.yaml - !include icu-rules/variants-pt.yaml
- id: ro
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-ro.yaml - !include icu-rules/variants-ro.yaml
- id: ru
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-ru.yaml - !include icu-rules/variants-ru.yaml
- id: sk
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-sk.yaml - !include icu-rules/variants-sk.yaml
- id: sl
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-sl.yaml - !include icu-rules/variants-sl.yaml
- id: sv
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-sv.yaml - !include icu-rules/variants-sv.yaml
- id: tr
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-tr.yaml - !include icu-rules/variants-tr.yaml
- id: uk
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-uk.yaml - !include icu-rules/variants-uk.yaml
- id: vi
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-vi.yaml - !include icu-rules/variants-vi.yaml

View File

@@ -52,7 +52,7 @@ Feature: Import and search of names
Scenario: Special characters in name Scenario: Special characters in name
Given the places Given the places
| osm | class | type | name | | osm | class | type | name+name:de |
| N1 | place | locality | Jim-Knopf-Straße | | N1 | place | locality | Jim-Knopf-Straße |
| N2 | place | locality | Smith/Weston | | N2 | place | locality | Smith/Weston |
| N3 | place | locality | space mountain | | N3 | place | locality | space mountain |

View File

@@ -40,7 +40,7 @@ def cfgfile(def_config, tmp_path):
def get_normalized_variants(proc, name): def get_normalized_variants(proc, name):
return proc.get_variants_ascii(proc.get_normalized(name)) return proc.analysis[None].get_variants_ascii(proc.normalizer.transliterate(name).strip())
def test_variants_empty(cfgfile): def test_variants_empty(cfgfile):
@@ -99,6 +99,6 @@ def test_search_normalized(cfgfile):
config = cfgfile('~street => s,st', 'master => mstr') config = cfgfile('~street => s,st', 'master => mstr')
proc = ICURuleLoader(config).make_token_analysis() proc = ICURuleLoader(config).make_token_analysis()
assert proc.get_search_normalized('Master Street') == 'master street' assert proc.search.transliterate('Master Street').strip() == 'master street'
assert proc.get_search_normalized('Earnes St') == 'earnes st' assert proc.search.transliterate('Earnes St').strip() == 'earnes st'
assert proc.get_search_normalized('Nostreet') == 'nostreet' assert proc.search.transliterate('Nostreet').strip() == 'nostreet'