mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-15 02:47:59 +00:00
make token analyzers configurable modules
Adds a mandatory section 'analyzer' to the token-analysis entries which define, which analyser to use. Currently there is exactly one, generic, which implements the former ICUNameProcessor.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
Helper class to create ICU rules from a configuration file.
|
||||
"""
|
||||
import importlib
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
@@ -12,7 +13,6 @@ from icu import Transliterator
|
||||
from nominatim.config import flatten_config_list
|
||||
from nominatim.db.properties import set_property, get_property
|
||||
from nominatim.errors import UsageError
|
||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
|
||||
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||
import nominatim.tokenizer.icu_variants as variants
|
||||
|
||||
@@ -23,6 +23,17 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
||||
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
|
||||
|
||||
|
||||
def _get_section(rules, section):
|
||||
""" Get the section named 'section' from the rules. If the section does
|
||||
not exist, raise a usage error with a meaningful message.
|
||||
"""
|
||||
if section not in rules:
|
||||
LOG.fatal("Section '%s' not found in tokenizer config.", section)
|
||||
raise UsageError("Syntax error in tokenizer configuration file.")
|
||||
|
||||
return rules[section]
|
||||
|
||||
|
||||
class VariantRule:
|
||||
""" Saves a single variant expansion.
|
||||
|
||||
@@ -45,7 +56,7 @@ class ICURuleLoader:
|
||||
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||
self.analysis_rules = self._get_section(rules, 'token-analysis')
|
||||
self.analysis_rules = _get_section(rules, 'token-analysis')
|
||||
self._setup_analysis()
|
||||
|
||||
# Load optional sanitizer rule set.
|
||||
@@ -130,25 +141,14 @@ class ICURuleLoader:
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _get_section(rules, section):
|
||||
""" Get the section named 'section' from the rules. If the section does
|
||||
not exist, raise a usage error with a meaningful message.
|
||||
"""
|
||||
if section not in rules:
|
||||
LOG.fatal("Section '%s' not found in tokenizer config.", section)
|
||||
raise UsageError("Syntax error in tokenizer configuration file.")
|
||||
|
||||
return rules[section]
|
||||
|
||||
|
||||
def _cfg_to_icu_rules(self, rules, section):
|
||||
def _cfg_to_icu_rules(rules, section):
|
||||
""" Load an ICU ruleset from the given section. If the section is a
|
||||
simple string, it is interpreted as a file name and the rules are
|
||||
loaded verbatim from the given file. The filename is expected to be
|
||||
relative to the tokenizer rule file. If the section is a list then
|
||||
each line is assumed to be a rule. All rules are concatenated and returned.
|
||||
"""
|
||||
content = self._get_section(rules, section)
|
||||
content = _get_section(rules, section)
|
||||
|
||||
if content is None:
|
||||
return ''
|
||||
@@ -162,19 +162,27 @@ class TokenAnalyzerRule:
|
||||
"""
|
||||
|
||||
def __init__(self, rules, normalization_rules):
|
||||
# Find the analysis module
|
||||
module_name = 'nominatim.tokenizer.token_analysis.' \
|
||||
+ _get_section(rules, 'analyzer').replace('-', '_')
|
||||
analysis_mod = importlib.import_module(module_name)
|
||||
self._mod_create = analysis_mod.create
|
||||
|
||||
# Load the configuration.
|
||||
self.config = {}
|
||||
self._parse_variant_list(rules.get('variants'), normalization_rules)
|
||||
|
||||
|
||||
def create(self, normalization_rules, transliteration_rules):
|
||||
""" Create an analyzer from the given rules.
|
||||
"""
|
||||
return ICUNameProcessor(normalization_rules,
|
||||
return self._mod_create(normalization_rules,
|
||||
transliteration_rules,
|
||||
self.variants)
|
||||
self.config)
|
||||
|
||||
|
||||
def _parse_variant_list(self, rules, normalization_rules):
|
||||
self.variants = set()
|
||||
vset = set()
|
||||
|
||||
if not rules:
|
||||
return
|
||||
@@ -196,7 +204,9 @@ class TokenAnalyzerRule:
|
||||
properties.append(props)
|
||||
|
||||
for rule in (section.get('words') or []):
|
||||
self.variants.update(vmaker.compute(rule, props))
|
||||
vset.update(vmaker.compute(rule, props))
|
||||
|
||||
self.config['variants'] = vset
|
||||
|
||||
|
||||
class _VariantMaker:
|
||||
|
||||
0
nominatim/tokenizer/token_analysis/__init__.py
Normal file
0
nominatim/tokenizer/token_analysis/__init__.py
Normal file
@@ -1,6 +1,5 @@
|
||||
"""
|
||||
Processor for names that are imported into the database based on the
|
||||
ICU library.
|
||||
Generic processor for names that creates abbreviation variants.
|
||||
"""
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
@@ -8,8 +7,15 @@ import itertools
|
||||
from icu import Transliterator
|
||||
import datrie
|
||||
|
||||
### Analysis section
|
||||
|
||||
class ICUNameProcessor:
|
||||
def create(norm_rules, trans_rules, config):
|
||||
""" Create a new token analysis instance for this module.
|
||||
"""
|
||||
return GenericTokenAnalysis(norm_rules, trans_rules, config['variants'])
|
||||
|
||||
|
||||
class GenericTokenAnalysis:
|
||||
""" Collects the different transformation rules for normalisation of names
|
||||
and provides the functions to apply the transformations.
|
||||
"""
|
||||
@@ -28,7 +28,8 @@ sanitizers:
|
||||
- step: split-name-list
|
||||
- step: strip-brace-terms
|
||||
token-analysis:
|
||||
- variants:
|
||||
- analyzer: generic
|
||||
variants:
|
||||
- !include icu-rules/variants-bg.yaml
|
||||
- !include icu-rules/variants-ca.yaml
|
||||
- !include icu-rules/variants-cs.yaml
|
||||
|
||||
@@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
||||
cfgstr = {'normalization': list(norm),
|
||||
'sanitizers': sanitizers,
|
||||
'transliteration': list(trans),
|
||||
'token-analysis': [{'variants': [{'words': list(variants)}]}]}
|
||||
'token-analysis': [{'analyzer': 'generic',
|
||||
'variants': [{'words': list(variants)}]}]}
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
||||
tok.loader = ICURuleLoader(test_config)
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ def cfgrules(test_config):
|
||||
- ":: Latin ()"
|
||||
- "[[:Punctuation:][:Space:]]+ > ' '"
|
||||
""")
|
||||
content += "token-analysis:\n - variants:\n - words:\n"
|
||||
content += "token-analysis:\n - analyzer: generic\n variants:\n - words:\n"
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
for k, v in kwargs:
|
||||
content += " {}: {}\n".format(k, v)
|
||||
@@ -50,7 +50,8 @@ def test_empty_rule_set(test_config):
|
||||
normalization:
|
||||
transliteration:
|
||||
token-analysis:
|
||||
- variants:
|
||||
- analyzer: generic
|
||||
variants:
|
||||
"""))
|
||||
|
||||
rules = ICURuleLoader(test_config)
|
||||
@@ -108,7 +109,8 @@ def test_transliteration_rules_from_file(test_config):
|
||||
- "'ax' > 'b'"
|
||||
- !include transliteration.yaml
|
||||
token-analysis:
|
||||
- variants:
|
||||
- analyzer: generic
|
||||
variants:
|
||||
"""))
|
||||
transpath = test_config.project_dir / ('transliteration.yaml')
|
||||
transpath.write_text('- "x > y"')
|
||||
@@ -128,7 +130,7 @@ class TestGetReplacements:
|
||||
|
||||
def get_replacements(self, *variants):
|
||||
loader = ICURuleLoader(self.cfgrules(*variants))
|
||||
rules = loader.analysis[None].variants
|
||||
rules = loader.analysis[None].config['variants']
|
||||
|
||||
return set((v.source, v.replacement) for v in rules)
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ def cfgfile(def_config, tmp_path):
|
||||
- ":: Latin ()"
|
||||
- "'🜵' > ' '"
|
||||
""")
|
||||
content += "token-analysis:\n - variants:\n - words:\n"
|
||||
content += "token-analysis:\n - analyzer: generic\n variants:\n - words:\n"
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
for k, v in kwargs:
|
||||
content += " {}: {}\n".format(k, v)
|
||||
Reference in New Issue
Block a user