make token analyzers configurable modules

Adds a mandatory section 'analyzer' to the token-analysis entries
which define, which analyser to use. Currently there is exactly
one, generic, which implements the former ICUNameProcessor.
This commit is contained in:
Sarah Hoffmann
2021-10-04 17:34:30 +02:00
parent 52847b61a3
commit 7cfcbacfc7
7 changed files with 49 additions and 29 deletions

View File

@@ -1,6 +1,7 @@
""" """
Helper class to create ICU rules from a configuration file. Helper class to create ICU rules from a configuration file.
""" """
import importlib
import io import io
import json import json
import logging import logging
@@ -12,7 +13,6 @@ from icu import Transliterator
from nominatim.config import flatten_config_list from nominatim.config import flatten_config_list
from nominatim.db.properties import set_property, get_property from nominatim.db.properties import set_property, get_property
from nominatim.errors import UsageError from nominatim.errors import UsageError
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
import nominatim.tokenizer.icu_variants as variants import nominatim.tokenizer.icu_variants as variants
@@ -23,6 +23,17 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules" DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
def _get_section(rules, section):
""" Get the section named 'section' from the rules. If the section does
not exist, raise a usage error with a meaningful message.
"""
if section not in rules:
LOG.fatal("Section '%s' not found in tokenizer config.", section)
raise UsageError("Syntax error in tokenizer configuration file.")
return rules[section]
class VariantRule: class VariantRule:
""" Saves a single variant expansion. """ Saves a single variant expansion.
@@ -45,7 +56,7 @@ class ICURuleLoader:
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self.analysis_rules = self._get_section(rules, 'token-analysis') self.analysis_rules = _get_section(rules, 'token-analysis')
self._setup_analysis() self._setup_analysis()
# Load optional sanitizer rule set. # Load optional sanitizer rule set.
@@ -130,25 +141,14 @@ class ICURuleLoader:
@staticmethod @staticmethod
def _get_section(rules, section): def _cfg_to_icu_rules(rules, section):
""" Get the section named 'section' from the rules. If the section does
not exist, raise a usage error with a meaningful message.
"""
if section not in rules:
LOG.fatal("Section '%s' not found in tokenizer config.", section)
raise UsageError("Syntax error in tokenizer configuration file.")
return rules[section]
def _cfg_to_icu_rules(self, rules, section):
""" Load an ICU ruleset from the given section. If the section is a """ Load an ICU ruleset from the given section. If the section is a
simple string, it is interpreted as a file name and the rules are simple string, it is interpreted as a file name and the rules are
loaded verbatim from the given file. The filename is expected to be loaded verbatim from the given file. The filename is expected to be
relative to the tokenizer rule file. If the section is a list then relative to the tokenizer rule file. If the section is a list then
each line is assumed to be a rule. All rules are concatenated and returned. each line is assumed to be a rule. All rules are concatenated and returned.
""" """
content = self._get_section(rules, section) content = _get_section(rules, section)
if content is None: if content is None:
return '' return ''
@@ -162,19 +162,27 @@ class TokenAnalyzerRule:
""" """
def __init__(self, rules, normalization_rules): def __init__(self, rules, normalization_rules):
# Find the analysis module
module_name = 'nominatim.tokenizer.token_analysis.' \
+ _get_section(rules, 'analyzer').replace('-', '_')
analysis_mod = importlib.import_module(module_name)
self._mod_create = analysis_mod.create
# Load the configuration.
self.config = {}
self._parse_variant_list(rules.get('variants'), normalization_rules) self._parse_variant_list(rules.get('variants'), normalization_rules)
def create(self, normalization_rules, transliteration_rules): def create(self, normalization_rules, transliteration_rules):
""" Create an analyzer from the given rules. """ Create an analyzer from the given rules.
""" """
return ICUNameProcessor(normalization_rules, return self._mod_create(normalization_rules,
transliteration_rules, transliteration_rules,
self.variants) self.config)
def _parse_variant_list(self, rules, normalization_rules): def _parse_variant_list(self, rules, normalization_rules):
self.variants = set() vset = set()
if not rules: if not rules:
return return
@@ -196,7 +204,9 @@ class TokenAnalyzerRule:
properties.append(props) properties.append(props)
for rule in (section.get('words') or []): for rule in (section.get('words') or []):
self.variants.update(vmaker.compute(rule, props)) vset.update(vmaker.compute(rule, props))
self.config['variants'] = vset
class _VariantMaker: class _VariantMaker:

View File

@@ -1,6 +1,5 @@
""" """
Processor for names that are imported into the database based on the Generic processor for names that creates abbreviation variants.
ICU library.
""" """
from collections import defaultdict from collections import defaultdict
import itertools import itertools
@@ -8,8 +7,15 @@ import itertools
from icu import Transliterator from icu import Transliterator
import datrie import datrie
### Analysis section
class ICUNameProcessor: def create(norm_rules, trans_rules, config):
""" Create a new token analysis instance for this module.
"""
return GenericTokenAnalysis(norm_rules, trans_rules, config['variants'])
class GenericTokenAnalysis:
""" Collects the different transformation rules for normalisation of names """ Collects the different transformation rules for normalisation of names
and provides the functions to apply the transformations. and provides the functions to apply the transformations.
""" """

View File

@@ -28,7 +28,8 @@ sanitizers:
- step: split-name-list - step: split-name-list
- step: strip-brace-terms - step: strip-brace-terms
token-analysis: token-analysis:
- variants: - analyzer: generic
variants:
- !include icu-rules/variants-bg.yaml - !include icu-rules/variants-bg.yaml
- !include icu-rules/variants-ca.yaml - !include icu-rules/variants-ca.yaml
- !include icu-rules/variants-cs.yaml - !include icu-rules/variants-cs.yaml

View File

@@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
cfgstr = {'normalization': list(norm), cfgstr = {'normalization': list(norm),
'sanitizers': sanitizers, 'sanitizers': sanitizers,
'transliteration': list(trans), 'transliteration': list(trans),
'token-analysis': [{'variants': [{'words': list(variants)}]}]} 'token-analysis': [{'analyzer': 'generic',
'variants': [{'words': list(variants)}]}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr)) (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config) tok.loader = ICURuleLoader(test_config)

View File

@@ -34,7 +34,7 @@ def cfgrules(test_config):
- ":: Latin ()" - ":: Latin ()"
- "[[:Punctuation:][:Space:]]+ > ' '" - "[[:Punctuation:][:Space:]]+ > ' '"
""") """)
content += "token-analysis:\n - variants:\n - words:\n" content += "token-analysis:\n - analyzer: generic\n variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n' content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs: for k, v in kwargs:
content += " {}: {}\n".format(k, v) content += " {}: {}\n".format(k, v)
@@ -50,7 +50,8 @@ def test_empty_rule_set(test_config):
normalization: normalization:
transliteration: transliteration:
token-analysis: token-analysis:
- variants: - analyzer: generic
variants:
""")) """))
rules = ICURuleLoader(test_config) rules = ICURuleLoader(test_config)
@@ -108,7 +109,8 @@ def test_transliteration_rules_from_file(test_config):
- "'ax' > 'b'" - "'ax' > 'b'"
- !include transliteration.yaml - !include transliteration.yaml
token-analysis: token-analysis:
- variants: - analyzer: generic
variants:
""")) """))
transpath = test_config.project_dir / ('transliteration.yaml') transpath = test_config.project_dir / ('transliteration.yaml')
transpath.write_text('- "x > y"') transpath.write_text('- "x > y"')
@@ -128,7 +130,7 @@ class TestGetReplacements:
def get_replacements(self, *variants): def get_replacements(self, *variants):
loader = ICURuleLoader(self.cfgrules(*variants)) loader = ICURuleLoader(self.cfgrules(*variants))
rules = loader.analysis[None].variants rules = loader.analysis[None].config['variants']
return set((v.source, v.replacement) for v in rules) return set((v.source, v.replacement) for v in rules)

View File

@@ -28,7 +28,7 @@ def cfgfile(def_config, tmp_path):
- ":: Latin ()" - ":: Latin ()"
- "'🜵' > ' '" - "'🜵' > ' '"
""") """)
content += "token-analysis:\n - variants:\n - words:\n" content += "token-analysis:\n - analyzer: generic\n variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n' content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs: for k, v in kwargs:
content += " {}: {}\n".format(k, v) content += " {}: {}\n".format(k, v)