mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-16 15:47:58 +00:00
make token analyzers configurable modules
Adds a mandatory section 'analyzer' to the token-analysis entries which define, which analyser to use. Currently there is exactly one, generic, which implements the former ICUNameProcessor.
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Helper class to create ICU rules from a configuration file.
|
Helper class to create ICU rules from a configuration file.
|
||||||
"""
|
"""
|
||||||
|
import importlib
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@@ -12,7 +13,6 @@ from icu import Transliterator
|
|||||||
from nominatim.config import flatten_config_list
|
from nominatim.config import flatten_config_list
|
||||||
from nominatim.db.properties import set_property, get_property
|
from nominatim.db.properties import set_property, get_property
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
|
|
||||||
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||||
import nominatim.tokenizer.icu_variants as variants
|
import nominatim.tokenizer.icu_variants as variants
|
||||||
|
|
||||||
@@ -23,6 +23,17 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
|||||||
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
|
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
|
||||||
|
|
||||||
|
|
||||||
|
def _get_section(rules, section):
|
||||||
|
""" Get the section named 'section' from the rules. If the section does
|
||||||
|
not exist, raise a usage error with a meaningful message.
|
||||||
|
"""
|
||||||
|
if section not in rules:
|
||||||
|
LOG.fatal("Section '%s' not found in tokenizer config.", section)
|
||||||
|
raise UsageError("Syntax error in tokenizer configuration file.")
|
||||||
|
|
||||||
|
return rules[section]
|
||||||
|
|
||||||
|
|
||||||
class VariantRule:
|
class VariantRule:
|
||||||
""" Saves a single variant expansion.
|
""" Saves a single variant expansion.
|
||||||
|
|
||||||
@@ -45,7 +56,7 @@ class ICURuleLoader:
|
|||||||
|
|
||||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||||
self.analysis_rules = self._get_section(rules, 'token-analysis')
|
self.analysis_rules = _get_section(rules, 'token-analysis')
|
||||||
self._setup_analysis()
|
self._setup_analysis()
|
||||||
|
|
||||||
# Load optional sanitizer rule set.
|
# Load optional sanitizer rule set.
|
||||||
@@ -130,25 +141,14 @@ class ICURuleLoader:
|
|||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_section(rules, section):
|
def _cfg_to_icu_rules(rules, section):
|
||||||
""" Get the section named 'section' from the rules. If the section does
|
|
||||||
not exist, raise a usage error with a meaningful message.
|
|
||||||
"""
|
|
||||||
if section not in rules:
|
|
||||||
LOG.fatal("Section '%s' not found in tokenizer config.", section)
|
|
||||||
raise UsageError("Syntax error in tokenizer configuration file.")
|
|
||||||
|
|
||||||
return rules[section]
|
|
||||||
|
|
||||||
|
|
||||||
def _cfg_to_icu_rules(self, rules, section):
|
|
||||||
""" Load an ICU ruleset from the given section. If the section is a
|
""" Load an ICU ruleset from the given section. If the section is a
|
||||||
simple string, it is interpreted as a file name and the rules are
|
simple string, it is interpreted as a file name and the rules are
|
||||||
loaded verbatim from the given file. The filename is expected to be
|
loaded verbatim from the given file. The filename is expected to be
|
||||||
relative to the tokenizer rule file. If the section is a list then
|
relative to the tokenizer rule file. If the section is a list then
|
||||||
each line is assumed to be a rule. All rules are concatenated and returned.
|
each line is assumed to be a rule. All rules are concatenated and returned.
|
||||||
"""
|
"""
|
||||||
content = self._get_section(rules, section)
|
content = _get_section(rules, section)
|
||||||
|
|
||||||
if content is None:
|
if content is None:
|
||||||
return ''
|
return ''
|
||||||
@@ -162,19 +162,27 @@ class TokenAnalyzerRule:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rules, normalization_rules):
|
def __init__(self, rules, normalization_rules):
|
||||||
|
# Find the analysis module
|
||||||
|
module_name = 'nominatim.tokenizer.token_analysis.' \
|
||||||
|
+ _get_section(rules, 'analyzer').replace('-', '_')
|
||||||
|
analysis_mod = importlib.import_module(module_name)
|
||||||
|
self._mod_create = analysis_mod.create
|
||||||
|
|
||||||
|
# Load the configuration.
|
||||||
|
self.config = {}
|
||||||
self._parse_variant_list(rules.get('variants'), normalization_rules)
|
self._parse_variant_list(rules.get('variants'), normalization_rules)
|
||||||
|
|
||||||
|
|
||||||
def create(self, normalization_rules, transliteration_rules):
|
def create(self, normalization_rules, transliteration_rules):
|
||||||
""" Create an analyzer from the given rules.
|
""" Create an analyzer from the given rules.
|
||||||
"""
|
"""
|
||||||
return ICUNameProcessor(normalization_rules,
|
return self._mod_create(normalization_rules,
|
||||||
transliteration_rules,
|
transliteration_rules,
|
||||||
self.variants)
|
self.config)
|
||||||
|
|
||||||
|
|
||||||
def _parse_variant_list(self, rules, normalization_rules):
|
def _parse_variant_list(self, rules, normalization_rules):
|
||||||
self.variants = set()
|
vset = set()
|
||||||
|
|
||||||
if not rules:
|
if not rules:
|
||||||
return
|
return
|
||||||
@@ -196,7 +204,9 @@ class TokenAnalyzerRule:
|
|||||||
properties.append(props)
|
properties.append(props)
|
||||||
|
|
||||||
for rule in (section.get('words') or []):
|
for rule in (section.get('words') or []):
|
||||||
self.variants.update(vmaker.compute(rule, props))
|
vset.update(vmaker.compute(rule, props))
|
||||||
|
|
||||||
|
self.config['variants'] = vset
|
||||||
|
|
||||||
|
|
||||||
class _VariantMaker:
|
class _VariantMaker:
|
||||||
|
|||||||
0
nominatim/tokenizer/token_analysis/__init__.py
Normal file
0
nominatim/tokenizer/token_analysis/__init__.py
Normal file
@@ -1,6 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
Processor for names that are imported into the database based on the
|
Generic processor for names that creates abbreviation variants.
|
||||||
ICU library.
|
|
||||||
"""
|
"""
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import itertools
|
import itertools
|
||||||
@@ -8,8 +7,15 @@ import itertools
|
|||||||
from icu import Transliterator
|
from icu import Transliterator
|
||||||
import datrie
|
import datrie
|
||||||
|
|
||||||
|
### Analysis section
|
||||||
|
|
||||||
class ICUNameProcessor:
|
def create(norm_rules, trans_rules, config):
|
||||||
|
""" Create a new token analysis instance for this module.
|
||||||
|
"""
|
||||||
|
return GenericTokenAnalysis(norm_rules, trans_rules, config['variants'])
|
||||||
|
|
||||||
|
|
||||||
|
class GenericTokenAnalysis:
|
||||||
""" Collects the different transformation rules for normalisation of names
|
""" Collects the different transformation rules for normalisation of names
|
||||||
and provides the functions to apply the transformations.
|
and provides the functions to apply the transformations.
|
||||||
"""
|
"""
|
||||||
@@ -28,7 +28,8 @@ sanitizers:
|
|||||||
- step: split-name-list
|
- step: split-name-list
|
||||||
- step: strip-brace-terms
|
- step: strip-brace-terms
|
||||||
token-analysis:
|
token-analysis:
|
||||||
- variants:
|
- analyzer: generic
|
||||||
|
variants:
|
||||||
- !include icu-rules/variants-bg.yaml
|
- !include icu-rules/variants-bg.yaml
|
||||||
- !include icu-rules/variants-ca.yaml
|
- !include icu-rules/variants-ca.yaml
|
||||||
- !include icu-rules/variants-cs.yaml
|
- !include icu-rules/variants-cs.yaml
|
||||||
|
|||||||
@@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
|||||||
cfgstr = {'normalization': list(norm),
|
cfgstr = {'normalization': list(norm),
|
||||||
'sanitizers': sanitizers,
|
'sanitizers': sanitizers,
|
||||||
'transliteration': list(trans),
|
'transliteration': list(trans),
|
||||||
'token-analysis': [{'variants': [{'words': list(variants)}]}]}
|
'token-analysis': [{'analyzer': 'generic',
|
||||||
|
'variants': [{'words': list(variants)}]}]}
|
||||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
||||||
tok.loader = ICURuleLoader(test_config)
|
tok.loader = ICURuleLoader(test_config)
|
||||||
|
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ def cfgrules(test_config):
|
|||||||
- ":: Latin ()"
|
- ":: Latin ()"
|
||||||
- "[[:Punctuation:][:Space:]]+ > ' '"
|
- "[[:Punctuation:][:Space:]]+ > ' '"
|
||||||
""")
|
""")
|
||||||
content += "token-analysis:\n - variants:\n - words:\n"
|
content += "token-analysis:\n - analyzer: generic\n variants:\n - words:\n"
|
||||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||||
for k, v in kwargs:
|
for k, v in kwargs:
|
||||||
content += " {}: {}\n".format(k, v)
|
content += " {}: {}\n".format(k, v)
|
||||||
@@ -50,7 +50,8 @@ def test_empty_rule_set(test_config):
|
|||||||
normalization:
|
normalization:
|
||||||
transliteration:
|
transliteration:
|
||||||
token-analysis:
|
token-analysis:
|
||||||
- variants:
|
- analyzer: generic
|
||||||
|
variants:
|
||||||
"""))
|
"""))
|
||||||
|
|
||||||
rules = ICURuleLoader(test_config)
|
rules = ICURuleLoader(test_config)
|
||||||
@@ -108,7 +109,8 @@ def test_transliteration_rules_from_file(test_config):
|
|||||||
- "'ax' > 'b'"
|
- "'ax' > 'b'"
|
||||||
- !include transliteration.yaml
|
- !include transliteration.yaml
|
||||||
token-analysis:
|
token-analysis:
|
||||||
- variants:
|
- analyzer: generic
|
||||||
|
variants:
|
||||||
"""))
|
"""))
|
||||||
transpath = test_config.project_dir / ('transliteration.yaml')
|
transpath = test_config.project_dir / ('transliteration.yaml')
|
||||||
transpath.write_text('- "x > y"')
|
transpath.write_text('- "x > y"')
|
||||||
@@ -128,7 +130,7 @@ class TestGetReplacements:
|
|||||||
|
|
||||||
def get_replacements(self, *variants):
|
def get_replacements(self, *variants):
|
||||||
loader = ICURuleLoader(self.cfgrules(*variants))
|
loader = ICURuleLoader(self.cfgrules(*variants))
|
||||||
rules = loader.analysis[None].variants
|
rules = loader.analysis[None].config['variants']
|
||||||
|
|
||||||
return set((v.source, v.replacement) for v in rules)
|
return set((v.source, v.replacement) for v in rules)
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ def cfgfile(def_config, tmp_path):
|
|||||||
- ":: Latin ()"
|
- ":: Latin ()"
|
||||||
- "'🜵' > ' '"
|
- "'🜵' > ' '"
|
||||||
""")
|
""")
|
||||||
content += "token-analysis:\n - variants:\n - words:\n"
|
content += "token-analysis:\n - analyzer: generic\n variants:\n - words:\n"
|
||||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||||
for k, v in kwargs:
|
for k, v in kwargs:
|
||||||
content += " {}: {}\n".format(k, v)
|
content += " {}: {}\n".format(k, v)
|
||||||
Reference in New Issue
Block a user