harmonize interface of token analysis module

The configure() function now receives a Transliterator object instead
of the ICU rules. This harmonizes the parameters with the create
function.
This commit is contained in:
Sarah Hoffmann
2022-07-29 10:43:07 +02:00
parent f0d640961a
commit c8873d34af
8 changed files with 34 additions and 24 deletions

View File

@@ -12,6 +12,8 @@ import io
import json import json
import logging import logging
from icu import Transliterator
from nominatim.config import flatten_config_list, Configuration from nominatim.config import flatten_config_list, Configuration
from nominatim.db.properties import set_property, get_property from nominatim.db.properties import set_property, get_property
from nominatim.db.connection import Connection from nominatim.db.connection import Connection
@@ -135,6 +137,11 @@ class ICURuleLoader:
if not isinstance(self.analysis_rules, list): if not isinstance(self.analysis_rules, list):
raise UsageError("Configuration section 'token-analysis' must be a list.") raise UsageError("Configuration section 'token-analysis' must be a list.")
norm = Transliterator.createFromRules("rule_loader_normalization",
self.normalization_rules)
trans = Transliterator.createFromRules("rule_loader_transliteration",
self.transliteration_rules)
for section in self.analysis_rules: for section in self.analysis_rules:
name = section.get('id', None) name = section.get('id', None)
if name in self.analysis: if name in self.analysis:
@@ -144,8 +151,7 @@ class ICURuleLoader:
LOG.fatal("ICU tokenizer configuration has two token " LOG.fatal("ICU tokenizer configuration has two token "
"analyzers with id '%s'.", name) "analyzers with id '%s'.", name)
raise UsageError("Syntax error in ICU tokenizer config.") raise UsageError("Syntax error in ICU tokenizer config.")
self.analysis[name] = TokenAnalyzerRule(section, self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
self.normalization_rules,
self.config) self.config)
@@ -170,7 +176,8 @@ class TokenAnalyzerRule:
and creates a new token analyzer on request. and creates a new token analyzer on request.
""" """
def __init__(self, rules: Mapping[str, Any], normalization_rules: str, def __init__(self, rules: Mapping[str, Any],
normalizer: Any, transliterator: Any,
config: Configuration) -> None: config: Configuration) -> None:
analyzer_name = _get_section(rules, 'analyzer') analyzer_name = _get_section(rules, 'analyzer')
if not analyzer_name or not isinstance(analyzer_name, str): if not analyzer_name or not isinstance(analyzer_name, str):
@@ -179,7 +186,8 @@ class TokenAnalyzerRule:
self._analysis_mod: AnalysisModule = \ self._analysis_mod: AnalysisModule = \
config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis') config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
self.config = self._analysis_mod.configure(rules, normalization_rules) self.config = self._analysis_mod.configure(rules, normalizer,
transliterator)
def create(self, normalizer: Any, transliterator: Any) -> Analyser: def create(self, normalizer: Any, transliterator: Any) -> Analyser:

View File

@@ -30,7 +30,8 @@ class AnalysisModule(Protocol):
""" Protocol for analysis modules. """ Protocol for analysis modules.
""" """
def configure(self, rules: Mapping[str, Any], normalization_rules: str) -> Any: def configure(self, rules: Mapping[str, Any],
normalizer: Any, transliterator: Any) -> Any:
""" Prepare the configuration of the analysis module. """ Prepare the configuration of the analysis module.
This function should prepare all data that can be shared This function should prepare all data that can be shared
between instances of this analyser. between instances of this analyser.
@@ -38,8 +39,10 @@ class AnalysisModule(Protocol):
Arguments: Arguments:
rules: A dictionary with the additional configuration options rules: A dictionary with the additional configuration options
as specified in the tokenizer configuration. as specified in the tokenizer configuration.
normalization_rules: ICU rules for normalization as a string normalizer: an ICU Transliterator with the compiled normalization
that can be used with createFromRules(). rules.
transliterator: an ICU tranliterator with the compiled
transliteration rules.
Returns: Returns:
A data object with the configuration that was set up. May be A data object with the configuration that was set up. May be

View File

@@ -12,8 +12,6 @@ from collections import defaultdict
import itertools import itertools
import re import re
from icu import Transliterator
from nominatim.config import flatten_config_list from nominatim.config import flatten_config_list
from nominatim.errors import UsageError from nominatim.errors import UsageError
@@ -25,7 +23,7 @@ class ICUVariant(NamedTuple):
def get_variant_config(in_rules: Any, def get_variant_config(in_rules: Any,
normalization_rules: str) -> Tuple[List[Tuple[str, List[str]]], str]: normalizer: Any) -> Tuple[List[Tuple[str, List[str]]], str]:
""" Convert the variant definition from the configuration into """ Convert the variant definition from the configuration into
replacement sets. replacement sets.
@@ -39,7 +37,7 @@ def get_variant_config(in_rules: Any,
vset: Set[ICUVariant] = set() vset: Set[ICUVariant] = set()
rules = flatten_config_list(in_rules, 'variants') rules = flatten_config_list(in_rules, 'variants')
vmaker = _VariantMaker(normalization_rules) vmaker = _VariantMaker(normalizer)
for section in rules: for section in rules:
for rule in (section.get('words') or []): for rule in (section.get('words') or []):
@@ -63,9 +61,8 @@ class _VariantMaker:
All text in rules is normalized to make sure the variants match later. All text in rules is normalized to make sure the variants match later.
""" """
def __init__(self, norm_rules: Any) -> None: def __init__(self, normalizer: Any) -> None:
self.norm = Transliterator.createFromRules("rule_loader_normalization", self.norm = normalizer
norm_rules)
def compute(self, rule: Any) -> Iterator[ICUVariant]: def compute(self, rule: Any) -> Iterator[ICUVariant]:

View File

@@ -18,13 +18,13 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG
### Configuration section ### Configuration section
def configure(rules: Mapping[str, Any], normalization_rules: str) -> Dict[str, Any]: def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
""" Extract and preprocess the configuration for this module. """ Extract and preprocess the configuration for this module.
""" """
config: Dict[str, Any] = {} config: Dict[str, Any] = {}
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
normalization_rules) normalizer)
config['variant_only'] = rules.get('mode', '') == 'variant-only' config['variant_only'] = rules.get('mode', '') == 'variant-only'
# parse mutation rules # parse mutation rules

View File

@@ -8,7 +8,7 @@
Specialized processor for housenumbers. Analyses common housenumber patterns Specialized processor for housenumbers. Analyses common housenumber patterns
and creates variants for them. and creates variants for them.
""" """
from typing import Mapping, Any, List, cast from typing import Any, List, cast
import re import re
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
@@ -20,7 +20,7 @@ RE_NAMED_PART = re.compile(r'[a-z]{4}')
### Configuration section ### Configuration section
def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613 def configure(*_: Any) -> None:
""" All behaviour is currently hard-coded. """ All behaviour is currently hard-coded.
""" """
return None return None

View File

@@ -8,13 +8,13 @@
Specialized processor for postcodes. Supports a 'lookup' variant of the Specialized processor for postcodes. Supports a 'lookup' variant of the
token, which produces variants with optional spaces. token, which produces variants with optional spaces.
""" """
from typing import Mapping, Any, List from typing import Any, List
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
### Configuration section ### Configuration section
def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613 def configure(*_: Any) -> None:
""" All behaviour is currently hard-coded. """ All behaviour is currently hard-coded.
""" """
return None return None

View File

@@ -30,9 +30,9 @@ def make_analyser(*variants, variant_only=False):
rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]} rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
if variant_only: if variant_only:
rules['mode'] = 'variant-only' rules['mode'] = 'variant-only'
config = module.configure(rules, DEFAULT_NORMALIZATION)
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
config = module.configure(rules, norm, trans)
return module.create(norm, trans, config) return module.create(norm, trans, config)
@@ -44,9 +44,9 @@ def get_normalized_variants(proc, name):
def test_no_variants(): def test_no_variants():
rules = { 'analyzer': 'generic' } rules = { 'analyzer': 'generic' }
config = module.configure(rules, DEFAULT_NORMALIZATION)
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
config = module.configure(rules, norm, trans)
proc = module.create(norm, trans, config) proc = module.create(norm, trans, config)
@@ -123,7 +123,9 @@ class TestGetReplacements:
@staticmethod @staticmethod
def configure_rules(*variants): def configure_rules(*variants):
rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]} rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
return module.configure(rules, DEFAULT_NORMALIZATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
return module.configure(rules, norm, trans)
def get_replacements(self, *variants): def get_replacements(self, *variants):

View File

@@ -31,9 +31,9 @@ class TestMutationNoVariants:
'mutations': [ {'pattern': m[0], 'replacements': m[1]} 'mutations': [ {'pattern': m[0], 'replacements': m[1]}
for m in mutations] for m in mutations]
} }
config = module.configure(rules, DEFAULT_NORMALIZATION)
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
config = module.configure(rules, norm, trans)
self.analysis = module.create(norm, trans, config) self.analysis = module.create(norm, trans, config)