forked from hans/Nominatim
harmonize interface of token analysis module
The configure() function now receives a Transliterator object instead of the ICU rules. This harmonizes the parameters with the create function.
This commit is contained in:
@@ -12,6 +12,8 @@ import io
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
from icu import Transliterator
|
||||||
|
|
||||||
from nominatim.config import flatten_config_list, Configuration
|
from nominatim.config import flatten_config_list, Configuration
|
||||||
from nominatim.db.properties import set_property, get_property
|
from nominatim.db.properties import set_property, get_property
|
||||||
from nominatim.db.connection import Connection
|
from nominatim.db.connection import Connection
|
||||||
@@ -135,6 +137,11 @@ class ICURuleLoader:
|
|||||||
if not isinstance(self.analysis_rules, list):
|
if not isinstance(self.analysis_rules, list):
|
||||||
raise UsageError("Configuration section 'token-analysis' must be a list.")
|
raise UsageError("Configuration section 'token-analysis' must be a list.")
|
||||||
|
|
||||||
|
norm = Transliterator.createFromRules("rule_loader_normalization",
|
||||||
|
self.normalization_rules)
|
||||||
|
trans = Transliterator.createFromRules("rule_loader_transliteration",
|
||||||
|
self.transliteration_rules)
|
||||||
|
|
||||||
for section in self.analysis_rules:
|
for section in self.analysis_rules:
|
||||||
name = section.get('id', None)
|
name = section.get('id', None)
|
||||||
if name in self.analysis:
|
if name in self.analysis:
|
||||||
@@ -144,8 +151,7 @@ class ICURuleLoader:
|
|||||||
LOG.fatal("ICU tokenizer configuration has two token "
|
LOG.fatal("ICU tokenizer configuration has two token "
|
||||||
"analyzers with id '%s'.", name)
|
"analyzers with id '%s'.", name)
|
||||||
raise UsageError("Syntax error in ICU tokenizer config.")
|
raise UsageError("Syntax error in ICU tokenizer config.")
|
||||||
self.analysis[name] = TokenAnalyzerRule(section,
|
self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
|
||||||
self.normalization_rules,
|
|
||||||
self.config)
|
self.config)
|
||||||
|
|
||||||
|
|
||||||
@@ -170,7 +176,8 @@ class TokenAnalyzerRule:
|
|||||||
and creates a new token analyzer on request.
|
and creates a new token analyzer on request.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rules: Mapping[str, Any], normalization_rules: str,
|
def __init__(self, rules: Mapping[str, Any],
|
||||||
|
normalizer: Any, transliterator: Any,
|
||||||
config: Configuration) -> None:
|
config: Configuration) -> None:
|
||||||
analyzer_name = _get_section(rules, 'analyzer')
|
analyzer_name = _get_section(rules, 'analyzer')
|
||||||
if not analyzer_name or not isinstance(analyzer_name, str):
|
if not analyzer_name or not isinstance(analyzer_name, str):
|
||||||
@@ -179,7 +186,8 @@ class TokenAnalyzerRule:
|
|||||||
self._analysis_mod: AnalysisModule = \
|
self._analysis_mod: AnalysisModule = \
|
||||||
config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
|
config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
|
||||||
|
|
||||||
self.config = self._analysis_mod.configure(rules, normalization_rules)
|
self.config = self._analysis_mod.configure(rules, normalizer,
|
||||||
|
transliterator)
|
||||||
|
|
||||||
|
|
||||||
def create(self, normalizer: Any, transliterator: Any) -> Analyser:
|
def create(self, normalizer: Any, transliterator: Any) -> Analyser:
|
||||||
|
|||||||
@@ -30,7 +30,8 @@ class AnalysisModule(Protocol):
|
|||||||
""" Protocol for analysis modules.
|
""" Protocol for analysis modules.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def configure(self, rules: Mapping[str, Any], normalization_rules: str) -> Any:
|
def configure(self, rules: Mapping[str, Any],
|
||||||
|
normalizer: Any, transliterator: Any) -> Any:
|
||||||
""" Prepare the configuration of the analysis module.
|
""" Prepare the configuration of the analysis module.
|
||||||
This function should prepare all data that can be shared
|
This function should prepare all data that can be shared
|
||||||
between instances of this analyser.
|
between instances of this analyser.
|
||||||
@@ -38,8 +39,10 @@ class AnalysisModule(Protocol):
|
|||||||
Arguments:
|
Arguments:
|
||||||
rules: A dictionary with the additional configuration options
|
rules: A dictionary with the additional configuration options
|
||||||
as specified in the tokenizer configuration.
|
as specified in the tokenizer configuration.
|
||||||
normalization_rules: ICU rules for normalization as a string
|
normalizer: an ICU Transliterator with the compiled normalization
|
||||||
that can be used with createFromRules().
|
rules.
|
||||||
|
transliterator: an ICU tranliterator with the compiled
|
||||||
|
transliteration rules.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A data object with the configuration that was set up. May be
|
A data object with the configuration that was set up. May be
|
||||||
|
|||||||
@@ -12,8 +12,6 @@ from collections import defaultdict
|
|||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from icu import Transliterator
|
|
||||||
|
|
||||||
from nominatim.config import flatten_config_list
|
from nominatim.config import flatten_config_list
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
@@ -25,7 +23,7 @@ class ICUVariant(NamedTuple):
|
|||||||
|
|
||||||
|
|
||||||
def get_variant_config(in_rules: Any,
|
def get_variant_config(in_rules: Any,
|
||||||
normalization_rules: str) -> Tuple[List[Tuple[str, List[str]]], str]:
|
normalizer: Any) -> Tuple[List[Tuple[str, List[str]]], str]:
|
||||||
""" Convert the variant definition from the configuration into
|
""" Convert the variant definition from the configuration into
|
||||||
replacement sets.
|
replacement sets.
|
||||||
|
|
||||||
@@ -39,7 +37,7 @@ def get_variant_config(in_rules: Any,
|
|||||||
vset: Set[ICUVariant] = set()
|
vset: Set[ICUVariant] = set()
|
||||||
rules = flatten_config_list(in_rules, 'variants')
|
rules = flatten_config_list(in_rules, 'variants')
|
||||||
|
|
||||||
vmaker = _VariantMaker(normalization_rules)
|
vmaker = _VariantMaker(normalizer)
|
||||||
|
|
||||||
for section in rules:
|
for section in rules:
|
||||||
for rule in (section.get('words') or []):
|
for rule in (section.get('words') or []):
|
||||||
@@ -63,9 +61,8 @@ class _VariantMaker:
|
|||||||
All text in rules is normalized to make sure the variants match later.
|
All text in rules is normalized to make sure the variants match later.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, norm_rules: Any) -> None:
|
def __init__(self, normalizer: Any) -> None:
|
||||||
self.norm = Transliterator.createFromRules("rule_loader_normalization",
|
self.norm = normalizer
|
||||||
norm_rules)
|
|
||||||
|
|
||||||
|
|
||||||
def compute(self, rule: Any) -> Iterator[ICUVariant]:
|
def compute(self, rule: Any) -> Iterator[ICUVariant]:
|
||||||
|
|||||||
@@ -18,13 +18,13 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG
|
|||||||
|
|
||||||
### Configuration section
|
### Configuration section
|
||||||
|
|
||||||
def configure(rules: Mapping[str, Any], normalization_rules: str) -> Dict[str, Any]:
|
def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
|
||||||
""" Extract and preprocess the configuration for this module.
|
""" Extract and preprocess the configuration for this module.
|
||||||
"""
|
"""
|
||||||
config: Dict[str, Any] = {}
|
config: Dict[str, Any] = {}
|
||||||
|
|
||||||
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
|
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
|
||||||
normalization_rules)
|
normalizer)
|
||||||
config['variant_only'] = rules.get('mode', '') == 'variant-only'
|
config['variant_only'] = rules.get('mode', '') == 'variant-only'
|
||||||
|
|
||||||
# parse mutation rules
|
# parse mutation rules
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
Specialized processor for housenumbers. Analyses common housenumber patterns
|
Specialized processor for housenumbers. Analyses common housenumber patterns
|
||||||
and creates variants for them.
|
and creates variants for them.
|
||||||
"""
|
"""
|
||||||
from typing import Mapping, Any, List, cast
|
from typing import Any, List, cast
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
||||||
@@ -20,7 +20,7 @@ RE_NAMED_PART = re.compile(r'[a-z]{4}')
|
|||||||
|
|
||||||
### Configuration section
|
### Configuration section
|
||||||
|
|
||||||
def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
|
def configure(*_: Any) -> None:
|
||||||
""" All behaviour is currently hard-coded.
|
""" All behaviour is currently hard-coded.
|
||||||
"""
|
"""
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -8,13 +8,13 @@
|
|||||||
Specialized processor for postcodes. Supports a 'lookup' variant of the
|
Specialized processor for postcodes. Supports a 'lookup' variant of the
|
||||||
token, which produces variants with optional spaces.
|
token, which produces variants with optional spaces.
|
||||||
"""
|
"""
|
||||||
from typing import Mapping, Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
||||||
|
|
||||||
### Configuration section
|
### Configuration section
|
||||||
|
|
||||||
def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
|
def configure(*_: Any) -> None:
|
||||||
""" All behaviour is currently hard-coded.
|
""" All behaviour is currently hard-coded.
|
||||||
"""
|
"""
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -30,9 +30,9 @@ def make_analyser(*variants, variant_only=False):
|
|||||||
rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
|
rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
|
||||||
if variant_only:
|
if variant_only:
|
||||||
rules['mode'] = 'variant-only'
|
rules['mode'] = 'variant-only'
|
||||||
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
|
||||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||||
|
config = module.configure(rules, norm, trans)
|
||||||
|
|
||||||
return module.create(norm, trans, config)
|
return module.create(norm, trans, config)
|
||||||
|
|
||||||
@@ -44,9 +44,9 @@ def get_normalized_variants(proc, name):
|
|||||||
|
|
||||||
def test_no_variants():
|
def test_no_variants():
|
||||||
rules = { 'analyzer': 'generic' }
|
rules = { 'analyzer': 'generic' }
|
||||||
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
|
||||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||||
|
config = module.configure(rules, norm, trans)
|
||||||
|
|
||||||
proc = module.create(norm, trans, config)
|
proc = module.create(norm, trans, config)
|
||||||
|
|
||||||
@@ -123,7 +123,9 @@ class TestGetReplacements:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def configure_rules(*variants):
|
def configure_rules(*variants):
|
||||||
rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
|
rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
|
||||||
return module.configure(rules, DEFAULT_NORMALIZATION)
|
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||||
|
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||||
|
return module.configure(rules, norm, trans)
|
||||||
|
|
||||||
|
|
||||||
def get_replacements(self, *variants):
|
def get_replacements(self, *variants):
|
||||||
|
|||||||
@@ -31,9 +31,9 @@ class TestMutationNoVariants:
|
|||||||
'mutations': [ {'pattern': m[0], 'replacements': m[1]}
|
'mutations': [ {'pattern': m[0], 'replacements': m[1]}
|
||||||
for m in mutations]
|
for m in mutations]
|
||||||
}
|
}
|
||||||
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
|
||||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||||
|
config = module.configure(rules, norm, trans)
|
||||||
|
|
||||||
self.analysis = module.create(norm, trans, config)
|
self.analysis = module.create(norm, trans, config)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user