add support for external sanitizer modules

This commit is contained in:
Sarah Hoffmann
2022-07-25 16:10:19 +02:00
parent 7b7203c149
commit 6d41046b15
9 changed files with 142 additions and 94 deletions

View File

@@ -45,6 +45,7 @@ class ICURuleLoader:
"""
def __init__(self, config: Configuration) -> None:
self.config = config
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
@@ -92,7 +93,7 @@ class ICURuleLoader:
def make_sanitizer(self) -> PlaceSanitizer:
""" Create a place sanitizer from the configured rules.
"""
return PlaceSanitizer(self.sanitizer_rules)
return PlaceSanitizer(self.sanitizer_rules, self.config)
def make_token_analysis(self) -> ICUTokenAnalysis:

View File

@@ -9,9 +9,9 @@ Handler for cleaning name and address tags in place information before it
is handed to the token analysis.
"""
from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
import importlib
from nominatim.errors import UsageError
from nominatim.config import Configuration
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
from nominatim.tokenizer.sanitizers.base import SanitizerHandler, ProcessInfo, PlaceName
from nominatim.data.place_info import PlaceInfo
@@ -22,16 +22,21 @@ class PlaceSanitizer:
names and address before they are used by the token analysers.
"""
def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]]) -> None:
def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]],
config: Configuration) -> None:
self.handlers: List[Callable[[ProcessInfo], None]] = []
if rules:
for func in rules:
if 'step' not in func:
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
handler_module: SanitizerHandler = importlib.import_module(module_name)
self.handlers.append(handler_module.create(SanitizerConfig(func)))
if not isinstance(func['step'], str):
raise UsageError("'step' attribute must be a simple string.")
module: SanitizerHandler = \
config.load_plugin_module(func['step'], 'nominatim.tokenizer.sanitizers')
self.handlers.append(module.create(SanitizerConfig(func)))
def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]: