add type annotations to ICU tokenizer helper modules

This commit is contained in:
Sarah Hoffmann
2022-07-13 22:55:40 +02:00
parent 77510f4a3b
commit e37cfc64d2
5 changed files with 73 additions and 34 deletions

View File

@@ -28,7 +28,7 @@ def set_property(conn: Connection, name: str, value: str) -> None:
def get_property(conn: Connection, name: str) -> Optional[str]:
""" Return the current value of the given propery or None if the property
""" Return the current value of the given property or None if the property
is not set.
"""
if not conn.table_exists('nominatim_properties'):

View File

@@ -10,12 +10,13 @@ mainly for documentation purposes.
"""
from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any
from pathlib import Path
from typing_extensions import Protocol
from nominatim.config import Configuration
from nominatim.data.place_info import PlaceInfo
# pylint: disable=unnecessary-pass
class AbstractAnalyzer(ABC):
""" The analyzer provides the functions for analysing names and building
the token database.
@@ -230,3 +231,13 @@ class AbstractTokenizer(ABC):
When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer.
"""
class TokenizerModule(Protocol):
""" Interface that must be exported by modules that implement their
own tokenizer.
"""
def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
""" Factory for new tokenizers.
"""

View File

@@ -19,17 +19,20 @@ database.
A tokenizer usually also includes PHP code for querying. The appropriate PHP
normalizer module is installed, when the tokenizer is created.
"""
from typing import Optional
import logging
import importlib
from pathlib import Path
from ..errors import UsageError
from ..db import properties
from ..db.connection import connect
from nominatim.errors import UsageError
from nominatim.db import properties
from nominatim.db.connection import connect
from nominatim.config import Configuration
from nominatim.tokenizer.base import AbstractTokenizer, TokenizerModule
LOG = logging.getLogger()
def _import_tokenizer(name):
def _import_tokenizer(name: str) -> TokenizerModule:
""" Load the tokenizer.py module from project directory.
"""
src_file = Path(__file__).parent / (name + '_tokenizer.py')
@@ -41,7 +44,8 @@ def _import_tokenizer(name):
return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
def create_tokenizer(config, init_db=True, module_name=None):
def create_tokenizer(config: Configuration, init_db: bool = True,
module_name: Optional[str] = None) -> AbstractTokenizer:
""" Create a new tokenizer as defined by the given configuration.
The tokenizer data and code is copied into the 'tokenizer' directory
@@ -70,7 +74,7 @@ def create_tokenizer(config, init_db=True, module_name=None):
return tokenizer
def get_tokenizer_for_db(config):
def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
""" Instantiate a tokenizer for an existing database.
The function looks up the appropriate tokenizer in the database

View File

@@ -7,16 +7,19 @@
"""
Helper class to create ICU rules from a configuration file.
"""
from typing import Mapping, Any, Generic, Dict, Optional
import importlib
import io
import json
import logging
from nominatim.config import flatten_config_list
from nominatim.config import flatten_config_list, Configuration
from nominatim.db.properties import set_property, get_property
from nominatim.db.connection import Connection
from nominatim.errors import UsageError
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
from nominatim.tokenizer.token_analysis.base import AnalysisModule, Analyser, T_config
import nominatim.data.country_info
LOG = logging.getLogger()
@@ -26,7 +29,7 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
def _get_section(rules, section):
def _get_section(rules: Mapping[str, Any], section: str) -> Any:
""" Get the section named 'section' from the rules. If the section does
not exist, raise a usage error with a meaningful message.
"""
@@ -41,7 +44,7 @@ class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file.
"""
def __init__(self, config):
def __init__(self, config: Configuration) -> None:
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
@@ -57,17 +60,27 @@ class ICURuleLoader:
self.sanitizer_rules = rules.get('sanitizers', [])
def load_config_from_db(self, conn):
def load_config_from_db(self, conn: Connection) -> None:
""" Get previously saved parts of the configuration from the
database.
"""
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
if rules is not None:
self.normalization_rules = rules
rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
if rules is not None:
self.transliteration_rules = rules
rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
if rules:
self.analysis_rules = json.loads(rules)
else:
self.analysis_rules = []
self._setup_analysis()
def save_config_to_db(self, conn):
def save_config_to_db(self, conn: Connection) -> None:
""" Save the part of the configuration that cannot be changed into
the database.
"""
@@ -76,20 +89,20 @@ class ICURuleLoader:
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
def make_sanitizer(self):
def make_sanitizer(self) -> PlaceSanitizer:
""" Create a place sanitizer from the configured rules.
"""
return PlaceSanitizer(self.sanitizer_rules)
def make_token_analysis(self):
def make_token_analysis(self) -> ICUTokenAnalysis:
""" Create a token analyser from the reviouly loaded rules.
"""
return ICUTokenAnalysis(self.normalization_rules,
self.transliteration_rules, self.analysis)
def get_search_rules(self):
def get_search_rules(self) -> str:
""" Return the ICU rules to be used during search.
The rules combine normalization and transliteration.
"""
@@ -102,22 +115,22 @@ class ICURuleLoader:
return rules.getvalue()
def get_normalization_rules(self):
def get_normalization_rules(self) -> str:
""" Return rules for normalisation of a term.
"""
return self.normalization_rules
def get_transliteration_rules(self):
def get_transliteration_rules(self) -> str:
""" Return the rules for converting a string into its asciii representation.
"""
return self.transliteration_rules
def _setup_analysis(self):
def _setup_analysis(self) -> None:
""" Process the rules used for creating the various token analyzers.
"""
self.analysis = {}
self.analysis: Dict[Optional[str], TokenAnalyzerRule[Any]] = {}
if not isinstance(self.analysis_rules, list):
raise UsageError("Configuration section 'token-analysis' must be a list.")
@@ -135,7 +148,7 @@ class ICURuleLoader:
@staticmethod
def _cfg_to_icu_rules(rules, section):
def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
""" Load an ICU ruleset from the given section. If the section is a
simple string, it is interpreted as a file name and the rules are
loaded verbatim from the given file. The filename is expected to be
@@ -150,17 +163,21 @@ class ICURuleLoader:
return ';'.join(flatten_config_list(content, section)) + ';'
class TokenAnalyzerRule:
class TokenAnalyzerRule(Generic[T_config]):
""" Factory for a single analysis module. The class saves the configuration
and creates a new token analyzer on request.
"""
def __init__(self, rules, normalization_rules):
def __init__(self, rules: Mapping[str, Any], normalization_rules: str) -> None:
# Find the analysis module
module_name = 'nominatim.tokenizer.token_analysis.' \
+ _get_section(rules, 'analyzer').replace('-', '_')
analysis_mod = importlib.import_module(module_name)
self.create = analysis_mod.create
self._analysis_mod: AnalysisModule[T_config] = importlib.import_module(module_name)
# Load the configuration.
self.config = analysis_mod.configure(rules, normalization_rules)
self.config = self._analysis_mod.configure(rules, normalization_rules)
def create(self, normalizer: Any, transliterator: Any) -> Analyser:
""" Create a new analyser instance for the given rule.
"""
return self._analysis_mod.create(normalizer, transliterator, self.config)

View File

@@ -8,15 +8,22 @@
Container class collecting all components required to transform an OSM name
into a Nominatim token.
"""
from typing import Mapping, Optional, TYPE_CHECKING
from icu import Transliterator
from nominatim.tokenizer.token_analysis.base import Analyser
if TYPE_CHECKING:
from typing import Any
from nominatim.tokenizer.icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import
class ICUTokenAnalysis:
""" Container class collecting the transliterators and token analysis
modules for a single NameAnalyser instance.
"""
def __init__(self, norm_rules, trans_rules, analysis_rules):
def __init__(self, norm_rules: str, trans_rules: str,
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule[Any]']):
self.normalizer = Transliterator.createFromRules("icu_normalization",
norm_rules)
trans_rules += ";[:Space:]+ > ' '"
@@ -25,11 +32,11 @@ class ICUTokenAnalysis:
self.search = Transliterator.createFromRules("icu_search",
norm_rules + trans_rules)
self.analysis = {name: arules.create(self.normalizer, self.to_ascii, arules.config)
self.analysis = {name: arules.create(self.normalizer, self.to_ascii)
for name, arules in analysis_rules.items()}
def get_analyzer(self, name):
def get_analyzer(self, name: str) -> Analyser:
""" Return the given named analyzer. If no analyzer with that
name exists, return the default analyzer.
"""