add type annotations to ICU tokenizer helper modules

This commit is contained in:
Sarah Hoffmann
2022-07-13 22:55:40 +02:00
parent 77510f4a3b
commit e37cfc64d2
5 changed files with 73 additions and 34 deletions

View File

@@ -28,7 +28,7 @@ def set_property(conn: Connection, name: str, value: str) -> None:
def get_property(conn: Connection, name: str) -> Optional[str]: def get_property(conn: Connection, name: str) -> Optional[str]:
""" Return the current value of the given propery or None if the property """ Return the current value of the given property or None if the property
is not set. is not set.
""" """
if not conn.table_exists('nominatim_properties'): if not conn.table_exists('nominatim_properties'):

View File

@@ -10,12 +10,13 @@ mainly for documentation purposes.
""" """
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any from typing import List, Tuple, Dict, Any
from pathlib import Path
from typing_extensions import Protocol
from nominatim.config import Configuration from nominatim.config import Configuration
from nominatim.data.place_info import PlaceInfo from nominatim.data.place_info import PlaceInfo
# pylint: disable=unnecessary-pass
class AbstractAnalyzer(ABC): class AbstractAnalyzer(ABC):
""" The analyzer provides the functions for analysing names and building """ The analyzer provides the functions for analysing names and building
the token database. the token database.
@@ -230,3 +231,13 @@ class AbstractTokenizer(ABC):
When used outside the with construct, the caller must ensure to When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer. call the close() function before destructing the analyzer.
""" """
class TokenizerModule(Protocol):
""" Interface that must be exported by modules that implement their
own tokenizer.
"""
def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
""" Factory for new tokenizers.
"""

View File

@@ -19,17 +19,20 @@ database.
A tokenizer usually also includes PHP code for querying. The appropriate PHP A tokenizer usually also includes PHP code for querying. The appropriate PHP
normalizer module is installed, when the tokenizer is created. normalizer module is installed, when the tokenizer is created.
""" """
from typing import Optional
import logging import logging
import importlib import importlib
from pathlib import Path from pathlib import Path
from ..errors import UsageError from nominatim.errors import UsageError
from ..db import properties from nominatim.db import properties
from ..db.connection import connect from nominatim.db.connection import connect
from nominatim.config import Configuration
from nominatim.tokenizer.base import AbstractTokenizer, TokenizerModule
LOG = logging.getLogger() LOG = logging.getLogger()
def _import_tokenizer(name): def _import_tokenizer(name: str) -> TokenizerModule:
""" Load the tokenizer.py module from project directory. """ Load the tokenizer.py module from project directory.
""" """
src_file = Path(__file__).parent / (name + '_tokenizer.py') src_file = Path(__file__).parent / (name + '_tokenizer.py')
@@ -41,7 +44,8 @@ def _import_tokenizer(name):
return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer') return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
def create_tokenizer(config, init_db=True, module_name=None): def create_tokenizer(config: Configuration, init_db: bool = True,
module_name: Optional[str] = None) -> AbstractTokenizer:
""" Create a new tokenizer as defined by the given configuration. """ Create a new tokenizer as defined by the given configuration.
The tokenizer data and code is copied into the 'tokenizer' directory The tokenizer data and code is copied into the 'tokenizer' directory
@@ -70,7 +74,7 @@ def create_tokenizer(config, init_db=True, module_name=None):
return tokenizer return tokenizer
def get_tokenizer_for_db(config): def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
""" Instantiate a tokenizer for an existing database. """ Instantiate a tokenizer for an existing database.
The function looks up the appropriate tokenizer in the database The function looks up the appropriate tokenizer in the database

View File

@@ -7,16 +7,19 @@
""" """
Helper class to create ICU rules from a configuration file. Helper class to create ICU rules from a configuration file.
""" """
from typing import Mapping, Any, Generic, Dict, Optional
import importlib import importlib
import io import io
import json import json
import logging import logging
from nominatim.config import flatten_config_list from nominatim.config import flatten_config_list, Configuration
from nominatim.db.properties import set_property, get_property from nominatim.db.properties import set_property, get_property
from nominatim.db.connection import Connection
from nominatim.errors import UsageError from nominatim.errors import UsageError
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
from nominatim.tokenizer.token_analysis.base import AnalysisModule, Analyser, T_config
import nominatim.data.country_info import nominatim.data.country_info
LOG = logging.getLogger() LOG = logging.getLogger()
@@ -26,7 +29,7 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules" DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
def _get_section(rules, section): def _get_section(rules: Mapping[str, Any], section: str) -> Any:
""" Get the section named 'section' from the rules. If the section does """ Get the section named 'section' from the rules. If the section does
not exist, raise a usage error with a meaningful message. not exist, raise a usage error with a meaningful message.
""" """
@@ -41,7 +44,7 @@ class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file. """ Compiler for ICU rules from a tokenizer configuration file.
""" """
def __init__(self, config): def __init__(self, config: Configuration) -> None:
rules = config.load_sub_configuration('icu_tokenizer.yaml', rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG') config='TOKENIZER_CONFIG')
@@ -57,17 +60,27 @@ class ICURuleLoader:
self.sanitizer_rules = rules.get('sanitizers', []) self.sanitizer_rules = rules.get('sanitizers', [])
def load_config_from_db(self, conn): def load_config_from_db(self, conn: Connection) -> None:
""" Get previously saved parts of the configuration from the """ Get previously saved parts of the configuration from the
database. database.
""" """
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) if rules is not None:
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)) self.normalization_rules = rules
rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
if rules is not None:
self.transliteration_rules = rules
rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
if rules:
self.analysis_rules = json.loads(rules)
else:
self.analysis_rules = []
self._setup_analysis() self._setup_analysis()
def save_config_to_db(self, conn): def save_config_to_db(self, conn: Connection) -> None:
""" Save the part of the configuration that cannot be changed into """ Save the part of the configuration that cannot be changed into
the database. the database.
""" """
@@ -76,20 +89,20 @@ class ICURuleLoader:
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules)) set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
def make_sanitizer(self): def make_sanitizer(self) -> PlaceSanitizer:
""" Create a place sanitizer from the configured rules. """ Create a place sanitizer from the configured rules.
""" """
return PlaceSanitizer(self.sanitizer_rules) return PlaceSanitizer(self.sanitizer_rules)
def make_token_analysis(self): def make_token_analysis(self) -> ICUTokenAnalysis:
""" Create a token analyser from the reviouly loaded rules. """ Create a token analyser from the reviouly loaded rules.
""" """
return ICUTokenAnalysis(self.normalization_rules, return ICUTokenAnalysis(self.normalization_rules,
self.transliteration_rules, self.analysis) self.transliteration_rules, self.analysis)
def get_search_rules(self): def get_search_rules(self) -> str:
""" Return the ICU rules to be used during search. """ Return the ICU rules to be used during search.
The rules combine normalization and transliteration. The rules combine normalization and transliteration.
""" """
@@ -102,22 +115,22 @@ class ICURuleLoader:
return rules.getvalue() return rules.getvalue()
def get_normalization_rules(self): def get_normalization_rules(self) -> str:
""" Return rules for normalisation of a term. """ Return rules for normalisation of a term.
""" """
return self.normalization_rules return self.normalization_rules
def get_transliteration_rules(self): def get_transliteration_rules(self) -> str:
""" Return the rules for converting a string into its asciii representation. """ Return the rules for converting a string into its asciii representation.
""" """
return self.transliteration_rules return self.transliteration_rules
def _setup_analysis(self): def _setup_analysis(self) -> None:
""" Process the rules used for creating the various token analyzers. """ Process the rules used for creating the various token analyzers.
""" """
self.analysis = {} self.analysis: Dict[Optional[str], TokenAnalyzerRule[Any]] = {}
if not isinstance(self.analysis_rules, list): if not isinstance(self.analysis_rules, list):
raise UsageError("Configuration section 'token-analysis' must be a list.") raise UsageError("Configuration section 'token-analysis' must be a list.")
@@ -135,7 +148,7 @@ class ICURuleLoader:
@staticmethod @staticmethod
def _cfg_to_icu_rules(rules, section): def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
""" Load an ICU ruleset from the given section. If the section is a """ Load an ICU ruleset from the given section. If the section is a
simple string, it is interpreted as a file name and the rules are simple string, it is interpreted as a file name and the rules are
loaded verbatim from the given file. The filename is expected to be loaded verbatim from the given file. The filename is expected to be
@@ -150,17 +163,21 @@ class ICURuleLoader:
return ';'.join(flatten_config_list(content, section)) + ';' return ';'.join(flatten_config_list(content, section)) + ';'
class TokenAnalyzerRule: class TokenAnalyzerRule(Generic[T_config]):
""" Factory for a single analysis module. The class saves the configuration """ Factory for a single analysis module. The class saves the configuration
and creates a new token analyzer on request. and creates a new token analyzer on request.
""" """
def __init__(self, rules, normalization_rules): def __init__(self, rules: Mapping[str, Any], normalization_rules: str) -> None:
# Find the analysis module # Find the analysis module
module_name = 'nominatim.tokenizer.token_analysis.' \ module_name = 'nominatim.tokenizer.token_analysis.' \
+ _get_section(rules, 'analyzer').replace('-', '_') + _get_section(rules, 'analyzer').replace('-', '_')
analysis_mod = importlib.import_module(module_name) self._analysis_mod: AnalysisModule[T_config] = importlib.import_module(module_name)
self.create = analysis_mod.create
# Load the configuration. # Load the configuration.
self.config = analysis_mod.configure(rules, normalization_rules) self.config = self._analysis_mod.configure(rules, normalization_rules)
def create(self, normalizer: Any, transliterator: Any) -> Analyser:
""" Create a new analyser instance for the given rule.
"""
return self._analysis_mod.create(normalizer, transliterator, self.config)

View File

@@ -8,15 +8,22 @@
Container class collecting all components required to transform an OSM name Container class collecting all components required to transform an OSM name
into a Nominatim token. into a Nominatim token.
""" """
from typing import Mapping, Optional, TYPE_CHECKING
from icu import Transliterator from icu import Transliterator
from nominatim.tokenizer.token_analysis.base import Analyser
if TYPE_CHECKING:
from typing import Any
from nominatim.tokenizer.icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import
class ICUTokenAnalysis: class ICUTokenAnalysis:
""" Container class collecting the transliterators and token analysis """ Container class collecting the transliterators and token analysis
modules for a single NameAnalyser instance. modules for a single NameAnalyser instance.
""" """
def __init__(self, norm_rules, trans_rules, analysis_rules): def __init__(self, norm_rules: str, trans_rules: str,
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule[Any]']):
self.normalizer = Transliterator.createFromRules("icu_normalization", self.normalizer = Transliterator.createFromRules("icu_normalization",
norm_rules) norm_rules)
trans_rules += ";[:Space:]+ > ' '" trans_rules += ";[:Space:]+ > ' '"
@@ -25,11 +32,11 @@ class ICUTokenAnalysis:
self.search = Transliterator.createFromRules("icu_search", self.search = Transliterator.createFromRules("icu_search",
norm_rules + trans_rules) norm_rules + trans_rules)
self.analysis = {name: arules.create(self.normalizer, self.to_ascii, arules.config) self.analysis = {name: arules.create(self.normalizer, self.to_ascii)
for name, arules in analysis_rules.items()} for name, arules in analysis_rules.items()}
def get_analyzer(self, name): def get_analyzer(self, name: str) -> Analyser:
""" Return the given named analyzer. If no analyzer with that """ Return the given named analyzer. If no analyzer with that
name exists, return the default analyzer. name exists, return the default analyzer.
""" """