add type annotations to ICU tokenizer helper modules

2026-02-16 15:47:58 +00:00 · 2022-07-13 22:55:40 +02:00
parent 77510f4a3b
commit e37cfc64d2
5 changed files with 73 additions and 34 deletions
--- a/nominatim/db/properties.py
+++ b/nominatim/db/properties.py
@@ -28,7 +28,7 @@ def set_property(conn: Connection, name: str, value: str) -> None:
 def get_property(conn: Connection, name: str) -> Optional[str]:
-    """ Return the current value of the given propery or None if the property
+    """ Return the current value of the given property or None if the property
        is not set.
    """
    if not conn.table_exists('nominatim_properties'):
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -10,12 +10,13 @@ mainly for documentation purposes.
 """
 from abc import ABC, abstractmethod
 from typing import List, Tuple, Dict, Any
 from pathlib import Path
 from typing_extensions import Protocol
 from nominatim.config import Configuration
 from nominatim.data.place_info import PlaceInfo
 # pylint: disable=unnecessary-pass
 class AbstractAnalyzer(ABC):
    """ The analyzer provides the functions for analysing names and building
        the token database.
@@ -230,3 +231,13 @@ class AbstractTokenizer(ABC):
            When used outside the with construct, the caller must ensure to
            call the close() function before destructing the analyzer.
        """
 class TokenizerModule(Protocol):
    """ Interface that must be exported by modules that implement their
        own tokenizer.
    """
    def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
        """ Factory for new tokenizers.
        """
--- a/nominatim/tokenizer/factory.py
+++ b/nominatim/tokenizer/factory.py
@@ -19,17 +19,20 @@ database.
 A tokenizer usually also includes PHP code for querying. The appropriate PHP
 normalizer module is installed, when the tokenizer is created.
 """
 from typing import Optional
 import logging
 import importlib
 from pathlib import Path
-from ..errors import UsageError
+from nominatim.errors import UsageError
-from ..db import properties
+from nominatim.db import properties
-from ..db.connection import connect
+from nominatim.db.connection import connect
 from nominatim.config import Configuration
 from nominatim.tokenizer.base import AbstractTokenizer, TokenizerModule
 LOG = logging.getLogger()
-def _import_tokenizer(name):
+def _import_tokenizer(name: str) -> TokenizerModule:
    """ Load the tokenizer.py module from project directory.
    """
    src_file = Path(__file__).parent / (name + '_tokenizer.py')
@@ -41,7 +44,8 @@ def _import_tokenizer(name):
    return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
-def create_tokenizer(config, init_db=True, module_name=None):
+def create_tokenizer(config: Configuration, init_db: bool = True,
                     module_name: Optional[str] = None) -> AbstractTokenizer:
    """ Create a new tokenizer as defined by the given configuration.
        The tokenizer data and code is copied into the 'tokenizer' directory
@@ -70,7 +74,7 @@ def create_tokenizer(config, init_db=True, module_name=None):
    return tokenizer
-def get_tokenizer_for_db(config):
+def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
    """ Instantiate a tokenizer for an existing database.
        The function looks up the appropriate tokenizer in the database
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -7,16 +7,19 @@
 """
 Helper class to create ICU rules from a configuration file.
 """
 from typing import Mapping, Any, Generic, Dict, Optional
 import importlib
 import io
 import json
 import logging
-from nominatim.config import flatten_config_list
+from nominatim.config import flatten_config_list, Configuration
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.connection import Connection
 from nominatim.errors import UsageError
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
 from nominatim.tokenizer.token_analysis.base import AnalysisModule, Analyser, T_config
 import nominatim.data.country_info
 LOG = logging.getLogger()
@@ -26,7 +29,7 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
 DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
-def _get_section(rules, section):
+def _get_section(rules: Mapping[str, Any], section: str) -> Any:
    """ Get the section named 'section' from the rules. If the section does
        not exist, raise a usage error with a meaningful message.
    """
@@ -41,7 +44,7 @@ class ICURuleLoader:
    """ Compiler for ICU rules from a tokenizer configuration file.
    """
-    def __init__(self, config):
+    def __init__(self, config: Configuration) -> None:
        rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                              config='TOKENIZER_CONFIG')
@@ -57,17 +60,27 @@ class ICURuleLoader:
        self.sanitizer_rules = rules.get('sanitizers', [])
-    def load_config_from_db(self, conn):
+    def load_config_from_db(self, conn: Connection) -> None:
        """ Get previously saved parts of the configuration from the
            database.
        """
-        self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+        rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
-        self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+        if rules is not None:
-        self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
+            self.normalization_rules = rules
        rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
        if rules is not None:
            self.transliteration_rules = rules
        rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
        if rules:
            self.analysis_rules = json.loads(rules)
        else:
            self.analysis_rules = []
        self._setup_analysis()
-    def save_config_to_db(self, conn):
+    def save_config_to_db(self, conn: Connection) -> None:
        """ Save the part of the configuration that cannot be changed into
            the database.
        """
@@ -76,20 +89,20 @@ class ICURuleLoader:
        set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
-    def make_sanitizer(self):
+    def make_sanitizer(self) -> PlaceSanitizer:
        """ Create a place sanitizer from the configured rules.
        """
        return PlaceSanitizer(self.sanitizer_rules)
-    def make_token_analysis(self):
+    def make_token_analysis(self) -> ICUTokenAnalysis:
        """ Create a token analyser from the reviouly loaded rules.
        """
        return ICUTokenAnalysis(self.normalization_rules,
                                self.transliteration_rules, self.analysis)
-    def get_search_rules(self):
+    def get_search_rules(self) -> str:
        """ Return the ICU rules to be used during search.
            The rules combine normalization and transliteration.
        """
@@ -102,22 +115,22 @@ class ICURuleLoader:
        return rules.getvalue()
-    def get_normalization_rules(self):
+    def get_normalization_rules(self) -> str:
        """ Return rules for normalisation of a term.
        """
        return self.normalization_rules
-    def get_transliteration_rules(self):
+    def get_transliteration_rules(self) -> str:
        """ Return the rules for converting a string into its asciii representation.
        """
        return self.transliteration_rules
-    def _setup_analysis(self):
+    def _setup_analysis(self) -> None:
        """ Process the rules used for creating the various token analyzers.
        """
-        self.analysis = {}
+        self.analysis: Dict[Optional[str], TokenAnalyzerRule[Any]]  = {}
        if not isinstance(self.analysis_rules, list):
            raise UsageError("Configuration section 'token-analysis' must be a list.")
@@ -135,7 +148,7 @@ class ICURuleLoader:
    @staticmethod
-    def _cfg_to_icu_rules(rules, section):
+    def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
        """ Load an ICU ruleset from the given section. If the section is a
            simple string, it is interpreted as a file name and the rules are
            loaded verbatim from the given file. The filename is expected to be
@@ -150,17 +163,21 @@ class ICURuleLoader:
        return ';'.join(flatten_config_list(content, section)) + ';'
-class TokenAnalyzerRule:
+class TokenAnalyzerRule(Generic[T_config]):
    """ Factory for a single analysis module. The class saves the configuration
        and creates a new token analyzer on request.
    """
-    def __init__(self, rules, normalization_rules):
+    def __init__(self, rules: Mapping[str, Any], normalization_rules: str) -> None:
        # Find the analysis module
        module_name = 'nominatim.tokenizer.token_analysis.' \
                      + _get_section(rules, 'analyzer').replace('-', '_')
-        analysis_mod = importlib.import_module(module_name)
+        self._analysis_mod: AnalysisModule[T_config] = importlib.import_module(module_name)
        self.create = analysis_mod.create
        # Load the configuration.
-        self.config = analysis_mod.configure(rules, normalization_rules)
+        self.config = self._analysis_mod.configure(rules, normalization_rules)
    def create(self, normalizer: Any, transliterator: Any) -> Analyser:
        """ Create a new analyser instance for the given rule.
        """
        return self._analysis_mod.create(normalizer, transliterator, self.config)
--- a/nominatim/tokenizer/icu_token_analysis.py
+++ b/nominatim/tokenizer/icu_token_analysis.py
@@ -8,15 +8,22 @@
 Container class collecting all components required to transform an OSM name
 into a Nominatim token.
 """
-
+from typing import Mapping, Optional, TYPE_CHECKING
 from icu import Transliterator
 from nominatim.tokenizer.token_analysis.base import Analyser
 if TYPE_CHECKING:
    from typing import Any
    from nominatim.tokenizer.icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import
 class ICUTokenAnalysis:
    """ Container class collecting the transliterators and token analysis
        modules for a single NameAnalyser instance.
    """
-    def __init__(self, norm_rules, trans_rules, analysis_rules):
+    def __init__(self, norm_rules: str, trans_rules: str,
                 analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule[Any]']):
        self.normalizer = Transliterator.createFromRules("icu_normalization",
                                                         norm_rules)
        trans_rules += ";[:Space:]+ > ' '"
@@ -25,11 +32,11 @@ class ICUTokenAnalysis:
        self.search = Transliterator.createFromRules("icu_search",
                                                     norm_rules + trans_rules)
-        self.analysis = {name: arules.create(self.normalizer, self.to_ascii, arules.config)
+        self.analysis = {name: arules.create(self.normalizer, self.to_ascii)
                         for name, arules in analysis_rules.items()}
-    def get_analyzer(self, name):
+    def get_analyzer(self, name: str) -> Analyser:
        """ Return the given named analyzer. If no analyzer with that
            name exists, return the default analyzer.
        """