add type annotations for legacy tokenizer

2026-02-26 11:08:13 +00:00 · 2022-07-15 22:52:26 +02:00
parent e37cfc64d2
commit 18b16e06ca
3 changed files with 83 additions and 62 deletions
--- a/nominatim/db/connection.py
+++ b/nominatim/db/connection.py
@@ -7,7 +7,7 @@
 """
 Specialised connection and cursor functions.
 """
-from typing import List, Optional, Any, Callable, ContextManager, Dict, cast, overload, Tuple
+from typing import Optional, Any, Callable, ContextManager, Dict, cast, overload, Tuple, Iterable
 import contextlib
 import logging
 import os
@@ -36,7 +36,7 @@ class Cursor(psycopg2.extras.DictCursor):
        super().execute(query, args)
-    def execute_values(self, sql: Query, argslist: List[Any],
+    def execute_values(self, sql: Query, argslist: Iterable[Tuple[Any, ...]],
                       template: Optional[str] = None) -> None:
        """ Wrapper for the psycopg2 convenience function to execute
            SQL for a list of values.
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -9,7 +9,7 @@ Abstract class defintions for tokenizers. These base classes are here
 mainly for documentation purposes.
 """
 from abc import ABC, abstractmethod
-from typing import List, Tuple, Dict, Any
+from typing import List, Tuple, Dict, Any, Optional
 from pathlib import Path
 from typing_extensions import Protocol
@@ -187,7 +187,7 @@ class AbstractTokenizer(ABC):
    @abstractmethod
-    def check_database(self, config: Configuration) -> str:
+    def check_database(self, config: Configuration) -> Optional[str]:
        """ Check that the database is set up correctly and ready for being
            queried.
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -7,8 +7,10 @@
 """
 Tokenizer implementing normalisation as used before Nominatim 4.
 """
 from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, cast, Dict, Set
 from collections import OrderedDict
 import logging
 from pathlib import Path
 import re
 import shutil
 from textwrap import dedent
@@ -17,10 +19,12 @@ from icu import Transliterator
 import psycopg2
 import psycopg2.extras
-from nominatim.db.connection import connect
+from nominatim.db.connection import connect, Connection
 from nominatim.config import Configuration
 from nominatim.db import properties
 from nominatim.db import utils as db_utils
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.data.place_info import PlaceInfo
 from nominatim.errors import UsageError
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
@@ -29,13 +33,13 @@ DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
 LOG = logging.getLogger()
-def create(dsn, data_dir):
+def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
    """ Create a new instance of the tokenizer provided by this module.
    """
    return LegacyTokenizer(dsn, data_dir)
-def _install_module(config_module_path, src_dir, module_dir):
+def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
    """ Copies the PostgreSQL normalisation module into the project
        directory if necessary. For historical reasons the module is
        saved in the '/module' subdirectory and not with the other tokenizer
@@ -52,7 +56,7 @@ def _install_module(config_module_path, src_dir, module_dir):
    # Compatibility mode for builddir installations.
    if module_dir.exists() and src_dir.samefile(module_dir):
        LOG.info('Running from build directory. Leaving database module as is.')
-        return module_dir
+        return str(module_dir)
    # In any other case install the module in the project directory.
    if not module_dir.exists():
@@ -64,10 +68,10 @@ def _install_module(config_module_path, src_dir, module_dir):
    LOG.info('Database module installed at %s', str(destfile))
-    return module_dir
+    return str(module_dir)
-def _check_module(module_dir, conn):
+def _check_module(module_dir: str, conn: Connection) -> None:
    """ Try to use the PostgreSQL module to confirm that it is correctly
        installed and accessible from PostgreSQL.
    """
@@ -89,13 +93,13 @@ class LegacyTokenizer(AbstractTokenizer):
        calls to the database.
    """
-    def __init__(self, dsn, data_dir):
+    def __init__(self, dsn: str, data_dir: Path) -> None:
        self.dsn = dsn
        self.data_dir = data_dir
-        self.normalization = None
+        self.normalization: Optional[str] = None
-    def init_new_db(self, config, init_db=True):
+    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
        """ Set up a new tokenizer for the database.
            This copies all necessary data in the project directory to make
@@ -119,7 +123,7 @@ class LegacyTokenizer(AbstractTokenizer):
            self._init_db_tables(config)
-    def init_from_project(self, config):
+    def init_from_project(self, config: Configuration) -> None:
        """ Initialise the tokenizer from the project directory.
        """
        with connect(self.dsn) as conn:
@@ -132,7 +136,7 @@ class LegacyTokenizer(AbstractTokenizer):
        self._install_php(config, overwrite=False)
-    def finalize_import(self, config):
+    def finalize_import(self, config: Configuration) -> None:
        """ Do any required postprocessing to make the tokenizer data ready
            for use.
        """
@@ -141,7 +145,7 @@ class LegacyTokenizer(AbstractTokenizer):
            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
-    def update_sql_functions(self, config):
+    def update_sql_functions(self, config: Configuration) -> None:
        """ Reimport the SQL functions for this tokenizer.
        """
        with connect(self.dsn) as conn:
@@ -154,7 +158,7 @@ class LegacyTokenizer(AbstractTokenizer):
                              modulepath=modulepath)
-    def check_database(self, _):
+    def check_database(self, _: Configuration) -> Optional[str]:
        """ Check that the tokenizer is set up correctly.
        """
        hint = """\
@@ -181,7 +185,7 @@ class LegacyTokenizer(AbstractTokenizer):
        return None
-    def migrate_database(self, config):
+    def migrate_database(self, config: Configuration) -> None:
        """ Initialise the project directory of an existing database for
            use with this tokenizer.
@@ -198,7 +202,7 @@ class LegacyTokenizer(AbstractTokenizer):
            self._save_config(conn, config)
-    def update_statistics(self):
+    def update_statistics(self) -> None:
        """ Recompute the frequency of full words.
        """
        with connect(self.dsn) as conn:
@@ -218,13 +222,13 @@ class LegacyTokenizer(AbstractTokenizer):
            conn.commit()
-    def update_word_tokens(self):
+    def update_word_tokens(self) -> None:
        """ No house-keeping implemented for the legacy tokenizer.
        """
        LOG.info("No tokenizer clean-up available.")
-    def name_analyzer(self):
+    def name_analyzer(self) -> 'LegacyNameAnalyzer':
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
            be used accordingly:
@@ -244,7 +248,7 @@ class LegacyTokenizer(AbstractTokenizer):
        return LegacyNameAnalyzer(self.dsn, normalizer)
-    def _install_php(self, config, overwrite=True):
+    def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
        """ Install the php script for the tokenizer.
        """
        php_file = self.data_dir / "tokenizer.php"
@@ -258,7 +262,7 @@ class LegacyTokenizer(AbstractTokenizer):
                """), encoding='utf-8')
-    def _init_db_tables(self, config):
+    def _init_db_tables(self, config: Configuration) -> None:
        """ Set up the word table and fill it with pre-computed word
            frequencies.
        """
@@ -271,10 +275,12 @@ class LegacyTokenizer(AbstractTokenizer):
        db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
-    def _save_config(self, conn, config):
+    def _save_config(self, conn: Connection, config: Configuration) -> None:
        """ Save the configuration that needs to remain stable for the given
            database as database properties.
        """
        assert self.normalization is not None
        properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
@@ -287,8 +293,8 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
        normalization.
    """
-    def __init__(self, dsn, normalizer):
+    def __init__(self, dsn: str, normalizer: Any):
-        self.conn = connect(dsn).connection
+        self.conn: Optional[Connection] = connect(dsn).connection
        self.conn.autocommit = True
        self.normalizer = normalizer
        psycopg2.extras.register_hstore(self.conn)
@@ -296,7 +302,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
        self._cache = _TokenCache(self.conn)
-    def close(self):
+    def close(self) -> None:
        """ Free all resources used by the analyzer.
        """
        if self.conn:
@@ -304,7 +310,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
            self.conn = None
-    def get_word_token_info(self, words):
+    def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
        """ Return token information for the given list of words.
            If a word starts with # it is assumed to be a full name
            otherwise is a partial name.
@@ -315,6 +321,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
            The function is used for testing and debugging only
            and not necessarily efficient.
        """
        assert self.conn is not None
        with self.conn.cursor() as cur:
            cur.execute("""SELECT t.term, word_token, word_id
                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
@@ -330,14 +337,14 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
            return [(r[0], r[1], r[2]) for r in cur]
-    def normalize(self, phrase):
+    def normalize(self, phrase: str) -> str:
        """ Normalize the given phrase, i.e. remove all properties that
            are irrelevant for search.
        """
-        return self.normalizer.transliterate(phrase)
+        return cast(str, self.normalizer.transliterate(phrase))
-    def normalize_postcode(self, postcode):
+    def normalize_postcode(self, postcode: str) -> str:
        """ Convert the postcode to a standardized form.
            This function must yield exactly the same result as the SQL function
@@ -346,10 +353,12 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
        return postcode.strip().upper()
-    def update_postcodes_from_db(self):
+    def update_postcodes_from_db(self) -> None:
        """ Update postcode tokens in the word table from the location_postcode
            table.
        """
        assert self.conn is not None
        with self.conn.cursor() as cur:
            # This finds us the rows in location_postcode and word that are
            # missing in the other table.
@@ -383,9 +392,12 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
-    def update_special_phrases(self, phrases, should_replace):
+    def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]],
                               should_replace: bool) -> None:
        """ Replace the search index for special phrases with the new phrases.
        """
        assert self.conn is not None
        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
                            for p in phrases))
@@ -422,9 +434,11 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
                 len(norm_phrases), len(to_add), len(to_delete))
-    def add_country_names(self, country_code, names):
+    def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
        """ Add names for the given country to the search index.
        """
        assert self.conn is not None
        with self.conn.cursor() as cur:
            cur.execute(
                """INSERT INTO word (word_id, word_token, country_code)
@@ -436,12 +450,14 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
                """, (country_code, list(names.values()), country_code))
-    def process_place(self, place):
+    def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
        """ Determine tokenizer information about the given place.
            Returns a JSON-serialisable structure that will be handed into
            the database via the token_info field.
        """
        assert self.conn is not None
        token_info = _TokenInfo(self._cache)
        names = place.name
@@ -450,6 +466,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
            token_info.add_names(self.conn, names)
            if place.is_country():
                assert place.country_code is not None
                self.add_country_names(place.country_code, names)
        address = place.address
@@ -459,7 +476,8 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
        return token_info.data
-    def _process_place_address(self, token_info, address):
+    def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
        assert self.conn is not None
        hnrs = []
        addr_terms = []
@@ -491,12 +509,12 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
 class _TokenInfo:
    """ Collect token information to be sent back to the database.
    """
-    def __init__(self, cache):
+    def __init__(self, cache: '_TokenCache') -> None:
        self.cache = cache
-        self.data = {}
+        self.data: Dict[str, Any] = {}
-    def add_names(self, conn, names):
+    def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
        """ Add token information for the names of the place.
        """
        with conn.cursor() as cur:
@@ -505,7 +523,7 @@ class _TokenInfo:
                                            (names, ))
-    def add_housenumbers(self, conn, hnrs):
+    def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
        """ Extract housenumber information from the address.
        """
        if len(hnrs) == 1:
@@ -516,7 +534,7 @@ class _TokenInfo:
                return
        # split numbers if necessary
-        simple_list = []
+        simple_list: List[str] = []
        for hnr in hnrs:
            simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
@@ -525,49 +543,53 @@ class _TokenInfo:
        with conn.cursor() as cur:
            cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
-            self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
+            self.data['hnr_tokens'], self.data['hnr'] = \
                cur.fetchone() # type: ignore[no-untyped-call]
-    def set_postcode(self, postcode):
+    def set_postcode(self, postcode: str) -> None:
        """ Set or replace the postcode token with the given value.
        """
        self.data['postcode'] = postcode
-    def add_street(self, conn, street):
+    def add_street(self, conn: Connection, street: str) -> None:
        """ Add addr:street match terms.
        """
-        def _get_street(name):
+        def _get_street(name: str) -> List[int]:
            with conn.cursor() as cur:
-                return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
+                return cast(List[int],
                            cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
        tokens = self.cache.streets.get(street, _get_street)
        if tokens:
            self.data['street'] = tokens
-    def add_place(self, conn, place):
+    def add_place(self, conn: Connection, place: str) -> None:
        """ Add addr:place search and match terms.
        """
-        def _get_place(name):
+        def _get_place(name: str) -> Tuple[List[int], List[int]]:
            with conn.cursor() as cur:
                cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
                                      word_ids_from_name(%s)::text""",
                            (name, name))
-                return cur.fetchone()
+                return cast(Tuple[List[int], List[int]],
                            cur.fetchone()) # type: ignore[no-untyped-call]
        self.data['place_search'], self.data['place_match'] = \
            self.cache.places.get(place, _get_place)
-    def add_address_terms(self, conn, terms):
+    def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
        """ Add additional address terms.
        """
-        def _get_address_term(name):
+        def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
            with conn.cursor() as cur:
                cur.execute("""SELECT addr_ids_from_name(%s)::text,
                                      word_ids_from_name(%s)::text""",
                            (name, name))
-                return cur.fetchone()
+                return cast(Tuple[List[int], List[int]],
                            cur.fetchone()) # type: ignore[no-untyped-call]
        tokens = {}
        for key, value in terms:
@@ -584,13 +606,12 @@ class _LRU:
        produce the item when there is a cache miss.
    """
-    def __init__(self, maxsize=128, init_data=None):
+    def __init__(self, maxsize: int = 128):
-        self.data = init_data or OrderedDict()
+        self.data: 'OrderedDict[str, Any]' = OrderedDict()
        self.maxsize = maxsize
        if init_data is not None and len(init_data) > maxsize:
            self.maxsize = len(init_data)
-    def get(self, key, generator):
+
    def get(self, key: str, generator: Callable[[str], Any]) -> Any:
        """ Get the item with the given key from the cache. If nothing
            is found in the cache, generate the value through the
            generator function and store it in the cache.
@@ -613,7 +634,7 @@ class _TokenCache:
        This cache is not thread-safe and needs to be instantiated per
        analyzer.
    """
-    def __init__(self, conn):
+    def __init__(self, conn: Connection):
        # various LRU caches
        self.streets = _LRU(maxsize=256)
        self.places = _LRU(maxsize=128)
@@ -623,18 +644,18 @@ class _TokenCache:
        with conn.cursor() as cur:
            cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
                           FROM generate_series(1, 100) as i""")
-            self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
+            self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
        # For postcodes remember the ones that have already been added
-        self.postcodes = set()
+        self.postcodes: Set[str] = set()
-    def get_housenumber(self, number):
+    def get_housenumber(self, number: str) -> Optional[str]:
        """ Get a housenumber token from the cache.
        """
        return self._cached_housenumbers.get(number)
-    def add_postcode(self, conn, postcode):
+    def add_postcode(self, conn: Connection, postcode: str) -> None:
        """ Make sure the given postcode is in the database.
        """
        if postcode not in self.postcodes: