fix style issue found by flake8

2026-02-26 11:08:13 +00:00 · 2024-11-10 22:47:14 +01:00
parent 8c14df55a6
commit 1f07967787
112 changed files with 656 additions and 1109 deletions
--- a/src/nominatim_db/tokenizer/base.py
+++ b/src/nominatim_db/tokenizer/base.py
@@ -17,6 +17,7 @@ from ..config import Configuration
 from ..db.connection import Connection
 from ..data.place_info import PlaceInfo

+
 class AbstractAnalyzer(ABC):
    """ The analyzer provides the functions for analysing names and building
        the token database.
@@ -28,17 +29,14 @@ class AbstractAnalyzer(ABC):
    def __enter__(self) -> 'AbstractAnalyzer':
        return self

-
    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
        self.close()

-
    @abstractmethod
    def close(self) -> None:
        """ Free all resources used by the analyzer.
        """

-
    @abstractmethod
    def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
        """ Return token information for the given list of words.
@@ -57,7 +55,6 @@ class AbstractAnalyzer(ABC):
                    (original word, word token, word id).
        """

-
    @abstractmethod
    def normalize_postcode(self, postcode: str) -> str:
        """ Convert the postcode to its standardized form.
@@ -72,14 +69,12 @@ class AbstractAnalyzer(ABC):
                The given postcode after normalization.
        """

-
    @abstractmethod
    def update_postcodes_from_db(self) -> None:
        """ Update the tokenizer's postcode tokens from the current content
            of the `location_postcode` table.
        """

-
    @abstractmethod
    def update_special_phrases(self,
                               phrases: Iterable[Tuple[str, str, str, str]],
@@ -95,7 +90,6 @@ class AbstractAnalyzer(ABC):
                                ones that already exist.
        """

-
    @abstractmethod
    def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
        """ Add the given names to the tokenizer's list of country tokens.
@@ -106,7 +100,6 @@ class AbstractAnalyzer(ABC):
                names: Dictionary of name type to name.
        """

-
    @abstractmethod
    def process_place(self, place: PlaceInfo) -> Any:
        """ Extract tokens for the given place and compute the
@@ -122,7 +115,6 @@ class AbstractAnalyzer(ABC):
        """


-
 class AbstractTokenizer(ABC):
    """ The tokenizer instance is the central instance of the tokenizer in
        the system. There will only be a single instance of the tokenizer
@@ -146,7 +138,6 @@ class AbstractTokenizer(ABC):
                tokenizers.
        """

-
    @abstractmethod
    def init_from_project(self, config: Configuration) -> None:
        """ Initialise the tokenizer from an existing database setup.
@@ -158,7 +149,6 @@ class AbstractTokenizer(ABC):
              config: Read-only object with configuration options.
        """

-
    @abstractmethod
    def finalize_import(self, config: Configuration) -> None:
        """ This function is called at the very end of an import when all
@@ -170,7 +160,6 @@ class AbstractTokenizer(ABC):
              config: Read-only object with configuration options.
        """

-
    @abstractmethod
    def update_sql_functions(self, config: Configuration) -> None:
        """ Update the SQL part of the tokenizer. This function is called
@@ -184,7 +173,6 @@ class AbstractTokenizer(ABC):
              config: Read-only object with configuration options.
        """

-
    @abstractmethod
    def check_database(self, config: Configuration) -> Optional[str]:
        """ Check that the database is set up correctly and ready for being
@@ -199,7 +187,6 @@ class AbstractTokenizer(ABC):
                  how to resolve the issue. If everything is okay, return `None`.
        """

-
    @abstractmethod
    def update_statistics(self, config: Configuration, threads: int = 1) -> None:
        """ Recompute any tokenizer statistics necessary for efficient lookup.
@@ -208,14 +195,12 @@ class AbstractTokenizer(ABC):
            it to be called in order to work.
        """

-
    @abstractmethod
    def update_word_tokens(self) -> None:
        """ Do house-keeping on the tokenizers internal data structures.
            Remove unused word tokens, resort data etc.
        """

-
    @abstractmethod
    def name_analyzer(self) -> AbstractAnalyzer:
        """ Create a new analyzer for tokenizing names and queries
@@ -231,7 +216,6 @@ class AbstractTokenizer(ABC):
            call the close() function before destructing the analyzer.
        """

-
    @abstractmethod
    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
        """ Return a list of the most frequent full words in the database.
--- a/src/nominatim_db/tokenizer/factory.py
+++ b/src/nominatim_db/tokenizer/factory.py
@@ -29,6 +29,7 @@ from ..tokenizer.base import AbstractTokenizer, TokenizerModule

 LOG = logging.getLogger()

+
 def _import_tokenizer(name: str) -> TokenizerModule:
    """ Load the tokenizer.py module from project directory.
    """
--- a/src/nominatim_db/tokenizer/icu_rule_loader.py
+++ b/src/nominatim_db/tokenizer/icu_rule_loader.py
@@ -61,7 +61,6 @@ class ICURuleLoader:
        # Load optional sanitizer rule set.
        self.sanitizer_rules = rules.get('sanitizers', [])

-
    def load_config_from_db(self, conn: Connection) -> None:
        """ Get previously saved parts of the configuration from the
            database.
@@ -81,7 +80,6 @@ class ICURuleLoader:
            self.analysis_rules = []
        self._setup_analysis()

-
    def save_config_to_db(self, conn: Connection) -> None:
        """ Save the part of the configuration that cannot be changed into
            the database.
@@ -90,20 +88,17 @@ class ICURuleLoader:
        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
        set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))

-
    def make_sanitizer(self) -> PlaceSanitizer:
        """ Create a place sanitizer from the configured rules.
        """
        return PlaceSanitizer(self.sanitizer_rules, self.config)

-
    def make_token_analysis(self) -> ICUTokenAnalysis:
        """ Create a token analyser from the reviouly loaded rules.
        """
        return ICUTokenAnalysis(self.normalization_rules,
                                self.transliteration_rules, self.analysis)

-
    def get_search_rules(self) -> str:
        """ Return the ICU rules to be used during search.
            The rules combine normalization and transliteration.
@@ -116,23 +111,20 @@ class ICURuleLoader:
        rules.write(self.transliteration_rules)
        return rules.getvalue()

-
    def get_normalization_rules(self) -> str:
        """ Return rules for normalisation of a term.
        """
        return self.normalization_rules

-
    def get_transliteration_rules(self) -> str:
        """ Return the rules for converting a string into its asciii representation.
        """
        return self.transliteration_rules

-
    def _setup_analysis(self) -> None:
        """ Process the rules used for creating the various token analyzers.
        """
-        self.analysis: Dict[Optional[str], TokenAnalyzerRule]  = {}
+        self.analysis: Dict[Optional[str], TokenAnalyzerRule] = {}

        if not isinstance(self.analysis_rules, list):
            raise UsageError("Configuration section 'token-analysis' must be a list.")
@@ -140,7 +132,7 @@ class ICURuleLoader:
        norm = Transliterator.createFromRules("rule_loader_normalization",
                                              self.normalization_rules)
        trans = Transliterator.createFromRules("rule_loader_transliteration",
-                                              self.transliteration_rules)
+                                               self.transliteration_rules)

        for section in self.analysis_rules:
            name = section.get('id', None)
@@ -154,7 +146,6 @@ class ICURuleLoader:
            self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
                                                    self.config)

-
    @staticmethod
    def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
        """ Load an ICU ruleset from the given section. If the section is a
@@ -189,7 +180,6 @@ class TokenAnalyzerRule:
        self.config = self._analysis_mod.configure(rules, normalizer,
                                                   transliterator)

-
    def create(self, normalizer: Any, transliterator: Any) -> Analyzer:
        """ Create a new analyser instance for the given rule.
        """
--- a/src/nominatim_db/tokenizer/icu_token_analysis.py
+++ b/src/nominatim_db/tokenizer/icu_token_analysis.py
@@ -14,8 +14,9 @@ from icu import Transliterator
 from .token_analysis.base import Analyzer

 if TYPE_CHECKING:
-    from typing import Any
-    from .icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import
+    from typing import Any  # noqa
+    from .icu_rule_loader import TokenAnalyzerRule
+

 class ICUTokenAnalysis:
    """ Container class collecting the transliterators and token analysis
@@ -35,7 +36,6 @@ class ICUTokenAnalysis:
        self.analysis = {name: arules.create(self.normalizer, self.to_ascii)
                         for name, arules in analysis_rules.items()}

-
    def get_analyzer(self, name: Optional[str]) -> Analyzer:
        """ Return the given named analyzer. If no analyzer with that
            name exists, return the default analyzer.
--- a/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@ -17,7 +17,7 @@ from pathlib import Path
 from psycopg.types.json import Jsonb
 from psycopg import sql as pysql

-from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
+from ..db.connection import connect, Connection, Cursor, server_version_tuple, \
                            drop_tables, table_exists, execute_scalar
 from ..config import Configuration
 from ..db.sql_preprocessor import SQLPreprocessor
@@ -32,10 +32,11 @@ DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"

 LOG = logging.getLogger()

-WORD_TYPES =(('country_names', 'C'),
-             ('postcodes', 'P'),
-             ('full_word', 'W'),
-             ('housenumbers', 'H'))
+WORD_TYPES = (('country_names', 'C'),
+              ('postcodes', 'P'),
+              ('full_word', 'W'),
+              ('housenumbers', 'H'))
+

 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
    """ Create a new instance of the tokenizer provided by this module.
@@ -54,7 +55,6 @@ class ICUTokenizer(AbstractTokenizer):
        self.data_dir = data_dir
        self.loader: Optional[ICURuleLoader] = None

-
    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
        """ Set up a new tokenizer for the database.

@@ -70,7 +70,6 @@ class ICUTokenizer(AbstractTokenizer):
            self._setup_db_tables(config)
            self._create_base_indices(config, 'word')

-
    def init_from_project(self, config: Configuration) -> None:
        """ Initialise the tokenizer from the project directory.
        """
@@ -79,14 +78,12 @@ class ICUTokenizer(AbstractTokenizer):
        with connect(self.dsn) as conn:
            self.loader.load_config_from_db(conn)

-
    def finalize_import(self, config: Configuration) -> None:
        """ Do any required postprocessing to make the tokenizer data ready
            for use.
        """
        self._create_lookup_indices(config, 'word')

-
    def update_sql_functions(self, config: Configuration) -> None:
        """ Reimport the SQL functions for this tokenizer.
        """
@@ -94,14 +91,12 @@ class ICUTokenizer(AbstractTokenizer):
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')

-
    def check_database(self, config: Configuration) -> None:
        """ Check that the tokenizer is set up correctly.
        """
        # Will throw an error if there is an issue.
        self.init_from_project(config)

-
    def update_statistics(self, config: Configuration, threads: int = 2) -> None:
        """ Recompute frequencies for all name words.
        """
@@ -126,28 +121,29 @@ class ICUTokenizer(AbstractTokenizer):
                                     SELECT unnest(nameaddress_vector) as id, count(*)
                                     FROM search_name GROUP BY id""")
                    cur.execute('CREATE INDEX ON addressword_frequencies(id)')
-                    cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
-                                                                               INOUT info JSONB)
-                                   AS $$
-                                   DECLARE rec RECORD;
-                                   BEGIN
-                                   IF info is null THEN
-                                     info = '{}'::jsonb;
-                                   END IF;
-                                   FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
-                                   LOOP
-                                     info = info || jsonb_build_object('count', rec.count);
-                                   END LOOP;
-                                   FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
-                                   LOOP
-                                     info = info || jsonb_build_object('addr_count', rec.count);
-                                   END LOOP;
-                                   IF info = '{}'::jsonb THEN
-                                     info = null;
-                                   END IF;
-                                   END;
-                                   $$ LANGUAGE plpgsql IMMUTABLE;
-                                """)
+                    cur.execute("""
+                        CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
+                                                                    INOUT info JSONB)
+                        AS $$
+                        DECLARE rec RECORD;
+                        BEGIN
+                        IF info is null THEN
+                          info = '{}'::jsonb;
+                        END IF;
+                        FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
+                        LOOP
+                          info = info || jsonb_build_object('count', rec.count);
+                        END LOOP;
+                        FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
+                        LOOP
+                          info = info || jsonb_build_object('addr_count', rec.count);
+                        END LOOP;
+                        IF info = '{}'::jsonb THEN
+                          info = null;
+                        END IF;
+                        END;
+                        $$ LANGUAGE plpgsql IMMUTABLE;
+                        """)
                    LOG.info('Update word table with recomputed frequencies')
                    drop_tables(conn, 'tmp_word')
                    cur.execute("""CREATE TABLE tmp_word AS
@@ -200,8 +196,6 @@ class ICUTokenizer(AbstractTokenizer):
        self._create_lookup_indices(config, 'tmp_word')
        self._move_temporary_word_table('tmp_word')

-
-
    def _cleanup_housenumbers(self) -> None:
        """ Remove unused house numbers.
        """
@@ -235,8 +229,6 @@ class ICUTokenizer(AbstractTokenizer):
                                (list(candidates.values()), ))
                conn.commit()

-
-
    def update_word_tokens(self) -> None:
        """ Remove unused tokens.
        """
@@ -244,7 +236,6 @@ class ICUTokenizer(AbstractTokenizer):
        self._cleanup_housenumbers()
        LOG.warning("Tokenizer house-keeping done.")

-
    def name_analyzer(self) -> 'ICUNameAnalyzer':
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
@@ -264,7 +255,6 @@ class ICUTokenizer(AbstractTokenizer):
        return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
                               self.loader.make_token_analysis())

-
    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
        """ Return a list of the `num` most frequent full words
            in the database.
@@ -276,7 +266,6 @@ class ICUTokenizer(AbstractTokenizer):
                             ORDER BY count DESC LIMIT %s""", (num,))
            return list(s[0].split('@')[0] for s in cur)

-
    def _save_config(self) -> None:
        """ Save the configuration that needs to remain stable for the given
            database as database properties.
@@ -285,7 +274,6 @@ class ICUTokenizer(AbstractTokenizer):
        with connect(self.dsn) as conn:
            self.loader.save_config_to_db(conn)

-
    def _setup_db_tables(self, config: Configuration) -> None:
        """ Set up the word table and fill it with pre-computed word
            frequencies.
@@ -309,7 +297,6 @@ class ICUTokenizer(AbstractTokenizer):
            """)
            conn.commit()

-
    def _create_base_indices(self, config: Configuration, table_name: str) -> None:
        """ Set up the word table and fill it with pre-computed word
            frequencies.
@@ -330,21 +317,21 @@ class ICUTokenizer(AbstractTokenizer):
                                column_type=ctype)
            conn.commit()

-
    def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
        """ Create additional indexes used when running the API.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            # Index required for details lookup.
-            sqlp.run_string(conn, """
+            sqlp.run_string(
+                conn,
+                """
                CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
                  ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
-            """,
-            table_name=table_name)
+                """,
+                table_name=table_name)
            conn.commit()

-
    def _move_temporary_word_table(self, old: str) -> None:
        """ Rename all tables and indexes used by the tokenizer.
        """
@@ -361,8 +348,6 @@ class ICUTokenizer(AbstractTokenizer):
            conn.commit()


-
-
 class ICUNameAnalyzer(AbstractAnalyzer):
    """ The ICU analyzer uses the ICU library for splitting names.

@@ -379,7 +364,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):

        self._cache = _TokenCache()

-
    def close(self) -> None:
        """ Free all resources used by the analyzer.
        """
@@ -387,20 +371,17 @@ class ICUNameAnalyzer(AbstractAnalyzer):
            self.conn.close()
            self.conn = None

-
    def _search_normalized(self, name: str) -> str:
        """ Return the search token transliteration of the given name.
        """
        return cast(str, self.token_analysis.search.transliterate(name)).strip()

-
    def _normalized(self, name: str) -> str:
        """ Return the normalized version of the given name with all
            non-relevant information removed.
        """
        return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()

-
    def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
        """ Return token information for the given list of words.
            If a word starts with # it is assumed to be a full name
@@ -432,8 +413,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
            part_ids = {r[0]: r[1] for r in cur}

        return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
-               + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
-
+            + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]

    def normalize_postcode(self, postcode: str) -> str:
        """ Convert the postcode to a standardized form.
@@ -443,7 +423,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
        """
        return postcode.strip().upper()

-
    def update_postcodes_from_db(self) -> None:
        """ Update postcode tokens in the word table from the location_postcode
            table.
@@ -516,9 +495,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
            with self.conn.cursor() as cur:
                cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)

-
-
-
    def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
                               should_replace: bool) -> None:
        """ Replace the search index for special phrases with the new phrases.
@@ -548,7 +524,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
                 len(norm_phrases), added, deleted)

-
    def _add_special_phrases(self, cursor: Cursor,
                             new_phrases: Set[Tuple[str, str, str, str]],
                             existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
@@ -568,10 +543,9 @@ class ICUNameAnalyzer(AbstractAnalyzer):

        return added

-
    def _remove_special_phrases(self, cursor: Cursor,
-                             new_phrases: Set[Tuple[str, str, str, str]],
-                             existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
+                                new_phrases: Set[Tuple[str, str, str, str]],
+                                existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
        """ Remove all phrases from the database that are no longer in the
            new phrase list.
        """
@@ -587,7 +561,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):

        return len(to_delete)

-
    def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
        """ Add default names for the given country to the search index.
        """
@@ -599,7 +572,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                                     self.sanitizer.process_names(info)[0],
                                     internal=True)

-
    def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
                                internal: bool = False) -> None:
        """ Add names for the given country from an already sanitized
@@ -651,7 +623,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                          """
                cur.execute(sql, (country_code, list(new_tokens)))

-
    def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
        """ Determine tokenizer information about the given place.

@@ -674,7 +645,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):

        return token_info.to_dict()

-
    def _process_place_address(self, token_info: '_TokenInfo',
                               address: Sequence[PlaceName]) -> None:
        for item in address:
@@ -687,12 +657,11 @@ class ICUNameAnalyzer(AbstractAnalyzer):
            elif item.kind == 'place':
                if not item.suffix:
                    token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
-            elif not item.kind.startswith('_') and not item.suffix and \
-                 item.kind not in ('country', 'full', 'inclusion'):
+            elif (not item.kind.startswith('_') and not item.suffix and
+                  item.kind not in ('country', 'full', 'inclusion')):
                token_info.add_address_term(item.kind,
                                            itertools.chain(*self._compute_name_tokens([item])))

-
    def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
        """ Normalize the housenumber and return the word token and the
            canonical form.
@@ -728,7 +697,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):

        return result

-
    def _retrieve_full_tokens(self, name: str) -> List[int]:
        """ Get the full name token for the given name, if it exists.
            The name is only retrieved for the standard analyser.
@@ -749,7 +717,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):

        return full

-
    def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
        """ Computes the full name and partial name tokens for the given
            dictionary of names.
@@ -787,7 +754,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):

        return full_tokens, partial_tokens

-
    def _add_postcode(self, item: PlaceName) -> Optional[str]:
        """ Make sure the normalized postcode is present in the word table.
        """
@@ -835,11 +801,9 @@ class _TokenInfo:
        self.address_tokens: Dict[str, str] = {}
        self.postcode: Optional[str] = None

-
    def _mk_array(self, tokens: Iterable[Any]) -> str:
        return f"{{{','.join((str(s) for s in tokens))}}}"

-
    def to_dict(self) -> Dict[str, Any]:
        """ Return the token information in database importable format.
        """
@@ -866,13 +830,11 @@ class _TokenInfo:

        return out

-
    def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
        """ Adds token information for the normalised names.
        """
        self.names = self._mk_array(itertools.chain(fulls, partials))

-
    def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
        """ Extract housenumber information from a list of normalised
            housenumbers.
@@ -882,7 +844,6 @@ class _TokenInfo:
            self.housenumbers.add(hnr)
            self.housenumber_tokens.add(token)

-
    def add_street(self, tokens: Iterable[int]) -> None:
        """ Add addr:street match terms.
        """
@@ -890,13 +851,11 @@ class _TokenInfo:
            self.street_tokens = set()
        self.street_tokens.update(tokens)

-
    def add_place(self, tokens: Iterable[int]) -> None:
        """ Add addr:place search and match terms.
        """
        self.place_tokens.update(tokens)

-
    def add_address_term(self, key: str, partials: Iterable[int]) -> None:
        """ Add additional address terms.
        """
--- a/src/nominatim_db/tokenizer/place_sanitizer.py
+++ b/src/nominatim_db/tokenizer/place_sanitizer.py
@@ -39,7 +39,6 @@ class PlaceSanitizer:

                self.handlers.append(module.create(SanitizerConfig(func)))

-
    def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
        """ Extract a sanitized list of names and address parts from the
            given place. The function returns a tuple
--- a/src/nominatim_db/tokenizer/sanitizers/base.py
+++ b/src/nominatim_db/tokenizer/sanitizers/base.py
@@ -27,7 +27,6 @@ class ProcessInfo:
        self.names = self._convert_name_dict(place.name)
        self.address = self._convert_name_dict(place.address)

-
    @staticmethod
    def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
        """ Convert a dictionary of names into a list of PlaceNames.
--- a/src/nominatim_db/tokenizer/sanitizers/clean_housenumbers.py
+++ b/src/nominatim_db/tokenizer/sanitizers/clean_housenumbers.py
@@ -30,6 +30,7 @@ from ...data.place_name import PlaceName
 from .base import ProcessInfo
 from .config import SanitizerConfig

+
 class _HousenumberSanitizer:

    def __init__(self, config: SanitizerConfig) -> None:
@@ -38,7 +39,6 @@ class _HousenumberSanitizer:

        self.filter_name = config.get_filter('convert-to-name', 'FAIL_ALL')

-
    def __call__(self, obj: ProcessInfo) -> None:
        if not obj.address:
            return
@@ -57,7 +57,6 @@ class _HousenumberSanitizer:

        obj.address = new_address

-
    def sanitize(self, value: str) -> Iterator[str]:
        """ Extract housenumbers in a regularized format from an OSM value.

@@ -68,7 +67,6 @@ class _HousenumberSanitizer:
            if hnr:
                yield from self._regularize(hnr)

-
    def _regularize(self, hnr: str) -> Iterator[str]:
        yield hnr

--- a/src/nominatim_db/tokenizer/sanitizers/clean_postcodes.py
+++ b/src/nominatim_db/tokenizer/sanitizers/clean_postcodes.py
@@ -26,6 +26,7 @@ from ...data.postcode_format import PostcodeFormatter
 from .base import ProcessInfo
 from .config import SanitizerConfig

+
 class _PostcodeSanitizer:

    def __init__(self, config: SanitizerConfig) -> None:
@@ -36,7 +37,6 @@ class _PostcodeSanitizer:
        if default_pattern is not None and isinstance(default_pattern, str):
            self.matcher.set_default_pattern(default_pattern)

-
    def __call__(self, obj: ProcessInfo) -> None:
        if not obj.address:
            return
@@ -55,7 +55,6 @@ class _PostcodeSanitizer:
                postcode.name = formatted[0]
                postcode.set_attr('variant', formatted[1])

-
    def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
        """ Check the postcode for correct formatting and return the
            normalized version. Returns None if the postcode does not
@@ -67,10 +66,8 @@ class _PostcodeSanitizer:

        assert country is not None

-        return self.matcher.normalize(country, match),\
-               ' '.join(filter(lambda p: p is not None, match.groups()))
-
-
+        return self.matcher.normalize(country, match), \
+            ' '.join(filter(lambda p: p is not None, match.groups()))


 def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
--- a/src/nominatim_db/tokenizer/sanitizers/clean_tiger_tags.py
+++ b/src/nominatim_db/tokenizer/sanitizers/clean_tiger_tags.py
@@ -19,6 +19,7 @@ from .config import SanitizerConfig

 COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]')

+
 def _clean_tiger_county(obj: ProcessInfo) -> None:
    """ Remove the state reference from tiger:county tags.

--- a/src/nominatim_db/tokenizer/sanitizers/config.py
+++ b/src/nominatim_db/tokenizer/sanitizers/config.py
@@ -20,6 +20,7 @@ if TYPE_CHECKING:
 else:
    _BaseUserDict = UserDict

+
 class SanitizerConfig(_BaseUserDict):
    """ The `SanitizerConfig` class is a read-only dictionary
        with configuration options for the sanitizer.
@@ -61,7 +62,6 @@ class SanitizerConfig(_BaseUserDict):

        return values

-
    def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
        """ Extract a configuration parameter as a boolean.

@@ -82,7 +82,6 @@ class SanitizerConfig(_BaseUserDict):

        return value

-
    def get_delimiter(self, default: str = ',;') -> Pattern[str]:
        """ Return the 'delimiters' parameter in the configuration as a
            compiled regular expression that can be used to split strings on
@@ -105,7 +104,6 @@ class SanitizerConfig(_BaseUserDict):

        return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))

-
    def get_filter(self, param: str, default: Union[str, Sequence[str]] = 'PASS_ALL'
                   ) -> Callable[[str], bool]:
        """ Returns a filter function for the given parameter of the sanitizer
--- a/src/nominatim_db/tokenizer/sanitizers/delete_tags.py
+++ b/src/nominatim_db/tokenizer/sanitizers/delete_tags.py
@@ -60,6 +60,7 @@ from ...data.place_name import PlaceName
 from .base import ProcessInfo
 from .config import SanitizerConfig

+
 class _TagSanitizer:

    def __init__(self, config: SanitizerConfig) -> None:
@@ -74,7 +75,6 @@ class _TagSanitizer:

        self.has_country_code = config.get('country_code', None) is not None

-
    def __call__(self, obj: ProcessInfo) -> None:
        tags = obj.names if self.type == 'name' else obj.address

@@ -93,13 +93,11 @@ class _TagSanitizer:
               or not self.filter_name(tag.name):
                filtered_tags.append(tag)

-
        if self.type == 'name':
            obj.names = filtered_tags
        else:
            obj.address = filtered_tags

-
    def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
        """ Returns a tuple of 31 boolean values corresponding to the
            address ranks 0-30. Value at index 'i' is True if rank 'i'
@@ -117,7 +115,6 @@ class _TagSanitizer:
            for i in range(start, end + 1):
                allowed_ranks[i] = True

-
        return tuple(allowed_ranks)


--- a/src/nominatim_db/tokenizer/sanitizers/split_name_list.py
+++ b/src/nominatim_db/tokenizer/sanitizers/split_name_list.py
@@ -16,6 +16,7 @@ from typing import Callable
 from .base import ProcessInfo
 from .config import SanitizerConfig

+
 def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a name processing function that splits name values with
        multiple values into their components.
--- a/src/nominatim_db/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/src/nominatim_db/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -36,6 +36,7 @@ from ...data import country_info
 from .base import ProcessInfo
 from .config import SanitizerConfig

+
 class _AnalyzerByLanguage:
    """ Processor for tagging the language of names in a place.
    """
@@ -47,7 +48,6 @@ class _AnalyzerByLanguage:

        self._compute_default_languages(config.get('use-defaults', 'no'))

-
    def _compute_default_languages(self, use_defaults: str) -> None:
        self.deflangs: Dict[Optional[str], List[str]] = {}

@@ -55,18 +55,16 @@ class _AnalyzerByLanguage:
            for ccode, clangs in country_info.iterate('languages'):
                if len(clangs) == 1 or use_defaults == 'all':
                    if self.whitelist:
-                        self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
+                        self.deflangs[ccode] = [cl for cl in clangs if cl in self.whitelist]
                    else:
                        self.deflangs[ccode] = clangs

-
    def _suffix_matches(self, suffix: str) -> bool:
        if self.whitelist is None:
            return len(suffix) in (2, 3) and suffix.islower()

        return suffix in self.whitelist

-
    def __call__(self, obj: ProcessInfo) -> None:
        if not obj.names:
            return
@@ -80,14 +78,13 @@ class _AnalyzerByLanguage:
            else:
                langs = self.deflangs.get(obj.place.country_code)

-
            if langs:
                if self.replace:
                    name.set_attr('analyzer', langs[0])
                else:
                    more_names.append(name.clone(attr={'analyzer': langs[0]}))

-                more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
+                more_names.extend(name.clone(attr={'analyzer': lg}) for lg in langs[1:])

        obj.names.extend(more_names)

--- a/src/nominatim_db/tokenizer/sanitizers/tag_japanese.py
+++ b/src/nominatim_db/tokenizer/sanitizers/tag_japanese.py
@@ -18,11 +18,13 @@ from .base import ProcessInfo
 from .config import SanitizerConfig
 from ...data.place_name import PlaceName

+
 def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """Set up the sanitizer
    """
    return tag_japanese

+
 def reconbine_housenumber(
    new_address: List[PlaceName],
    tmp_housenumber: Optional[str],
@@ -56,6 +58,7 @@ def reconbine_housenumber(
        )
    return new_address

+
 def reconbine_place(
    new_address: List[PlaceName],
    tmp_neighbourhood: Optional[str],
@@ -88,6 +91,8 @@ def reconbine_place(
            )
        )
    return new_address
+
+
 def tag_japanese(obj: ProcessInfo) -> None:
    """Recombine kind of address
    """
--- a/src/nominatim_db/tokenizer/token_analysis/base.py
+++ b/src/nominatim_db/tokenizer/token_analysis/base.py
@@ -12,6 +12,7 @@ from typing import Mapping, List, Any
 from ...typing import Protocol
 from ...data.place_name import PlaceName

+
 class Analyzer(Protocol):
    """ The `create()` function of an analysis module needs to return an
        object that implements the following functions.
--- a/src/nominatim_db/tokenizer/token_analysis/config_variants.py
+++ b/src/nominatim_db/tokenizer/token_analysis/config_variants.py
@@ -15,6 +15,7 @@ import re
 from ...config import flatten_config_list
 from ...errors import UsageError

+
 class ICUVariant(NamedTuple):
    """ A single replacement rule for variant creation.
    """
@@ -64,7 +65,6 @@ class _VariantMaker:
    def __init__(self, normalizer: Any) -> None:
        self.norm = normalizer

-
    def compute(self, rule: Any) -> Iterator[ICUVariant]:
        """ Generator for all ICUVariant tuples from a single variant rule.
        """
@@ -88,7 +88,6 @@ class _VariantMaker:
                for froms, tos in _create_variants(*src, repl, decompose):
                    yield ICUVariant(froms, tos)

-
    def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]:
        name = name.strip()
        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
--- a/src/nominatim_db/tokenizer/token_analysis/generic.py
+++ b/src/nominatim_db/tokenizer/token_analysis/generic.py
@@ -17,7 +17,8 @@ from ...data.place_name import PlaceName
 from .config_variants import get_variant_config
 from .generic_mutation import MutationVariantGenerator

-### Configuration section
+# Configuration section
+

 def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
    """ Extract and preprocess the configuration for this module.
@@ -47,7 +48,7 @@ def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, An
    return config


-### Analysis section
+# Analysis section

 def create(normalizer: Any, transliterator: Any,
           config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
@@ -77,14 +78,12 @@ class GenericTokenAnalysis:
        # set up mutation rules
        self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]

-
    def get_canonical_id(self, name: PlaceName) -> str:
        """ Return the normalized form of the name. This is the standard form
            from which possible variants for the name can be derived.
        """
        return cast(str, self.norm.transliterate(name.name)).strip()

-
    def compute_variants(self, norm_name: str) -> List[str]:
        """ Compute the spelling variants for the given normalized name
            and transliterate the result.
@@ -96,7 +95,6 @@ class GenericTokenAnalysis:

        return [name for name in self._transliterate_unique_list(norm_name, variants) if name]

-
    def _transliterate_unique_list(self, norm_name: str,
                                   iterable: Iterable[str]) -> Iterator[Optional[str]]:
        seen = set()
@@ -108,7 +106,6 @@ class GenericTokenAnalysis:
                seen.add(variant)
                yield self.to_ascii.transliterate(variant).strip()

-
    def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
        baseform = '^ ' + norm_name + ' ^'
        baselen = len(baseform)
--- a/src/nominatim_db/tokenizer/token_analysis/generic_mutation.py
+++ b/src/nominatim_db/tokenizer/token_analysis/generic_mutation.py
@@ -16,6 +16,7 @@ from ...errors import UsageError

 LOG = logging.getLogger()

+
 def _zigzag(outer: Iterable[str], inner: Iterable[str]) -> Iterator[str]:
    return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))

@@ -36,7 +37,6 @@ class MutationVariantGenerator:
                      "This is not allowed.", pattern)
            raise UsageError("Bad mutation pattern in configuration.")

-
    def generate(self, names: Iterable[str]) -> Iterator[str]:
        """ Generator function for the name variants. 'names' is an iterable
            over a set of names for which the variants are to be generated.
@@ -49,7 +49,6 @@ class MutationVariantGenerator:
                for seps in self._fillers(len(parts)):
                    yield ''.join(_zigzag(parts, seps))

-
    def _fillers(self, num_parts: int) -> Iterator[Tuple[str, ...]]:
        """ Returns a generator for strings to join the given number of string
            parts in all possible combinations.
--- a/src/nominatim_db/tokenizer/token_analysis/housenumbers.py
+++ b/src/nominatim_db/tokenizer/token_analysis/housenumbers.py
@@ -19,16 +19,18 @@ RE_DIGIT_ALPHA = re.compile(r'(\d)\s*([^\d\s␣])')
 RE_ALPHA_DIGIT = re.compile(r'([^\s\d␣])\s*(\d)')
 RE_NAMED_PART = re.compile(r'[a-z]{4}')

-### Configuration section
+# Configuration section
+

 def configure(*_: Any) -> None:
    """ All behaviour is currently hard-coded.
    """
    return None

-### Analysis section
+# Analysis section

-def create(normalizer: Any, transliterator: Any, config: None) -> 'HousenumberTokenAnalysis': # pylint: disable=W0613
+
+def create(normalizer: Any, transliterator: Any, config: None) -> 'HousenumberTokenAnalysis':
    """ Create a new token analysis instance for this module.
    """
    return HousenumberTokenAnalysis(normalizer, transliterator)
--- a/src/nominatim_db/tokenizer/token_analysis/postcodes.py
+++ b/src/nominatim_db/tokenizer/token_analysis/postcodes.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2022 by the Nominatim developer community.
+# Copyright (C) 2024 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Specialized processor for postcodes. Supports a 'lookup' variant of the
@@ -13,16 +13,18 @@ from typing import Any, List
 from ...data.place_name import PlaceName
 from .generic_mutation import MutationVariantGenerator

-### Configuration section
+# Configuration section
+

 def configure(*_: Any) -> None:
    """ All behaviour is currently hard-coded.
    """
    return None

-### Analysis section
+# Analysis section

-def create(normalizer: Any, transliterator: Any, config: None) -> 'PostcodeTokenAnalysis': # pylint: disable=W0613
+
+def create(normalizer: Any, transliterator: Any, config: None) -> 'PostcodeTokenAnalysis':
    """ Create a new token analysis instance for this module.
    """
    return PostcodeTokenAnalysis(normalizer, transliterator)
@@ -44,13 +46,11 @@ class PostcodeTokenAnalysis:

        self.mutator = MutationVariantGenerator(' ', (' ', ''))

-
    def get_canonical_id(self, name: PlaceName) -> str:
        """ Return the standard form of the postcode.
        """
        return name.name.strip().upper()

-
    def compute_variants(self, norm_name: str) -> List[str]:
        """ Compute the spelling variants for the given normalized postcode.