generalize normalization step for search query

It is now possible to configure functions for changing the query input before it is analysed by the tokenizer. Code is a cleaned-up version of the implementation by @miku.
2026-02-26 11:08:13 +00:00 · 2024-12-13 11:53:10 +01:00
parent 046665f8d9
commit 2b87c016db
10 changed files with 167 additions and 9 deletions
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -1,3 +1,5 @@
 query-preprocessing:
    - step: normalize
 normalization:
    - ":: lower ()"
    - ":: Hans-Hant"
--- a/src/nominatim_api/connection.py
+++ b/src/nominatim_api/connection.py
@@ -18,6 +18,7 @@ from .typing import SaFromClause
 from .sql.sqlalchemy_schema import SearchTables
 from .sql.sqlalchemy_types import Geometry
 from .logging import log
 from .config import Configuration
 T = TypeVar('T')
@@ -31,9 +32,11 @@ class SearchConnection:
    def __init__(self, conn: AsyncConnection,
                 tables: SearchTables,
-                 properties: Dict[str, Any]) -> None:
+                 properties: Dict[str, Any],
                 config: Configuration) -> None:
        self.connection = conn
        self.t = tables
        self.config = config
        self._property_cache = properties
        self._classtables: Optional[Set[str]] = None
        self.query_timeout: Optional[int] = None
--- a/src/nominatim_api/core.py
+++ b/src/nominatim_api/core.py
@@ -184,7 +184,7 @@ class NominatimAPIAsync:
        assert self._tables is not None
        async with self._engine.begin() as conn:
-            yield SearchConnection(conn, self._tables, self._property_cache)
+            yield SearchConnection(conn, self._tables, self._property_cache, self.config)
    async def status(self) -> StatusResult:
        """ Return the status of the database.
--- a/src/nominatim_api/query_preprocessing/init.py
+++ b/src/nominatim_api/query_preprocessing/init.py
--- a/src/nominatim_api/query_preprocessing/base.py
+++ b/src/nominatim_api/query_preprocessing/base.py
@@ -0,0 +1,32 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2024 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Common data types and protocols for preprocessing.
 """
 from typing import List, Callable
 from ..typing import Protocol
 from ..search import query as qmod
 from .config import QueryConfig
 QueryProcessingFunc = Callable[[List[qmod.Phrase]], List[qmod.Phrase]]
 class QueryHandler(Protocol):
    """ Protocol for query modules.
    """
    def create(self, config: QueryConfig) -> QueryProcessingFunc:
        """
        Create a function for sanitizing a place.
        Arguments:
            config: A dictionary with the additional configuration options
                    specified in the tokenizer configuration
            normalizer: A instance to transliterate text
        Return:
            The result is a list modified by the preprocessor.
        """
        pass
--- a/src/nominatim_api/query_preprocessing/config.py
+++ b/src/nominatim_api/query_preprocessing/config.py
@@ -0,0 +1,34 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2024 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Configuration for Sanitizers.
 """
 from typing import Any, TYPE_CHECKING
 from collections import UserDict
 # working around missing generics in Python < 3.8
 # See https://github.com/python/typing/issues/60#issuecomment-869757075
 if TYPE_CHECKING:
    _BaseUserDict = UserDict[str, Any]
 else:
    _BaseUserDict = UserDict
 class QueryConfig(_BaseUserDict):
    """ The `QueryConfig` class is a read-only dictionary
        with configuration options for the preprocessor.
        In addition to the usual dictionary functions, the class provides
        accessors to standard preprocessor options that are used by many of the
        preprocessors.
    """
    def set_normalizer(self, normalizer: Any) -> 'QueryConfig':
        """ Set the normalizer function to be used.
        """
        self['_normalizer'] = normalizer
        return self
--- a/src/nominatim_api/query_preprocessing/normalize.py
+++ b/src/nominatim_api/query_preprocessing/normalize.py
@@ -0,0 +1,26 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2024 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Normalize query test using an ICU transliterator.
 """
 from typing import cast
 from .config import QueryConfig
 from .base import QueryProcessingFunc
 from ..search.query import Phrase
 def create(config: QueryConfig) -> QueryProcessingFunc:
    normalizer = config.get('_normalizer')
    if not normalizer:
        return lambda p: p
    return lambda phrases: list(
        filter(lambda p: p.text,
               (Phrase(p.ptype, cast(str, normalizer.transliterate(p.text)))
                for p in phrases)))
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -16,12 +16,14 @@ from icu import Transliterator
 import sqlalchemy as sa
 from ..errors import UsageError
 from ..typing import SaRow
 from ..sql.sqlalchemy_types import Json
 from ..connection import SearchConnection
 from ..logging import log
-from ..search import query as qmod
+from . import query as qmod
-from ..search.query_analyzer_factory import AbstractQueryAnalyzer
+from ..query_preprocessing.config import QueryConfig
 from .query_analyzer_factory import AbstractQueryAnalyzer
 DB_TO_TOKEN_TYPE = {
@@ -151,6 +153,8 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
        self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator',
                                                               _make_transliterator)
        await self._setup_preprocessing()
        if 'word' not in self.conn.t.meta.tables:
            sa.Table('word', self.conn.t.meta,
                     sa.Column('word_id', sa.Integer),
@@ -159,15 +163,36 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                     sa.Column('word', sa.Text),
                     sa.Column('info', Json))
    async def _setup_preprocessing(self) -> None:
        """ Load the rules for preprocessing and set up the handlers.
        """
        rules = self.conn.config.load_sub_configuration('icu_tokenizer.yaml',
                                                        config='TOKENIZER_CONFIG')
        preprocessing_rules = rules.get('query-preprocessing', [])
        self.preprocessors = []
        for func in preprocessing_rules:
            if 'step' not in func:
                raise UsageError("Preprocessing rule is missing the 'step' attribute.")
            if not isinstance(func['step'], str):
                raise UsageError("'step' attribute must be a simple string.")
            module = self.conn.config.load_plugin_module(
                        func['step'], 'nominatim_api.query_preprocessing')
            self.preprocessors.append(
                module.create(QueryConfig(func).set_normalizer(self.normalizer)))
    async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
        """ Analyze the given list of phrases and return the
            tokenized query.
        """
        log().section('Analyze query (using ICU tokenizer)')
-        normalized = list(filter(lambda p: p.text,
+        for func in self.preprocessors:
-                                 (qmod.Phrase(p.ptype, self.normalize_text(p.text))
+            phrases = func(phrases)
-                                  for p in phrases)))
+        query = qmod.QueryStruct(phrases)
-        query = qmod.QueryStruct(normalized)
+
        log().var_dump('Normalized query', query.source)
        if not query.source:
            return query
--- a/src/nominatim_api/typing.py
+++ b/src/nominatim_api/typing.py
@@ -21,9 +21,11 @@ if TYPE_CHECKING:
    from typing import Any
    import sqlalchemy as sa
    import os
-    from typing_extensions import (TypeAlias as TypeAlias)
+    from typing_extensions import (TypeAlias as TypeAlias,
                                   Protocol as Protocol)
 else:
    TypeAlias = str
    Protocol = object
 StrPath = Union[str, 'os.PathLike[str]']
--- a/test/python/api/query_processing/test_normalize.py
+++ b/test/python/api/query_processing/test_normalize.py
@@ -0,0 +1,34 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2024 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Tests for normalizing search queries.
 """
 from pathlib import Path
 import pytest
 from icu import Transliterator
 import nominatim_api.search.query as qmod
 from nominatim_api.query_preprocessing.config import QueryConfig
 from nominatim_api.query_preprocessing import normalize
 def run_preprocessor_on(query, norm):
    normalizer = Transliterator.createFromRules("normalization", norm)
    proc = normalize.create(QueryConfig().set_normalizer(normalizer))
    return proc(query)
 def test_normalize_simple():
    norm = ':: lower();'
    query = [qmod.Phrase(qmod.PhraseType.NONE, 'Hallo')]
    out = run_preprocessor_on(query, norm)
    assert len(out) == 1
    assert out == [qmod.Phrase(qmod.PhraseType.NONE, 'hallo')]