generalize normalization step for search query

It is now possible to configure functions for changing the query
input before it is analysed by the tokenizer.

Code is a cleaned-up version of the implementation by @miku.
This commit is contained in:
Sarah Hoffmann
2024-12-13 11:53:10 +01:00
parent 046665f8d9
commit 2b87c016db
10 changed files with 167 additions and 9 deletions

View File

@@ -18,6 +18,7 @@ from .typing import SaFromClause
from .sql.sqlalchemy_schema import SearchTables
from .sql.sqlalchemy_types import Geometry
from .logging import log
from .config import Configuration
T = TypeVar('T')
@@ -31,9 +32,11 @@ class SearchConnection:
def __init__(self, conn: AsyncConnection,
tables: SearchTables,
properties: Dict[str, Any]) -> None:
properties: Dict[str, Any],
config: Configuration) -> None:
self.connection = conn
self.t = tables
self.config = config
self._property_cache = properties
self._classtables: Optional[Set[str]] = None
self.query_timeout: Optional[int] = None

View File

@@ -184,7 +184,7 @@ class NominatimAPIAsync:
assert self._tables is not None
async with self._engine.begin() as conn:
yield SearchConnection(conn, self._tables, self._property_cache)
yield SearchConnection(conn, self._tables, self._property_cache, self.config)
async def status(self) -> StatusResult:
""" Return the status of the database.

View File

@@ -0,0 +1,32 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Common data types and protocols for preprocessing.
"""
from typing import List, Callable
from ..typing import Protocol
from ..search import query as qmod
from .config import QueryConfig
QueryProcessingFunc = Callable[[List[qmod.Phrase]], List[qmod.Phrase]]
class QueryHandler(Protocol):
""" Protocol for query modules.
"""
def create(self, config: QueryConfig) -> QueryProcessingFunc:
"""
Create a function for sanitizing a place.
Arguments:
config: A dictionary with the additional configuration options
specified in the tokenizer configuration
normalizer: A instance to transliterate text
Return:
The result is a list modified by the preprocessor.
"""
pass

View File

@@ -0,0 +1,34 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Configuration for Sanitizers.
"""
from typing import Any, TYPE_CHECKING
from collections import UserDict
# working around missing generics in Python < 3.8
# See https://github.com/python/typing/issues/60#issuecomment-869757075
if TYPE_CHECKING:
_BaseUserDict = UserDict[str, Any]
else:
_BaseUserDict = UserDict
class QueryConfig(_BaseUserDict):
""" The `QueryConfig` class is a read-only dictionary
with configuration options for the preprocessor.
In addition to the usual dictionary functions, the class provides
accessors to standard preprocessor options that are used by many of the
preprocessors.
"""
def set_normalizer(self, normalizer: Any) -> 'QueryConfig':
""" Set the normalizer function to be used.
"""
self['_normalizer'] = normalizer
return self

View File

@@ -0,0 +1,26 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Normalize query test using an ICU transliterator.
"""
from typing import cast
from .config import QueryConfig
from .base import QueryProcessingFunc
from ..search.query import Phrase
def create(config: QueryConfig) -> QueryProcessingFunc:
normalizer = config.get('_normalizer')
if not normalizer:
return lambda p: p
return lambda phrases: list(
filter(lambda p: p.text,
(Phrase(p.ptype, cast(str, normalizer.transliterate(p.text)))
for p in phrases)))

View File

@@ -16,12 +16,14 @@ from icu import Transliterator
import sqlalchemy as sa
from ..errors import UsageError
from ..typing import SaRow
from ..sql.sqlalchemy_types import Json
from ..connection import SearchConnection
from ..logging import log
from ..search import query as qmod
from ..search.query_analyzer_factory import AbstractQueryAnalyzer
from . import query as qmod
from ..query_preprocessing.config import QueryConfig
from .query_analyzer_factory import AbstractQueryAnalyzer
DB_TO_TOKEN_TYPE = {
@@ -151,6 +153,8 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator',
_make_transliterator)
await self._setup_preprocessing()
if 'word' not in self.conn.t.meta.tables:
sa.Table('word', self.conn.t.meta,
sa.Column('word_id', sa.Integer),
@@ -159,15 +163,36 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
sa.Column('word', sa.Text),
sa.Column('info', Json))
async def _setup_preprocessing(self) -> None:
""" Load the rules for preprocessing and set up the handlers.
"""
rules = self.conn.config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
preprocessing_rules = rules.get('query-preprocessing', [])
self.preprocessors = []
for func in preprocessing_rules:
if 'step' not in func:
raise UsageError("Preprocessing rule is missing the 'step' attribute.")
if not isinstance(func['step'], str):
raise UsageError("'step' attribute must be a simple string.")
module = self.conn.config.load_plugin_module(
func['step'], 'nominatim_api.query_preprocessing')
self.preprocessors.append(
module.create(QueryConfig(func).set_normalizer(self.normalizer)))
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
""" Analyze the given list of phrases and return the
tokenized query.
"""
log().section('Analyze query (using ICU tokenizer)')
normalized = list(filter(lambda p: p.text,
(qmod.Phrase(p.ptype, self.normalize_text(p.text))
for p in phrases)))
query = qmod.QueryStruct(normalized)
for func in self.preprocessors:
phrases = func(phrases)
query = qmod.QueryStruct(phrases)
log().var_dump('Normalized query', query.source)
if not query.source:
return query

View File

@@ -21,9 +21,11 @@ if TYPE_CHECKING:
from typing import Any
import sqlalchemy as sa
import os
from typing_extensions import (TypeAlias as TypeAlias)
from typing_extensions import (TypeAlias as TypeAlias,
Protocol as Protocol)
else:
TypeAlias = str
Protocol = object
StrPath = Union[str, 'os.PathLike[str]']