generalize normalization step for search query

It is now possible to configure functions for changing the query
input before it is analysed by the tokenizer.

Code is a cleaned-up version of the implementation by @miku.
This commit is contained in:
Sarah Hoffmann
2024-12-13 11:53:10 +01:00
parent 046665f8d9
commit 2b87c016db
10 changed files with 167 additions and 9 deletions

View File

@@ -0,0 +1,32 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Common data types and protocols for preprocessing.
"""
from typing import List, Callable
from ..typing import Protocol
from ..search import query as qmod
from .config import QueryConfig
QueryProcessingFunc = Callable[[List[qmod.Phrase]], List[qmod.Phrase]]
class QueryHandler(Protocol):
""" Protocol for query modules.
"""
def create(self, config: QueryConfig) -> QueryProcessingFunc:
"""
Create a function for sanitizing a place.
Arguments:
config: A dictionary with the additional configuration options
specified in the tokenizer configuration
normalizer: A instance to transliterate text
Return:
The result is a list modified by the preprocessor.
"""
pass

View File

@@ -0,0 +1,34 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Configuration for Sanitizers.
"""
from typing import Any, TYPE_CHECKING
from collections import UserDict
# working around missing generics in Python < 3.8
# See https://github.com/python/typing/issues/60#issuecomment-869757075
if TYPE_CHECKING:
_BaseUserDict = UserDict[str, Any]
else:
_BaseUserDict = UserDict
class QueryConfig(_BaseUserDict):
""" The `QueryConfig` class is a read-only dictionary
with configuration options for the preprocessor.
In addition to the usual dictionary functions, the class provides
accessors to standard preprocessor options that are used by many of the
preprocessors.
"""
def set_normalizer(self, normalizer: Any) -> 'QueryConfig':
""" Set the normalizer function to be used.
"""
self['_normalizer'] = normalizer
return self

View File

@@ -0,0 +1,26 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Normalize query test using an ICU transliterator.
"""
from typing import cast
from .config import QueryConfig
from .base import QueryProcessingFunc
from ..search.query import Phrase
def create(config: QueryConfig) -> QueryProcessingFunc:
normalizer = config.get('_normalizer')
if not normalizer:
return lambda p: p
return lambda phrases: list(
filter(lambda p: p.text,
(Phrase(p.ptype, cast(str, normalizer.transliterate(p.text)))
for p in phrases)))