forked from hans/Nominatim
generalize normalization step for search query
It is now possible to configure functions for changing the query input before it is analysed by the tokenizer. Code is a cleaned-up version of the implementation by @miku.
This commit is contained in:
@@ -18,6 +18,7 @@ from .typing import SaFromClause
|
||||
from .sql.sqlalchemy_schema import SearchTables
|
||||
from .sql.sqlalchemy_types import Geometry
|
||||
from .logging import log
|
||||
from .config import Configuration
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
@@ -31,9 +32,11 @@ class SearchConnection:
|
||||
|
||||
def __init__(self, conn: AsyncConnection,
|
||||
tables: SearchTables,
|
||||
properties: Dict[str, Any]) -> None:
|
||||
properties: Dict[str, Any],
|
||||
config: Configuration) -> None:
|
||||
self.connection = conn
|
||||
self.t = tables
|
||||
self.config = config
|
||||
self._property_cache = properties
|
||||
self._classtables: Optional[Set[str]] = None
|
||||
self.query_timeout: Optional[int] = None
|
||||
|
||||
@@ -184,7 +184,7 @@ class NominatimAPIAsync:
|
||||
assert self._tables is not None
|
||||
|
||||
async with self._engine.begin() as conn:
|
||||
yield SearchConnection(conn, self._tables, self._property_cache)
|
||||
yield SearchConnection(conn, self._tables, self._property_cache, self.config)
|
||||
|
||||
async def status(self) -> StatusResult:
|
||||
""" Return the status of the database.
|
||||
|
||||
0
src/nominatim_api/query_preprocessing/__init__.py
Normal file
0
src/nominatim_api/query_preprocessing/__init__.py
Normal file
32
src/nominatim_api/query_preprocessing/base.py
Normal file
32
src/nominatim_api/query_preprocessing/base.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Common data types and protocols for preprocessing.
|
||||
"""
|
||||
from typing import List, Callable
|
||||
|
||||
from ..typing import Protocol
|
||||
from ..search import query as qmod
|
||||
from .config import QueryConfig
|
||||
|
||||
QueryProcessingFunc = Callable[[List[qmod.Phrase]], List[qmod.Phrase]]
|
||||
|
||||
|
||||
class QueryHandler(Protocol):
|
||||
""" Protocol for query modules.
|
||||
"""
|
||||
def create(self, config: QueryConfig) -> QueryProcessingFunc:
|
||||
"""
|
||||
Create a function for sanitizing a place.
|
||||
Arguments:
|
||||
config: A dictionary with the additional configuration options
|
||||
specified in the tokenizer configuration
|
||||
normalizer: A instance to transliterate text
|
||||
Return:
|
||||
The result is a list modified by the preprocessor.
|
||||
"""
|
||||
pass
|
||||
34
src/nominatim_api/query_preprocessing/config.py
Normal file
34
src/nominatim_api/query_preprocessing/config.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Configuration for Sanitizers.
|
||||
"""
|
||||
from typing import Any, TYPE_CHECKING
|
||||
from collections import UserDict
|
||||
|
||||
# working around missing generics in Python < 3.8
|
||||
# See https://github.com/python/typing/issues/60#issuecomment-869757075
|
||||
if TYPE_CHECKING:
|
||||
_BaseUserDict = UserDict[str, Any]
|
||||
else:
|
||||
_BaseUserDict = UserDict
|
||||
|
||||
|
||||
class QueryConfig(_BaseUserDict):
|
||||
""" The `QueryConfig` class is a read-only dictionary
|
||||
with configuration options for the preprocessor.
|
||||
In addition to the usual dictionary functions, the class provides
|
||||
accessors to standard preprocessor options that are used by many of the
|
||||
preprocessors.
|
||||
"""
|
||||
|
||||
def set_normalizer(self, normalizer: Any) -> 'QueryConfig':
|
||||
""" Set the normalizer function to be used.
|
||||
"""
|
||||
self['_normalizer'] = normalizer
|
||||
|
||||
return self
|
||||
26
src/nominatim_api/query_preprocessing/normalize.py
Normal file
26
src/nominatim_api/query_preprocessing/normalize.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Normalize query test using an ICU transliterator.
|
||||
"""
|
||||
from typing import cast
|
||||
|
||||
from .config import QueryConfig
|
||||
from .base import QueryProcessingFunc
|
||||
from ..search.query import Phrase
|
||||
|
||||
|
||||
def create(config: QueryConfig) -> QueryProcessingFunc:
|
||||
normalizer = config.get('_normalizer')
|
||||
|
||||
if not normalizer:
|
||||
return lambda p: p
|
||||
|
||||
return lambda phrases: list(
|
||||
filter(lambda p: p.text,
|
||||
(Phrase(p.ptype, cast(str, normalizer.transliterate(p.text)))
|
||||
for p in phrases)))
|
||||
@@ -16,12 +16,14 @@ from icu import Transliterator
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from ..errors import UsageError
|
||||
from ..typing import SaRow
|
||||
from ..sql.sqlalchemy_types import Json
|
||||
from ..connection import SearchConnection
|
||||
from ..logging import log
|
||||
from ..search import query as qmod
|
||||
from ..search.query_analyzer_factory import AbstractQueryAnalyzer
|
||||
from . import query as qmod
|
||||
from ..query_preprocessing.config import QueryConfig
|
||||
from .query_analyzer_factory import AbstractQueryAnalyzer
|
||||
|
||||
|
||||
DB_TO_TOKEN_TYPE = {
|
||||
@@ -151,6 +153,8 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
||||
self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator',
|
||||
_make_transliterator)
|
||||
|
||||
await self._setup_preprocessing()
|
||||
|
||||
if 'word' not in self.conn.t.meta.tables:
|
||||
sa.Table('word', self.conn.t.meta,
|
||||
sa.Column('word_id', sa.Integer),
|
||||
@@ -159,15 +163,36 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
||||
sa.Column('word', sa.Text),
|
||||
sa.Column('info', Json))
|
||||
|
||||
async def _setup_preprocessing(self) -> None:
|
||||
""" Load the rules for preprocessing and set up the handlers.
|
||||
"""
|
||||
|
||||
rules = self.conn.config.load_sub_configuration('icu_tokenizer.yaml',
|
||||
config='TOKENIZER_CONFIG')
|
||||
preprocessing_rules = rules.get('query-preprocessing', [])
|
||||
|
||||
self.preprocessors = []
|
||||
|
||||
for func in preprocessing_rules:
|
||||
if 'step' not in func:
|
||||
raise UsageError("Preprocessing rule is missing the 'step' attribute.")
|
||||
if not isinstance(func['step'], str):
|
||||
raise UsageError("'step' attribute must be a simple string.")
|
||||
|
||||
module = self.conn.config.load_plugin_module(
|
||||
func['step'], 'nominatim_api.query_preprocessing')
|
||||
self.preprocessors.append(
|
||||
module.create(QueryConfig(func).set_normalizer(self.normalizer)))
|
||||
|
||||
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
|
||||
""" Analyze the given list of phrases and return the
|
||||
tokenized query.
|
||||
"""
|
||||
log().section('Analyze query (using ICU tokenizer)')
|
||||
normalized = list(filter(lambda p: p.text,
|
||||
(qmod.Phrase(p.ptype, self.normalize_text(p.text))
|
||||
for p in phrases)))
|
||||
query = qmod.QueryStruct(normalized)
|
||||
for func in self.preprocessors:
|
||||
phrases = func(phrases)
|
||||
query = qmod.QueryStruct(phrases)
|
||||
|
||||
log().var_dump('Normalized query', query.source)
|
||||
if not query.source:
|
||||
return query
|
||||
|
||||
@@ -21,9 +21,11 @@ if TYPE_CHECKING:
|
||||
from typing import Any
|
||||
import sqlalchemy as sa
|
||||
import os
|
||||
from typing_extensions import (TypeAlias as TypeAlias)
|
||||
from typing_extensions import (TypeAlias as TypeAlias,
|
||||
Protocol as Protocol)
|
||||
else:
|
||||
TypeAlias = str
|
||||
Protocol = object
|
||||
|
||||
StrPath = Union[str, 'os.PathLike[str]']
|
||||
|
||||
|
||||
Reference in New Issue
Block a user