implement search builder

2026-03-11 21:34:06 +00:00 · 2023-05-23 11:20:34 +02:00
parent 3bf489cd7c
commit c42273a4db
7 changed files with 1208 additions and 3 deletions
--- a/nominatim/api/search/db_search_fields.py
+++ b/nominatim/api/search/db_search_fields.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2023 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Data structures for more complex fields in abstract search descriptions.
+"""
+from typing import List, Tuple, cast
+import dataclasses
+
+import sqlalchemy as sa
+from sqlalchemy.dialects.postgresql import ARRAY
+
+from nominatim.typing import SaFromClause, SaColumn
+from nominatim.api.search.query import Token
+
+@dataclasses.dataclass
+class WeightedStrings:
+    """ A list of strings together with a penalty.
+    """
+    values: List[str]
+    penalties: List[float]
+
+    def __bool__(self) -> bool:
+        return bool(self.values)
+
+
+@dataclasses.dataclass
+class WeightedCategories:
+    """ A list of class/type tuples together with a penalty.
+    """
+    values: List[Tuple[str, str]]
+    penalties: List[float]
+
+    def __bool__(self) -> bool:
+        return bool(self.values)
+
+
+@dataclasses.dataclass(order=True)
+class RankedTokens:
+    """ List of tokens together with the penalty of using it.
+    """
+    penalty: float
+    tokens: List[int]
+
+    def with_token(self, t: Token, transition_penalty: float) -> 'RankedTokens':
+        """ Create a new RankedTokens list with the given token appended.
+            The tokens penalty as well as the given transision penalty
+            are added to the overall penalty.
+        """
+        return RankedTokens(self.penalty + t.penalty + transition_penalty,
+                            self.tokens + [t.token])
+
+
+@dataclasses.dataclass
+class FieldRanking:
+    """ A list of rankings to be applied sequentially until one matches.
+        The matched ranking determines the penalty. If none matches a
+        default penalty is applied.
+    """
+    column: str
+    default: float
+    rankings: List[RankedTokens]
+
+    def normalize_penalty(self) -> float:
+        """ Reduce the default and ranking penalties, such that the minimum
+            penalty is 0. Return the penalty that was subtracted.
+        """
+        if self.rankings:
+            min_penalty = min(self.default, min(r.penalty for r in self.rankings))
+        else:
+            min_penalty = self.default
+        if min_penalty > 0.0:
+            self.default -= min_penalty
+            for ranking in self.rankings:
+                ranking.penalty -= min_penalty
+        return min_penalty
+
+
+    def sql_penalty(self, table: SaFromClause) -> SaColumn:
+        """ Create an SQL expression for the rankings.
+        """
+        assert self.rankings
+
+        col = table.c[self.column]
+
+        return sa.case(*((col.contains(r.tokens),r.penalty) for r in self.rankings),
+                       else_=self.default)
+
+
+@dataclasses.dataclass
+class FieldLookup:
+    """ A list of tokens to be searched for. The column names the database
+        column to search in and the lookup_type the operator that is applied.
+        'lookup_all' requires all tokens to match. 'lookup_any' requires
+        one of the tokens to match. 'restrict' requires to match all tokens
+        but avoids the use of indexes.
+    """
+    column: str
+    tokens: List[int]
+    lookup_type: str
+
+    def sql_condition(self, table: SaFromClause) -> SaColumn:
+        """ Create an SQL expression for the given match condition.
+        """
+        col = table.c[self.column]
+        if self.lookup_type == 'lookup_all':
+            return col.contains(self.tokens)
+        if self.lookup_type == 'lookup_any':
+            return cast(SaColumn, col.overlap(self.tokens))
+
+        return sa.func.array_cat(col, sa.text('ARRAY[]::integer[]'),
+                                 type_=ARRAY(sa.Integer())).contains(self.tokens)
+
+
+class SearchData:
+    """ Search fields derived from query and token assignment
+        to be used with the SQL queries.
+    """
+    penalty: float
+
+    lookups: List[FieldLookup] = []
+    rankings: List[FieldRanking]
+
+    housenumbers: WeightedStrings = WeightedStrings([], [])
+    postcodes: WeightedStrings = WeightedStrings([], [])
+    countries: WeightedStrings = WeightedStrings([], [])
+
+    qualifiers: WeightedCategories = WeightedCategories([], [])
+
+
+    def set_strings(self, field: str, tokens: List[Token]) -> None:
+        """ Set on of the WeightedStrings properties from the given
+            token list. Adapt the global penalty, so that the
+            minimum penalty is 0.
+        """
+        if tokens:
+            min_penalty = min(t.penalty for t in tokens)
+            self.penalty += min_penalty
+            wstrs = WeightedStrings([t.lookup_word for t in tokens],
+                                    [t.penalty - min_penalty for t in tokens])
+
+            setattr(self, field, wstrs)
+
+
+    def set_qualifiers(self, tokens: List[Token]) -> None:
+        """ Set the qulaifier field from the given tokens.
+        """
+        if tokens:
+            min_penalty = min(t.penalty for t in tokens)
+            self.penalty += min_penalty
+            self.qualifiers = WeightedCategories([t.get_category() for t in tokens],
+                                                 [t.penalty - min_penalty for t in tokens])
+
+
+    def set_ranking(self, rankings: List[FieldRanking]) -> None:
+        """ Set the list of rankings and normalize the ranking.
+        """
+        self.rankings = []
+        for ranking in rankings:
+            if ranking.rankings:
+                self.penalty += ranking.normalize_penalty()
+                self.rankings.append(ranking)
+            else:
+                self.penalty += ranking.default