only use most infrequent tokens for search index lookup

2025-07-11 15:26:50 +02:00
parent 830307484b
commit 6d2b79870c
2 changed files with 160 additions and 42 deletions
--- a/src/nominatim_api/search/db_search_fields.py
+++ b/src/nominatim_api/search/db_search_fields.py
@@ -7,7 +7,7 @@
 """
 Data structures for more complex fields in abstract search descriptions.
 """
-from typing import List, Tuple, Iterator, Dict, Type
+from typing import List, Tuple, Iterator, Dict, Type, cast
 import dataclasses

 import sqlalchemy as sa
@@ -18,6 +18,66 @@ from .query import Token
 from . import db_search_lookups as lookups


+class CountedTokenIDs:
+    """ A list of token IDs with their respective counts, sorted
+        from least frequent to most frequent.
+
+        If a token count is one, then statistics are likely to be unavaible
+        and a relatively high count is assumed instead.
+    """
+
+    def __init__(self, tokens: Iterator[Token], count_column: str = 'count'):
+        self.tokens = list({(cast(int, getattr(t, count_column)), t.token) for t in tokens})
+        self.tokens.sort(key=lambda t: t[0] if t[0] > 1 else 100000)
+
+    def __len__(self) -> int:
+        return len(self.tokens)
+
+    def get_num_lookup_tokens(self, limit: int, fac: int) -> int:
+        """ Suggest the number of tokens to be used for an index lookup.
+            The idea here is to use as few items as possible while making
+            sure the number of rows returned stays below 'limit' which
+            makes recheck of the returned rows more expensive than adding
+            another item for the index lookup. 'fac' is the factor by which
+            the limit is increased every time a lookup item is added.
+
+            If the list of tokens doesn't seem suitable at all for index
+            lookup, -1 is returned.
+        """
+        length = len(self.tokens)
+        min_count = self.tokens[0][0]
+        if min_count == 1:
+            return min(length, 3)  # no statistics available, use index
+
+        for i in range(min(length, 3)):
+            if min_count < limit:
+                return i + 1
+            limit = limit * fac
+
+        return -1
+
+    def min_count(self) -> int:
+        return self.tokens[0][0]
+
+    def expected_for_all_search(self, fac: int = 5) -> int:
+        return int(self.tokens[0][0] / (fac**(len(self.tokens) - 1)))
+
+    def get_tokens(self) -> List[int]:
+        return [t[1] for t in self.tokens]
+
+    def get_head_tokens(self, num_tokens: int) -> List[int]:
+        return [t[1] for t in self.tokens[:num_tokens]]
+
+    def get_tail_tokens(self, first: int) -> List[int]:
+        return [t[1] for t in self.tokens[first:]]
+
+    def split_lookup(self, split: int, column: str) -> 'List[FieldLookup]':
+        lookup = [FieldLookup(column, self.get_head_tokens(split), lookups.LookupAll)]
+        if split < len(self.tokens):
+            lookup.append(FieldLookup(column, self.get_tail_tokens(split), lookups.Restrict))
+        return lookup
+
+
@dataclasses.dataclass
 class WeightedStrings:
    """ A list of strings together with a penalty.