Compare commits

...

7 Commits

Author SHA1 Message Date
Sarah Hoffmann
8a96e4f802 Merge pull request #3781 from lonvia/partial-address-index-lookup
Reduce number of tokens used for index lookups during search
2025-07-15 10:11:12 +02:00
Sarah Hoffmann
a9cd706bb6 adapt test to new lookup limits 2025-07-14 14:21:09 +02:00
Sarah Hoffmann
09b5ea097b restrict pre-selection by postcode to country 2025-07-14 14:21:09 +02:00
Sarah Hoffmann
e111257644 restrict name-only address searches early by postcode 2025-07-14 14:21:09 +02:00
Sarah Hoffmann
93ac1023f7 restrict name-only search more 2025-07-14 14:21:09 +02:00
Sarah Hoffmann
1fe2353682 restrict postcode distance computation to within country 2025-07-14 14:21:09 +02:00
Sarah Hoffmann
6d2b79870c only use most infrequent tokens for search index lookup 2025-07-14 14:18:22 +02:00
4 changed files with 167 additions and 48 deletions

View File

@@ -155,29 +155,36 @@ class SearchBuilder:
""" Build a simple address search for special entries where the
housenumber is the main name token.
"""
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
expected_count = sum(t.count for t in hnrs)
partials = {t.token: t.addr_count for trange in address
for t in self.query.iter_partials(trange)}
partials = dbf.CountedTokenIDs((t for trange in address
for t in self.query.iter_partials(trange)),
'addr_count')
if not partials:
# can happen when none of the partials is indexed
return
if expected_count < 8000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
list(partials), lookups.Restrict))
elif len(partials) != 1 or list(partials.values())[0] < 10000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
list(partials), lookups.LookupAll))
expected_count = sum(t.count for t in hnrs)
hnr_tokens = [t.token for t in hnrs]
if expected_count < 10000:
sdata.lookups = [dbf.FieldLookup('name_vector', hnr_tokens, lookups.LookupAny),
dbf.FieldLookup('nameaddress_vector',
partials.get_tokens(),
lookups.Restrict)]
else:
addr_fulls = [t.token for t
in self.query.get_tokens(address[0], qmod.TOKEN_WORD)]
if len(addr_fulls) > 5:
return
sdata.lookups.append(
dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny))
split = partials.get_num_lookup_tokens(20000, 5)
if split > 0:
sdata.lookups = partials.split_lookup(split, 'nameaddress_vector')
sdata.lookups.append(
dbf.FieldLookup('name_vector', hnr_tokens, lookups.Restrict))
else:
addr_fulls = [t.token for t in
self.query.get_tokens(address[0], qmod.TOKEN_WORD)]
if len(addr_fulls) > 5:
return
sdata.lookups = [
dbf.FieldLookup('name_vector', hnr_tokens, lookups.LookupAny),
dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny)]
sdata.housenumbers = dbf.WeightedStrings([], [])
yield dbs.PlaceSearch(0.05, sdata, expected_count, True)
@@ -205,37 +212,88 @@ class SearchBuilder:
be searched for. This takes into account how frequent the terms
are and tries to find a lookup that optimizes index use.
"""
name_partials = dbf.CountedTokenIDs(self.query.iter_partials(name))
addr_partials = dbf.CountedTokenIDs((t for r in address
for t in self.query.iter_partials(r)),
'addr_count')
if not addr_partials:
yield from self.yield_name_only_lookups(name_partials, name)
else:
yield from self.yield_address_lookups(name_partials, addr_partials, name)
def yield_name_only_lookups(self, partials: dbf.CountedTokenIDs, name: qmod.TokenRange
) -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
""" Yield the best lookup for a name-only search.
"""
split = partials.get_num_lookup_tokens(30000, 6)
if split > 0:
yield 0.0, partials.expected_for_all_search(5), \
partials.split_lookup(split, 'name_vector')
else:
# lots of results expected: try lookup by full names first
name_fulls = list(filter(lambda t: t.count < 50000,
self.query.get_tokens(name, qmod.TOKEN_WORD)))
if name_fulls:
yield 0.0, sum(t.count for t in name_fulls), \
dbf.lookup_by_any_name([t.token for t in name_fulls], [], [])
# look the name up by its partials
exp_count = partials.expected_for_all_search(5)
if exp_count < 50000:
yield 1.0, exp_count, \
[dbf.FieldLookup('name_vector', partials.get_tokens(), lookups.LookupAll)]
def yield_address_lookups(self, name_partials: dbf.CountedTokenIDs,
addr_partials: dbf.CountedTokenIDs, name: qmod.TokenRange,
) -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
penalty = 0.0 # extra penalty
name_partials = {t.token: t for t in self.query.iter_partials(name)}
addr_partials = [t for r in address for t in self.query.iter_partials(r)]
addr_tokens = list({t.token for t in addr_partials})
name_split = name_partials.get_num_lookup_tokens(20000, 6)
addr_split = addr_partials.get_num_lookup_tokens(10000, 3)
exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1))
if name_split < 0 and addr_split < 0:
# Partial term too frequent. Try looking up by rare full names first.
name_fulls = self.query.get_tokens(name, qmod.TOKEN_WORD)
if name_fulls:
fulls_count = sum(t.count for t in name_fulls)
if (len(name_partials) > 3 or exp_count < 8000):
yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
return
if fulls_count < 80000:
yield 0.0, fulls_count, \
dbf.lookup_by_any_name([t.token for t in name_fulls],
addr_partials.get_tokens(),
[])
penalty += 0.2
penalty += 0.4
addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000
# Partial term to frequent. Try looking up by rare full names first.
name_fulls = self.query.get_tokens(name, qmod.TOKEN_WORD)
if name_fulls:
fulls_count = sum(t.count for t in name_fulls)
name_split = name_partials.get_num_lookup_tokens(50000, 10)
addr_split = addr_partials.get_num_lookup_tokens(30000, 5)
if fulls_count < 80000 or addr_count < 50000:
yield penalty, fulls_count / (2**len(addr_tokens)), \
self.get_full_name_ranking(name_fulls, addr_partials,
fulls_count > 30000 / max(1, len(addr_tokens)))
if name_split > 0 \
and (addr_split < 0 or name_partials.min_count() <= addr_partials.min_count()):
# lookup by name
lookup = name_partials.split_lookup(name_split, 'name_vector')
lookup.append(dbf.FieldLookup('nameaddress_vector',
addr_partials.get_tokens(), lookups.Restrict))
yield penalty, name_partials.expected_for_all_search(5), lookup
elif addr_split > 0:
# lookup by address
lookup = addr_partials.split_lookup(addr_split, 'nameaddress_vector')
lookup.append(dbf.FieldLookup('name_vector',
name_partials.get_tokens(), lookups.Restrict))
yield penalty, addr_partials.expected_for_all_search(3), lookup
elif len(name_partials) > 1:
penalty += 0.5
# To catch remaining results, lookup by name and address
# We only do this if there is a reasonable number of results expected.
exp_count = min(name_partials.min_count(), addr_partials.min_count())
exp_count = int(exp_count / (min(3, len(name_partials)) + min(3, len(addr_partials))))
if exp_count < 50000:
lookup = name_partials.split_lookup(3, 'name_vector')
lookup.extend(addr_partials.split_lookup(3, 'nameaddress_vector'))
# To catch remaining results, lookup by name and address
# We only do this if there is a reasonable number of results expected.
exp_count /= 2**len(addr_tokens)
if exp_count < 10000 and addr_count < 20000:
penalty += 0.35 * max(1 if name_fulls else 0.1,
5 - len(name_partials) - len(addr_tokens))
yield penalty, exp_count, \
self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
yield penalty, exp_count, lookup
def get_name_address_ranking(self, name_tokens: List[int],
addr_partials: List[qmod.Token]) -> List[dbf.FieldLookup]:

View File

@@ -7,7 +7,7 @@
"""
Data structures for more complex fields in abstract search descriptions.
"""
from typing import List, Tuple, Iterator, Dict, Type
from typing import List, Tuple, Iterator, Dict, Type, cast
import dataclasses
import sqlalchemy as sa
@@ -18,6 +18,66 @@ from .query import Token
from . import db_search_lookups as lookups
class CountedTokenIDs:
""" A list of token IDs with their respective counts, sorted
from least frequent to most frequent.
If a token count is one, then statistics are likely to be unavaible
and a relatively high count is assumed instead.
"""
def __init__(self, tokens: Iterator[Token], count_column: str = 'count'):
self.tokens = list({(cast(int, getattr(t, count_column)), t.token) for t in tokens})
self.tokens.sort(key=lambda t: t[0] if t[0] > 1 else 100000)
def __len__(self) -> int:
return len(self.tokens)
def get_num_lookup_tokens(self, limit: int, fac: int) -> int:
""" Suggest the number of tokens to be used for an index lookup.
The idea here is to use as few items as possible while making
sure the number of rows returned stays below 'limit' which
makes recheck of the returned rows more expensive than adding
another item for the index lookup. 'fac' is the factor by which
the limit is increased every time a lookup item is added.
If the list of tokens doesn't seem suitable at all for index
lookup, -1 is returned.
"""
length = len(self.tokens)
min_count = self.tokens[0][0]
if min_count == 1:
return min(length, 3) # no statistics available, use index
for i in range(min(length, 3)):
if min_count < limit:
return i + 1
limit = limit * fac
return -1
def min_count(self) -> int:
return self.tokens[0][0]
def expected_for_all_search(self, fac: int = 5) -> int:
return int(self.tokens[0][0] / (fac**(len(self.tokens) - 1)))
def get_tokens(self) -> List[int]:
return [t[1] for t in self.tokens]
def get_head_tokens(self, num_tokens: int) -> List[int]:
return [t[1] for t in self.tokens[:num_tokens]]
def get_tail_tokens(self, first: int) -> List[int]:
return [t[1] for t in self.tokens[first:]]
def split_lookup(self, split: int, column: str) -> 'List[FieldLookup]':
lookup = [FieldLookup(column, self.get_head_tokens(split), lookups.LookupAll)]
if split < len(self.tokens):
lookup.append(FieldLookup(column, self.get_tail_tokens(split), lookups.Restrict))
return lookup
@dataclasses.dataclass
class WeightedStrings:
""" A list of strings together with a penalty.

View File

@@ -176,10 +176,10 @@ class AddressSearch(base.AbstractSearch):
if self.postcodes:
if self.expected_count > 10000:
# Many results expected. Restrict by postcode.
tpc = conn.t.postcode
sql = sql.where(sa.select(tpc.c.postcode)
.where(tpc.c.postcode.in_(self.postcodes.values))
.where(tpc.c.country_code == t.c.country_code)
.where(t.c.centroid.within_distance(tpc.c.geometry, 0.4))
.exists())
@@ -233,6 +233,7 @@ class AddressSearch(base.AbstractSearch):
pc_near = sa.select(sa.func.min(tpc.c.geometry.ST_Distance(t.c.centroid)
* (tpc.c.rank_search - 19)))\
.where(tpc.c.postcode.in_(pcs))\
.where(tpc.c.country_code == t.c.country_code)\
.scalar_subquery()
penalty += sa.case((t.c.postcode.in_(pcs), 0.0),
else_=sa.func.coalesce(pc_near, cast(SaColumn, 2.0)))

View File

@@ -396,9 +396,9 @@ def make_counted_searches(name_part, name_full, address_part, address_full,
MyToken(0, 101, name_full, 1, 'name_full'))
for i in range(num_address_parts):
q.add_token(TokenRange(i + 1, i + 2), qmod.TOKEN_PARTIAL,
MyToken(0.5, 2, address_part, 1, 'address_part'))
MyToken(0.5, 2, 1, address_part, 'address_part'))
q.add_token(TokenRange(i + 1, i + 2), qmod.TOKEN_WORD,
MyToken(0, 102, address_full, 1, 'address_full'))
MyToken(0, 102, 1, address_full, 'address_full'))
builder = SearchBuilder(q, SearchDetails())
@@ -421,7 +421,7 @@ def test_infrequent_partials_in_name():
def test_frequent_partials_in_name_and_address():
searches = make_counted_searches(9999, 1, 9999, 1)
searches = make_counted_searches(20001, 1, 20001, 1)
assert len(searches) == 2
@@ -431,11 +431,11 @@ def test_frequent_partials_in_name_and_address():
assert set((s.column, s.lookup_type.__name__) for s in searches[0].lookups) == \
{('name_vector', 'LookupAny'), ('nameaddress_vector', 'Restrict')}
assert set((s.column, s.lookup_type.__name__) for s in searches[1].lookups) == \
{('nameaddress_vector', 'LookupAll'), ('name_vector', 'LookupAll')}
{('nameaddress_vector', 'Restrict'), ('name_vector', 'LookupAll')}
def test_too_frequent_partials_in_name_and_address():
searches = make_counted_searches(20000, 1, 10000, 1)
searches = make_counted_searches(50001, 1, 30001, 1)
assert len(searches) == 1