mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-14 10:27:57 +00:00
Merge pull request #3781 from lonvia/partial-address-index-lookup
Reduce number of tokens used for index lookups during search
This commit is contained in:
@@ -155,29 +155,36 @@ class SearchBuilder:
|
||||
""" Build a simple address search for special entries where the
|
||||
housenumber is the main name token.
|
||||
"""
|
||||
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
|
||||
expected_count = sum(t.count for t in hnrs)
|
||||
|
||||
partials = {t.token: t.addr_count for trange in address
|
||||
for t in self.query.iter_partials(trange)}
|
||||
partials = dbf.CountedTokenIDs((t for trange in address
|
||||
for t in self.query.iter_partials(trange)),
|
||||
'addr_count')
|
||||
|
||||
if not partials:
|
||||
# can happen when none of the partials is indexed
|
||||
return
|
||||
|
||||
if expected_count < 8000:
|
||||
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
|
||||
list(partials), lookups.Restrict))
|
||||
elif len(partials) != 1 or list(partials.values())[0] < 10000:
|
||||
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
|
||||
list(partials), lookups.LookupAll))
|
||||
expected_count = sum(t.count for t in hnrs)
|
||||
hnr_tokens = [t.token for t in hnrs]
|
||||
|
||||
if expected_count < 10000:
|
||||
sdata.lookups = [dbf.FieldLookup('name_vector', hnr_tokens, lookups.LookupAny),
|
||||
dbf.FieldLookup('nameaddress_vector',
|
||||
partials.get_tokens(),
|
||||
lookups.Restrict)]
|
||||
else:
|
||||
addr_fulls = [t.token for t
|
||||
in self.query.get_tokens(address[0], qmod.TOKEN_WORD)]
|
||||
if len(addr_fulls) > 5:
|
||||
return
|
||||
sdata.lookups.append(
|
||||
dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny))
|
||||
split = partials.get_num_lookup_tokens(20000, 5)
|
||||
if split > 0:
|
||||
sdata.lookups = partials.split_lookup(split, 'nameaddress_vector')
|
||||
sdata.lookups.append(
|
||||
dbf.FieldLookup('name_vector', hnr_tokens, lookups.Restrict))
|
||||
else:
|
||||
addr_fulls = [t.token for t in
|
||||
self.query.get_tokens(address[0], qmod.TOKEN_WORD)]
|
||||
if len(addr_fulls) > 5:
|
||||
return
|
||||
sdata.lookups = [
|
||||
dbf.FieldLookup('name_vector', hnr_tokens, lookups.LookupAny),
|
||||
dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny)]
|
||||
|
||||
sdata.housenumbers = dbf.WeightedStrings([], [])
|
||||
yield dbs.PlaceSearch(0.05, sdata, expected_count, True)
|
||||
@@ -205,37 +212,88 @@ class SearchBuilder:
|
||||
be searched for. This takes into account how frequent the terms
|
||||
are and tries to find a lookup that optimizes index use.
|
||||
"""
|
||||
name_partials = dbf.CountedTokenIDs(self.query.iter_partials(name))
|
||||
addr_partials = dbf.CountedTokenIDs((t for r in address
|
||||
for t in self.query.iter_partials(r)),
|
||||
'addr_count')
|
||||
|
||||
if not addr_partials:
|
||||
yield from self.yield_name_only_lookups(name_partials, name)
|
||||
else:
|
||||
yield from self.yield_address_lookups(name_partials, addr_partials, name)
|
||||
|
||||
def yield_name_only_lookups(self, partials: dbf.CountedTokenIDs, name: qmod.TokenRange
|
||||
) -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
|
||||
""" Yield the best lookup for a name-only search.
|
||||
"""
|
||||
split = partials.get_num_lookup_tokens(30000, 6)
|
||||
|
||||
if split > 0:
|
||||
yield 0.0, partials.expected_for_all_search(5), \
|
||||
partials.split_lookup(split, 'name_vector')
|
||||
else:
|
||||
# lots of results expected: try lookup by full names first
|
||||
name_fulls = list(filter(lambda t: t.count < 50000,
|
||||
self.query.get_tokens(name, qmod.TOKEN_WORD)))
|
||||
if name_fulls:
|
||||
yield 0.0, sum(t.count for t in name_fulls), \
|
||||
dbf.lookup_by_any_name([t.token for t in name_fulls], [], [])
|
||||
|
||||
# look the name up by its partials
|
||||
exp_count = partials.expected_for_all_search(5)
|
||||
if exp_count < 50000:
|
||||
yield 1.0, exp_count, \
|
||||
[dbf.FieldLookup('name_vector', partials.get_tokens(), lookups.LookupAll)]
|
||||
|
||||
def yield_address_lookups(self, name_partials: dbf.CountedTokenIDs,
|
||||
addr_partials: dbf.CountedTokenIDs, name: qmod.TokenRange,
|
||||
) -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
|
||||
penalty = 0.0 # extra penalty
|
||||
name_partials = {t.token: t for t in self.query.iter_partials(name)}
|
||||
|
||||
addr_partials = [t for r in address for t in self.query.iter_partials(r)]
|
||||
addr_tokens = list({t.token for t in addr_partials})
|
||||
name_split = name_partials.get_num_lookup_tokens(20000, 6)
|
||||
addr_split = addr_partials.get_num_lookup_tokens(10000, 3)
|
||||
|
||||
exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1))
|
||||
if name_split < 0 and addr_split < 0:
|
||||
# Partial term too frequent. Try looking up by rare full names first.
|
||||
name_fulls = self.query.get_tokens(name, qmod.TOKEN_WORD)
|
||||
if name_fulls:
|
||||
fulls_count = sum(t.count for t in name_fulls)
|
||||
|
||||
if (len(name_partials) > 3 or exp_count < 8000):
|
||||
yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
|
||||
return
|
||||
if fulls_count < 80000:
|
||||
yield 0.0, fulls_count, \
|
||||
dbf.lookup_by_any_name([t.token for t in name_fulls],
|
||||
addr_partials.get_tokens(),
|
||||
[])
|
||||
penalty += 0.2
|
||||
penalty += 0.4
|
||||
|
||||
addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000
|
||||
# Partial term to frequent. Try looking up by rare full names first.
|
||||
name_fulls = self.query.get_tokens(name, qmod.TOKEN_WORD)
|
||||
if name_fulls:
|
||||
fulls_count = sum(t.count for t in name_fulls)
|
||||
name_split = name_partials.get_num_lookup_tokens(50000, 10)
|
||||
addr_split = addr_partials.get_num_lookup_tokens(30000, 5)
|
||||
|
||||
if fulls_count < 80000 or addr_count < 50000:
|
||||
yield penalty, fulls_count / (2**len(addr_tokens)), \
|
||||
self.get_full_name_ranking(name_fulls, addr_partials,
|
||||
fulls_count > 30000 / max(1, len(addr_tokens)))
|
||||
if name_split > 0 \
|
||||
and (addr_split < 0 or name_partials.min_count() <= addr_partials.min_count()):
|
||||
# lookup by name
|
||||
lookup = name_partials.split_lookup(name_split, 'name_vector')
|
||||
lookup.append(dbf.FieldLookup('nameaddress_vector',
|
||||
addr_partials.get_tokens(), lookups.Restrict))
|
||||
yield penalty, name_partials.expected_for_all_search(5), lookup
|
||||
elif addr_split > 0:
|
||||
# lookup by address
|
||||
lookup = addr_partials.split_lookup(addr_split, 'nameaddress_vector')
|
||||
lookup.append(dbf.FieldLookup('name_vector',
|
||||
name_partials.get_tokens(), lookups.Restrict))
|
||||
yield penalty, addr_partials.expected_for_all_search(3), lookup
|
||||
elif len(name_partials) > 1:
|
||||
penalty += 0.5
|
||||
# To catch remaining results, lookup by name and address
|
||||
# We only do this if there is a reasonable number of results expected.
|
||||
exp_count = min(name_partials.min_count(), addr_partials.min_count())
|
||||
exp_count = int(exp_count / (min(3, len(name_partials)) + min(3, len(addr_partials))))
|
||||
if exp_count < 50000:
|
||||
lookup = name_partials.split_lookup(3, 'name_vector')
|
||||
lookup.extend(addr_partials.split_lookup(3, 'nameaddress_vector'))
|
||||
|
||||
# To catch remaining results, lookup by name and address
|
||||
# We only do this if there is a reasonable number of results expected.
|
||||
exp_count /= 2**len(addr_tokens)
|
||||
if exp_count < 10000 and addr_count < 20000:
|
||||
penalty += 0.35 * max(1 if name_fulls else 0.1,
|
||||
5 - len(name_partials) - len(addr_tokens))
|
||||
yield penalty, exp_count, \
|
||||
self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
|
||||
yield penalty, exp_count, lookup
|
||||
|
||||
def get_name_address_ranking(self, name_tokens: List[int],
|
||||
addr_partials: List[qmod.Token]) -> List[dbf.FieldLookup]:
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"""
|
||||
Data structures for more complex fields in abstract search descriptions.
|
||||
"""
|
||||
from typing import List, Tuple, Iterator, Dict, Type
|
||||
from typing import List, Tuple, Iterator, Dict, Type, cast
|
||||
import dataclasses
|
||||
|
||||
import sqlalchemy as sa
|
||||
@@ -18,6 +18,66 @@ from .query import Token
|
||||
from . import db_search_lookups as lookups
|
||||
|
||||
|
||||
class CountedTokenIDs:
|
||||
""" A list of token IDs with their respective counts, sorted
|
||||
from least frequent to most frequent.
|
||||
|
||||
If a token count is one, then statistics are likely to be unavaible
|
||||
and a relatively high count is assumed instead.
|
||||
"""
|
||||
|
||||
def __init__(self, tokens: Iterator[Token], count_column: str = 'count'):
|
||||
self.tokens = list({(cast(int, getattr(t, count_column)), t.token) for t in tokens})
|
||||
self.tokens.sort(key=lambda t: t[0] if t[0] > 1 else 100000)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.tokens)
|
||||
|
||||
def get_num_lookup_tokens(self, limit: int, fac: int) -> int:
|
||||
""" Suggest the number of tokens to be used for an index lookup.
|
||||
The idea here is to use as few items as possible while making
|
||||
sure the number of rows returned stays below 'limit' which
|
||||
makes recheck of the returned rows more expensive than adding
|
||||
another item for the index lookup. 'fac' is the factor by which
|
||||
the limit is increased every time a lookup item is added.
|
||||
|
||||
If the list of tokens doesn't seem suitable at all for index
|
||||
lookup, -1 is returned.
|
||||
"""
|
||||
length = len(self.tokens)
|
||||
min_count = self.tokens[0][0]
|
||||
if min_count == 1:
|
||||
return min(length, 3) # no statistics available, use index
|
||||
|
||||
for i in range(min(length, 3)):
|
||||
if min_count < limit:
|
||||
return i + 1
|
||||
limit = limit * fac
|
||||
|
||||
return -1
|
||||
|
||||
def min_count(self) -> int:
|
||||
return self.tokens[0][0]
|
||||
|
||||
def expected_for_all_search(self, fac: int = 5) -> int:
|
||||
return int(self.tokens[0][0] / (fac**(len(self.tokens) - 1)))
|
||||
|
||||
def get_tokens(self) -> List[int]:
|
||||
return [t[1] for t in self.tokens]
|
||||
|
||||
def get_head_tokens(self, num_tokens: int) -> List[int]:
|
||||
return [t[1] for t in self.tokens[:num_tokens]]
|
||||
|
||||
def get_tail_tokens(self, first: int) -> List[int]:
|
||||
return [t[1] for t in self.tokens[first:]]
|
||||
|
||||
def split_lookup(self, split: int, column: str) -> 'List[FieldLookup]':
|
||||
lookup = [FieldLookup(column, self.get_head_tokens(split), lookups.LookupAll)]
|
||||
if split < len(self.tokens):
|
||||
lookup.append(FieldLookup(column, self.get_tail_tokens(split), lookups.Restrict))
|
||||
return lookup
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class WeightedStrings:
|
||||
""" A list of strings together with a penalty.
|
||||
|
||||
@@ -176,10 +176,10 @@ class AddressSearch(base.AbstractSearch):
|
||||
|
||||
if self.postcodes:
|
||||
if self.expected_count > 10000:
|
||||
# Many results expected. Restrict by postcode.
|
||||
tpc = conn.t.postcode
|
||||
sql = sql.where(sa.select(tpc.c.postcode)
|
||||
.where(tpc.c.postcode.in_(self.postcodes.values))
|
||||
.where(tpc.c.country_code == t.c.country_code)
|
||||
.where(t.c.centroid.within_distance(tpc.c.geometry, 0.4))
|
||||
.exists())
|
||||
|
||||
@@ -233,6 +233,7 @@ class AddressSearch(base.AbstractSearch):
|
||||
pc_near = sa.select(sa.func.min(tpc.c.geometry.ST_Distance(t.c.centroid)
|
||||
* (tpc.c.rank_search - 19)))\
|
||||
.where(tpc.c.postcode.in_(pcs))\
|
||||
.where(tpc.c.country_code == t.c.country_code)\
|
||||
.scalar_subquery()
|
||||
penalty += sa.case((t.c.postcode.in_(pcs), 0.0),
|
||||
else_=sa.func.coalesce(pc_near, cast(SaColumn, 2.0)))
|
||||
|
||||
@@ -396,9 +396,9 @@ def make_counted_searches(name_part, name_full, address_part, address_full,
|
||||
MyToken(0, 101, name_full, 1, 'name_full'))
|
||||
for i in range(num_address_parts):
|
||||
q.add_token(TokenRange(i + 1, i + 2), qmod.TOKEN_PARTIAL,
|
||||
MyToken(0.5, 2, address_part, 1, 'address_part'))
|
||||
MyToken(0.5, 2, 1, address_part, 'address_part'))
|
||||
q.add_token(TokenRange(i + 1, i + 2), qmod.TOKEN_WORD,
|
||||
MyToken(0, 102, address_full, 1, 'address_full'))
|
||||
MyToken(0, 102, 1, address_full, 'address_full'))
|
||||
|
||||
builder = SearchBuilder(q, SearchDetails())
|
||||
|
||||
@@ -421,7 +421,7 @@ def test_infrequent_partials_in_name():
|
||||
|
||||
|
||||
def test_frequent_partials_in_name_and_address():
|
||||
searches = make_counted_searches(9999, 1, 9999, 1)
|
||||
searches = make_counted_searches(20001, 1, 20001, 1)
|
||||
|
||||
assert len(searches) == 2
|
||||
|
||||
@@ -431,11 +431,11 @@ def test_frequent_partials_in_name_and_address():
|
||||
assert set((s.column, s.lookup_type.__name__) for s in searches[0].lookups) == \
|
||||
{('name_vector', 'LookupAny'), ('nameaddress_vector', 'Restrict')}
|
||||
assert set((s.column, s.lookup_type.__name__) for s in searches[1].lookups) == \
|
||||
{('nameaddress_vector', 'LookupAll'), ('name_vector', 'LookupAll')}
|
||||
{('nameaddress_vector', 'Restrict'), ('name_vector', 'LookupAll')}
|
||||
|
||||
|
||||
def test_too_frequent_partials_in_name_and_address():
|
||||
searches = make_counted_searches(20000, 1, 10000, 1)
|
||||
searches = make_counted_searches(50001, 1, 30001, 1)
|
||||
|
||||
assert len(searches) == 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user