mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-13 14:24:08 +00:00
only use most infrequent tokens for search index lookup
This commit is contained in:
@@ -155,29 +155,36 @@ class SearchBuilder:
|
|||||||
""" Build a simple address search for special entries where the
|
""" Build a simple address search for special entries where the
|
||||||
housenumber is the main name token.
|
housenumber is the main name token.
|
||||||
"""
|
"""
|
||||||
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
|
partials = dbf.CountedTokenIDs((t for trange in address
|
||||||
expected_count = sum(t.count for t in hnrs)
|
for t in self.query.iter_partials(trange)),
|
||||||
|
'addr_count')
|
||||||
partials = {t.token: t.addr_count for trange in address
|
|
||||||
for t in self.query.iter_partials(trange)}
|
|
||||||
|
|
||||||
if not partials:
|
if not partials:
|
||||||
# can happen when none of the partials is indexed
|
# can happen when none of the partials is indexed
|
||||||
return
|
return
|
||||||
|
|
||||||
if expected_count < 8000:
|
expected_count = sum(t.count for t in hnrs)
|
||||||
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
|
hnr_tokens = [t.token for t in hnrs]
|
||||||
list(partials), lookups.Restrict))
|
|
||||||
elif len(partials) != 1 or list(partials.values())[0] < 10000:
|
if expected_count < 10000:
|
||||||
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
|
sdata.lookups = [dbf.FieldLookup('name_vector', hnr_tokens, lookups.LookupAny),
|
||||||
list(partials), lookups.LookupAll))
|
dbf.FieldLookup('nameaddress_vector',
|
||||||
|
partials.get_tokens(),
|
||||||
|
lookups.Restrict)]
|
||||||
else:
|
else:
|
||||||
addr_fulls = [t.token for t
|
split = partials.get_num_lookup_tokens(20000, 5)
|
||||||
in self.query.get_tokens(address[0], qmod.TOKEN_WORD)]
|
if split > 0:
|
||||||
if len(addr_fulls) > 5:
|
sdata.lookups = partials.split_lookup(split, 'nameaddress_vector')
|
||||||
return
|
sdata.lookups.append(
|
||||||
sdata.lookups.append(
|
dbf.FieldLookup('name_vector', hnr_tokens, lookups.Restrict))
|
||||||
dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny))
|
else:
|
||||||
|
addr_fulls = [t.token for t in
|
||||||
|
self.query.get_tokens(address[0], qmod.TOKEN_WORD)]
|
||||||
|
if len(addr_fulls) > 5:
|
||||||
|
return
|
||||||
|
sdata.lookups = [
|
||||||
|
dbf.FieldLookup('name_vector', hnr_tokens, lookups.LookupAny),
|
||||||
|
dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny)]
|
||||||
|
|
||||||
sdata.housenumbers = dbf.WeightedStrings([], [])
|
sdata.housenumbers = dbf.WeightedStrings([], [])
|
||||||
yield dbs.PlaceSearch(0.05, sdata, expected_count, True)
|
yield dbs.PlaceSearch(0.05, sdata, expected_count, True)
|
||||||
@@ -205,37 +212,88 @@ class SearchBuilder:
|
|||||||
be searched for. This takes into account how frequent the terms
|
be searched for. This takes into account how frequent the terms
|
||||||
are and tries to find a lookup that optimizes index use.
|
are and tries to find a lookup that optimizes index use.
|
||||||
"""
|
"""
|
||||||
|
name_partials = dbf.CountedTokenIDs(self.query.iter_partials(name))
|
||||||
|
addr_partials = dbf.CountedTokenIDs((t for r in address
|
||||||
|
for t in self.query.iter_partials(r)),
|
||||||
|
'addr_count')
|
||||||
|
|
||||||
|
if not addr_partials:
|
||||||
|
yield from self.yield_name_only_lookups(name_partials, name)
|
||||||
|
else:
|
||||||
|
yield from self.yield_address_lookups(name_partials, addr_partials, name)
|
||||||
|
|
||||||
|
def yield_name_only_lookups(self, partials: dbf.CountedTokenIDs, name: qmod.TokenRange
|
||||||
|
) -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
|
||||||
|
""" Yield the best lookup for a name-only search.
|
||||||
|
"""
|
||||||
|
split = partials.get_num_lookup_tokens(30000, 10)
|
||||||
|
|
||||||
|
if split > 0:
|
||||||
|
yield 0.0, partials.expected_for_all_search(10), \
|
||||||
|
partials.split_lookup(split, 'name_vector')
|
||||||
|
else:
|
||||||
|
# lots of results expected: try lookup by full names first
|
||||||
|
name_fulls = list(filter(lambda t: t.count < 50000,
|
||||||
|
self.query.get_tokens(name, qmod.TOKEN_WORD)))
|
||||||
|
if name_fulls:
|
||||||
|
yield 0.0, sum(t.count for t in name_fulls), \
|
||||||
|
dbf.lookup_by_any_name([t.token for t in name_fulls], [], [])
|
||||||
|
|
||||||
|
# look the name up by its partials
|
||||||
|
exp_count = partials.expected_for_all_search(10)
|
||||||
|
if exp_count < 50000:
|
||||||
|
yield 1.0, exp_count, \
|
||||||
|
[dbf.FieldLookup('name_vector', partials.get_tokens(), lookups.LookupAll)]
|
||||||
|
|
||||||
|
def yield_address_lookups(self, name_partials: dbf.CountedTokenIDs,
|
||||||
|
addr_partials: dbf.CountedTokenIDs, name: qmod.TokenRange,
|
||||||
|
) -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
|
||||||
penalty = 0.0 # extra penalty
|
penalty = 0.0 # extra penalty
|
||||||
name_partials = {t.token: t for t in self.query.iter_partials(name)}
|
|
||||||
|
|
||||||
addr_partials = [t for r in address for t in self.query.iter_partials(r)]
|
name_split = name_partials.get_num_lookup_tokens(20000, 6)
|
||||||
addr_tokens = list({t.token for t in addr_partials})
|
addr_split = addr_partials.get_num_lookup_tokens(10000, 3)
|
||||||
|
|
||||||
exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1))
|
if name_split < 0 and addr_split < 0:
|
||||||
|
# Partial term too frequent. Try looking up by rare full names first.
|
||||||
|
name_fulls = self.query.get_tokens(name, qmod.TOKEN_WORD)
|
||||||
|
if name_fulls:
|
||||||
|
fulls_count = sum(t.count for t in name_fulls)
|
||||||
|
|
||||||
if (len(name_partials) > 3 or exp_count < 8000):
|
if fulls_count < 80000:
|
||||||
yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
|
yield 0.0, fulls_count, \
|
||||||
return
|
dbf.lookup_by_any_name([t.token for t in name_fulls],
|
||||||
|
addr_partials.get_tokens(),
|
||||||
|
[])
|
||||||
|
penalty += 0.2
|
||||||
|
penalty += 0.4
|
||||||
|
|
||||||
addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000
|
name_split = name_partials.get_num_lookup_tokens(50000, 10)
|
||||||
# Partial term to frequent. Try looking up by rare full names first.
|
addr_split = addr_partials.get_num_lookup_tokens(30000, 5)
|
||||||
name_fulls = self.query.get_tokens(name, qmod.TOKEN_WORD)
|
|
||||||
if name_fulls:
|
|
||||||
fulls_count = sum(t.count for t in name_fulls)
|
|
||||||
|
|
||||||
if fulls_count < 80000 or addr_count < 50000:
|
if name_split > 0 \
|
||||||
yield penalty, fulls_count / (2**len(addr_tokens)), \
|
and (addr_split < 0 or name_partials.min_count() <= addr_partials.min_count()):
|
||||||
self.get_full_name_ranking(name_fulls, addr_partials,
|
# lookup by name
|
||||||
fulls_count > 30000 / max(1, len(addr_tokens)))
|
lookup = name_partials.split_lookup(name_split, 'name_vector')
|
||||||
|
lookup.append(dbf.FieldLookup('nameaddress_vector',
|
||||||
|
addr_partials.get_tokens(), lookups.Restrict))
|
||||||
|
yield penalty, name_partials.expected_for_all_search(5), lookup
|
||||||
|
elif addr_split > 0:
|
||||||
|
# lookup by address
|
||||||
|
lookup = addr_partials.split_lookup(addr_split, 'nameaddress_vector')
|
||||||
|
lookup.append(dbf.FieldLookup('name_vector',
|
||||||
|
name_partials.get_tokens(), lookups.Restrict))
|
||||||
|
yield penalty, addr_partials.expected_for_all_search(3), lookup
|
||||||
|
elif len(name_partials) > 1:
|
||||||
|
penalty += 0.5
|
||||||
|
# To catch remaining results, lookup by name and address
|
||||||
|
# We only do this if there is a reasonable number of results expected.
|
||||||
|
exp_count = min(name_partials.min_count(), addr_partials.min_count())
|
||||||
|
exp_count = int(exp_count / (min(3, len(name_partials)) + min(3, len(addr_partials))))
|
||||||
|
if exp_count < 50000:
|
||||||
|
lookup = name_partials.split_lookup(3, 'name_vector')
|
||||||
|
lookup.extend(addr_partials.split_lookup(3, 'nameaddress_vector'))
|
||||||
|
|
||||||
# To catch remaining results, lookup by name and address
|
yield penalty, exp_count, lookup
|
||||||
# We only do this if there is a reasonable number of results expected.
|
|
||||||
exp_count /= 2**len(addr_tokens)
|
|
||||||
if exp_count < 10000 and addr_count < 20000:
|
|
||||||
penalty += 0.35 * max(1 if name_fulls else 0.1,
|
|
||||||
5 - len(name_partials) - len(addr_tokens))
|
|
||||||
yield penalty, exp_count, \
|
|
||||||
self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
|
|
||||||
|
|
||||||
def get_name_address_ranking(self, name_tokens: List[int],
|
def get_name_address_ranking(self, name_tokens: List[int],
|
||||||
addr_partials: List[qmod.Token]) -> List[dbf.FieldLookup]:
|
addr_partials: List[qmod.Token]) -> List[dbf.FieldLookup]:
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
"""
|
"""
|
||||||
Data structures for more complex fields in abstract search descriptions.
|
Data structures for more complex fields in abstract search descriptions.
|
||||||
"""
|
"""
|
||||||
from typing import List, Tuple, Iterator, Dict, Type
|
from typing import List, Tuple, Iterator, Dict, Type, cast
|
||||||
import dataclasses
|
import dataclasses
|
||||||
|
|
||||||
import sqlalchemy as sa
|
import sqlalchemy as sa
|
||||||
@@ -18,6 +18,66 @@ from .query import Token
|
|||||||
from . import db_search_lookups as lookups
|
from . import db_search_lookups as lookups
|
||||||
|
|
||||||
|
|
||||||
|
class CountedTokenIDs:
|
||||||
|
""" A list of token IDs with their respective counts, sorted
|
||||||
|
from least frequent to most frequent.
|
||||||
|
|
||||||
|
If a token count is one, then statistics are likely to be unavaible
|
||||||
|
and a relatively high count is assumed instead.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, tokens: Iterator[Token], count_column: str = 'count'):
|
||||||
|
self.tokens = list({(cast(int, getattr(t, count_column)), t.token) for t in tokens})
|
||||||
|
self.tokens.sort(key=lambda t: t[0] if t[0] > 1 else 100000)
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self.tokens)
|
||||||
|
|
||||||
|
def get_num_lookup_tokens(self, limit: int, fac: int) -> int:
|
||||||
|
""" Suggest the number of tokens to be used for an index lookup.
|
||||||
|
The idea here is to use as few items as possible while making
|
||||||
|
sure the number of rows returned stays below 'limit' which
|
||||||
|
makes recheck of the returned rows more expensive than adding
|
||||||
|
another item for the index lookup. 'fac' is the factor by which
|
||||||
|
the limit is increased every time a lookup item is added.
|
||||||
|
|
||||||
|
If the list of tokens doesn't seem suitable at all for index
|
||||||
|
lookup, -1 is returned.
|
||||||
|
"""
|
||||||
|
length = len(self.tokens)
|
||||||
|
min_count = self.tokens[0][0]
|
||||||
|
if min_count == 1:
|
||||||
|
return min(length, 3) # no statistics available, use index
|
||||||
|
|
||||||
|
for i in range(min(length, 3)):
|
||||||
|
if min_count < limit:
|
||||||
|
return i + 1
|
||||||
|
limit = limit * fac
|
||||||
|
|
||||||
|
return -1
|
||||||
|
|
||||||
|
def min_count(self) -> int:
|
||||||
|
return self.tokens[0][0]
|
||||||
|
|
||||||
|
def expected_for_all_search(self, fac: int = 5) -> int:
|
||||||
|
return int(self.tokens[0][0] / (fac**(len(self.tokens) - 1)))
|
||||||
|
|
||||||
|
def get_tokens(self) -> List[int]:
|
||||||
|
return [t[1] for t in self.tokens]
|
||||||
|
|
||||||
|
def get_head_tokens(self, num_tokens: int) -> List[int]:
|
||||||
|
return [t[1] for t in self.tokens[:num_tokens]]
|
||||||
|
|
||||||
|
def get_tail_tokens(self, first: int) -> List[int]:
|
||||||
|
return [t[1] for t in self.tokens[first:]]
|
||||||
|
|
||||||
|
def split_lookup(self, split: int, column: str) -> 'List[FieldLookup]':
|
||||||
|
lookup = [FieldLookup(column, self.get_head_tokens(split), lookups.LookupAll)]
|
||||||
|
if split < len(self.tokens):
|
||||||
|
lookup.append(FieldLookup(column, self.get_tail_tokens(split), lookups.Restrict))
|
||||||
|
return lookup
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
class WeightedStrings:
|
class WeightedStrings:
|
||||||
""" A list of strings together with a penalty.
|
""" A list of strings together with a penalty.
|
||||||
|
|||||||
Reference in New Issue
Block a user