Merge pull request #3169 from lonvia/tweak-search-with-frequent-names

Further tweak detection of queries that would return a massive amount of results
This commit is contained in:
Sarah Hoffmann
2023-08-24 14:22:35 +02:00
committed by GitHub
3 changed files with 26 additions and 27 deletions

View File

@@ -212,39 +212,27 @@ class SearchBuilder:
yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens) yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
return return
exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
# Partial term to frequent. Try looking up by rare full names first. # Partial term to frequent. Try looking up by rare full names first.
name_fulls = self.query.get_tokens(name, TokenType.WORD) name_fulls = self.query.get_tokens(name, TokenType.WORD)
rare_names = list(filter(lambda t: t.count < 10000, name_fulls)) fulls_count = sum(t.count for t in name_fulls) / (2**len(addr_partials))
# At this point drop unindexed partials from the address. # At this point drop unindexed partials from the address.
# This might yield wrong results, nothing we can do about that. # This might yield wrong results, nothing we can do about that.
if not partials_indexed: if not partials_indexed:
addr_tokens = [t.token for t in addr_partials if t.is_indexed] addr_tokens = [t.token for t in addr_partials if t.is_indexed]
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed) penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
if rare_names: # Any of the full names applies with all of the partials from the address
# Any of the full names applies with all of the partials from the address yield penalty, fulls_count,\
yield penalty, sum(t.count for t in rare_names),\ dbf.lookup_by_any_name([t.token for t in name_fulls], addr_tokens,
dbf.lookup_by_any_name([t.token for t in rare_names], addr_tokens) 'restrict' if fulls_count < 10000 else 'lookup_all')
# To catch remaining results, lookup by name and address # To catch remaining results, lookup by name and address
# We only do this if there is a reasonable number of results expected. # We only do this if there is a reasonable number of results expected.
if exp_count < 10000: exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
if all(t.is_indexed for t in name_partials): if exp_count < 10000 and all(t.is_indexed for t in name_partials):
lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')] lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')]
else:
# we don't have the partials, try with the non-rare names
non_rare_names = [t.token for t in name_fulls if t.count >= 10000]
if not non_rare_names:
return
lookup = [dbf.FieldLookup('name_vector', non_rare_names, 'lookup_any')]
if addr_tokens: if addr_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all')) lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all'))
penalty += 0.1 * max(0, 5 - len(name_partials) - len(addr_tokens)) penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens))
if len(rare_names) == len(name_fulls):
# if there already was a search for all full tokens,
# avoid this if anything has been found
penalty += 0.25
yield penalty, exp_count, lookup yield penalty, exp_count, lookup

View File

@@ -224,14 +224,15 @@ def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[Fiel
return lookup return lookup
def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]: def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int],
lookup_type: str) -> List[FieldLookup]:
""" Create a lookup list where name tokens are looked up via index """ Create a lookup list where name tokens are looked up via index
and only one of the name tokens must be present. and only one of the name tokens must be present.
Potential address tokens are used to restrict the search further. Potential address tokens are used to restrict the search further.
""" """
lookup = [FieldLookup('name_vector', name_tokens, 'lookup_any')] lookup = [FieldLookup('name_vector', name_tokens, 'lookup_any')]
if addr_tokens: if addr_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_tokens, 'restrict')) lookup.append(FieldLookup('nameaddress_vector', addr_tokens, lookup_type))
return lookup return lookup

View File

@@ -627,6 +627,11 @@ class PlaceSearch(AbstractSearch):
sql = sql.where(tsearch.c.centroid.intersects(VIEWBOX_PARAM)) sql = sql.where(tsearch.c.centroid.intersects(VIEWBOX_PARAM))
else: else:
sql = sql.where(tsearch.c.centroid.ST_Intersects_no_index(VIEWBOX_PARAM)) sql = sql.where(tsearch.c.centroid.ST_Intersects_no_index(VIEWBOX_PARAM))
elif self.expected_count >= 10000:
if details.viewbox.area < 0.5:
sql = sql.where(tsearch.c.centroid.intersects(VIEWBOX2_PARAM))
else:
sql = sql.where(tsearch.c.centroid.ST_Intersects_no_index(VIEWBOX2_PARAM))
else: else:
penalty += sa.case((t.c.geometry.intersects(VIEWBOX_PARAM), 0.0), penalty += sa.case((t.c.geometry.intersects(VIEWBOX_PARAM), 0.0),
(t.c.geometry.intersects(VIEWBOX2_PARAM), 1.0), (t.c.geometry.intersects(VIEWBOX2_PARAM), 1.0),
@@ -643,13 +648,18 @@ class PlaceSearch(AbstractSearch):
.label('importance')) .label('importance'))
sql = sql.order_by(sa.desc(sa.text('importance'))) sql = sql.order_by(sa.desc(sa.text('importance')))
else: else:
sql = sql.order_by(penalty - sa.case((tsearch.c.importance > 0, tsearch.c.importance), if self.expected_count < 10000\
else_=0.75001-(sa.cast(tsearch.c.search_rank, sa.Float())/40))) or (details.viewbox is not None and details.viewbox.area < 0.5):
sql = sql.order_by(
penalty - sa.case((tsearch.c.importance > 0, tsearch.c.importance),
else_=0.75001-(sa.cast(tsearch.c.search_rank, sa.Float())/40)))
sql = sql.add_columns(t.c.importance) sql = sql.add_columns(t.c.importance)
sql = sql.add_columns(penalty.label('accuracy'))\ sql = sql.add_columns(penalty.label('accuracy'))
.order_by(sa.text('accuracy'))
if self.expected_count < 10000:
sql = sql.order_by(sa.text('accuracy'))
if self.housenumbers: if self.housenumbers:
hnr_regexp = f"\\m({'|'.join(self.housenumbers.values)})\\M" hnr_regexp = f"\\m({'|'.join(self.housenumbers.values)})\\M"