Merge pull request #3117 from lonvia/fix-assorted-search-errors

More improvements to the Python search algorithm
This commit is contained in:
Sarah Hoffmann
2023-07-22 11:45:36 +02:00
committed by GitHub
5 changed files with 242 additions and 87 deletions

View File

@@ -15,7 +15,6 @@ from nominatim.api.search.query import QueryStruct, Token, TokenType, TokenRange
from nominatim.api.search.token_assignment import TokenAssignment from nominatim.api.search.token_assignment import TokenAssignment
import nominatim.api.search.db_search_fields as dbf import nominatim.api.search.db_search_fields as dbf
import nominatim.api.search.db_searches as dbs import nominatim.api.search.db_searches as dbs
from nominatim.api.logging import log
def wrap_near_search(categories: List[Tuple[str, str]], def wrap_near_search(categories: List[Tuple[str, str]],
@@ -156,13 +155,22 @@ class SearchBuilder:
""" Build a simple address search for special entries where the """ Build a simple address search for special entries where the
housenumber is the main name token. housenumber is the main name token.
""" """
partial_tokens: List[int] = [] sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], 'lookup_any')]
for trange in address:
partial_tokens.extend(t.token for t in self.query.get_partials_list(trange))
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], 'lookup_any'), partials = [t for trange in address
dbf.FieldLookup('nameaddress_vector', partial_tokens, 'lookup_all') for t in self.query.get_partials_list(trange)]
]
if len(partials) != 1 or partials[0].count < 10000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
[t.token for t in partials], 'lookup_all'))
else:
sdata.lookups.append(
dbf.FieldLookup('nameaddress_vector',
[t.token for t
in self.query.get_tokens(address[0], TokenType.WORD)],
'lookup_any'))
sdata.housenumbers = dbf.WeightedStrings([], [])
yield dbs.PlaceSearch(0.05, sdata, sum(t.count for t in hnrs)) yield dbs.PlaceSearch(0.05, sdata, sum(t.count for t in hnrs))
@@ -187,69 +195,63 @@ class SearchBuilder:
be searched for. This takes into account how frequent the terms be searched for. This takes into account how frequent the terms
are and tries to find a lookup that optimizes index use. are and tries to find a lookup that optimizes index use.
""" """
penalty = 0.0 # extra penalty currently unused penalty = 0.0 # extra penalty
name_partials = self.query.get_partials_list(name) name_partials = self.query.get_partials_list(name)
exp_name_count = min(t.count for t in name_partials) name_tokens = [t.token for t in name_partials]
addr_partials = []
for trange in address: addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
addr_partials.extend(self.query.get_partials_list(trange))
addr_tokens = [t.token for t in addr_partials] addr_tokens = [t.token for t in addr_partials]
partials_indexed = all(t.is_indexed for t in name_partials) \ partials_indexed = all(t.is_indexed for t in name_partials) \
and all(t.is_indexed for t in addr_partials) and all(t.is_indexed for t in addr_partials)
exp_count = min(t.count for t in name_partials)
if (len(name_partials) > 3 or exp_name_count < 1000) and partials_indexed: if (len(name_partials) > 3 or exp_count < 1000) and partials_indexed:
# Lookup by name partials, use address partials to restrict results. yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
lookup = [dbf.FieldLookup('name_vector',
[t.token for t in name_partials], 'lookup_all')]
if addr_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'restrict'))
yield penalty, exp_name_count, lookup
return return
exp_addr_count = min(t.count for t in addr_partials) if addr_partials else exp_name_count exp_count = min(exp_count, min(t.count for t in addr_partials)) \
if exp_addr_count < 1000 and partials_indexed: if addr_partials else exp_count
if exp_count < 1000 and partials_indexed:
# Lookup by address partials and restrict results through name terms. # Lookup by address partials and restrict results through name terms.
# Give this a small penalty because lookups in the address index are # Give this a small penalty because lookups in the address index are
# more expensive # more expensive
yield penalty + exp_addr_count/5000, exp_addr_count,\ yield penalty + exp_count/5000, exp_count,\
[dbf.FieldLookup('name_vector', [t.token for t in name_partials], 'restrict'), dbf.lookup_by_addr(name_tokens, addr_tokens)
dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all')]
return return
# Partial term to frequent. Try looking up by rare full names first. # Partial term to frequent. Try looking up by rare full names first.
name_fulls = self.query.get_tokens(name, TokenType.WORD) name_fulls = self.query.get_tokens(name, TokenType.WORD)
rare_names = list(filter(lambda t: t.count < 1000, name_fulls)) rare_names = list(filter(lambda t: t.count < 10000, name_fulls))
# At this point drop unindexed partials from the address. # At this point drop unindexed partials from the address.
# This might yield wrong results, nothing we can do about that. # This might yield wrong results, nothing we can do about that.
if not partials_indexed: if not partials_indexed:
addr_tokens = [t.token for t in addr_partials if t.is_indexed] addr_tokens = [t.token for t in addr_partials if t.is_indexed]
log().var_dump('before', penalty)
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed) penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
log().var_dump('after', penalty)
if rare_names: if rare_names:
# Any of the full names applies with all of the partials from the address # Any of the full names applies with all of the partials from the address
lookup = [dbf.FieldLookup('name_vector', [t.token for t in rare_names], 'lookup_any')] yield penalty, sum(t.count for t in rare_names),\
if addr_tokens: dbf.lookup_by_any_name([t.token for t in rare_names], addr_tokens)
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'restrict'))
yield penalty, sum(t.count for t in rare_names), lookup
# To catch remaining results, lookup by name and address # To catch remaining results, lookup by name and address
# We only do this if there is a reasonable number of results expected. # We only do this if there is a reasonable number of results expected.
if min(exp_name_count, exp_addr_count) < 10000: if exp_count < 10000:
if all(t.is_indexed for t in name_partials): if all(t.is_indexed for t in name_partials):
lookup = [dbf.FieldLookup('name_vector', lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')]
[t.token for t in name_partials], 'lookup_all')]
else: else:
# we don't have the partials, try with the non-rare names # we don't have the partials, try with the non-rare names
non_rare_names = [t.token for t in name_fulls if t.count >= 1000] non_rare_names = [t.token for t in name_fulls if t.count >= 10000]
if not non_rare_names: if not non_rare_names:
return return
lookup = [dbf.FieldLookup('name_vector', non_rare_names, 'lookup_any')] lookup = [dbf.FieldLookup('name_vector', non_rare_names, 'lookup_any')]
if addr_tokens: if addr_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all')) lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all'))
yield penalty + 0.1 * max(0, 5 - len(name_partials) - len(addr_tokens)),\ penalty += 0.1 * max(0, 5 - len(name_partials) - len(addr_tokens))
min(exp_name_count, exp_addr_count), lookup if len(rare_names) == len(name_fulls):
# if there already was a search for all full tokens,
# avoid this if anything has been found
penalty += 0.25
yield penalty, exp_count, lookup
def get_name_ranking(self, trange: TokenRange) -> dbf.FieldRanking: def get_name_ranking(self, trange: TokenRange) -> dbf.FieldRanking:

View File

@@ -211,3 +211,34 @@ class SearchData:
self.rankings.append(ranking) self.rankings.append(ranking)
else: else:
self.penalty += ranking.default self.penalty += ranking.default
def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where name tokens are looked up via index
and potential address tokens are used to restrict the search further.
"""
lookup = [FieldLookup('name_vector', name_tokens, 'lookup_all')]
if addr_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_tokens, 'restrict'))
return lookup
def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where name tokens are looked up via index
and only one of the name tokens must be present.
Potential address tokens are used to restrict the search further.
"""
lookup = [FieldLookup('name_vector', name_tokens, 'lookup_any')]
if addr_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_tokens, 'restrict'))
return lookup
def lookup_by_addr(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where address tokens are looked up via index
and the name tokens are only used to restrict the search further.
"""
return [FieldLookup('name_vector', name_tokens, 'restrict'),
FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all')]

View File

@@ -7,7 +7,7 @@
""" """
Datastructures for a tokenized query. Datastructures for a tokenized query.
""" """
from typing import List, Tuple, Optional, NamedTuple, Iterator from typing import List, Tuple, Optional, Iterator
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import dataclasses import dataclasses
import enum import enum
@@ -107,13 +107,29 @@ class Token(ABC):
category objects. category objects.
""" """
@dataclasses.dataclass
class TokenRange(NamedTuple): class TokenRange:
""" Indexes of query nodes over which a token spans. """ Indexes of query nodes over which a token spans.
""" """
start: int start: int
end: int end: int
def __lt__(self, other: 'TokenRange') -> bool:
return self.end <= other.start
def __le__(self, other: 'TokenRange') -> bool:
return NotImplemented
def __gt__(self, other: 'TokenRange') -> bool:
return self.start >= other.end
def __ge__(self, other: 'TokenRange') -> bool:
return NotImplemented
def replace_start(self, new_start: int) -> 'TokenRange': def replace_start(self, new_start: int) -> 'TokenRange':
""" Return a new token range with the new start. """ Return a new token range with the new start.
""" """

View File

@@ -257,6 +257,97 @@ class _TokenSequence:
return True return True
def _get_assignments_postcode(self, base: TokenAssignment,
query_len: int) -> Iterator[TokenAssignment]:
""" Yield possible assignments of Postcode searches with an
address component.
"""
assert base.postcode is not None
if (base.postcode.start == 0 and self.direction != -1)\
or (base.postcode.end == query_len and self.direction != 1):
log().comment('postcode search')
# <address>,<postcode> should give preference to address search
if base.postcode.start == 0:
penalty = self.penalty
self.direction = -1 # name searches are only possbile backwards
else:
penalty = self.penalty + 0.1
self.direction = 1 # name searches are only possbile forwards
yield dataclasses.replace(base, penalty=penalty)
def _get_assignments_address_forward(self, base: TokenAssignment,
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments of address searches with
left-to-right reading.
"""
first = base.address[0]
log().comment('first word = name')
yield dataclasses.replace(base, penalty=self.penalty,
name=first, address=base.address[1:])
# To paraphrase:
# * if another name term comes after the first one and before the
# housenumber
# * a qualifier comes after the name
# * the containing phrase is strictly typed
if (base.housenumber and first.end < base.housenumber.start)\
or (base.qualifier and base.qualifier > first)\
or (query.nodes[first.start].ptype != qmod.PhraseType.NONE):
return
penalty = self.penalty
# Penalty for:
# * <name>, <street>, <housenumber> , ...
# * queries that are comma-separated
if (base.housenumber and base.housenumber > first) or len(query.source) > 1:
penalty += 0.25
for i in range(first.start + 1, first.end):
name, addr = first.split(i)
log().comment(f'split first word = name ({i - first.start})')
yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
def _get_assignments_address_backward(self, base: TokenAssignment,
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments of address searches with
right-to-left reading.
"""
last = base.address[-1]
if self.direction == -1 or len(base.address) > 1:
log().comment('last word = name')
yield dataclasses.replace(base, penalty=self.penalty,
name=last, address=base.address[:-1])
# To paraphrase:
# * if another name term comes before the last one and after the
# housenumber
# * a qualifier comes before the name
# * the containing phrase is strictly typed
if (base.housenumber and last.start > base.housenumber.end)\
or (base.qualifier and base.qualifier < last)\
or (query.nodes[last.start].ptype != qmod.PhraseType.NONE):
return
penalty = self.penalty
if base.housenumber and base.housenumber < last:
penalty += 0.4
if len(query.source) > 1:
penalty += 0.25
for i in range(last.start + 1, last.end):
addr, name = last.split(i)
log().comment(f'split last word = name ({i - last.start})')
yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]: def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments for the current sequence. """ Yield possible assignments for the current sequence.
@@ -265,17 +356,13 @@ class _TokenSequence:
""" """
base = TokenAssignment.from_ranges(self.seq) base = TokenAssignment.from_ranges(self.seq)
num_addr_tokens = sum(t.end - t.start for t in base.address)
if num_addr_tokens > 50:
return
# Postcode search (postcode-only search is covered in next case) # Postcode search (postcode-only search is covered in next case)
if base.postcode is not None and base.address: if base.postcode is not None and base.address:
if (base.postcode.start == 0 and self.direction != -1)\ yield from self._get_assignments_postcode(base, query.num_token_slots())
or (base.postcode.end == query.num_token_slots() and self.direction != 1):
log().comment('postcode search')
# <address>,<postcode> should give preference to address search
if base.postcode.start == 0:
penalty = self.penalty
else:
penalty = self.penalty + 0.1
yield dataclasses.replace(base, penalty=penalty)
# Postcode or country-only search # Postcode or country-only search
if not base.address: if not base.address:
@@ -286,49 +373,19 @@ class _TokenSequence:
# <postcode>,<address> should give preference to postcode search # <postcode>,<address> should give preference to postcode search
if base.postcode and base.postcode.start == 0: if base.postcode and base.postcode.start == 0:
self.penalty += 0.1 self.penalty += 0.1
# Use entire first word as name
if self.direction != -1:
log().comment('first word = name')
yield dataclasses.replace(base, name=base.address[0],
penalty=self.penalty,
address=base.address[1:])
# Use entire last word as name # Right-to-left reading of the address
if self.direction == -1 or (self.direction == 0 and len(base.address) > 1): if self.direction != -1:
log().comment('last word = name') yield from self._get_assignments_address_forward(base, query)
yield dataclasses.replace(base, name=base.address[-1],
penalty=self.penalty, # Left-to-right reading of the address
address=base.address[:-1]) if self.direction != 1:
yield from self._get_assignments_address_backward(base, query)
# variant for special housenumber searches # variant for special housenumber searches
if base.housenumber: if base.housenumber:
yield dataclasses.replace(base, penalty=self.penalty) yield dataclasses.replace(base, penalty=self.penalty)
# Use beginning of first word as name
if self.direction != -1:
first = base.address[0]
if (not base.housenumber or first.end >= base.housenumber.start)\
and (not base.qualifier or first.start >= base.qualifier.end):
for i in range(first.start + 1, first.end):
name, addr = first.split(i)
penalty = self.penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype]
log().comment(f'split first word = name ({i - first.start})')
yield dataclasses.replace(base, name=name, penalty=penalty,
address=[addr] + base.address[1:])
# Use end of last word as name
if self.direction != 1:
last = base.address[-1]
if (not base.housenumber or last.start <= base.housenumber.end)\
and (not base.qualifier or last.end <= base.qualifier.start):
for i in range(last.start + 1, last.end):
addr, name = last.split(i)
penalty = self.penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype]
log().comment(f'split last word = name ({i - last.start})')
yield dataclasses.replace(base, name=name, penalty=penalty,
address=base.address[:-1] + [addr])
def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment]: def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Return possible word type assignments to word positions. """ Return possible word type assignments to word positions.

View File

@@ -0,0 +1,49 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2023 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Test data types for search queries.
"""
import pytest
import nominatim.api.search.query as nq
def test_token_range_equal():
assert nq.TokenRange(2, 3) == nq.TokenRange(2, 3)
assert not (nq.TokenRange(2, 3) != nq.TokenRange(2, 3))
@pytest.mark.parametrize('lop,rop', [((1, 2), (3, 4)),
((3, 4), (3, 5)),
((10, 12), (11, 12))])
def test_token_range_unequal(lop, rop):
assert not (nq.TokenRange(*lop) == nq.TokenRange(*rop))
assert nq.TokenRange(*lop) != nq.TokenRange(*rop)
def test_token_range_lt():
assert nq.TokenRange(1, 3) < nq.TokenRange(10, 12)
assert nq.TokenRange(5, 6) < nq.TokenRange(7, 8)
assert nq.TokenRange(1, 4) < nq.TokenRange(4, 5)
assert not(nq.TokenRange(5, 6) < nq.TokenRange(5, 6))
assert not(nq.TokenRange(10, 11) < nq.TokenRange(4, 5))
def test_token_rankge_gt():
assert nq.TokenRange(3, 4) > nq.TokenRange(1, 2)
assert nq.TokenRange(100, 200) > nq.TokenRange(10, 11)
assert nq.TokenRange(10, 11) > nq.TokenRange(4, 10)
assert not(nq.TokenRange(5, 6) > nq.TokenRange(5, 6))
assert not(nq.TokenRange(1, 2) > nq.TokenRange(3, 4))
assert not(nq.TokenRange(4, 10) > nq.TokenRange(3, 5))
def test_token_range_unimplemented_ops():
with pytest.raises(TypeError):
nq.TokenRange(1, 3) <= nq.TokenRange(10, 12)
with pytest.raises(TypeError):
nq.TokenRange(1, 3) >= nq.TokenRange(10, 12)