Merge pull request #3321 from lonvia/remove-duplicate-partials

Improvements to query parsing
This commit is contained in:
Sarah Hoffmann
2024-01-28 20:32:58 +01:00
committed by GitHub
3 changed files with 77 additions and 31 deletions

View File

@@ -166,15 +166,15 @@ class SearchBuilder:
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)] sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
expected_count = sum(t.count for t in hnrs) expected_count = sum(t.count for t in hnrs)
partials = [t for trange in address partials = {t.token: t.count for trange in address
for t in self.query.get_partials_list(trange)] for t in self.query.get_partials_list(trange)}
if expected_count < 8000: if expected_count < 8000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector', sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
[t.token for t in partials], lookups.Restrict)) list(partials), lookups.Restrict))
elif len(partials) != 1 or partials[0].count < 10000: elif len(partials) != 1 or list(partials.values())[0] < 10000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector', sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
[t.token for t in partials], lookups.LookupAll)) list(partials), lookups.LookupAll))
else: else:
sdata.lookups.append( sdata.lookups.append(
dbf.FieldLookup('nameaddress_vector', dbf.FieldLookup('nameaddress_vector',
@@ -208,18 +208,17 @@ class SearchBuilder:
are and tries to find a lookup that optimizes index use. are and tries to find a lookup that optimizes index use.
""" """
penalty = 0.0 # extra penalty penalty = 0.0 # extra penalty
name_partials = self.query.get_partials_list(name) name_partials = {t.token: t for t in self.query.get_partials_list(name)}
name_tokens = [t.token for t in name_partials]
addr_partials = [t for r in address for t in self.query.get_partials_list(r)] addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
addr_tokens = [t.token for t in addr_partials] addr_tokens = list({t.token for t in addr_partials})
partials_indexed = all(t.is_indexed for t in name_partials) \ partials_indexed = all(t.is_indexed for t in name_partials.values()) \
and all(t.is_indexed for t in addr_partials) and all(t.is_indexed for t in addr_partials)
exp_count = min(t.count for t in name_partials) / (2**(len(name_partials) - 1)) exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed: if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens) yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
return return
# Partial term to frequent. Try looking up by rare full names first. # Partial term to frequent. Try looking up by rare full names first.
@@ -232,22 +231,25 @@ class SearchBuilder:
addr_tokens = [t.token for t in addr_partials if t.is_indexed] addr_tokens = [t.token for t in addr_partials if t.is_indexed]
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed) penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
# Any of the full names applies with all of the partials from the address # Any of the full names applies with all of the partials from the address
yield penalty, fulls_count / (2**len(addr_partials)),\ yield penalty, fulls_count / (2**len(addr_tokens)),\
dbf.lookup_by_any_name([t.token for t in name_fulls], dbf.lookup_by_any_name([t.token for t in name_fulls],
addr_tokens, fulls_count > 10000) addr_tokens,
fulls_count > 30000 / max(1, len(addr_tokens)))
# To catch remaining results, lookup by name and address # To catch remaining results, lookup by name and address
# We only do this if there is a reasonable number of results expected. # We only do this if there is a reasonable number of results expected.
exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
if exp_count < 10000 and all(t.is_indexed for t in name_partials): if exp_count < 10000 and all(t.is_indexed for t in name_partials.values()):
lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)] lookup = [dbf.FieldLookup('name_vector', list(name_partials.keys()), lookups.LookupAll)]
if addr_tokens: if addr_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll)) lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens)) penalty += 0.35 * max(1 if name_fulls else 0.1,
5 - len(name_partials) - len(addr_tokens))
yield penalty, exp_count, lookup yield penalty, exp_count, lookup
def get_name_ranking(self, trange: TokenRange) -> dbf.FieldRanking: def get_name_ranking(self, trange: TokenRange,
db_field: str = 'name_vector') -> dbf.FieldRanking:
""" Create a ranking expression for a name term in the given range. """ Create a ranking expression for a name term in the given range.
""" """
name_fulls = self.query.get_tokens(trange, TokenType.WORD) name_fulls = self.query.get_tokens(trange, TokenType.WORD)
@@ -256,7 +258,7 @@ class SearchBuilder:
# Fallback, sum of penalty for partials # Fallback, sum of penalty for partials
name_partials = self.query.get_partials_list(trange) name_partials = self.query.get_partials_list(trange)
default = sum(t.penalty for t in name_partials) + 0.2 default = sum(t.penalty for t in name_partials) + 0.2
return dbf.FieldRanking('name_vector', default, ranks) return dbf.FieldRanking(db_field, default, ranks)
def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking: def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
@@ -314,11 +316,9 @@ class SearchBuilder:
sdata = dbf.SearchData() sdata = dbf.SearchData()
sdata.penalty = assignment.penalty sdata.penalty = assignment.penalty
if assignment.country: if assignment.country:
tokens = self.query.get_tokens(assignment.country, TokenType.COUNTRY) tokens = self.get_country_tokens(assignment.country)
if self.details.countries: if not tokens:
tokens = [t for t in tokens if t.lookup_word in self.details.countries] return None
if not tokens:
return None
sdata.set_strings('countries', tokens) sdata.set_strings('countries', tokens)
elif self.details.countries: elif self.details.countries:
sdata.countries = dbf.WeightedStrings(self.details.countries, sdata.countries = dbf.WeightedStrings(self.details.countries,
@@ -332,24 +332,54 @@ class SearchBuilder:
self.query.get_tokens(assignment.postcode, self.query.get_tokens(assignment.postcode,
TokenType.POSTCODE)) TokenType.POSTCODE))
if assignment.qualifier: if assignment.qualifier:
tokens = self.query.get_tokens(assignment.qualifier, TokenType.QUALIFIER) tokens = self.get_qualifier_tokens(assignment.qualifier)
if self.details.categories: if not tokens:
tokens = [t for t in tokens if t.get_category() in self.details.categories] return None
if not tokens:
return None
sdata.set_qualifiers(tokens) sdata.set_qualifiers(tokens)
elif self.details.categories: elif self.details.categories:
sdata.qualifiers = dbf.WeightedCategories(self.details.categories, sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
[0.0] * len(self.details.categories)) [0.0] * len(self.details.categories))
if assignment.address: if assignment.address:
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address]) if not assignment.name and assignment.housenumber:
# housenumber search: the first item needs to be handled like
# a name in ranking or penalties are not comparable with
# normal searches.
sdata.set_ranking([self.get_name_ranking(assignment.address[0],
db_field='nameaddress_vector')]
+ [self.get_addr_ranking(r) for r in assignment.address[1:]])
else:
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
else: else:
sdata.rankings = [] sdata.rankings = []
return sdata return sdata
def get_country_tokens(self, trange: TokenRange) -> List[Token]:
""" Return the list of country tokens for the given range,
optionally filtered by the country list from the details
parameters.
"""
tokens = self.query.get_tokens(trange, TokenType.COUNTRY)
if self.details.countries:
tokens = [t for t in tokens if t.lookup_word in self.details.countries]
return tokens
def get_qualifier_tokens(self, trange: TokenRange) -> List[Token]:
""" Return the list of qualifier tokens for the given range,
optionally filtered by the qualifier list from the details
parameters.
"""
tokens = self.query.get_tokens(trange, TokenType.QUALIFIER)
if self.details.categories:
tokens = [t for t in tokens if t.get_category() in self.details.categories]
return tokens
def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]: def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
""" Collect tokens for near items search or use the categories """ Collect tokens for near items search or use the categories
requested per parameter. requested per parameter.

View File

@@ -132,6 +132,11 @@ class _TokenSequence:
# Name tokens are always acceptable and don't change direction # Name tokens are always acceptable and don't change direction
if ttype == qmod.TokenType.PARTIAL: if ttype == qmod.TokenType.PARTIAL:
# qualifiers cannot appear in the middle of the qeury. They need
# to be near the next phrase.
if self.direction == -1 \
and any(t.ttype == qmod.TokenType.QUALIFIER for t in self.seq[:-1]):
return None
return self.direction return self.direction
# Other tokens may only appear once # Other tokens may only appear once
@@ -385,7 +390,7 @@ class _TokenSequence:
yield from self._get_assignments_address_backward(base, query) yield from self._get_assignments_address_backward(base, query)
# variant for special housenumber searches # variant for special housenumber searches
if base.housenumber: if base.housenumber and not base.qualifier:
yield dataclasses.replace(base, penalty=self.penalty) yield dataclasses.replace(base, penalty=self.penalty)

View File

@@ -337,3 +337,14 @@ def test_qualifier_after_housenumber():
(BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)])) (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
check_assignments(yield_token_assignments(q)) check_assignments(yield_token_assignments(q))
def test_qualifier_in_middle_of_phrase():
q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
(BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
(BreakType.WORD, PhraseType.NONE, [(3, TokenType.QUALIFIER)]),
(BreakType.WORD, PhraseType.NONE, [(4, TokenType.PARTIAL)]),
(BreakType.PHRASE, PhraseType.NONE, [(5, TokenType.PARTIAL)]))
check_assignments(yield_token_assignments(q))