forked from hans/Nominatim
Merge pull request #3321 from lonvia/remove-duplicate-partials
Improvements to query parsing
This commit is contained in:
@@ -166,15 +166,15 @@ class SearchBuilder:
|
|||||||
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
|
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
|
||||||
expected_count = sum(t.count for t in hnrs)
|
expected_count = sum(t.count for t in hnrs)
|
||||||
|
|
||||||
partials = [t for trange in address
|
partials = {t.token: t.count for trange in address
|
||||||
for t in self.query.get_partials_list(trange)]
|
for t in self.query.get_partials_list(trange)}
|
||||||
|
|
||||||
if expected_count < 8000:
|
if expected_count < 8000:
|
||||||
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
|
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
|
||||||
[t.token for t in partials], lookups.Restrict))
|
list(partials), lookups.Restrict))
|
||||||
elif len(partials) != 1 or partials[0].count < 10000:
|
elif len(partials) != 1 or list(partials.values())[0] < 10000:
|
||||||
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
|
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
|
||||||
[t.token for t in partials], lookups.LookupAll))
|
list(partials), lookups.LookupAll))
|
||||||
else:
|
else:
|
||||||
sdata.lookups.append(
|
sdata.lookups.append(
|
||||||
dbf.FieldLookup('nameaddress_vector',
|
dbf.FieldLookup('nameaddress_vector',
|
||||||
@@ -208,18 +208,17 @@ class SearchBuilder:
|
|||||||
are and tries to find a lookup that optimizes index use.
|
are and tries to find a lookup that optimizes index use.
|
||||||
"""
|
"""
|
||||||
penalty = 0.0 # extra penalty
|
penalty = 0.0 # extra penalty
|
||||||
name_partials = self.query.get_partials_list(name)
|
name_partials = {t.token: t for t in self.query.get_partials_list(name)}
|
||||||
name_tokens = [t.token for t in name_partials]
|
|
||||||
|
|
||||||
addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
|
addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
|
||||||
addr_tokens = [t.token for t in addr_partials]
|
addr_tokens = list({t.token for t in addr_partials})
|
||||||
|
|
||||||
partials_indexed = all(t.is_indexed for t in name_partials) \
|
partials_indexed = all(t.is_indexed for t in name_partials.values()) \
|
||||||
and all(t.is_indexed for t in addr_partials)
|
and all(t.is_indexed for t in addr_partials)
|
||||||
exp_count = min(t.count for t in name_partials) / (2**(len(name_partials) - 1))
|
exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
|
||||||
|
|
||||||
if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
|
if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
|
||||||
yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
|
yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Partial term to frequent. Try looking up by rare full names first.
|
# Partial term to frequent. Try looking up by rare full names first.
|
||||||
@@ -232,22 +231,25 @@ class SearchBuilder:
|
|||||||
addr_tokens = [t.token for t in addr_partials if t.is_indexed]
|
addr_tokens = [t.token for t in addr_partials if t.is_indexed]
|
||||||
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
|
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
|
||||||
# Any of the full names applies with all of the partials from the address
|
# Any of the full names applies with all of the partials from the address
|
||||||
yield penalty, fulls_count / (2**len(addr_partials)),\
|
yield penalty, fulls_count / (2**len(addr_tokens)),\
|
||||||
dbf.lookup_by_any_name([t.token for t in name_fulls],
|
dbf.lookup_by_any_name([t.token for t in name_fulls],
|
||||||
addr_tokens, fulls_count > 10000)
|
addr_tokens,
|
||||||
|
fulls_count > 30000 / max(1, len(addr_tokens)))
|
||||||
|
|
||||||
# To catch remaining results, lookup by name and address
|
# To catch remaining results, lookup by name and address
|
||||||
# We only do this if there is a reasonable number of results expected.
|
# We only do this if there is a reasonable number of results expected.
|
||||||
exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
|
exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
|
||||||
if exp_count < 10000 and all(t.is_indexed for t in name_partials):
|
if exp_count < 10000 and all(t.is_indexed for t in name_partials.values()):
|
||||||
lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
|
lookup = [dbf.FieldLookup('name_vector', list(name_partials.keys()), lookups.LookupAll)]
|
||||||
if addr_tokens:
|
if addr_tokens:
|
||||||
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
|
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
|
||||||
penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens))
|
penalty += 0.35 * max(1 if name_fulls else 0.1,
|
||||||
|
5 - len(name_partials) - len(addr_tokens))
|
||||||
yield penalty, exp_count, lookup
|
yield penalty, exp_count, lookup
|
||||||
|
|
||||||
|
|
||||||
def get_name_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
|
def get_name_ranking(self, trange: TokenRange,
|
||||||
|
db_field: str = 'name_vector') -> dbf.FieldRanking:
|
||||||
""" Create a ranking expression for a name term in the given range.
|
""" Create a ranking expression for a name term in the given range.
|
||||||
"""
|
"""
|
||||||
name_fulls = self.query.get_tokens(trange, TokenType.WORD)
|
name_fulls = self.query.get_tokens(trange, TokenType.WORD)
|
||||||
@@ -256,7 +258,7 @@ class SearchBuilder:
|
|||||||
# Fallback, sum of penalty for partials
|
# Fallback, sum of penalty for partials
|
||||||
name_partials = self.query.get_partials_list(trange)
|
name_partials = self.query.get_partials_list(trange)
|
||||||
default = sum(t.penalty for t in name_partials) + 0.2
|
default = sum(t.penalty for t in name_partials) + 0.2
|
||||||
return dbf.FieldRanking('name_vector', default, ranks)
|
return dbf.FieldRanking(db_field, default, ranks)
|
||||||
|
|
||||||
|
|
||||||
def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
|
def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
|
||||||
@@ -314,11 +316,9 @@ class SearchBuilder:
|
|||||||
sdata = dbf.SearchData()
|
sdata = dbf.SearchData()
|
||||||
sdata.penalty = assignment.penalty
|
sdata.penalty = assignment.penalty
|
||||||
if assignment.country:
|
if assignment.country:
|
||||||
tokens = self.query.get_tokens(assignment.country, TokenType.COUNTRY)
|
tokens = self.get_country_tokens(assignment.country)
|
||||||
if self.details.countries:
|
if not tokens:
|
||||||
tokens = [t for t in tokens if t.lookup_word in self.details.countries]
|
return None
|
||||||
if not tokens:
|
|
||||||
return None
|
|
||||||
sdata.set_strings('countries', tokens)
|
sdata.set_strings('countries', tokens)
|
||||||
elif self.details.countries:
|
elif self.details.countries:
|
||||||
sdata.countries = dbf.WeightedStrings(self.details.countries,
|
sdata.countries = dbf.WeightedStrings(self.details.countries,
|
||||||
@@ -332,24 +332,54 @@ class SearchBuilder:
|
|||||||
self.query.get_tokens(assignment.postcode,
|
self.query.get_tokens(assignment.postcode,
|
||||||
TokenType.POSTCODE))
|
TokenType.POSTCODE))
|
||||||
if assignment.qualifier:
|
if assignment.qualifier:
|
||||||
tokens = self.query.get_tokens(assignment.qualifier, TokenType.QUALIFIER)
|
tokens = self.get_qualifier_tokens(assignment.qualifier)
|
||||||
if self.details.categories:
|
if not tokens:
|
||||||
tokens = [t for t in tokens if t.get_category() in self.details.categories]
|
return None
|
||||||
if not tokens:
|
|
||||||
return None
|
|
||||||
sdata.set_qualifiers(tokens)
|
sdata.set_qualifiers(tokens)
|
||||||
elif self.details.categories:
|
elif self.details.categories:
|
||||||
sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
|
sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
|
||||||
[0.0] * len(self.details.categories))
|
[0.0] * len(self.details.categories))
|
||||||
|
|
||||||
if assignment.address:
|
if assignment.address:
|
||||||
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
|
if not assignment.name and assignment.housenumber:
|
||||||
|
# housenumber search: the first item needs to be handled like
|
||||||
|
# a name in ranking or penalties are not comparable with
|
||||||
|
# normal searches.
|
||||||
|
sdata.set_ranking([self.get_name_ranking(assignment.address[0],
|
||||||
|
db_field='nameaddress_vector')]
|
||||||
|
+ [self.get_addr_ranking(r) for r in assignment.address[1:]])
|
||||||
|
else:
|
||||||
|
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
|
||||||
else:
|
else:
|
||||||
sdata.rankings = []
|
sdata.rankings = []
|
||||||
|
|
||||||
return sdata
|
return sdata
|
||||||
|
|
||||||
|
|
||||||
|
def get_country_tokens(self, trange: TokenRange) -> List[Token]:
|
||||||
|
""" Return the list of country tokens for the given range,
|
||||||
|
optionally filtered by the country list from the details
|
||||||
|
parameters.
|
||||||
|
"""
|
||||||
|
tokens = self.query.get_tokens(trange, TokenType.COUNTRY)
|
||||||
|
if self.details.countries:
|
||||||
|
tokens = [t for t in tokens if t.lookup_word in self.details.countries]
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
def get_qualifier_tokens(self, trange: TokenRange) -> List[Token]:
|
||||||
|
""" Return the list of qualifier tokens for the given range,
|
||||||
|
optionally filtered by the qualifier list from the details
|
||||||
|
parameters.
|
||||||
|
"""
|
||||||
|
tokens = self.query.get_tokens(trange, TokenType.QUALIFIER)
|
||||||
|
if self.details.categories:
|
||||||
|
tokens = [t for t in tokens if t.get_category() in self.details.categories]
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
|
def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
|
||||||
""" Collect tokens for near items search or use the categories
|
""" Collect tokens for near items search or use the categories
|
||||||
requested per parameter.
|
requested per parameter.
|
||||||
|
|||||||
@@ -132,6 +132,11 @@ class _TokenSequence:
|
|||||||
|
|
||||||
# Name tokens are always acceptable and don't change direction
|
# Name tokens are always acceptable and don't change direction
|
||||||
if ttype == qmod.TokenType.PARTIAL:
|
if ttype == qmod.TokenType.PARTIAL:
|
||||||
|
# qualifiers cannot appear in the middle of the qeury. They need
|
||||||
|
# to be near the next phrase.
|
||||||
|
if self.direction == -1 \
|
||||||
|
and any(t.ttype == qmod.TokenType.QUALIFIER for t in self.seq[:-1]):
|
||||||
|
return None
|
||||||
return self.direction
|
return self.direction
|
||||||
|
|
||||||
# Other tokens may only appear once
|
# Other tokens may only appear once
|
||||||
@@ -385,7 +390,7 @@ class _TokenSequence:
|
|||||||
yield from self._get_assignments_address_backward(base, query)
|
yield from self._get_assignments_address_backward(base, query)
|
||||||
|
|
||||||
# variant for special housenumber searches
|
# variant for special housenumber searches
|
||||||
if base.housenumber:
|
if base.housenumber and not base.qualifier:
|
||||||
yield dataclasses.replace(base, penalty=self.penalty)
|
yield dataclasses.replace(base, penalty=self.penalty)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -337,3 +337,14 @@ def test_qualifier_after_housenumber():
|
|||||||
(BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
|
(BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
|
||||||
|
|
||||||
check_assignments(yield_token_assignments(q))
|
check_assignments(yield_token_assignments(q))
|
||||||
|
|
||||||
|
|
||||||
|
def test_qualifier_in_middle_of_phrase():
|
||||||
|
q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
|
||||||
|
(BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
|
||||||
|
(BreakType.WORD, PhraseType.NONE, [(3, TokenType.QUALIFIER)]),
|
||||||
|
(BreakType.WORD, PhraseType.NONE, [(4, TokenType.PARTIAL)]),
|
||||||
|
(BreakType.PHRASE, PhraseType.NONE, [(5, TokenType.PARTIAL)]))
|
||||||
|
|
||||||
|
check_assignments(yield_token_assignments(q))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user