Merge pull request #3840 from lonvia/normalize-penalties

Improve termination condition for forward search
This commit is contained in:
Sarah Hoffmann
2025-09-12 21:59:39 +02:00
committed by GitHub
7 changed files with 36 additions and 19 deletions

View File

@@ -187,7 +187,7 @@ class SearchBuilder:
dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny)]
sdata.housenumbers = dbf.WeightedStrings([], [])
yield dbs.PlaceSearch(0.05, sdata, expected_count, True)
yield dbs.PlaceSearch(0.0, sdata, expected_count, True)
def build_name_search(self, sdata: dbf.SearchData,
name: qmod.TokenRange, address: List[qmod.TokenRange],
@@ -342,7 +342,10 @@ class SearchBuilder:
heapq.heappush(todo, (-tlist.end, tlist.end,
rank.with_token(t, chgpenalty)))
elif tlist.end == trange.end:
ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
chgpenalty = self.query.get_in_word_penalty(
qmod.TokenRange(pos, tlist.end))
ranks.extend(rank.with_token(t, chgpenalty)
for t in tlist.tokens)
if len(ranks) >= 10:
# Too many variants, bail out and only add

View File

@@ -80,7 +80,7 @@ class ForwardGeocoder:
qs = self.params.query_stats
qs['search_min_penalty'] = round(searches[0].penalty, 2)
min_ranking = searches[0].penalty + 2.0
min_ranking = searches[0].penalty + 1.5
prev_penalty = 0.0
for i, search in enumerate(searches):
if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 15):
@@ -103,7 +103,9 @@ class ForwardGeocoder:
qs['search_min_result_penalty'] = spenalty
qs['search_best_penalty_round'] = i
results[rhash] = result
min_ranking = min(min_ranking, result.accuracy * 1.2, 2.0)
min_ranking = min(min_ranking,
search.penalty + 0.4,
result.accuracy + 0.1)
log().result_dump('Results', ((r.accuracy, r) for r in lookup_results))
prev_penalty = search.penalty
if self.timeout.is_elapsed():

View File

@@ -202,7 +202,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
term = ' '.join(n.term_lookup for n in query.nodes[start + 1:end + 1])
query.add_token(qmod.TokenRange(start, end),
qmod.TOKEN_POSTCODE,
ICUToken(penalty=0.1, token=0, count=1, addr_count=1,
ICUToken(penalty=0.0, token=0, count=1, addr_count=1,
lookup_word=pc, word_token=term,
info=None))
self.rerank_tokens(query)
@@ -288,7 +288,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
if need_hnr and is_full_token \
and len(node.term_normalized) <= 4 and node.term_normalized.isdigit():
query.add_token(qmod.TokenRange(i-1, i), qmod.TOKEN_HOUSENUMBER,
ICUToken(penalty=0.5, token=0,
ICUToken(penalty=0.2, token=0,
count=1, addr_count=1,
lookup_word=node.term_lookup,
word_token=node.term_lookup, info=None))
@@ -309,6 +309,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
len(query.nodes[end].term_lookup) > 4):
for token in tokens:
token.penalty += 0.39
if (start + 1 == end):
if partial := query.nodes[start].partial:
partial.penalty += 0.39
# If it looks like a simple housenumber, prefer that.
if qmod.TOKEN_HOUSENUMBER in tlist:
@@ -319,6 +322,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
if ttype != qmod.TOKEN_HOUSENUMBER:
for token in tokens:
token.penalty += penalty
if (start + 1 == end):
if partial := query.nodes[start].partial:
partial.penalty += penalty
# rerank tokens against the normalized form
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"

View File

@@ -380,17 +380,23 @@ class _TokenSequence:
if base.postcode and base.postcode.start == 0:
self.penalty += 0.1
min_penalty = self.penalty + 2.0
# Left-to-right reading of the address
if self.direction != -1:
yield from self._get_assignments_address_forward(base, query)
for result in self._get_assignments_address_forward(base, query):
min_penalty = min(min_penalty, result.penalty)
yield result
# Right-to-left reading of the address
if self.direction != 1:
yield from self._get_assignments_address_backward(base, query)
for result in self._get_assignments_address_backward(base, query):
min_penalty = min(min_penalty, result.penalty)
yield result
# variant for special housenumber searches
if base.housenumber and not base.qualifier:
yield dataclasses.replace(base, penalty=self.penalty)
yield dataclasses.replace(base, penalty=min_penalty + 0.1)
def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment]:

View File

@@ -2,13 +2,13 @@
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Wrapper around place information the indexer gets from the database and hands to
the tokenizer.
"""
from typing import Optional, Mapping, Any, Tuple
from typing import Optional, Mapping, Any, Tuple, cast
class PlaceInfo:
@@ -56,7 +56,7 @@ class PlaceInfo:
[1]: ../customize/Ranking.md#address-rank
"""
return self._info.get('rank_address', 0)
return cast(int, self._info.get('rank_address', 0))
@property
def centroid(self) -> Optional[Tuple[float, float]]:

View File

@@ -23,8 +23,8 @@ Feature: Search API geocodejson output
Scenario: Search geocodejson - Town street-level address with footway
When sending v1/search with format geocodejson
| q | addressdetails |
| burg gutenberg 6000 jahre geschichte | 1 |
| q | addressdetails |
| 6000 jahre geschichte | 1 |
Then a HTTP 200 is returned
And the result is valid geocodejson
And all results contain

View File

@@ -127,7 +127,7 @@ def test_housenumber_and_street():
check_assignments(yield_token_assignments(q),
TokenAssignment(name=TokenRange(1, 2),
housenumber=TokenRange(0, 1)),
TokenAssignment(address=[TokenRange(1, 2)],
TokenAssignment(penalty=0.1, address=[TokenRange(1, 2)],
housenumber=TokenRange(0, 1)))
@@ -138,7 +138,7 @@ def test_housenumber_and_street_backwards():
check_assignments(yield_token_assignments(q),
TokenAssignment(name=TokenRange(0, 1),
housenumber=TokenRange(1, 2)),
TokenAssignment(address=[TokenRange(0, 1)],
TokenAssignment(penalty=0.1, address=[TokenRange(0, 1)],
housenumber=TokenRange(1, 2)))
@@ -154,7 +154,7 @@ def test_housenumber_and_postcode():
housenumber=TokenRange(1, 2),
address=[TokenRange(2, 3)],
postcode=TokenRange(3, 4)),
TokenAssignment(penalty=pytest.approx(0.3),
TokenAssignment(penalty=pytest.approx(0.4),
housenumber=TokenRange(1, 2),
address=[TokenRange(0, 1), TokenRange(2, 3)],
postcode=TokenRange(3, 4)))
@@ -172,7 +172,7 @@ def test_postcode_and_housenumber():
housenumber=TokenRange(3, 4),
address=[TokenRange(0, 1)],
postcode=TokenRange(1, 2)),
TokenAssignment(penalty=pytest.approx(0.3),
TokenAssignment(penalty=pytest.approx(0.4),
housenumber=TokenRange(3, 4),
address=[TokenRange(0, 1), TokenRange(2, 3)],
postcode=TokenRange(1, 2)))
@@ -218,7 +218,7 @@ def test_housenumber_many_phrases():
housenumber=TokenRange(3, 4),
address=[TokenRange(0, 1), TokenRange(1, 2),
TokenRange(2, 3)]),
TokenAssignment(penalty=0.1,
TokenAssignment(penalty=0.2,
housenumber=TokenRange(3, 4),
address=[TokenRange(0, 1), TokenRange(1, 2),
TokenRange(2, 3), TokenRange(4, 5)]))