From 43ffceff2774dfca2a8c975683daf5f73ff1dac1 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 12 Sep 2025 09:45:57 +0200 Subject: [PATCH 1/8] remove base penalty for postcodes This is a relict from having base penalties for all terms. --- src/nominatim_api/search/icu_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 4be15d67..e0b14941 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -202,7 +202,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): term = ' '.join(n.term_lookup for n in query.nodes[start + 1:end + 1]) query.add_token(qmod.TokenRange(start, end), qmod.TOKEN_POSTCODE, - ICUToken(penalty=0.1, token=0, count=1, addr_count=1, + ICUToken(penalty=0.0, token=0, count=1, addr_count=1, lookup_word=pc, word_token=term, info=None)) self.rerank_tokens(query) From 42b687f545680a6b7a813f4b5c7989c8cfe67890 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 12 Sep 2025 10:01:13 +0200 Subject: [PATCH 2/8] stop searching earlier after the first results was found --- src/nominatim_api/search/geocoder.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/nominatim_api/search/geocoder.py b/src/nominatim_api/search/geocoder.py index 3fcb4101..3b039342 100644 --- a/src/nominatim_api/search/geocoder.py +++ b/src/nominatim_api/search/geocoder.py @@ -80,7 +80,7 @@ class ForwardGeocoder: qs = self.params.query_stats qs['search_min_penalty'] = round(searches[0].penalty, 2) - min_ranking = searches[0].penalty + 2.0 + min_ranking = searches[0].penalty + 1.5 prev_penalty = 0.0 for i, search in enumerate(searches): if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 15): @@ -103,7 +103,9 @@ class ForwardGeocoder: qs['search_min_result_penalty'] = spenalty qs['search_best_penalty_round'] = i results[rhash] = result - min_ranking = min(min_ranking, result.accuracy * 1.2, 2.0) + min_ranking = min(min_ranking, + search.penalty + 0.4, + result.accuracy + 0.1) log().result_dump('Results', ((r.accuracy, r) for r in lookup_results)) prev_penalty = search.penalty if self.timeout.is_elapsed(): From 54620f9566fdb6337cc1e2c41d17ad786a87af30 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 12 Sep 2025 10:52:42 +0200 Subject: [PATCH 3/8] base penalty for housenumber searches on similar address searches --- src/nominatim_api/search/db_search_builder.py | 2 +- src/nominatim_api/search/token_assignment.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py index ae69e4ae..0ced365f 100644 --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@ -187,7 +187,7 @@ class SearchBuilder: dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny)] sdata.housenumbers = dbf.WeightedStrings([], []) - yield dbs.PlaceSearch(0.05, sdata, expected_count, True) + yield dbs.PlaceSearch(0.0, sdata, expected_count, True) def build_name_search(self, sdata: dbf.SearchData, name: qmod.TokenRange, address: List[qmod.TokenRange], diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py index 798ee546..159b36ac 100644 --- a/src/nominatim_api/search/token_assignment.py +++ b/src/nominatim_api/search/token_assignment.py @@ -380,17 +380,23 @@ class _TokenSequence: if base.postcode and base.postcode.start == 0: self.penalty += 0.1 + min_penalty = self.penalty + 2.0 + # Left-to-right reading of the address if self.direction != -1: - yield from self._get_assignments_address_forward(base, query) + for result in self._get_assignments_address_forward(base, query): + min_penalty = min(min_penalty, result.penalty) + yield result # Right-to-left reading of the address if self.direction != 1: - yield from self._get_assignments_address_backward(base, query) + for result in self._get_assignments_address_backward(base, query): + min_penalty = min(min_penalty, result.penalty) + yield result # variant for special housenumber searches if base.housenumber and not base.qualifier: - yield dataclasses.replace(base, penalty=self.penalty) + yield dataclasses.replace(base, penalty=min_penalty + 0.1) def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment]: From 4fd881bcb207586d598dba48870f4aef254f67a1 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 12 Sep 2025 11:50:01 +0200 Subject: [PATCH 4/8] housenumber and postcode cross penalties for partials --- src/nominatim_api/search/icu_tokenizer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index e0b14941..4dba5275 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -309,6 +309,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): len(query.nodes[end].term_lookup) > 4): for token in tokens: token.penalty += 0.39 + if (start + 1 == end): + if partial := query.nodes[start].partial: + partial.penalty += 0.39 # If it looks like a simple housenumber, prefer that. if qmod.TOKEN_HOUSENUMBER in tlist: @@ -319,6 +322,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if ttype != qmod.TOKEN_HOUSENUMBER: for token in tokens: token.penalty += penalty + if (start + 1 == end): + if partial := query.nodes[start].partial: + partial.penalty += penalty # rerank tokens against the normalized form norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}" From 193d6c41737f1c7359731f87edb8708068cc80d2 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 12 Sep 2025 12:05:29 +0200 Subject: [PATCH 5/8] in-word penalty for final address token --- src/nominatim_api/search/db_search_builder.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py index 0ced365f..f90c6d7f 100644 --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@ -342,7 +342,10 @@ class SearchBuilder: heapq.heappush(todo, (-tlist.end, tlist.end, rank.with_token(t, chgpenalty))) elif tlist.end == trange.end: - ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens) + chgpenalty = self.query.get_in_word_penalty( + qmod.TokenRange(pos, tlist.end)) + ranks.extend(rank.with_token(t, chgpenalty) + for t in tlist.tokens) if len(ranks) >= 10: # Too many variants, bail out and only add From 72592da0ccbee4cd9a1e3e23708b11a53fde6a7a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 12 Sep 2025 17:44:54 +0200 Subject: [PATCH 6/8] reduce penalty for artificial housenumbers --- src/nominatim_api/search/icu_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 4dba5275..4ab85fd3 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -288,7 +288,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if need_hnr and is_full_token \ and len(node.term_normalized) <= 4 and node.term_normalized.isdigit(): query.add_token(qmod.TokenRange(i-1, i), qmod.TOKEN_HOUSENUMBER, - ICUToken(penalty=0.5, token=0, + ICUToken(penalty=0.2, token=0, count=1, addr_count=1, lookup_word=node.term_lookup, word_token=node.term_lookup, info=None)) From 5a8aa6cce4c8b0e8b4eaf68ed3ad71d4940988bb Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 12 Sep 2025 17:45:22 +0200 Subject: [PATCH 7/8] adapt tests to new penalties --- test/bdd/features/api/search/v1_geocodejson.feature | 4 ++-- test/python/api/search/test_token_assignment.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/bdd/features/api/search/v1_geocodejson.feature b/test/bdd/features/api/search/v1_geocodejson.feature index 99fff0e4..785b45d1 100644 --- a/test/bdd/features/api/search/v1_geocodejson.feature +++ b/test/bdd/features/api/search/v1_geocodejson.feature @@ -23,8 +23,8 @@ Feature: Search API geocodejson output Scenario: Search geocodejson - Town street-level address with footway When sending v1/search with format geocodejson - | q | addressdetails | - | burg gutenberg 6000 jahre geschichte | 1 | + | q | addressdetails | + | 6000 jahre geschichte | 1 | Then a HTTP 200 is returned And the result is valid geocodejson And all results contain diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py index e45352d7..0b2d7cb9 100644 --- a/test/python/api/search/test_token_assignment.py +++ b/test/python/api/search/test_token_assignment.py @@ -127,7 +127,7 @@ def test_housenumber_and_street(): check_assignments(yield_token_assignments(q), TokenAssignment(name=TokenRange(1, 2), housenumber=TokenRange(0, 1)), - TokenAssignment(address=[TokenRange(1, 2)], + TokenAssignment(penalty=0.1, address=[TokenRange(1, 2)], housenumber=TokenRange(0, 1))) @@ -138,7 +138,7 @@ def test_housenumber_and_street_backwards(): check_assignments(yield_token_assignments(q), TokenAssignment(name=TokenRange(0, 1), housenumber=TokenRange(1, 2)), - TokenAssignment(address=[TokenRange(0, 1)], + TokenAssignment(penalty=0.1, address=[TokenRange(0, 1)], housenumber=TokenRange(1, 2))) @@ -154,7 +154,7 @@ def test_housenumber_and_postcode(): housenumber=TokenRange(1, 2), address=[TokenRange(2, 3)], postcode=TokenRange(3, 4)), - TokenAssignment(penalty=pytest.approx(0.3), + TokenAssignment(penalty=pytest.approx(0.4), housenumber=TokenRange(1, 2), address=[TokenRange(0, 1), TokenRange(2, 3)], postcode=TokenRange(3, 4))) @@ -172,7 +172,7 @@ def test_postcode_and_housenumber(): housenumber=TokenRange(3, 4), address=[TokenRange(0, 1)], postcode=TokenRange(1, 2)), - TokenAssignment(penalty=pytest.approx(0.3), + TokenAssignment(penalty=pytest.approx(0.4), housenumber=TokenRange(3, 4), address=[TokenRange(0, 1), TokenRange(2, 3)], postcode=TokenRange(1, 2))) @@ -218,7 +218,7 @@ def test_housenumber_many_phrases(): housenumber=TokenRange(3, 4), address=[TokenRange(0, 1), TokenRange(1, 2), TokenRange(2, 3)]), - TokenAssignment(penalty=0.1, + TokenAssignment(penalty=0.2, housenumber=TokenRange(3, 4), address=[TokenRange(0, 1), TokenRange(1, 2), TokenRange(2, 3), TokenRange(4, 5)])) From 7715a9d50056c5f5a7f1661c6a87fcce31720a06 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 12 Sep 2025 19:32:49 +0200 Subject: [PATCH 8/8] fix new mypy issue --- src/nominatim_db/data/place_info.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nominatim_db/data/place_info.py b/src/nominatim_db/data/place_info.py index 5fc6a48a..32f86c96 100644 --- a/src/nominatim_db/data/place_info.py +++ b/src/nominatim_db/data/place_info.py @@ -2,13 +2,13 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. """ Wrapper around place information the indexer gets from the database and hands to the tokenizer. """ -from typing import Optional, Mapping, Any, Tuple +from typing import Optional, Mapping, Any, Tuple, cast class PlaceInfo: @@ -56,7 +56,7 @@ class PlaceInfo: [1]: ../customize/Ranking.md#address-rank """ - return self._info.get('rank_address', 0) + return cast(int, self._info.get('rank_address', 0)) @property def centroid(self) -> Optional[Tuple[float, float]]: