From 9c2d4f4285575e679e2bf625ba8bd175112c738f Mon Sep 17 00:00:00 2001 From: Itz-Agasta Date: Fri, 20 Feb 2026 22:27:30 +0530 Subject: [PATCH 1/2] Adds language-aware country penalty in forward geocoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Threads the caller's Accept-Language preference into ForwardGeocoder and uses it in rerank_by_query to check whether the query matches the localized name of a country result. If the caller's locale renders the country name differently (e.g. pt-BR gives "Brasil" ≠ "Brasilia"), the country's importance is added as an accuracy penalty, neutralising its dominance over lower-ranked places. If the locale matches (e.g. Finnish gives "Brasilia" = "Brasilia"), no penalty is applied and the country correctly wins. --- src/nominatim_api/search/geocoder.py | 17 +++++++++++++---- src/nominatim_api/types.py | 11 ++++++++++- src/nominatim_api/v1/server_glue.py | 5 +++-- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/nominatim_api/search/geocoder.py b/src/nominatim_api/search/geocoder.py index 3b039342..1fb67cf1 100644 --- a/src/nominatim_api/search/geocoder.py +++ b/src/nominatim_api/search/geocoder.py @@ -170,11 +170,20 @@ class ForwardGeocoder: if qword not in words: wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words) distance += len(qword) if wdist < 0.4 else 1 - # Compensate for the fact that country names do not get a - # match penalty yet by the tokenizer. - # Temporary hack that needs to be removed! + # Countries with high importance can dominate results when matched + # via an alternate-language name. Apply a language-aware penalty + # to offset this. if result.rank_address == 4: - distance *= 2 + if self.params.locales and result.names: + loc_names = [result.names[t] for t in self.params.locales.name_tags + if t in result.names] + if loc_names: + norm_loc = self.query_analyzer.normalize_text(' '.join(loc_names)) + loc_words = set(w for w in re.split('[-,: ]+', norm_loc) if w) + if loc_words and loc_words.isdisjoint(qwords): + result.accuracy += result.calculated_importance() * 0.5 + else: + distance *= 2 result.accuracy += distance * 0.3 / sum(len(w) for w in qwords) async def lookup_pois(self, categories: List[Tuple[str, str]], diff --git a/src/nominatim_api/types.py b/src/nominatim_api/types.py index 92c2b6b9..a9fd29a4 100644 --- a/src/nominatim_api/types.py +++ b/src/nominatim_api/types.py @@ -8,7 +8,7 @@ Complex datatypes used by the Nominatim API. """ from typing import Optional, Union, Tuple, NamedTuple, TypeVar, Type, Dict, \ - Any, List, Sequence + Any, List, Sequence, TYPE_CHECKING from collections import abc import dataclasses import datetime as dt @@ -17,6 +17,8 @@ import math from struct import unpack from binascii import unhexlify +if TYPE_CHECKING: + from .localization import Locales from .errors import UsageError @@ -573,6 +575,13 @@ class SearchDetails(LookupDetails): viewbox_x2: Optional[Bbox] = None + locales: Optional['Locales'] = dataclasses.field( + default=None, metadata={'transform': lambda v: v}) + """ Locale preferences of the caller. + Used during result re-ranking to prefer results that match the + caller's locale over results that only match in an alternate language. + """ + def __post_init__(self) -> None: if self.viewbox is not None: xext = (self.viewbox.maxlon - self.viewbox.minlon)/2 diff --git a/src/nominatim_api/v1/server_glue.py b/src/nominatim_api/v1/server_glue.py index c02a1307..995da8d1 100644 --- a/src/nominatim_api/v1/server_glue.py +++ b/src/nominatim_api/v1/server_glue.py @@ -334,6 +334,8 @@ async def search_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any: details['layers'] = DataLayer.ADDRESS else: details['layers'] = get_layers(params) + details['locales'] = Locales.from_accept_languages(get_accepted_languages(params), + params.config().OUTPUT_NAMES) # unstructured query parameters query = params.get('q', None) @@ -359,8 +361,7 @@ async def search_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any: except UsageError as err: params.raise_error(str(err)) - Locales.from_accept_languages(get_accepted_languages(params), - params.config().OUTPUT_NAMES).localize_results(results) + details['locales'].localize_results(results) if details['dedupe'] and len(results) > 1: results = helpers.deduplicate_results(results, max_results) From 36a364ec25876603d477c2cf02cfdb6c39b0b824 Mon Sep 17 00:00:00 2001 From: Itz-Agasta Date: Mon, 2 Mar 2026 12:36:45 +0530 Subject: [PATCH 2/2] Adds test for locale-sensitive country name matching Introduces a scenario to verify that a country's alternate-language name does not dominate search results when the requested locale differs. Ensures correct result selection for locale-aware geocoding. Relates to #3210 --- .../features/db/query/search_simple.feature | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/bdd/features/db/query/search_simple.feature b/test/bdd/features/db/query/search_simple.feature index 10a26ed0..e5e771f7 100644 --- a/test/bdd/features/db/query/search_simple.feature +++ b/test/bdd/features/db/query/search_simple.feature @@ -80,3 +80,23 @@ Feature: Searching of simple objects | Chicago | Illinois | IL | | Auburn | Alabama | AL | | New Orleans | Louisiana | LA | + + # github #3210 + Scenario: Country with alternate-language name does not dominate when locale differs + Given the 1.0 grid with origin DE + | 1 | | 2 | + | | 10 | | + | 4 | | 3 | + Given the places + | osm | class | type | admin | name+name | name+name:fi | name+name:de | country | geometry | + | R1 | boundary | administrative | 2 | Turgei | Turgi | Testland | de | (1,2,3,4,1) | + Given the places + | osm | class | type | name+name | geometry | + | N10 | place | village | Turgi | 10 | + When importing + And geocoding "Turgi" + | accept-language | + | de | + Then result 0 contains + | object | + | N10 |