Adds language-aware country penalty in forward geocoding

Threads the caller's Accept-Language preference into ForwardGeocoder and uses it in rerank_by_query to check whether the query matches the localized name of a country result. If the caller's locale renders the country name differently (e.g. pt-BR gives "Brasil" ≠ "Brasilia"), the country's importance is added as an accuracy penalty, neutralising its dominance over lower-ranked places. If the locale matches (e.g. Finnish gives "Brasilia" = "Brasilia"), no penalty is applied and the country correctly wins.
2026-03-07 02:24:08 +00:00 · 2026-02-20 22:27:30 +05:30
parent af9458a601
commit 9c2d4f4285
3 changed files with 26 additions and 7 deletions
--- a/src/nominatim_api/search/geocoder.py
+++ b/src/nominatim_api/search/geocoder.py
@@ -170,11 +170,20 @@ class ForwardGeocoder:
                if qword not in words:
                    wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
                    distance += len(qword) if wdist < 0.4 else 1
-            # Compensate for the fact that country names do not get a
-            # match penalty yet by the tokenizer.
-            # Temporary hack that needs to be removed!
+            # Countries with high importance can dominate results when matched
+            # via an alternate-language name. Apply a language-aware penalty
+            # to offset this.
            if result.rank_address == 4:
-                distance *= 2
+                if self.params.locales and result.names:
+                    loc_names = [result.names[t] for t in self.params.locales.name_tags
+                                 if t in result.names]
+                    if loc_names:
+                        norm_loc = self.query_analyzer.normalize_text(' '.join(loc_names))
+                        loc_words = set(w for w in re.split('[-,: ]+', norm_loc) if w)
+                        if loc_words and loc_words.isdisjoint(qwords):
+                            result.accuracy += result.calculated_importance() * 0.5
+                else:
+                    distance *= 2
            result.accuracy += distance * 0.3 / sum(len(w) for w in qwords)

    async def lookup_pois(self, categories: List[Tuple[str, str]],
--- a/src/nominatim_api/types.py
+++ b/src/nominatim_api/types.py
@@ -8,7 +8,7 @@
 Complex datatypes used by the Nominatim API.
 """
 from typing import Optional, Union, Tuple, NamedTuple, TypeVar, Type, Dict, \
-                   Any, List, Sequence
+                   Any, List, Sequence, TYPE_CHECKING
 from collections import abc
 import dataclasses
 import datetime as dt
@@ -17,6 +17,8 @@ import math
 from struct import unpack
 from binascii import unhexlify

+if TYPE_CHECKING:
+    from .localization import Locales
 from .errors import UsageError


@@ -573,6 +575,13 @@ class SearchDetails(LookupDetails):

    viewbox_x2: Optional[Bbox] = None

+    locales: Optional['Locales'] = dataclasses.field(
+        default=None, metadata={'transform': lambda v: v})
+    """ Locale preferences of the caller.
+        Used during result re-ranking to prefer results that match the
+        caller's locale over results that only match in an alternate language.
+    """
+
    def __post_init__(self) -> None:
        if self.viewbox is not None:
            xext = (self.viewbox.maxlon - self.viewbox.minlon)/2
--- a/src/nominatim_api/v1/server_glue.py
+++ b/src/nominatim_api/v1/server_glue.py
@@ -334,6 +334,8 @@ async def search_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
        details['layers'] = DataLayer.ADDRESS
    else:
        details['layers'] = get_layers(params)
+    details['locales'] = Locales.from_accept_languages(get_accepted_languages(params),
+                                                       params.config().OUTPUT_NAMES)

    # unstructured query parameters
    query = params.get('q', None)
@@ -359,8 +361,7 @@ async def search_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
    except UsageError as err:
        params.raise_error(str(err))

-    Locales.from_accept_languages(get_accepted_languages(params),
-                                  params.config().OUTPUT_NAMES).localize_results(results)
+    details['locales'].localize_results(results)

    if details['dedupe'] and len(results) > 1:
        results = helpers.deduplicate_results(results, max_results)