rerank results by query

The algorithm is similar to the PHP reranking and uses the terms from the display name to check against the query terms. However instead of exact matching it uses a per-word-edit-distance, so that it is less strict when it comes to mismatching accents or other one letter differences. Country names get a higher penalty because they don't receive a penalty during token matching right now. This will work badly with the legacy tokenizer. Given that it is marked for removal, it is simply not worth optimising for it.
2026-02-26 11:08:13 +00:00 · 2023-09-19 16:18:09 +02:00
parent 5762a5bc80
commit fd26310d6a
3 changed files with 64 additions and 4 deletions
--- a/nominatim/api/search/geocoder.py
+++ b/nominatim/api/search/geocoder.py
@@ -9,7 +9,9 @@ Public interface to the search code.
 """
 from typing import List, Any, Optional, Iterator, Tuple
 import itertools
+import re
 import datetime as dt
+import difflib

 from nominatim.api.connection import SearchConnection
 from nominatim.api.types import SearchDetails
@@ -92,23 +94,56 @@ class ForwardGeocoder:
            if dt.datetime.now() >= end_time:
                break

+        return results
+
+
+    def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
+        """ Remove badly matching results, sort by ranking and
+            limit to the configured number of results.
+        """
        if results:
            min_ranking = min(r.ranking for r in results)
            results = SearchResults(r for r in results if r.ranking < min_ranking + 0.5)
+            results.sort(key=lambda r: r.ranking)

        if results:
-            min_rank = min(r.rank_search for r in results)
-
+            min_rank = results[0].rank_search
            results = SearchResults(r for r in results
                                    if r.ranking + 0.05 * (r.rank_search - min_rank)
                                       < min_ranking + 0.5)

-            results.sort(key=lambda r: r.accuracy - r.calculated_importance())
            results = SearchResults(results[:self.limit])

        return results


+    def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
+        """ Adjust the accuracy of the localized result according to how well
+            they match the original query.
+        """
+        assert self.query_analyzer is not None
+        qwords = [word for phrase in query.source
+                       for word in re.split('[, ]+', phrase.text) if word]
+        if not qwords:
+            return
+
+        for result in results:
+            if not result.display_name:
+                continue
+            distance = 0.0
+            norm = self.query_analyzer.normalize_text(result.display_name)
+            words = set((w for w in norm.split(' ') if w))
+            if not words:
+                continue
+            for qword in qwords:
+                wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
+                if wdist < 0.5:
+                    distance += len(qword)
+                else:
+                    distance += (1.0 - wdist) * len(qword)
+            result.accuracy += distance * 0.5 / sum(len(w) for w in qwords)
+
+
    async def lookup_pois(self, categories: List[Tuple[str, str]],
                          phrases: List[Phrase]) -> SearchResults:
        """ Look up places by category. If phrase is given, a place search
@@ -123,13 +158,16 @@ class ForwardGeocoder:
            if query:
                searches = [wrap_near_search(categories, s) for s in searches[:50]]
                results = await self.execute_searches(query, searches)
+                await add_result_details(self.conn, results, self.params)
+                log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
+                results = self.sort_and_cut_results(results)
            else:
                results = SearchResults()
        else:
            search = build_poi_search(categories, self.params.countries)
            results = await search.lookup(self.conn, self.params)
+            await add_result_details(self.conn, results, self.params)

-        await add_result_details(self.conn, results, self.params)
        log().result_dump('Final Results', ((r.accuracy, r) for r in results))

        return results
@@ -150,6 +188,10 @@ class ForwardGeocoder:
            # Execute SQL until an appropriate result is found.
            results = await self.execute_searches(query, searches[:50])
            await add_result_details(self.conn, results, self.params)
+            log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
+            self.rerank_by_query(query, results)
+            log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
+            results = self.sort_and_cut_results(results)
            log().result_dump('Final Results', ((r.accuracy, r) for r in results))

        return results
--- a/nominatim/api/search/legacy_tokenizer.py
+++ b/nominatim/api/search/legacy_tokenizer.py
@@ -127,6 +127,15 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
        return query


+    def normalize_text(self, text: str) -> str:
+        """ Bring the given text into a normalized form.
+
+            This only removes case, so some difference with the normalization
+            in the phrase remains.
+        """
+        return text.lower()
+
+
    def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
                                                            Dict[str, List[qmod.TokenRange]]]:
        """ Transliterate the phrases and split them into tokens.
--- a/nominatim/api/search/query_analyzer_factory.py
+++ b/nominatim/api/search/query_analyzer_factory.py
@@ -30,6 +30,15 @@ class AbstractQueryAnalyzer(ABC):
        """


+    @abstractmethod
+    def normalize_text(self, text: str) -> str:
+        """ Bring the given text into a normalized form. That is the
+            standardized form search will work with. All information removed
+            at this stage is inevitably lost.
+        """
+
+
+
 async def make_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
    """ Create a query analyzer for the tokenizer used by the database.
    """