rerank results by query

The algorithm is similar to the PHP reranking and uses the terms from
the display name to check against the query terms. However instead of
exact matching it uses a per-word-edit-distance, so that it is less
strict when it comes to mismatching accents or other one letter
differences.

Country names get a higher penalty because they don't receive a
penalty during token matching right now.

This will work badly with the legacy tokenizer. Given that it is
marked for removal, it is simply not worth optimising for it.
This commit is contained in:
Sarah Hoffmann
2023-09-19 16:18:09 +02:00
parent 5762a5bc80
commit fd26310d6a
3 changed files with 64 additions and 4 deletions

View File

@@ -9,7 +9,9 @@ Public interface to the search code.
"""
from typing import List, Any, Optional, Iterator, Tuple
import itertools
import re
import datetime as dt
import difflib
from nominatim.api.connection import SearchConnection
from nominatim.api.types import SearchDetails
@@ -92,23 +94,56 @@ class ForwardGeocoder:
if dt.datetime.now() >= end_time:
break
return results
def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
""" Remove badly matching results, sort by ranking and
limit to the configured number of results.
"""
if results:
min_ranking = min(r.ranking for r in results)
results = SearchResults(r for r in results if r.ranking < min_ranking + 0.5)
results.sort(key=lambda r: r.ranking)
if results:
min_rank = min(r.rank_search for r in results)
min_rank = results[0].rank_search
results = SearchResults(r for r in results
if r.ranking + 0.05 * (r.rank_search - min_rank)
< min_ranking + 0.5)
results.sort(key=lambda r: r.accuracy - r.calculated_importance())
results = SearchResults(results[:self.limit])
return results
def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
""" Adjust the accuracy of the localized result according to how well
they match the original query.
"""
assert self.query_analyzer is not None
qwords = [word for phrase in query.source
for word in re.split('[, ]+', phrase.text) if word]
if not qwords:
return
for result in results:
if not result.display_name:
continue
distance = 0.0
norm = self.query_analyzer.normalize_text(result.display_name)
words = set((w for w in norm.split(' ') if w))
if not words:
continue
for qword in qwords:
wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
if wdist < 0.5:
distance += len(qword)
else:
distance += (1.0 - wdist) * len(qword)
result.accuracy += distance * 0.5 / sum(len(w) for w in qwords)
async def lookup_pois(self, categories: List[Tuple[str, str]],
phrases: List[Phrase]) -> SearchResults:
""" Look up places by category. If phrase is given, a place search
@@ -123,13 +158,16 @@ class ForwardGeocoder:
if query:
searches = [wrap_near_search(categories, s) for s in searches[:50]]
results = await self.execute_searches(query, searches)
await add_result_details(self.conn, results, self.params)
log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
results = self.sort_and_cut_results(results)
else:
results = SearchResults()
else:
search = build_poi_search(categories, self.params.countries)
results = await search.lookup(self.conn, self.params)
await add_result_details(self.conn, results, self.params)
await add_result_details(self.conn, results, self.params)
log().result_dump('Final Results', ((r.accuracy, r) for r in results))
return results
@@ -150,6 +188,10 @@ class ForwardGeocoder:
# Execute SQL until an appropriate result is found.
results = await self.execute_searches(query, searches[:50])
await add_result_details(self.conn, results, self.params)
log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
self.rerank_by_query(query, results)
log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
results = self.sort_and_cut_results(results)
log().result_dump('Final Results', ((r.accuracy, r) for r in results))
return results

View File

@@ -127,6 +127,15 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
return query
def normalize_text(self, text: str) -> str:
""" Bring the given text into a normalized form.
This only removes case, so some difference with the normalization
in the phrase remains.
"""
return text.lower()
def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
Dict[str, List[qmod.TokenRange]]]:
""" Transliterate the phrases and split them into tokens.

View File

@@ -30,6 +30,15 @@ class AbstractQueryAnalyzer(ABC):
"""
@abstractmethod
def normalize_text(self, text: str) -> str:
""" Bring the given text into a normalized form. That is the
standardized form search will work with. All information removed
at this stage is inevitably lost.
"""
async def make_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
""" Create a query analyzer for the tokenizer used by the database.
"""