filter duplicate results after DB query

This commit is contained in:
Sarah Hoffmann
2023-09-20 11:39:18 +02:00
parent fd26310d6a
commit 21df87dedc

View File

@@ -7,7 +7,7 @@
""" """
Public interface to the search code. Public interface to the search code.
""" """
from typing import List, Any, Optional, Iterator, Tuple from typing import List, Any, Optional, Iterator, Tuple, Dict
import itertools import itertools
import re import re
import datetime as dt import datetime as dt
@@ -15,7 +15,7 @@ import difflib
from nominatim.api.connection import SearchConnection from nominatim.api.connection import SearchConnection
from nominatim.api.types import SearchDetails from nominatim.api.types import SearchDetails
from nominatim.api.results import SearchResults, add_result_details from nominatim.api.results import SearchResult, SearchResults, add_result_details
from nominatim.api.search.token_assignment import yield_token_assignments from nominatim.api.search.token_assignment import yield_token_assignments
from nominatim.api.search.db_search_builder import SearchBuilder, build_poi_search, wrap_near_search from nominatim.api.search.db_search_builder import SearchBuilder, build_poi_search, wrap_near_search
from nominatim.api.search.db_searches import AbstractSearch from nominatim.api.search.db_searches import AbstractSearch
@@ -75,26 +75,32 @@ class ForwardGeocoder:
is found. is found.
""" """
log().section('Execute database searches') log().section('Execute database searches')
results = SearchResults() results: Dict[Any, SearchResult] = {}
end_time = dt.datetime.now() + self.timeout end_time = dt.datetime.now() + self.timeout
num_results = 0
min_ranking = 1000.0 min_ranking = 1000.0
prev_penalty = 0.0 prev_penalty = 0.0
for i, search in enumerate(searches): for i, search in enumerate(searches):
if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20): if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
break break
log().table_dump(f"{i + 1}. Search", _dump_searches([search], query)) log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
for result in await search.lookup(self.conn, self.params): lookup_results = await search.lookup(self.conn, self.params)
results.append(result) for result in lookup_results:
rhash = (result.source_table, result.place_id,
result.housenumber, result.country_code)
prevresult = results.get(rhash)
if prevresult:
prevresult.accuracy = min(prevresult.accuracy, result.accuracy)
else:
results[rhash] = result
min_ranking = min(min_ranking, result.ranking + 0.5, search.penalty + 0.3) min_ranking = min(min_ranking, result.ranking + 0.5, search.penalty + 0.3)
log().result_dump('Results', ((r.accuracy, r) for r in results[num_results:])) log().result_dump('Results', ((r.accuracy, r) for r in lookup_results))
num_results = len(results)
prev_penalty = search.penalty prev_penalty = search.penalty
if dt.datetime.now() >= end_time: if dt.datetime.now() >= end_time:
break break
return results return SearchResults(results.values())
def sort_and_cut_results(self, results: SearchResults) -> SearchResults: def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
@@ -141,7 +147,12 @@ class ForwardGeocoder:
distance += len(qword) distance += len(qword)
else: else:
distance += (1.0 - wdist) * len(qword) distance += (1.0 - wdist) * len(qword)
result.accuracy += distance * 0.5 / sum(len(w) for w in qwords) # Compensate for the fact that country names do not get a
# match penalty yet by the tokenizer.
# Temporary hack that needs to be removed!
if result.rank_address == 4:
distance *= 2
result.accuracy += distance * 0.4 / sum(len(w) for w in qwords)
async def lookup_pois(self, categories: List[Tuple[str, str]], async def lookup_pois(self, categories: List[Tuple[str, str]],