diff --git a/nominatim/api/results.py b/nominatim/api/results.py index 0183f5b9..1b2534e2 100644 --- a/nominatim/api/results.py +++ b/nominatim/api/results.py @@ -292,12 +292,6 @@ class SearchResults(List[SearchResult]): May be empty when no result was found. """ - def localize(self, locales: Locales) -> None: - """ Apply the given locales to all results. - """ - for result in self: - result.localize(locales) - def _filter_geometries(row: SaRow) -> Dict[str, str]: return {k[9:]: v for k, v in row._mapping.items() # pylint: disable=W0212 @@ -459,6 +453,8 @@ async def add_result_details(conn: SearchConnection, results: List[BaseResultT], log().comment('Query keywords') for result in results: await complete_keywords(conn, result) + for result in results: + result.localize(details.locales) def _result_row_to_address_row(row: SaRow) -> AddressLine: diff --git a/nominatim/api/search/geocoder.py b/nominatim/api/search/geocoder.py index f88bffbd..7ff3ed08 100644 --- a/nominatim/api/search/geocoder.py +++ b/nominatim/api/search/geocoder.py @@ -7,13 +7,15 @@ """ Public interface to the search code. """ -from typing import List, Any, Optional, Iterator, Tuple +from typing import List, Any, Optional, Iterator, Tuple, Dict import itertools +import re import datetime as dt +import difflib from nominatim.api.connection import SearchConnection from nominatim.api.types import SearchDetails -from nominatim.api.results import SearchResults, add_result_details +from nominatim.api.results import SearchResult, SearchResults, add_result_details from nominatim.api.search.token_assignment import yield_token_assignments from nominatim.api.search.db_search_builder import SearchBuilder, build_poi_search, wrap_near_search from nominatim.api.search.db_searches import AbstractSearch @@ -73,42 +75,86 @@ class ForwardGeocoder: is found. """ log().section('Execute database searches') - results = SearchResults() + results: Dict[Any, SearchResult] = {} + end_time = dt.datetime.now() + self.timeout - num_results = 0 min_ranking = 1000.0 prev_penalty = 0.0 for i, search in enumerate(searches): if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20): break log().table_dump(f"{i + 1}. Search", _dump_searches([search], query)) - for result in await search.lookup(self.conn, self.params): - results.append(result) + lookup_results = await search.lookup(self.conn, self.params) + for result in lookup_results: + rhash = (result.source_table, result.place_id, + result.housenumber, result.country_code) + prevresult = results.get(rhash) + if prevresult: + prevresult.accuracy = min(prevresult.accuracy, result.accuracy) + else: + results[rhash] = result min_ranking = min(min_ranking, result.ranking + 0.5, search.penalty + 0.3) - log().result_dump('Results', ((r.accuracy, r) for r in results[num_results:])) - num_results = len(results) + log().result_dump('Results', ((r.accuracy, r) for r in lookup_results)) prev_penalty = search.penalty if dt.datetime.now() >= end_time: break + return SearchResults(results.values()) + + + def sort_and_cut_results(self, results: SearchResults) -> SearchResults: + """ Remove badly matching results, sort by ranking and + limit to the configured number of results. + """ if results: min_ranking = min(r.ranking for r in results) results = SearchResults(r for r in results if r.ranking < min_ranking + 0.5) + results.sort(key=lambda r: r.ranking) if results: - min_rank = min(r.rank_search for r in results) - + min_rank = results[0].rank_search results = SearchResults(r for r in results if r.ranking + 0.05 * (r.rank_search - min_rank) < min_ranking + 0.5) - results.sort(key=lambda r: r.accuracy - r.calculated_importance()) results = SearchResults(results[:self.limit]) return results + def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None: + """ Adjust the accuracy of the localized result according to how well + they match the original query. + """ + assert self.query_analyzer is not None + qwords = [word for phrase in query.source + for word in re.split('[, ]+', phrase.text) if word] + if not qwords: + return + + for result in results: + if not result.display_name: + continue + distance = 0.0 + norm = self.query_analyzer.normalize_text(result.display_name) + words = set((w for w in norm.split(' ') if w)) + if not words: + continue + for qword in qwords: + wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words) + if wdist < 0.5: + distance += len(qword) + else: + distance += (1.0 - wdist) * len(qword) + # Compensate for the fact that country names do not get a + # match penalty yet by the tokenizer. + # Temporary hack that needs to be removed! + if result.rank_address == 4: + distance *= 2 + result.accuracy += distance * 0.4 / sum(len(w) for w in qwords) + + async def lookup_pois(self, categories: List[Tuple[str, str]], phrases: List[Phrase]) -> SearchResults: """ Look up places by category. If phrase is given, a place search @@ -123,13 +169,16 @@ class ForwardGeocoder: if query: searches = [wrap_near_search(categories, s) for s in searches[:50]] results = await self.execute_searches(query, searches) + await add_result_details(self.conn, results, self.params) + log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results)) + results = self.sort_and_cut_results(results) else: results = SearchResults() else: search = build_poi_search(categories, self.params.countries) results = await search.lookup(self.conn, self.params) + await add_result_details(self.conn, results, self.params) - await add_result_details(self.conn, results, self.params) log().result_dump('Final Results', ((r.accuracy, r) for r in results)) return results @@ -150,6 +199,10 @@ class ForwardGeocoder: # Execute SQL until an appropriate result is found. results = await self.execute_searches(query, searches[:50]) await add_result_details(self.conn, results, self.params) + log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results)) + self.rerank_by_query(query, results) + log().result_dump('Results after reranking', ((r.accuracy, r) for r in results)) + results = self.sort_and_cut_results(results) log().result_dump('Final Results', ((r.accuracy, r) for r in results)) return results diff --git a/nominatim/api/search/legacy_tokenizer.py b/nominatim/api/search/legacy_tokenizer.py index 3346584c..26e4c126 100644 --- a/nominatim/api/search/legacy_tokenizer.py +++ b/nominatim/api/search/legacy_tokenizer.py @@ -127,6 +127,15 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer): return query + def normalize_text(self, text: str) -> str: + """ Bring the given text into a normalized form. + + This only removes case, so some difference with the normalization + in the phrase remains. + """ + return text.lower() + + def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str], Dict[str, List[qmod.TokenRange]]]: """ Transliterate the phrases and split them into tokens. diff --git a/nominatim/api/search/query_analyzer_factory.py b/nominatim/api/search/query_analyzer_factory.py index 35649d0f..bbc1eb6b 100644 --- a/nominatim/api/search/query_analyzer_factory.py +++ b/nominatim/api/search/query_analyzer_factory.py @@ -30,6 +30,15 @@ class AbstractQueryAnalyzer(ABC): """ + @abstractmethod + def normalize_text(self, text: str) -> str: + """ Bring the given text into a normalized form. That is the + standardized form search will work with. All information removed + at this stage is inevitably lost. + """ + + + async def make_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer: """ Create a query analyzer for the tokenizer used by the database. """ diff --git a/nominatim/api/types.py b/nominatim/api/types.py index c96b3f59..3ca023e7 100644 --- a/nominatim/api/types.py +++ b/nominatim/api/types.py @@ -17,6 +17,7 @@ from struct import unpack from binascii import unhexlify from nominatim.errors import UsageError +from nominatim.api.localization import Locales # pylint: disable=no-member,too-many-boolean-expressions,too-many-instance-attributes @@ -386,7 +387,7 @@ TParam = TypeVar('TParam', bound='LookupDetails') # pylint: disable=invalid-name @dataclasses.dataclass class LookupDetails: - """ Collection of parameters that define the amount of details + """ Collection of parameters that define which kind of details are returned with a lookup or details result. """ geometry_output: GeometryFormat = GeometryFormat.NONE @@ -413,6 +414,9 @@ class LookupDetails: 0.0 means the original geometry is kept. The higher the value, the more the geometry gets simplified. """ + locales: Locales = Locales() + """ Prefered languages for localization of results. + """ @classmethod def from_kwargs(cls: Type[TParam], kwargs: Dict[str, Any]) -> TParam: diff --git a/nominatim/api/v1/server_glue.py b/nominatim/api/v1/server_glue.py index 5e8dbf4f..70f7dc40 100644 --- a/nominatim/api/v1/server_glue.py +++ b/nominatim/api/v1/server_glue.py @@ -308,7 +308,8 @@ async def details_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> keywords=params.get_bool('keywords', False), geometry_output = napi.GeometryFormat.GEOJSON if params.get_bool('polygon_geojson', False) - else napi.GeometryFormat.NONE + else napi.GeometryFormat.NONE, + locales=locales ) if debug: @@ -317,8 +318,6 @@ async def details_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> if result is None: params.raise_error('No place with that OSM ID found.', status=404) - result.localize(locales) - output = formatting.format_result(result, fmt, {'locales': locales, 'group_hierarchy': params.get_bool('group_hierarchy', False), @@ -337,6 +336,7 @@ async def reverse_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> details = params.parse_geometry_details(fmt) details['max_rank'] = helpers.zoom_to_rank(params.get_int('zoom', 18)) details['layers'] = params.get_layers() + details['locales'] = napi.Locales.from_accept_languages(params.get_accepted_languages()) result = await api.reverse(coord, **details) @@ -357,9 +357,6 @@ async def reverse_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> 'namedetails': params.get_bool('namedetails', False), 'addressdetails': params.get_bool('addressdetails', True)} - if result: - result.localize(napi.Locales.from_accept_languages(params.get_accepted_languages())) - output = formatting.format_result(napi.ReverseResults([result] if result else []), fmt, fmt_options) @@ -372,6 +369,7 @@ async def lookup_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> A fmt = params.parse_format(napi.SearchResults, 'xml') debug = params.setup_debugging() details = params.parse_geometry_details(fmt) + details['locales'] = napi.Locales.from_accept_languages(params.get_accepted_languages()) places = [] for oid in (params.get('osm_ids') or '').split(','): @@ -394,8 +392,6 @@ async def lookup_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> A 'namedetails': params.get_bool('namedetails', False), 'addressdetails': params.get_bool('addressdetails', True)} - results.localize(napi.Locales.from_accept_languages(params.get_accepted_languages())) - output = formatting.format_result(results, fmt, fmt_options) return params.build_response(output, num_results=len(results)) @@ -456,6 +452,8 @@ async def search_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> A else: details['layers'] = params.get_layers() + details['locales'] = napi.Locales.from_accept_languages(params.get_accepted_languages()) + # unstructured query parameters query = params.get('q', None) # structured query parameters @@ -480,8 +478,6 @@ async def search_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> A except UsageError as err: params.raise_error(str(err)) - results.localize(napi.Locales.from_accept_languages(params.get_accepted_languages())) - if details['dedupe'] and len(results) > 1: results = helpers.deduplicate_results(results, max_results) diff --git a/nominatim/clicmd/api.py b/nominatim/clicmd/api.py index e8f1d233..e8450e6b 100644 --- a/nominatim/clicmd/api.py +++ b/nominatim/clicmd/api.py @@ -109,7 +109,8 @@ class APISearch: 'countries': args.countrycodes, 'excluded': args.exclude_place_ids, 'viewbox': args.viewbox, - 'bounded_viewbox': args.bounded + 'bounded_viewbox': args.bounded, + 'locales': args.get_locales(api.config.DEFAULT_LANGUAGE) } if args.query: @@ -124,9 +125,6 @@ class APISearch: country=args.country, **params) - for result in results: - result.localize(args.get_locales(api.config.DEFAULT_LANGUAGE)) - if args.dedupe and len(results) > 1: results = deduplicate_results(results, args.limit) @@ -187,14 +185,14 @@ class APIReverse: layers=args.get_layers(napi.DataLayer.ADDRESS | napi.DataLayer.POI), address_details=True, # needed for display name geometry_output=args.get_geometry_output(), - geometry_simplification=args.polygon_threshold) + geometry_simplification=args.polygon_threshold, + locales=args.get_locales(api.config.DEFAULT_LANGUAGE)) if args.format == 'debug': print(loglib.get_and_disable()) return 0 if result: - result.localize(args.get_locales(api.config.DEFAULT_LANGUAGE)) output = api_output.format_result( napi.ReverseResults([result]), args.format, @@ -249,10 +247,8 @@ class APILookup: results = api.lookup(places, address_details=True, # needed for display name geometry_output=args.get_geometry_output(), - geometry_simplification=args.polygon_threshold or 0.0) - - for result in results: - result.localize(args.get_locales(api.config.DEFAULT_LANGUAGE)) + geometry_simplification=args.polygon_threshold or 0.0, + locales=args.get_locales(api.config.DEFAULT_LANGUAGE)) output = api_output.format_result( results, @@ -326,6 +322,7 @@ class APIDetails: api = napi.NominatimAPI(args.project_dir) + locales = args.get_locales(api.config.DEFAULT_LANGUAGE) result = api.details(place, address_details=args.addressdetails, linked_places=args.linkedplaces, @@ -333,13 +330,11 @@ class APIDetails: keywords=args.keywords, geometry_output=napi.GeometryFormat.GEOJSON if args.polygon_geojson - else napi.GeometryFormat.NONE) + else napi.GeometryFormat.NONE, + locales=locales) if result: - locales = args.get_locales(api.config.DEFAULT_LANGUAGE) - result.localize(locales) - output = api_output.format_result( result, 'json', diff --git a/test/bdd/api/search/structured.feature b/test/bdd/api/search/structured.feature index f8044dad..517c0edd 100644 --- a/test/bdd/api/search/structured.feature +++ b/test/bdd/api/search/structured.feature @@ -56,7 +56,7 @@ Feature: Structured search queries | Liechtenstein | And results contain | class | type | - | amenity | ^(pub)\|(bar) | + | amenity | ^(pub)\|(bar)\|(restaurant) | #176 Scenario: Structured search restricts rank diff --git a/test/python/api/test_api_details.py b/test/python/api/test_api_details.py index 101dfd13..05a7aa7f 100644 --- a/test/python/api/test_api_details.py +++ b/test/python/api/test_api_details.py @@ -150,17 +150,20 @@ def test_lookup_placex_with_address_details(apiobj): category=('highway', 'residential'), names={'name': 'Street'}, extratags={}, admin_level=15, fromarea=True, isaddress=True, - rank_address=26, distance=0.0), + rank_address=26, distance=0.0, + local_name='Street'), napi.AddressLine(place_id=1000, osm_object=('N', 3333), category=('place', 'suburb'), names={'name': 'Smallplace'}, extratags={}, admin_level=13, fromarea=False, isaddress=True, - rank_address=23, distance=0.0034), + rank_address=23, distance=0.0034, + local_name='Smallplace'), napi.AddressLine(place_id=1001, osm_object=('N', 3334), category=('place', 'city'), names={'name': 'Bigplace'}, extratags={}, admin_level=15, fromarea=True, isaddress=True, - rank_address=16, distance=0.0), + rank_address=16, distance=0.0, + local_name='Bigplace'), napi.AddressLine(place_id=None, osm_object=None, category=('place', 'country_code'), names={'ref': 'pl'}, extratags={}, @@ -341,22 +344,26 @@ def test_lookup_osmline_with_address_details(apiobj): category=('place', 'house_number'), names={'ref': '2'}, extratags={}, admin_level=None, fromarea=True, isaddress=True, - rank_address=28, distance=0.0), + rank_address=28, distance=0.0, + local_name='2'), napi.AddressLine(place_id=332, osm_object=('W', 4), category=('highway', 'residential'), names={'name': 'Street'}, extratags={}, admin_level=15, fromarea=True, isaddress=True, - rank_address=26, distance=0.0), + rank_address=26, distance=0.0, + local_name='Street'), napi.AddressLine(place_id=1000, osm_object=('N', 3333), category=('place', 'suburb'), names={'name': 'Smallplace'}, extratags={}, admin_level=13, fromarea=False, isaddress=True, - rank_address=23, distance=0.0034), + rank_address=23, distance=0.0034, + local_name='Smallplace'), napi.AddressLine(place_id=1001, osm_object=('N', 3334), category=('place', 'city'), names={'name': 'Bigplace'}, extratags={}, admin_level=15, fromarea=True, isaddress=True, - rank_address=16, distance=0.0), + rank_address=16, distance=0.0, + local_name='Bigplace'), napi.AddressLine(place_id=None, osm_object=None, category=('place', 'country_code'), names={'ref': 'pl'}, extratags={}, @@ -441,22 +448,26 @@ def test_lookup_tiger_with_address_details(apiobj): category=('place', 'house_number'), names={'ref': '2'}, extratags={}, admin_level=None, fromarea=True, isaddress=True, - rank_address=28, distance=0.0), + rank_address=28, distance=0.0, + local_name='2'), napi.AddressLine(place_id=332, osm_object=('W', 4), category=('highway', 'residential'), names={'name': 'Street'}, extratags={}, admin_level=15, fromarea=True, isaddress=True, - rank_address=26, distance=0.0), + rank_address=26, distance=0.0, + local_name='Street'), napi.AddressLine(place_id=1000, osm_object=('N', 3333), category=('place', 'suburb'), names={'name': 'Smallplace'}, extratags={}, admin_level=13, fromarea=False, isaddress=True, - rank_address=23, distance=0.0034), + rank_address=23, distance=0.0034, + local_name='Smallplace'), napi.AddressLine(place_id=1001, osm_object=('N', 3334), category=('place', 'city'), names={'name': 'Bigplace'}, extratags={}, admin_level=15, fromarea=True, isaddress=True, - rank_address=16, distance=0.0), + rank_address=16, distance=0.0, + local_name='Bigplace'), napi.AddressLine(place_id=None, osm_object=None, category=('place', 'country_code'), names={'ref': 'us'}, extratags={}, @@ -536,17 +547,20 @@ def test_lookup_postcode_with_address_details(apiobj): category=('place', 'suburb'), names={'name': 'Smallplace'}, extratags={}, admin_level=13, fromarea=True, isaddress=True, - rank_address=23, distance=0.0), + rank_address=23, distance=0.0, + local_name='Smallplace'), napi.AddressLine(place_id=1001, osm_object=('N', 3334), category=('place', 'city'), names={'name': 'Bigplace'}, extratags={}, admin_level=15, fromarea=True, isaddress=True, - rank_address=16, distance=0.0), + rank_address=16, distance=0.0, + local_name='Bigplace'), napi.AddressLine(place_id=None, osm_object=None, category=('place', 'postcode'), names={'ref': '34 425'}, extratags={}, admin_level=None, fromarea=False, isaddress=True, - rank_address=5, distance=0.0), + rank_address=5, distance=0.0, + local_name='34 425'), napi.AddressLine(place_id=None, osm_object=None, category=('place', 'country_code'), names={'ref': 'gb'}, extratags={}, diff --git a/test/python/cli/test_cmd_api.py b/test/python/cli/test_cmd_api.py index 05e3c4f0..ca160a35 100644 --- a/test/python/cli/test_cmd_api.py +++ b/test/python/cli/test_cmd_api.py @@ -67,7 +67,9 @@ class TestCliReverseCall: result = napi.ReverseResult(napi.SourceTable.PLACEX, ('place', 'thing'), napi.Point(1.0, -3.0), names={'name':'Name', 'name:fr': 'Nom'}, - extratags={'extra':'Extra'}) + extratags={'extra':'Extra'}, + locale_name='Name', + display_name='Name') monkeypatch.setattr(napi.NominatimAPI, 'reverse', lambda *args, **kwargs: result) @@ -109,16 +111,6 @@ class TestCliReverseCall: assert out['type'] == 'FeatureCollection' - def test_reverse_language(self, cli_call, tmp_path, capsys): - result = cli_call('reverse', '--project-dir', str(tmp_path), - '--lat', '34', '--lon', '34', '--lang', 'fr') - - assert result == 0 - - out = json.loads(capsys.readouterr().out) - assert out['name'] == 'Nom' - - class TestCliLookupCall: @pytest.fixture(autouse=True) @@ -126,7 +118,9 @@ class TestCliLookupCall: result = napi.SearchResult(napi.SourceTable.PLACEX, ('place', 'thing'), napi.Point(1.0, -3.0), names={'name':'Name', 'name:fr': 'Nom'}, - extratags={'extra':'Extra'}) + extratags={'extra':'Extra'}, + locale_name='Name', + display_name='Name') monkeypatch.setattr(napi.NominatimAPI, 'lookup', lambda *args, **kwargs: napi.SearchResults([result])) @@ -150,9 +144,11 @@ class TestCliLookupCall: ]) def test_search(cli_call, tmp_path, capsys, monkeypatch, endpoint, params): result = napi.SearchResult(napi.SourceTable.PLACEX, ('place', 'thing'), - napi.Point(1.0, -3.0), - names={'name':'Name', 'name:fr': 'Nom'}, - extratags={'extra':'Extra'}) + napi.Point(1.0, -3.0), + names={'name':'Name', 'name:fr': 'Nom'}, + extratags={'extra':'Extra'}, + locale_name='Name', + display_name='Name') monkeypatch.setattr(napi.NominatimAPI, endpoint, lambda *args, **kwargs: napi.SearchResults([result]))