Nominatim/nominatim/api/v1/helpers.py

# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2023 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper function for parsing parameters and and outputting data
specifically for the v1 version of the API.
"""
from typing import Tuple, Optional, Any, Dict, Iterable
from itertools import chain
import re

from nominatim.api.results import SearchResult, SearchResults, SourceTable
from nominatim.api.types import SearchDetails, GeometryFormat

REVERSE_MAX_RANKS = [2, 2, 2,   # 0-2   Continent/Sea
                     4, 4,      # 3-4   Country
                     8,         # 5     State
                     10, 10,    # 6-7   Region
                     12, 12,    # 8-9   County
                     16, 17,    # 10-11 City
                     18,        # 12    Town
                     19,        # 13    Village/Suburb
                     22,        # 14    Hamlet/Neighbourhood
                     25,        # 15    Localities
                     26,        # 16    Major Streets
                     27,        # 17    Minor Streets
                     30         # 18    Building
                    ]


def zoom_to_rank(zoom: int) -> int:
    """ Convert a zoom parameter into a rank according to the v1 API spec.
    """
    return REVERSE_MAX_RANKS[max(0, min(18, zoom))]


FEATURE_TYPE_TO_RANK: Dict[Optional[str], Any] = {
    'country': (4, 4),
    'state': (8, 8),
    'city': (14, 16),
    'settlement': (8, 20)
}


def feature_type_to_rank(feature_type: Optional[str]) -> Tuple[int, int]:
    """ Convert a feature type parameter to a tuple of
        feature type name, minimum rank and maximum rank.
    """
    return FEATURE_TYPE_TO_RANK.get(feature_type, (0, 30))


#pylint: disable=too-many-arguments,too-many-branches
def extend_query_parts(queryparts: Dict[str, Any], details: Dict[str, Any],
                       feature_type: Optional[str],
                       namedetails: bool, extratags: bool,
                       excluded: Iterable[str]) -> None:
    """ Add parameters from details dictionary to the query parts
        dictionary which is suitable as URL parameter dictionary.
    """
    parsed = SearchDetails.from_kwargs(details)
    if parsed.geometry_output != GeometryFormat.NONE:
        if GeometryFormat.GEOJSON in parsed.geometry_output:
            queryparts['polygon_geojson'] = '1'
        if GeometryFormat.KML in parsed.geometry_output:
            queryparts['polygon_kml'] = '1'
        if GeometryFormat.SVG in parsed.geometry_output:
            queryparts['polygon_svg'] = '1'
        if GeometryFormat.TEXT in parsed.geometry_output:
            queryparts['polygon_text'] = '1'
    if parsed.address_details:
        queryparts['addressdetails'] = '1'
    if namedetails:
        queryparts['namedetails'] = '1'
    if extratags:
        queryparts['extratags'] = '1'
    if parsed.geometry_simplification > 0.0:
        queryparts['polygon_threshold'] = f"{parsed.geometry_simplification:.6g}"
    if parsed.max_results != 10:
        queryparts['limit'] = str(parsed.max_results)
    if parsed.countries:
        queryparts['countrycodes'] = ','.join(parsed.countries)
    queryparts['exclude_place_ids'] = \
        ','.join(chain(excluded, map(str, (e for e in parsed.excluded if e > 0))))
    if parsed.viewbox:
        queryparts['viewbox'] = ','.join(f"{c:.7g}" for c in parsed.viewbox.coords)
    if parsed.bounded_viewbox:
        queryparts['bounded'] = '1'
    if not details['dedupe']:
        queryparts['dedupe'] = '0'
    if feature_type in FEATURE_TYPE_TO_RANK:
        queryparts['featureType'] = feature_type


def deduplicate_results(results: SearchResults, max_results: int) -> SearchResults:
    """ Remove results that look like duplicates.

        Two results are considered the same if they have the same OSM ID
        or if they have the same category, display name and rank.
    """
    osm_ids_done = set()
    classification_done = set()
    deduped = SearchResults()
    for result in results:
        if result.source_table == SourceTable.POSTCODE:
            assert result.names and 'ref' in result.names
            if any(_is_postcode_relation_for(r, result.names['ref']) for r in results):
                continue
        if result.source_table == SourceTable.PLACEX:
            classification = (result.osm_object[0] if result.osm_object else None,
                              result.category,
                              result.display_name,
                              result.rank_address)
            if result.osm_object not in osm_ids_done \
               and classification not in classification_done:
                deduped.append(result)
            osm_ids_done.add(result.osm_object)
            classification_done.add(classification)
        else:
            deduped.append(result)
        if len(deduped) >= max_results:
            break

    return deduped


def _is_postcode_relation_for(result: SearchResult, postcode: str) -> bool:
    return result.source_table == SourceTable.PLACEX \
           and result.osm_object is not None \
           and result.osm_object[0] == 'R' \
           and result.category == ('boundary', 'postal_code') \
           and result.names is not None \
           and result.names.get('ref') == postcode


def _deg(axis:str) -> str:
    return f"(?P<{axis}_deg>\\d+\\.\\d+)°?"

def _deg_min(axis: str) -> str:
    return f"(?P<{axis}_deg>\\d+)[°\\s]+(?P<{axis}_min>[\\d.]+)[′']*"

def _deg_min_sec(axis: str) -> str:
    return f"(?P<{axis}_deg>\\d+)[°\\s]+(?P<{axis}_min>\\d+)[′'\\s]+(?P<{axis}_sec>[\\d.]+)[\"″]*"

COORD_REGEX = [re.compile(r'(?:(?P<pre>.*?)\s+)??' + r + r'(?:\s+(?P<post>.*))?') for r in (
    r"(?P<ns>[NS])\s*" + _deg('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg('lon'),
    _deg('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg('lon') + r"\s*(?P<ew>[EW])",
    r"(?P<ns>[NS])\s*" + _deg_min('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg_min('lon'),
    _deg_min('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg_min('lon') + r"\s*(?P<ew>[EW])",
    r"(?P<ns>[NS])\s*" + _deg_min_sec('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg_min_sec('lon'),
    _deg_min_sec('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg_min_sec('lon') + r"\s*(?P<ew>[EW])",
    r"\[?(?P<lat_deg>[+-]?\d+\.\d+)[\s,]+(?P<lon_deg>[+-]?\d+\.\d+)\]?"
)]

def extract_coords_from_query(query: str) -> Tuple[str, Optional[float], Optional[float]]:
    """ Look for something that is formated like a coordinate at the
        beginning or end of the query. If found, extract the coordinate and
        return the remaining query (or the empty string if the query
        consisted of nothing but a coordinate).

        Only the first match will be returned.
    """
    for regex in COORD_REGEX:
        match = regex.fullmatch(query)
        if match is None:
            continue
        groups = match.groupdict()
        if not groups['pre'] or not groups['post']:
            x = float(groups['lon_deg']) \
                + float(groups.get('lon_min', 0.0)) / 60.0 \
                + float(groups.get('lon_sec', 0.0)) / 3600.0
            if groups.get('ew') == 'W':
                x = -x
            y = float(groups['lat_deg']) \
                + float(groups.get('lat_min', 0.0)) / 60.0 \
                + float(groups.get('lat_sec', 0.0)) / 3600.0
            if groups.get('ns') == 'S':
                y = -y
            return groups['pre'] or groups['post'] or '', x, y

    return query, None, None


CATEGORY_REGEX = re.compile(r'(?P<pre>.*?)\[(?P<cls>[a-zA-Z_]+)=(?P<typ>[a-zA-Z_]+)\](?P<post>.*)')

def extract_category_from_query(query: str) -> Tuple[str, Optional[str], Optional[str]]:
    """ Extract a hidden category specification of the form '[key=value]' from
        the query. If found, extract key and value  and
        return the remaining query (or the empty string if the query
        consisted of nothing but a category).

        Only the first match will be returned.
    """
    match = CATEGORY_REGEX.search(query)
    if match is not None:
        return (match.group('pre').strip() + ' ' + match.group('post').strip()).strip(), \
               match.group('cls'), match.group('typ')

    return query, None, None