split code into submodules

This commit is contained in:
Sarah Hoffmann
2024-05-16 11:55:17 +02:00
parent 0fb4fe8e4d
commit 6e89310a92
137 changed files with 757 additions and 716 deletions

View File

@@ -0,0 +1,38 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
The public interface of the Nominatim library.
Classes and functions defined in this file are considered stable. Always
import from this file, not from the source files directly.
"""
# See also https://github.com/PyCQA/pylint/issues/6006
# pylint: disable=useless-import-alias
from .core import (NominatimAPI as NominatimAPI,
NominatimAPIAsync as NominatimAPIAsync)
from .connection import (SearchConnection as SearchConnection)
from .status import (StatusResult as StatusResult)
from .types import (PlaceID as PlaceID,
OsmID as OsmID,
PlaceRef as PlaceRef,
Point as Point,
Bbox as Bbox,
GeometryFormat as GeometryFormat,
DataLayer as DataLayer)
from .results import (SourceTable as SourceTable,
AddressLine as AddressLine,
AddressLines as AddressLines,
WordInfo as WordInfo,
WordInfos as WordInfos,
DetailedResult as DetailedResult,
ReverseResult as ReverseResult,
ReverseResults as ReverseResults,
SearchResult as SearchResult,
SearchResults as SearchResults)
from .localization import (Locales as Locales)

View File

@@ -0,0 +1,149 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Extended SQLAlchemy connection class that also includes access to the schema.
"""
from typing import cast, Any, Mapping, Sequence, Union, Dict, Optional, Set, \
Awaitable, Callable, TypeVar
import asyncio
import sqlalchemy as sa
from sqlalchemy.ext.asyncio import AsyncConnection
from nominatim_core.typing import SaFromClause
from nominatim_core.db.sqlalchemy_schema import SearchTables
from nominatim_core.db.sqlalchemy_types import Geometry
from .logging import log
T = TypeVar('T')
class SearchConnection:
""" An extended SQLAlchemy connection class, that also contains
the table definitions. The underlying asynchronous SQLAlchemy
connection can be accessed with the 'connection' property.
The 't' property is the collection of Nominatim tables.
"""
def __init__(self, conn: AsyncConnection,
tables: SearchTables,
properties: Dict[str, Any]) -> None:
self.connection = conn
self.t = tables # pylint: disable=invalid-name
self._property_cache = properties
self._classtables: Optional[Set[str]] = None
self.query_timeout: Optional[int] = None
def set_query_timeout(self, timeout: Optional[int]) -> None:
""" Set the timeout after which a query over this connection
is cancelled.
"""
self.query_timeout = timeout
async def scalar(self, sql: sa.sql.base.Executable,
params: Union[Mapping[str, Any], None] = None
) -> Any:
""" Execute a 'scalar()' query on the connection.
"""
log().sql(self.connection, sql, params)
return await asyncio.wait_for(self.connection.scalar(sql, params), self.query_timeout)
async def execute(self, sql: 'sa.Executable',
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None] = None
) -> 'sa.Result[Any]':
""" Execute a 'execute()' query on the connection.
"""
log().sql(self.connection, sql, params)
return await asyncio.wait_for(self.connection.execute(sql, params), self.query_timeout)
async def get_property(self, name: str, cached: bool = True) -> str:
""" Get a property from Nominatim's property table.
Property values are normally cached so that they are only
retrieved from the database when they are queried for the
first time with this function. Set 'cached' to False to force
reading the property from the database.
Raises a ValueError if the property does not exist.
"""
lookup_name = f'DBPROP:{name}'
if cached and lookup_name in self._property_cache:
return cast(str, self._property_cache[lookup_name])
sql = sa.select(self.t.properties.c.value)\
.where(self.t.properties.c.property == name)
value = await self.connection.scalar(sql)
if value is None:
raise ValueError(f"Property '{name}' not found in database.")
self._property_cache[lookup_name] = cast(str, value)
return cast(str, value)
async def get_db_property(self, name: str) -> Any:
""" Get a setting from the database. At the moment, only
'server_version', the version of the database software, can
be retrieved with this function.
Raises a ValueError if the property does not exist.
"""
if name != 'server_version':
raise ValueError(f"DB setting '{name}' not found in database.")
return self._property_cache['DB:server_version']
async def get_cached_value(self, group: str, name: str,
factory: Callable[[], Awaitable[T]]) -> T:
""" Access the cache for this Nominatim instance.
Each cache value needs to belong to a group and have a name.
This function is for internal API use only.
`factory` is an async callback function that produces
the value if it is not already cached.
Returns the cached value or the result of factory (also caching
the result).
"""
full_name = f'{group}:{name}'
if full_name in self._property_cache:
return cast(T, self._property_cache[full_name])
value = await factory()
self._property_cache[full_name] = value
return value
async def get_class_table(self, cls: str, typ: str) -> Optional[SaFromClause]:
""" Lookup up if there is a classtype table for the given category
and return a SQLAlchemy table for it, if it exists.
"""
if self._classtables is None:
res = await self.execute(sa.text("""SELECT tablename FROM pg_tables
WHERE tablename LIKE 'place_classtype_%'
"""))
self._classtables = {r[0] for r in res}
tablename = f"place_classtype_{cls}_{typ}"
if tablename not in self._classtables:
return None
if tablename in self.t.meta.tables:
return self.t.meta.tables[tablename]
return sa.Table(tablename, self.t.meta,
sa.Column('place_id', sa.BigInteger),
sa.Column('centroid', Geometry))

974
src/nominatim_api/core.py Normal file
View File

@@ -0,0 +1,974 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of classes for API access via libraries.
"""
from typing import Mapping, Optional, Any, AsyncIterator, Dict, Sequence, List, Tuple
import asyncio
import sys
import contextlib
from pathlib import Path
import sqlalchemy as sa
import sqlalchemy.ext.asyncio as sa_asyncio
from nominatim_core.errors import UsageError
from nominatim_core.db.sqlalchemy_schema import SearchTables
from nominatim_core.db.async_core_library import PGCORE_LIB, PGCORE_ERROR
from nominatim_core.config import Configuration
from .sql import sqlite_functions, sqlalchemy_functions #pylint: disable=unused-import
from .connection import SearchConnection
from .status import get_status, StatusResult
from .lookup import get_detailed_place, get_simple_place
from .reverse import ReverseGeocoder
from .search import ForwardGeocoder, Phrase, PhraseType, make_query_analyzer
from . import types as ntyp
from .results import DetailedResult, ReverseResult, SearchResults
class NominatimAPIAsync: #pylint: disable=too-many-instance-attributes
""" The main frontend to the Nominatim database implements the
functions for lookup, forward and reverse geocoding using
asynchronous functions.
This class shares most of the functions with its synchronous
version. There are some additional functions or parameters,
which are documented below.
"""
def __init__(self, project_dir: Path,
environ: Optional[Mapping[str, str]] = None,
loop: Optional[asyncio.AbstractEventLoop] = None) -> None:
""" Initiate a new frontend object with synchronous API functions.
Parameters:
project_dir: Path to the
[project directory](../admin/Import.md#creating-the-project-directory)
of the local Nominatim installation.
environ: Mapping of [configuration parameters](../customize/Settings.md).
When set, replaces any configuration via environment variables.
Settings in this mapping also have precedence over any
parameters found in the `.env` file of the project directory.
loop: The asyncio event loop that will be used when calling
functions. Only needed, when a custom event loop is used
and the Python version is 3.9 or earlier.
"""
self.config = Configuration(project_dir, environ)
self.query_timeout = self.config.get_int('QUERY_TIMEOUT') \
if self.config.QUERY_TIMEOUT else None
self.reverse_restrict_to_country_area = self.config.get_bool('SEARCH_WITHIN_COUNTRIES')
self.server_version = 0
if sys.version_info >= (3, 10):
self._engine_lock = asyncio.Lock()
else:
self._engine_lock = asyncio.Lock(loop=loop) # pylint: disable=unexpected-keyword-arg
self._engine: Optional[sa_asyncio.AsyncEngine] = None
self._tables: Optional[SearchTables] = None
self._property_cache: Dict[str, Any] = {'DB:server_version': 0}
async def setup_database(self) -> None:
""" Set up the SQL engine and connections.
This function will be implicitly called when the database is
accessed for the first time. You may also call it explicitly to
avoid that the first call is delayed by the setup.
"""
async with self._engine_lock:
if self._engine:
return
extra_args: Dict[str, Any] = {'future': True,
'echo': self.config.get_bool('DEBUG_SQL')}
if self.config.get_int('API_POOL_SIZE') == 0:
extra_args['poolclass'] = sa.pool.NullPool
else:
extra_args['poolclass'] = sa.pool.AsyncAdaptedQueuePool
extra_args['max_overflow'] = 0
extra_args['pool_size'] = self.config.get_int('API_POOL_SIZE')
is_sqlite = self.config.DATABASE_DSN.startswith('sqlite:')
if is_sqlite:
params = dict((p.split('=', 1)
for p in self.config.DATABASE_DSN[7:].split(';')))
dburl = sa.engine.URL.create('sqlite+aiosqlite',
database=params.get('dbname'))
if not ('NOMINATIM_DATABASE_RW' in self.config.environ
and self.config.get_bool('DATABASE_RW')) \
and not Path(params.get('dbname', '')).is_file():
raise UsageError(f"SQlite database '{params.get('dbname')}' does not exist.")
else:
dsn = self.config.get_database_params()
query = {k: v for k, v in dsn.items()
if k not in ('user', 'password', 'dbname', 'host', 'port')}
dburl = sa.engine.URL.create(
f'postgresql+{PGCORE_LIB}',
database=dsn.get('dbname'),
username=dsn.get('user'),
password=dsn.get('password'),
host=dsn.get('host'),
port=int(dsn['port']) if 'port' in dsn else None,
query=query)
engine = sa_asyncio.create_async_engine(dburl, **extra_args)
if is_sqlite:
server_version = 0
@sa.event.listens_for(engine.sync_engine, "connect")
def _on_sqlite_connect(dbapi_con: Any, _: Any) -> None:
dbapi_con.run_async(lambda conn: conn.enable_load_extension(True))
sqlite_functions.install_custom_functions(dbapi_con)
cursor = dbapi_con.cursor()
cursor.execute("SELECT load_extension('mod_spatialite')")
cursor.execute('SELECT SetDecimalPrecision(7)')
dbapi_con.run_async(lambda conn: conn.enable_load_extension(False))
else:
try:
async with engine.begin() as conn:
result = await conn.scalar(sa.text('SHOW server_version_num'))
server_version = int(result)
if server_version >= 110000:
await conn.execute(sa.text("SET jit_above_cost TO '-1'"))
await conn.execute(sa.text(
"SET max_parallel_workers_per_gather TO '0'"))
except (PGCORE_ERROR, sa.exc.OperationalError):
server_version = 0
if server_version >= 110000:
@sa.event.listens_for(engine.sync_engine, "connect")
def _on_connect(dbapi_con: Any, _: Any) -> None:
cursor = dbapi_con.cursor()
cursor.execute("SET jit_above_cost TO '-1'")
cursor.execute("SET max_parallel_workers_per_gather TO '0'")
self._property_cache['DB:server_version'] = server_version
self._tables = SearchTables(sa.MetaData()) # pylint: disable=no-member
self._engine = engine
async def close(self) -> None:
""" Close all active connections to the database. The NominatimAPIAsync
object remains usable after closing. If a new API functions is
called, new connections are created.
"""
if self._engine is not None:
await self._engine.dispose()
@contextlib.asynccontextmanager
async def begin(self) -> AsyncIterator[SearchConnection]:
""" Create a new connection with automatic transaction handling.
This function may be used to get low-level access to the database.
Refer to the documentation of SQLAlchemy for details how to use
the connection object.
"""
if self._engine is None:
await self.setup_database()
assert self._engine is not None
assert self._tables is not None
async with self._engine.begin() as conn:
yield SearchConnection(conn, self._tables, self._property_cache)
async def status(self) -> StatusResult:
""" Return the status of the database.
"""
try:
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
status = await get_status(conn)
except (PGCORE_ERROR, sa.exc.OperationalError):
return StatusResult(700, 'Database connection failed')
return status
async def details(self, place: ntyp.PlaceRef, **params: Any) -> Optional[DetailedResult]:
""" Get detailed information about a place in the database.
Returns None if there is no entry under the given ID.
"""
details = ntyp.LookupDetails.from_kwargs(params)
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
if details.keywords:
await make_query_analyzer(conn)
return await get_detailed_place(conn, place, details)
async def lookup(self, places: Sequence[ntyp.PlaceRef], **params: Any) -> SearchResults:
""" Get simple information about a list of places.
Returns a list of place information for all IDs that were found.
"""
details = ntyp.LookupDetails.from_kwargs(params)
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
if details.keywords:
await make_query_analyzer(conn)
return SearchResults(filter(None,
[await get_simple_place(conn, p, details) for p in places]))
async def reverse(self, coord: ntyp.AnyPoint, **params: Any) -> Optional[ReverseResult]:
""" Find a place by its coordinates. Also known as reverse geocoding.
Returns the closest result that can be found or None if
no place matches the given criteria.
"""
# The following negation handles NaN correctly. Don't change.
if not abs(coord[0]) <= 180 or not abs(coord[1]) <= 90:
# There are no results to be expected outside valid coordinates.
return None
details = ntyp.ReverseDetails.from_kwargs(params)
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
if details.keywords:
await make_query_analyzer(conn)
geocoder = ReverseGeocoder(conn, details,
self.reverse_restrict_to_country_area)
return await geocoder.lookup(coord)
async def search(self, query: str, **params: Any) -> SearchResults:
""" Find a place by free-text search. Also known as forward geocoding.
"""
query = query.strip()
if not query:
raise UsageError('Nothing to search for.')
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
geocoder = ForwardGeocoder(conn, ntyp.SearchDetails.from_kwargs(params),
self.config.get_int('REQUEST_TIMEOUT') \
if self.config.REQUEST_TIMEOUT else None)
phrases = [Phrase(PhraseType.NONE, p.strip()) for p in query.split(',')]
return await geocoder.lookup(phrases)
# pylint: disable=too-many-arguments,too-many-branches
async def search_address(self, amenity: Optional[str] = None,
street: Optional[str] = None,
city: Optional[str] = None,
county: Optional[str] = None,
state: Optional[str] = None,
country: Optional[str] = None,
postalcode: Optional[str] = None,
**params: Any) -> SearchResults:
""" Find an address using structured search.
"""
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
details = ntyp.SearchDetails.from_kwargs(params)
phrases: List[Phrase] = []
if amenity:
phrases.append(Phrase(PhraseType.AMENITY, amenity))
if street:
phrases.append(Phrase(PhraseType.STREET, street))
if city:
phrases.append(Phrase(PhraseType.CITY, city))
if county:
phrases.append(Phrase(PhraseType.COUNTY, county))
if state:
phrases.append(Phrase(PhraseType.STATE, state))
if postalcode:
phrases.append(Phrase(PhraseType.POSTCODE, postalcode))
if country:
phrases.append(Phrase(PhraseType.COUNTRY, country))
if not phrases:
raise UsageError('Nothing to search for.')
if amenity or street:
details.restrict_min_max_rank(26, 30)
elif city:
details.restrict_min_max_rank(13, 25)
elif county:
details.restrict_min_max_rank(10, 12)
elif state:
details.restrict_min_max_rank(5, 9)
elif postalcode:
details.restrict_min_max_rank(5, 11)
else:
details.restrict_min_max_rank(4, 4)
if 'layers' not in params:
details.layers = ntyp.DataLayer.ADDRESS
if amenity:
details.layers |= ntyp.DataLayer.POI
geocoder = ForwardGeocoder(conn, details,
self.config.get_int('REQUEST_TIMEOUT') \
if self.config.REQUEST_TIMEOUT else None)
return await geocoder.lookup(phrases)
async def search_category(self, categories: List[Tuple[str, str]],
near_query: Optional[str] = None,
**params: Any) -> SearchResults:
""" Find an object of a certain category near another place.
The near place may either be given as an unstructured search
query in itself or as coordinates.
"""
if not categories:
return SearchResults()
details = ntyp.SearchDetails.from_kwargs(params)
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
if near_query:
phrases = [Phrase(PhraseType.NONE, p) for p in near_query.split(',')]
else:
phrases = []
if details.keywords:
await make_query_analyzer(conn)
geocoder = ForwardGeocoder(conn, details,
self.config.get_int('REQUEST_TIMEOUT') \
if self.config.REQUEST_TIMEOUT else None)
return await geocoder.lookup_pois(categories, phrases)
class NominatimAPI:
""" This class provides a thin synchronous wrapper around the asynchronous
Nominatim functions. It creates its own event loop and runs each
synchronous function call to completion using that loop.
"""
def __init__(self, project_dir: Path,
environ: Optional[Mapping[str, str]] = None) -> None:
""" Initiate a new frontend object with synchronous API functions.
Parameters:
project_dir: Path to the
[project directory](../admin/Import.md#creating-the-project-directory)
of the local Nominatim installation.
environ: Mapping of [configuration parameters](../customize/Settings.md).
When set, replaces any configuration via environment variables.
Settings in this mapping also have precedence over any
parameters found in the `.env` file of the project directory.
"""
self._loop = asyncio.new_event_loop()
self._async_api = NominatimAPIAsync(project_dir, environ, loop=self._loop)
def close(self) -> None:
""" Close all active connections to the database.
This function also closes the asynchronous worker loop making
the NominatimAPI object unusable.
"""
self._loop.run_until_complete(self._async_api.close())
self._loop.close()
@property
def config(self) -> Configuration:
""" Provide read-only access to the [configuration](#Configuration)
used by the API.
"""
return self._async_api.config
def status(self) -> StatusResult:
""" Return the status of the database as a dataclass object
with the fields described below.
Returns:
status(int): A status code as described on the status page.
message(str): Either 'OK' or a human-readable message of the
problem encountered.
software_version(tuple): A tuple with the version of the
Nominatim library consisting of (major, minor, patch, db-patch)
version.
database_version(tuple): A tuple with the version of the library
which was used for the import or last migration.
Also consists of (major, minor, patch, db-patch).
data_updated(datetime): Timestamp with the age of the data.
"""
return self._loop.run_until_complete(self._async_api.status())
def details(self, place: ntyp.PlaceRef, **params: Any) -> Optional[DetailedResult]:
""" Get detailed information about a place in the database.
The result is a dataclass object with the fields described below
or `None` if the place could not be found in the database.
Parameters:
place: Description of the place to look up. See
[Place identification](Input-Parameter-Types.md#place-identification)
for the various ways to reference a place.
Other parameters:
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
parent_place_id (Optional(int]): Internal ID of the parent of this
place. Only meaning full for POI-like objects (places with a
rank_address of 30).
linked_place_id (Optional[int]): Internal ID of the place this object
links to. When this ID is set then there is no guarantee that
the rest of the result information is complete.
admin_level (int): Value of the `admin_level` OSM tag. Only meaningful
for administrative boundary objects.
indexed_date (datetime): Timestamp when the place was last updated.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
"""
return self._loop.run_until_complete(self._async_api.details(place, **params))
def lookup(self, places: Sequence[ntyp.PlaceRef], **params: Any) -> SearchResults:
""" Get simple information about a list of places.
Returns a list of place information for all IDs that were found.
Each result is a dataclass with the fields detailed below.
Parameters:
places: List of descriptions of the place to look up. See
[Place identification](Input-Parameter-Types.md#place-identification)
for the various ways to reference a place.
Other parameters:
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
bbox (Bbox): Bounding box of the full geometry of the place.
If the place is a single point, then the size of the bounding
box is guessed according to the type of place.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
"""
return self._loop.run_until_complete(self._async_api.lookup(places, **params))
def reverse(self, coord: ntyp.AnyPoint, **params: Any) -> Optional[ReverseResult]:
""" Find a place by its coordinates. Also known as reverse geocoding.
Returns the closest result that can be found or `None` if
no place matches the given criteria. The result is a dataclass
with the fields as detailed below.
Parameters:
coord: Coordinate to lookup the place for as a Point
or a tuple (x, y). Must be in WGS84 projection.
Other parameters:
max_rank (int): Highest address rank to return. Can be used to
restrict search to streets or settlements.
layers (enum): Defines the kind of data to take into account.
See description of layers below. (Default: addresses and POIs)
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
bbox (Bbox): Bounding box of the full geometry of the place.
If the place is a single point, then the size of the bounding
box is guessed according to the type of place.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
distance (Optional[float]): Distance in degree from the input point.
"""
return self._loop.run_until_complete(self._async_api.reverse(coord, **params))
def search(self, query: str, **params: Any) -> SearchResults:
""" Find a place by free-text search. Also known as forward geocoding.
Parameters:
query: Free-form text query searching for a place.
Other parameters:
max_results (int): Maximum number of results to return. The
actual number of results may be less. (Default: 10)
min_rank (int): Lowest permissible rank for the result.
For addressable places this is the minimum
[address rank](../customize/Ranking.md#address-rank). For all
other places the [search rank](../customize/Ranking.md#search-rank)
is used.
max_rank (int): Highest permissible rank for the result. See min_rank above.
layers (enum): Defines the kind of data to take into account.
See [layers section](Input-Parameter-Types.md#layers) for details.
(Default: addresses and POIs)
countries (list[str]): Restrict search to countries with the given
ISO 3166-1 alpha-2 country code. An empty list (the default)
disables this filter.
excluded (list[int]): A list of internal IDs of places to exclude
from the search.
viewbox (Optional[Bbox]): Bounding box of an area to focus search on.
bounded_viewbox (bool): Consider the bounding box given in `viewbox`
as a filter and return only results within the bounding box.
near (Optional[Point]): Focus search around the given point and
return results ordered by distance to the given point.
near_radius (Optional[float]): Restrict results to results within
the given distance in degrees of `near` point. Ignored, when
`near` is not set.
categories (list[tuple]): Restrict search to places of the given
categories. The category is the main OSM tag assigned to each
place. An empty list (the default) disables this filter.
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
bbox (Bbox): Bounding box of the full geometry of the place.
If the place is a single point, then the size of the bounding
box is guessed according to the type of place.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
"""
return self._loop.run_until_complete(
self._async_api.search(query, **params))
# pylint: disable=too-many-arguments
def search_address(self, amenity: Optional[str] = None,
street: Optional[str] = None,
city: Optional[str] = None,
county: Optional[str] = None,
state: Optional[str] = None,
country: Optional[str] = None,
postalcode: Optional[str] = None,
**params: Any) -> SearchResults:
""" Find an address using structured search.
Parameters:
amenity: Name of a POI.
street: Street and optionally housenumber of the address. If the address
does not have a street, then the place the housenumber references to.
city: Postal city of the address.
county: County equivalent of the address. Does not exist in all
jurisdictions.
state: State or province of the address.
country: Country with its full name or its ISO 3166-1 alpha-2 country code.
Do not use together with the country_code filter.
postalcode: Post code or ZIP for the place.
Other parameters:
max_results (int): Maximum number of results to return. The
actual number of results may be less. (Default: 10)
min_rank (int): Lowest permissible rank for the result.
For addressable places this is the minimum
[address rank](../customize/Ranking.md#address-rank). For all
other places the [search rank](../customize/Ranking.md#search-rank)
is used.
max_rank (int): Highest permissible rank for the result. See min_rank above.
layers (enum): Defines the kind of data to take into account.
See [layers section](Input-Parameter-Types.md#layers) for details.
(Default: addresses and POIs)
countries (list[str]): Restrict search to countries with the given
ISO 3166-1 alpha-2 country code. An empty list (the default)
disables this filter. Do not use, when the country parameter
is used.
excluded (list[int]): A list of internal IDs of places to exclude
from the search.
viewbox (Optional[Bbox]): Bounding box of an area to focus search on.
bounded_viewbox (bool): Consider the bounding box given in `viewbox`
as a filter and return only results within the bounding box.
near (Optional[Point]): Focus search around the given point and
return results ordered by distance to the given point.
near_radius (Optional[float]): Restrict results to results within
the given distance in degrees of `near` point. Ignored, when
`near` is not set.
categories (list[tuple]): Restrict search to places of the given
categories. The category is the main OSM tag assigned to each
place. An empty list (the default) disables this filter.
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
bbox (Bbox): Bounding box of the full geometry of the place.
If the place is a single point, then the size of the bounding
box is guessed according to the type of place.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
"""
return self._loop.run_until_complete(
self._async_api.search_address(amenity, street, city, county,
state, country, postalcode, **params))
def search_category(self, categories: List[Tuple[str, str]],
near_query: Optional[str] = None,
**params: Any) -> SearchResults:
""" Find an object of a certain category near another place.
The near place may either be given as an unstructured search
query in itself or as a geographic area through the
viewbox or near parameters.
Parameters:
categories: Restrict search to places of the given
categories. The category is the main OSM tag assigned to each
place.
near_query: Optional free-text query to define the are to
restrict search to.
Other parameters:
max_results (int): Maximum number of results to return. The
actual number of results may be less. (Default: 10)
min_rank (int): Lowest permissible rank for the result.
For addressable places this is the minimum
[address rank](../customize/Ranking.md#address-rank). For all
other places the [search rank](../customize/Ranking.md#search-rank)
is used.
max_rank (int): Highest permissible rank for the result. See min_rank above.
layers (enum): Defines the kind of data to take into account.
See [layers section](Input-Parameter-Types.md#layers) for details.
(Default: addresses and POIs)
countries (list[str]): Restrict search to countries with the given
ISO 3166-1 alpha-2 country code. An empty list (the default)
disables this filter.
excluded (list[int]): A list of internal IDs of places to exclude
from the search.
viewbox (Optional[Bbox]): Bounding box of an area to focus search on.
bounded_viewbox (bool): Consider the bounding box given in `viewbox`
as a filter and return only results within the bounding box.
near (Optional[Point]): Focus search around the given point and
return results ordered by distance to the given point.
near_radius (Optional[float]): Restrict results to results within
the given distance in degrees of `near` point. Ignored, when
`near` is not set.
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
bbox (Bbox): Bounding box of the full geometry of the place.
If the place is a single point, then the size of the bounding
box is guessed according to the type of place.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
"""
return self._loop.run_until_complete(
self._async_api.search_category(categories, near_query, **params))

View File

@@ -0,0 +1,97 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for localizing names of results.
"""
from typing import Mapping, List, Optional
import re
class Locales:
""" Helper class for localization of names.
It takes a list of language prefixes in their order of preferred
usage.
"""
def __init__(self, langs: Optional[List[str]] = None):
self.languages = langs or []
self.name_tags: List[str] = []
# Build the list of supported tags. It is currently hard-coded.
self._add_lang_tags('name')
self._add_tags('name', 'brand')
self._add_lang_tags('official_name', 'short_name')
self._add_tags('official_name', 'short_name', 'ref')
def __bool__(self) -> bool:
return len(self.languages) > 0
def _add_tags(self, *tags: str) -> None:
for tag in tags:
self.name_tags.append(tag)
self.name_tags.append(f"_place_{tag}")
def _add_lang_tags(self, *tags: str) -> None:
for tag in tags:
for lang in self.languages:
self.name_tags.append(f"{tag}:{lang}")
self.name_tags.append(f"_place_{tag}:{lang}")
def display_name(self, names: Optional[Mapping[str, str]]) -> str:
""" Return the best matching name from a dictionary of names
containing different name variants.
If 'names' is null or empty, an empty string is returned. If no
appropriate localization is found, the first name is returned.
"""
if not names:
return ''
if len(names) > 1:
for tag in self.name_tags:
if tag in names:
return names[tag]
# Nothing? Return any of the other names as a default.
return next(iter(names.values()))
@staticmethod
def from_accept_languages(langstr: str) -> 'Locales':
""" Create a localization object from a language list in the
format of HTTP accept-languages header.
The functions tries to be forgiving of format errors by first splitting
the string into comma-separated parts and then parsing each
description separately. Badly formatted parts are then ignored.
"""
# split string into languages
candidates = []
for desc in langstr.split(','):
m = re.fullmatch(r'\s*([a-z_-]+)(?:;\s*q\s*=\s*([01](?:\.\d+)?))?\s*',
desc, flags=re.I)
if m:
candidates.append((m[1], float(m[2] or 1.0)))
# sort the results by the weight of each language (preserving order).
candidates.sort(reverse=True, key=lambda e: e[1])
# If a language has a region variant, also add the language without
# variant but only if it isn't already in the list to not mess up the weight.
languages = []
for lid, _ in candidates:
languages.append(lid)
parts = lid.split('-', 1)
if len(parts) > 1 and all(c[0] != parts[0] for c in candidates):
languages.append(parts[0])
return Locales(languages)

View File

@@ -0,0 +1,433 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for specialised logging with HTML output.
"""
from typing import Any, Iterator, Optional, List, Tuple, cast, Union, Mapping, Sequence
from contextvars import ContextVar
import datetime as dt
import textwrap
import io
import re
import html
import sqlalchemy as sa
from sqlalchemy.ext.asyncio import AsyncConnection
try:
from pygments import highlight
from pygments.lexers import PythonLexer, PostgresLexer
from pygments.formatters import HtmlFormatter
CODE_HIGHLIGHT = True
except ModuleNotFoundError:
CODE_HIGHLIGHT = False
def _debug_name(res: Any) -> str:
if res.names:
return cast(str, res.names.get('name', next(iter(res.names.values()))))
return f"Hnr {res.housenumber}" if res.housenumber is not None else '[NONE]'
class BaseLogger:
""" Interface for logging function.
The base implementation does nothing. Overwrite the functions
in derived classes which implement logging functionality.
"""
def get_buffer(self) -> str:
""" Return the current content of the log buffer.
"""
return ''
def function(self, func: str, **kwargs: Any) -> None:
""" Start a new debug chapter for the given function and its parameters.
"""
def section(self, heading: str) -> None:
""" Start a new section with the given title.
"""
def comment(self, text: str) -> None:
""" Add a simple comment to the debug output.
"""
def var_dump(self, heading: str, var: Any) -> None:
""" Print the content of the variable to the debug output prefixed by
the given heading.
"""
def table_dump(self, heading: str, rows: Iterator[Optional[List[Any]]]) -> None:
""" Print the table generated by the generator function.
"""
def result_dump(self, heading: str, results: Iterator[Tuple[Any, Any]]) -> None:
""" Print a list of search results generated by the generator function.
"""
def sql(self, conn: AsyncConnection, statement: 'sa.Executable',
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None]) -> None:
""" Print the SQL for the given statement.
"""
def format_sql(self, conn: AsyncConnection, statement: 'sa.Executable',
extra_params: Union[Mapping[str, Any],
Sequence[Mapping[str, Any]], None]) -> str:
""" Return the compiled version of the statement.
"""
compiled = cast('sa.ClauseElement', statement).compile(conn.sync_engine)
params = dict(compiled.params)
if isinstance(extra_params, Mapping):
for k, v in extra_params.items():
if hasattr(v, 'to_wkt'):
params[k] = v.to_wkt()
elif isinstance(v, (int, float)):
params[k] = v
else:
params[k] = str(v)
elif isinstance(extra_params, Sequence) and extra_params:
for k in extra_params[0]:
params[k] = f':{k}'
sqlstr = str(compiled)
if conn.dialect.name == 'postgresql':
if sa.__version__.startswith('1'):
try:
sqlstr = re.sub(r'__\[POSTCOMPILE_[^]]*\]', '%s', sqlstr)
return sqlstr % tuple((repr(params.get(name, None))
for name in compiled.positiontup)) # type: ignore
except TypeError:
return sqlstr
# Fixes an odd issue with Python 3.7 where percentages are not
# quoted correctly.
sqlstr = re.sub(r'%(?!\()', '%%', sqlstr)
sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', r'%(\1)s', sqlstr)
return sqlstr % params
assert conn.dialect.name == 'sqlite'
# params in positional order
pparams = (repr(params.get(name, None)) for name in compiled.positiontup) # type: ignore
sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', '?', sqlstr)
sqlstr = re.sub(r"\?", lambda m: next(pparams), sqlstr)
return sqlstr
class HTMLLogger(BaseLogger):
""" Logger that formats messages in HTML.
"""
def __init__(self) -> None:
self.buffer = io.StringIO()
def _timestamp(self) -> None:
self._write(f'<p class="timestamp">[{dt.datetime.now()}]</p>')
def get_buffer(self) -> str:
return HTML_HEADER + self.buffer.getvalue() + HTML_FOOTER
def function(self, func: str, **kwargs: Any) -> None:
self._timestamp()
self._write(f"<h1>Debug output for {func}()</h1>\n<p>Parameters:<dl>")
for name, value in kwargs.items():
self._write(f'<dt>{name}</dt><dd>{self._python_var(value)}</dd>')
self._write('</dl></p>')
def section(self, heading: str) -> None:
self._timestamp()
self._write(f"<h2>{heading}</h2>")
def comment(self, text: str) -> None:
self._timestamp()
self._write(f"<p>{text}</p>")
def var_dump(self, heading: str, var: Any) -> None:
self._timestamp()
if callable(var):
var = var()
self._write(f'<h5>{heading}</h5>{self._python_var(var)}')
def table_dump(self, heading: str, rows: Iterator[Optional[List[Any]]]) -> None:
self._timestamp()
head = next(rows)
assert head
self._write(f'<table><thead><tr><th colspan="{len(head)}">{heading}</th></tr><tr>')
for cell in head:
self._write(f'<th>{cell}</th>')
self._write('</tr></thead><tbody>')
for row in rows:
if row is not None:
self._write('<tr>')
for cell in row:
self._write(f'<td>{cell}</td>')
self._write('</tr>')
self._write('</tbody></table>')
def result_dump(self, heading: str, results: Iterator[Tuple[Any, Any]]) -> None:
""" Print a list of search results generated by the generator function.
"""
self._timestamp()
def format_osm(osm_object: Optional[Tuple[str, int]]) -> str:
if not osm_object:
return '-'
t, i = osm_object
if t == 'N':
fullt = 'node'
elif t == 'W':
fullt = 'way'
elif t == 'R':
fullt = 'relation'
else:
return f'{t}{i}'
return f'<a href="https://www.openstreetmap.org/{fullt}/{i}">{t}{i}</a>'
self._write(f'<h5>{heading}</h5><p><dl>')
total = 0
for rank, res in results:
self._write(f'<dt>[{rank:.3f}]</dt> <dd>{res.source_table.name}(')
self._write(f"{_debug_name(res)}, type=({','.join(res.category)}), ")
self._write(f"rank={res.rank_address}, ")
self._write(f"osm={format_osm(res.osm_object)}, ")
self._write(f'cc={res.country_code}, ')
self._write(f'importance={res.importance or float("nan"):.5f})</dd>')
total += 1
self._write(f'</dl><b>TOTAL:</b> {total}</p>')
def sql(self, conn: AsyncConnection, statement: 'sa.Executable',
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None]) -> None:
self._timestamp()
sqlstr = self.format_sql(conn, statement, params)
if CODE_HIGHLIGHT:
sqlstr = highlight(sqlstr, PostgresLexer(),
HtmlFormatter(nowrap=True, lineseparator='<br />'))
self._write(f'<div class="highlight"><code class="lang-sql">{sqlstr}</code></div>')
else:
self._write(f'<code class="lang-sql">{html.escape(sqlstr)}</code>')
def _python_var(self, var: Any) -> str:
if CODE_HIGHLIGHT:
fmt = highlight(str(var), PythonLexer(), HtmlFormatter(nowrap=True))
return f'<div class="highlight"><code class="lang-python">{fmt}</code></div>'
return f'<code class="lang-python">{html.escape(str(var))}</code>'
def _write(self, text: str) -> None:
""" Add the raw text to the debug output.
"""
self.buffer.write(text)
class TextLogger(BaseLogger):
""" Logger creating output suitable for the console.
"""
def __init__(self) -> None:
self.buffer = io.StringIO()
def _timestamp(self) -> None:
self._write(f'[{dt.datetime.now()}]\n')
def get_buffer(self) -> str:
return self.buffer.getvalue()
def function(self, func: str, **kwargs: Any) -> None:
self._write(f"#### Debug output for {func}()\n\nParameters:\n")
for name, value in kwargs.items():
self._write(f' {name}: {self._python_var(value)}\n')
self._write('\n')
def section(self, heading: str) -> None:
self._timestamp()
self._write(f"\n# {heading}\n\n")
def comment(self, text: str) -> None:
self._write(f"{text}\n")
def var_dump(self, heading: str, var: Any) -> None:
if callable(var):
var = var()
self._write(f'{heading}:\n {self._python_var(var)}\n\n')
def table_dump(self, heading: str, rows: Iterator[Optional[List[Any]]]) -> None:
self._write(f'{heading}:\n')
data = [list(map(self._python_var, row)) if row else None for row in rows]
assert data[0] is not None
num_cols = len(data[0])
maxlens = [max(len(d[i]) for d in data if d) for i in range(num_cols)]
tablewidth = sum(maxlens) + 3 * num_cols + 1
row_format = '| ' +' | '.join(f'{{:<{l}}}' for l in maxlens) + ' |\n'
self._write('-'*tablewidth + '\n')
self._write(row_format.format(*data[0]))
self._write('-'*tablewidth + '\n')
for row in data[1:]:
if row:
self._write(row_format.format(*row))
else:
self._write('-'*tablewidth + '\n')
if data[-1]:
self._write('-'*tablewidth + '\n')
def result_dump(self, heading: str, results: Iterator[Tuple[Any, Any]]) -> None:
self._timestamp()
self._write(f'{heading}:\n')
total = 0
for rank, res in results:
self._write(f'[{rank:.3f}] {res.source_table.name}(')
self._write(f"{_debug_name(res)}, type=({','.join(res.category)}), ")
self._write(f"rank={res.rank_address}, ")
self._write(f"osm={''.join(map(str, res.osm_object or []))}, ")
self._write(f'cc={res.country_code}, ')
self._write(f'importance={res.importance or -1:.5f})\n')
total += 1
self._write(f'TOTAL: {total}\n\n')
def sql(self, conn: AsyncConnection, statement: 'sa.Executable',
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None]) -> None:
self._timestamp()
sqlstr = '\n| '.join(textwrap.wrap(self.format_sql(conn, statement, params), width=78))
self._write(f"| {sqlstr}\n\n")
def _python_var(self, var: Any) -> str:
return str(var)
def _write(self, text: str) -> None:
self.buffer.write(text)
logger: ContextVar[BaseLogger] = ContextVar('logger', default=BaseLogger())
def set_log_output(fmt: str) -> None:
""" Enable collecting debug information.
"""
if fmt == 'html':
logger.set(HTMLLogger())
elif fmt == 'text':
logger.set(TextLogger())
else:
logger.set(BaseLogger())
def log() -> BaseLogger:
""" Return the logger for the current context.
"""
return logger.get()
def get_and_disable() -> str:
""" Return the current content of the debug buffer and disable logging.
"""
buf = logger.get().get_buffer()
logger.set(BaseLogger())
return buf
HTML_HEADER: str = """<!DOCTYPE html>
<html>
<head>
<title>Nominatim - Debug</title>
<style>
""" + \
(HtmlFormatter(nobackground=True).get_style_defs('.highlight') if CODE_HIGHLIGHT else '') +\
"""
h2 { font-size: x-large }
dl {
padding-left: 10pt;
font-family: monospace
}
dt {
float: left;
font-weight: bold;
margin-right: 0.5em
}
dt::after { content: ": "; }
dd::after {
clear: left;
display: block
}
.lang-sql {
color: #555;
font-size: small
}
h5 {
border: solid lightgrey 0.1pt;
margin-bottom: 0;
background-color: #f7f7f7
}
h5 + .highlight {
padding: 3pt;
border: solid lightgrey 0.1pt
}
table, th, tbody {
border: thin solid;
border-collapse: collapse;
}
td {
border-right: thin solid;
padding-left: 3pt;
padding-right: 3pt;
}
.timestamp {
font-size: 0.8em;
color: darkblue;
width: calc(100% - 5pt);
text-align: right;
position: absolute;
left: 0;
margin-top: -5px;
}
</style>
</head>
<body>
"""
HTML_FOOTER: str = "</body></html>"

250
src/nominatim_api/lookup.py Normal file
View File

@@ -0,0 +1,250 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of place lookup by ID.
"""
from typing import Optional, Callable, Tuple, Type
import datetime as dt
import sqlalchemy as sa
from nominatim_core.typing import SaColumn, SaRow, SaSelect
from .connection import SearchConnection
from .logging import log
from . import types as ntyp
from . import results as nres
RowFunc = Callable[[Optional[SaRow], Type[nres.BaseResultT]], Optional[nres.BaseResultT]]
GeomFunc = Callable[[SaSelect, SaColumn], SaSelect]
async def find_in_placex(conn: SearchConnection, place: ntyp.PlaceRef,
add_geometries: GeomFunc) -> Optional[SaRow]:
""" Search for the given place in the placex table and return the
base information.
"""
log().section("Find in placex table")
t = conn.t.placex
sql = sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.admin_level,
t.c.address, t.c.extratags,
t.c.housenumber, t.c.postcode, t.c.country_code,
t.c.importance, t.c.wikipedia, t.c.indexed_date,
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
t.c.linked_place_id,
t.c.geometry.ST_Expand(0).label('bbox'),
t.c.centroid)
if isinstance(place, ntyp.PlaceID):
sql = sql.where(t.c.place_id == place.place_id)
elif isinstance(place, ntyp.OsmID):
sql = sql.where(t.c.osm_type == place.osm_type)\
.where(t.c.osm_id == place.osm_id)
if place.osm_class:
sql = sql.where(t.c.class_ == place.osm_class)
else:
sql = sql.order_by(t.c.class_)
sql = sql.limit(1)
else:
return None
return (await conn.execute(add_geometries(sql, t.c.geometry))).one_or_none()
async def find_in_osmline(conn: SearchConnection, place: ntyp.PlaceRef,
add_geometries: GeomFunc) -> Optional[SaRow]:
""" Search for the given place in the osmline table and return the
base information.
"""
log().section("Find in interpolation table")
t = conn.t.osmline
sql = sa.select(t.c.place_id, t.c.osm_id, t.c.parent_place_id,
t.c.indexed_date, t.c.startnumber, t.c.endnumber,
t.c.step, t.c.address, t.c.postcode, t.c.country_code,
t.c.linegeo.ST_Centroid().label('centroid'))
if isinstance(place, ntyp.PlaceID):
sql = sql.where(t.c.place_id == place.place_id)
elif isinstance(place, ntyp.OsmID) and place.osm_type == 'W':
# There may be multiple interpolations for a single way.
# If 'class' contains a number, return the one that belongs to that number.
sql = sql.where(t.c.osm_id == place.osm_id).limit(1)
if place.osm_class and place.osm_class.isdigit():
sql = sql.order_by(sa.func.greatest(0,
int(place.osm_class) - t.c.endnumber,
t.c.startnumber - int(place.osm_class)))
else:
return None
return (await conn.execute(add_geometries(sql, t.c.linegeo))).one_or_none()
async def find_in_tiger(conn: SearchConnection, place: ntyp.PlaceRef,
add_geometries: GeomFunc) -> Optional[SaRow]:
""" Search for the given place in the table of Tiger addresses and return
the base information. Only lookup by place ID is supported.
"""
if not isinstance(place, ntyp.PlaceID):
return None
log().section("Find in TIGER table")
t = conn.t.tiger
parent = conn.t.placex
sql = sa.select(t.c.place_id, t.c.parent_place_id,
parent.c.osm_type, parent.c.osm_id,
t.c.startnumber, t.c.endnumber, t.c.step,
t.c.postcode,
t.c.linegeo.ST_Centroid().label('centroid'))\
.where(t.c.place_id == place.place_id)\
.join(parent, t.c.parent_place_id == parent.c.place_id, isouter=True)
return (await conn.execute(add_geometries(sql, t.c.linegeo))).one_or_none()
async def find_in_postcode(conn: SearchConnection, place: ntyp.PlaceRef,
add_geometries: GeomFunc) -> Optional[SaRow]:
""" Search for the given place in the postcode table and return the
base information. Only lookup by place ID is supported.
"""
if not isinstance(place, ntyp.PlaceID):
return None
log().section("Find in postcode table")
t = conn.t.postcode
sql = sa.select(t.c.place_id, t.c.parent_place_id,
t.c.rank_search, t.c.rank_address,
t.c.indexed_date, t.c.postcode, t.c.country_code,
t.c.geometry.label('centroid')) \
.where(t.c.place_id == place.place_id)
return (await conn.execute(add_geometries(sql, t.c.geometry))).one_or_none()
async def find_in_all_tables(conn: SearchConnection, place: ntyp.PlaceRef,
add_geometries: GeomFunc
) -> Tuple[Optional[SaRow], RowFunc[nres.BaseResultT]]:
""" Search for the given place in all data tables
and return the base information.
"""
row = await find_in_placex(conn, place, add_geometries)
log().var_dump('Result (placex)', row)
if row is not None:
return row, nres.create_from_placex_row
row = await find_in_osmline(conn, place, add_geometries)
log().var_dump('Result (osmline)', row)
if row is not None:
return row, nres.create_from_osmline_row
row = await find_in_postcode(conn, place, add_geometries)
log().var_dump('Result (postcode)', row)
if row is not None:
return row, nres.create_from_postcode_row
row = await find_in_tiger(conn, place, add_geometries)
log().var_dump('Result (tiger)', row)
return row, nres.create_from_tiger_row
async def get_detailed_place(conn: SearchConnection, place: ntyp.PlaceRef,
details: ntyp.LookupDetails) -> Optional[nres.DetailedResult]:
""" Retrieve a place with additional details from the database.
"""
log().function('get_detailed_place', place=place, details=details)
if details.geometry_output and details.geometry_output != ntyp.GeometryFormat.GEOJSON:
raise ValueError("lookup only supports geojosn polygon output.")
if details.geometry_output & ntyp.GeometryFormat.GEOJSON:
def _add_geometry(sql: SaSelect, column: SaColumn) -> SaSelect:
return sql.add_columns(sa.func.ST_AsGeoJSON(
sa.case((sa.func.ST_NPoints(column) > 5000,
sa.func.ST_SimplifyPreserveTopology(column, 0.0001)),
else_=column), 7).label('geometry_geojson'))
else:
def _add_geometry(sql: SaSelect, column: SaColumn) -> SaSelect:
return sql.add_columns(sa.func.ST_GeometryType(column).label('geometry_type'))
row_func: RowFunc[nres.DetailedResult]
row, row_func = await find_in_all_tables(conn, place, _add_geometry)
if row is None:
return None
result = row_func(row, nres.DetailedResult)
assert result is not None
# add missing details
assert result is not None
if 'type' in result.geometry:
result.geometry['type'] = GEOMETRY_TYPE_MAP.get(result.geometry['type'],
result.geometry['type'])
indexed_date = getattr(row, 'indexed_date', None)
if indexed_date is not None:
result.indexed_date = indexed_date.replace(tzinfo=dt.timezone.utc)
await nres.add_result_details(conn, [result], details)
return result
async def get_simple_place(conn: SearchConnection, place: ntyp.PlaceRef,
details: ntyp.LookupDetails) -> Optional[nres.SearchResult]:
""" Retrieve a place as a simple search result from the database.
"""
log().function('get_simple_place', place=place, details=details)
def _add_geometry(sql: SaSelect, col: SaColumn) -> SaSelect:
if not details.geometry_output:
return sql
out = []
if details.geometry_simplification > 0.0:
col = sa.func.ST_SimplifyPreserveTopology(col, details.geometry_simplification)
if details.geometry_output & ntyp.GeometryFormat.GEOJSON:
out.append(sa.func.ST_AsGeoJSON(col, 7).label('geometry_geojson'))
if details.geometry_output & ntyp.GeometryFormat.TEXT:
out.append(sa.func.ST_AsText(col).label('geometry_text'))
if details.geometry_output & ntyp.GeometryFormat.KML:
out.append(sa.func.ST_AsKML(col, 7).label('geometry_kml'))
if details.geometry_output & ntyp.GeometryFormat.SVG:
out.append(sa.func.ST_AsSVG(col, 0, 7).label('geometry_svg'))
return sql.add_columns(*out)
row_func: RowFunc[nres.SearchResult]
row, row_func = await find_in_all_tables(conn, place, _add_geometry)
if row is None:
return None
result = row_func(row, nres.SearchResult)
assert result is not None
# add missing details
assert result is not None
if hasattr(row, 'bbox'):
result.bbox = ntyp.Bbox.from_wkb(row.bbox)
await nres.add_result_details(conn, [result], details)
return result
GEOMETRY_TYPE_MAP = {
'POINT': 'ST_Point',
'MULTIPOINT': 'ST_MultiPoint',
'LINESTRING': 'ST_LineString',
'MULTILINESTRING': 'ST_MultiLineString',
'POLYGON': 'ST_Polygon',
'MULTIPOLYGON': 'ST_MultiPolygon',
'GEOMETRYCOLLECTION': 'ST_GeometryCollection'
}

View File

View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper classes and functions for formatting results into API responses.
"""
from typing import Type, TypeVar, Dict, List, Callable, Any, Mapping
from collections import defaultdict
T = TypeVar('T') # pylint: disable=invalid-name
FormatFunc = Callable[[T, Mapping[str, Any]], str]
class FormatDispatcher:
""" Helper class to conveniently create formatting functions in
a module using decorators.
"""
def __init__(self) -> None:
self.format_functions: Dict[Type[Any], Dict[str, FormatFunc[Any]]] = defaultdict(dict)
def format_func(self, result_class: Type[T],
fmt: str) -> Callable[[FormatFunc[T]], FormatFunc[T]]:
""" Decorator for a function that formats a given type of result into the
selected format.
"""
def decorator(func: FormatFunc[T]) -> FormatFunc[T]:
self.format_functions[result_class][fmt] = func
return func
return decorator
def list_formats(self, result_type: Type[Any]) -> List[str]:
""" Return a list of formats supported by this formatter.
"""
return list(self.format_functions[result_type].keys())
def supports_format(self, result_type: Type[Any], fmt: str) -> bool:
""" Check if the given format is supported by this formatter.
"""
return fmt in self.format_functions[result_type]
def format_result(self, result: Any, fmt: str, options: Mapping[str, Any]) -> str:
""" Convert the given result into a string using the given format.
The format is expected to be in the list returned by
`list_formats()`.
"""
return self.format_functions[type(result)][fmt](result, options)

View File

@@ -0,0 +1,752 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Dataclasses for search results and helper functions to fill them.
Data classes are part of the public API while the functions are for
internal use only. That's why they are implemented as free-standing functions
instead of member functions.
"""
from typing import Optional, Tuple, Dict, Sequence, TypeVar, Type, List, cast, Callable
import enum
import dataclasses
import datetime as dt
import sqlalchemy as sa
from nominatim_core.typing import SaSelect, SaRow
from nominatim_core.db.sqlalchemy_types import Geometry
from .types import Point, Bbox, LookupDetails
from .connection import SearchConnection
from .logging import log
from .localization import Locales
# This file defines complex result data classes.
# pylint: disable=too-many-instance-attributes
def _mingle_name_tags(names: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]:
""" Mix-in names from linked places, so that they show up
as standard names where necessary.
"""
if not names:
return None
out = {}
for k, v in names.items():
if k.startswith('_place_'):
outkey = k[7:]
out[k if outkey in names else outkey] = v
else:
out[k] = v
return out
class SourceTable(enum.Enum):
""" The `SourceTable` type lists the possible sources a result can have.
"""
PLACEX = 1
""" The placex table is the main source for result usually containing
OSM data.
"""
OSMLINE = 2
""" The osmline table contains address interpolations from OSM data.
Interpolation addresses are always approximate. The OSM id in the
result refers to the OSM way with the interpolation line object.
"""
TIGER = 3
""" TIGER address data contains US addresses imported on the side,
see [Installing TIGER data](../customize/Tiger.md).
TIGER address are also interpolations. The addresses always refer
to a street from OSM data. The OSM id in the result refers to
that street.
"""
POSTCODE = 4
""" The postcode table contains artificial centroids for postcodes,
computed from the postcodes available with address points. Results
are always approximate.
"""
COUNTRY = 5
""" The country table provides a fallback, when country data is missing
in the OSM data.
"""
@dataclasses.dataclass
class AddressLine:
""" The `AddressLine` may contain the following fields about a related place
and its function as an address object. Most fields are optional.
Their presence depends on the kind and function of the address part.
"""
category: Tuple[str, str]
""" Main category of the place, described by a key-value pair.
"""
names: Dict[str, str]
""" All available names for the place including references, alternative
names and translations.
"""
fromarea: bool
""" If true, then the exact area of the place is known. Without area
information, Nominatim has to make an educated guess if an address
belongs to one place or another.
"""
isaddress: bool
""" If true, this place should be considered for the final address display.
Nominatim will sometimes include more than one candidate for
the address in the list when it cannot reliably determine where the
place belongs. It will consider names of all candidates when searching
but when displaying the result, only the most likely candidate should
be shown.
"""
rank_address: int
""" [Address rank](../customize/Ranking.md#address-rank) of the place.
"""
distance: float
""" Distance in degrees between the result place and this address part.
"""
place_id: Optional[int] = None
""" Internal ID of the place.
"""
osm_object: Optional[Tuple[str, int]] = None
""" OSM type and ID of the place, if such an object exists.
"""
extratags: Optional[Dict[str, str]] = None
""" Any extra information available about the place. This is a dictionary
that usually contains OSM tag key-value pairs.
"""
admin_level: Optional[int] = None
""" The administrative level of a boundary as tagged in the input data.
This field is only meaningful for places of the category
(boundary, administrative).
"""
local_name: Optional[str] = None
""" Place holder for localization of this address part. See
[Localization](#localization) below.
"""
class AddressLines(List[AddressLine]):
""" Sequence of address lines order in descending order by their rank.
"""
def localize(self, locales: Locales) -> List[str]:
""" Set the local name of address parts according to the chosen
locale. Return the list of local names without duplicates.
Only address parts that are marked as isaddress are localized
and returned.
"""
label_parts: List[str] = []
for line in self:
if line.isaddress and line.names:
line.local_name = locales.display_name(line.names)
if not label_parts or label_parts[-1] != line.local_name:
label_parts.append(line.local_name)
return label_parts
@dataclasses.dataclass
class WordInfo:
""" Each entry in the list of search terms contains the
following detailed information.
"""
word_id: int
""" Internal identifier for the word.
"""
word_token: str
""" Normalised and transliterated form of the word.
This form is used for searching.
"""
word: Optional[str] = None
""" Untransliterated form, if available.
"""
WordInfos = Sequence[WordInfo]
@dataclasses.dataclass
class BaseResult:
""" Data class collecting information common to all
types of search results.
"""
source_table: SourceTable
category: Tuple[str, str]
centroid: Point
place_id : Optional[int] = None
osm_object: Optional[Tuple[str, int]] = None
parent_place_id: Optional[int] = None
linked_place_id: Optional[int] = None
admin_level: int = 15
locale_name: Optional[str] = None
display_name: Optional[str] = None
names: Optional[Dict[str, str]] = None
address: Optional[Dict[str, str]] = None
extratags: Optional[Dict[str, str]] = None
housenumber: Optional[str] = None
postcode: Optional[str] = None
wikipedia: Optional[str] = None
rank_address: int = 30
rank_search: int = 30
importance: Optional[float] = None
country_code: Optional[str] = None
address_rows: Optional[AddressLines] = None
linked_rows: Optional[AddressLines] = None
parented_rows: Optional[AddressLines] = None
name_keywords: Optional[WordInfos] = None
address_keywords: Optional[WordInfos] = None
geometry: Dict[str, str] = dataclasses.field(default_factory=dict)
@property
def lat(self) -> float:
""" Get the latitude (or y) of the center point of the place.
"""
return self.centroid[1]
@property
def lon(self) -> float:
""" Get the longitude (or x) of the center point of the place.
"""
return self.centroid[0]
def calculated_importance(self) -> float:
""" Get a valid importance value. This is either the stored importance
of the value or an artificial value computed from the place's
search rank.
"""
return self.importance or (0.40001 - (self.rank_search/75.0))
def localize(self, locales: Locales) -> None:
""" Fill the locale_name and the display_name field for the
place and, if available, its address information.
"""
self.locale_name = locales.display_name(self.names)
if self.address_rows:
self.display_name = ', '.join(self.address_rows.localize(locales))
else:
self.display_name = self.locale_name
BaseResultT = TypeVar('BaseResultT', bound=BaseResult)
@dataclasses.dataclass
class DetailedResult(BaseResult):
""" A search result with more internal information from the database
added.
"""
indexed_date: Optional[dt.datetime] = None
@dataclasses.dataclass
class ReverseResult(BaseResult):
""" A search result for reverse geocoding.
"""
distance: Optional[float] = None
bbox: Optional[Bbox] = None
class ReverseResults(List[ReverseResult]):
""" Sequence of reverse lookup results ordered by distance.
May be empty when no result was found.
"""
@dataclasses.dataclass
class SearchResult(BaseResult):
""" A search result for forward geocoding.
"""
bbox: Optional[Bbox] = None
accuracy: float = 0.0
@property
def ranking(self) -> float:
""" Return the ranking, a combined measure of accuracy and importance.
"""
return (self.accuracy if self.accuracy is not None else 1) \
- self.calculated_importance()
class SearchResults(List[SearchResult]):
""" Sequence of forward lookup results ordered by relevance.
May be empty when no result was found.
"""
def _filter_geometries(row: SaRow) -> Dict[str, str]:
return {k[9:]: v for k, v in row._mapping.items() # pylint: disable=W0212
if k.startswith('geometry_')}
def create_from_placex_row(row: Optional[SaRow],
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
""" Construct a new result and add the data from the result row
from the placex table. 'class_type' defines the type of result
to return. Returns None if the row is None.
"""
if row is None:
return None
return class_type(source_table=SourceTable.PLACEX,
place_id=row.place_id,
osm_object=(row.osm_type, row.osm_id),
category=(row.class_, row.type),
parent_place_id = row.parent_place_id,
linked_place_id = getattr(row, 'linked_place_id', None),
admin_level = getattr(row, 'admin_level', 15),
names=_mingle_name_tags(row.name),
address=row.address,
extratags=row.extratags,
housenumber=row.housenumber,
postcode=row.postcode,
wikipedia=row.wikipedia,
rank_address=row.rank_address,
rank_search=row.rank_search,
importance=row.importance,
country_code=row.country_code,
centroid=Point.from_wkb(row.centroid),
geometry=_filter_geometries(row))
def create_from_osmline_row(row: Optional[SaRow],
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
""" Construct a new result and add the data from the result row
from the address interpolation table osmline. 'class_type' defines
the type of result to return. Returns None if the row is None.
If the row contains a housenumber, then the housenumber is filled out.
Otherwise the result contains the interpolation information in extratags.
"""
if row is None:
return None
hnr = getattr(row, 'housenumber', None)
res = class_type(source_table=SourceTable.OSMLINE,
place_id=row.place_id,
parent_place_id = row.parent_place_id,
osm_object=('W', row.osm_id),
category=('place', 'houses' if hnr is None else 'house'),
address=row.address,
postcode=row.postcode,
country_code=row.country_code,
centroid=Point.from_wkb(row.centroid),
geometry=_filter_geometries(row))
if hnr is None:
res.extratags = {'startnumber': str(row.startnumber),
'endnumber': str(row.endnumber),
'step': str(row.step)}
else:
res.housenumber = str(hnr)
return res
def create_from_tiger_row(row: Optional[SaRow],
class_type: Type[BaseResultT],
osm_type: Optional[str] = None,
osm_id: Optional[int] = None) -> Optional[BaseResultT]:
""" Construct a new result and add the data from the result row
from the Tiger data interpolation table. 'class_type' defines
the type of result to return. Returns None if the row is None.
If the row contains a housenumber, then the housenumber is filled out.
Otherwise the result contains the interpolation information in extratags.
"""
if row is None:
return None
hnr = getattr(row, 'housenumber', None)
res = class_type(source_table=SourceTable.TIGER,
place_id=row.place_id,
parent_place_id = row.parent_place_id,
osm_object=(osm_type or row.osm_type, osm_id or row.osm_id),
category=('place', 'houses' if hnr is None else 'house'),
postcode=row.postcode,
country_code='us',
centroid=Point.from_wkb(row.centroid),
geometry=_filter_geometries(row))
if hnr is None:
res.extratags = {'startnumber': str(row.startnumber),
'endnumber': str(row.endnumber),
'step': str(row.step)}
else:
res.housenumber = str(hnr)
return res
def create_from_postcode_row(row: Optional[SaRow],
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
""" Construct a new result and add the data from the result row
from the postcode table. 'class_type' defines
the type of result to return. Returns None if the row is None.
"""
if row is None:
return None
return class_type(source_table=SourceTable.POSTCODE,
place_id=row.place_id,
parent_place_id = row.parent_place_id,
category=('place', 'postcode'),
names={'ref': row.postcode},
rank_search=row.rank_search,
rank_address=row.rank_address,
country_code=row.country_code,
centroid=Point.from_wkb(row.centroid),
geometry=_filter_geometries(row))
def create_from_country_row(row: Optional[SaRow],
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
""" Construct a new result and add the data from the result row
from the fallback country tables. 'class_type' defines
the type of result to return. Returns None if the row is None.
"""
if row is None:
return None
return class_type(source_table=SourceTable.COUNTRY,
category=('place', 'country'),
centroid=Point.from_wkb(row.centroid),
names=row.name,
rank_address=4, rank_search=4,
country_code=row.country_code,
geometry=_filter_geometries(row))
async def add_result_details(conn: SearchConnection, results: List[BaseResultT],
details: LookupDetails) -> None:
""" Retrieve more details from the database according to the
parameters specified in 'details'.
"""
if results:
log().section('Query details for result')
if details.address_details:
log().comment('Query address details')
await complete_address_details(conn, results)
if details.linked_places:
log().comment('Query linked places')
for result in results:
await complete_linked_places(conn, result)
if details.parented_places:
log().comment('Query parent places')
for result in results:
await complete_parented_places(conn, result)
if details.keywords:
log().comment('Query keywords')
for result in results:
await complete_keywords(conn, result)
for result in results:
result.localize(details.locales)
def _result_row_to_address_row(row: SaRow, isaddress: Optional[bool] = None) -> AddressLine:
""" Create a new AddressLine from the results of a database query.
"""
extratags: Dict[str, str] = getattr(row, 'extratags', {}) or {}
if 'linked_place' in extratags:
extratags['place'] = extratags['linked_place']
names = _mingle_name_tags(row.name) or {}
if getattr(row, 'housenumber', None) is not None:
names['housenumber'] = row.housenumber
if isaddress is None:
isaddress = getattr(row, 'isaddress', True)
return AddressLine(place_id=row.place_id,
osm_object=None if row.osm_type is None else (row.osm_type, row.osm_id),
category=(getattr(row, 'class'), row.type),
names=names,
extratags=extratags,
admin_level=row.admin_level,
fromarea=row.fromarea,
isaddress=isaddress,
rank_address=row.rank_address,
distance=row.distance)
def _get_address_lookup_id(result: BaseResultT) -> int:
assert result.place_id
if result.source_table != SourceTable.PLACEX or result.rank_search > 27:
return result.parent_place_id or result.place_id
return result.linked_place_id or result.place_id
async def _finalize_entry(conn: SearchConnection, result: BaseResultT) -> None:
assert result.address_rows is not None
if result.category[0] not in ('boundary', 'place')\
or result.category[1] not in ('postal_code', 'postcode'):
postcode = result.postcode
if not postcode and result.address:
postcode = result.address.get('postcode')
if postcode and ',' not in postcode and ';' not in postcode:
result.address_rows.append(AddressLine(
category=('place', 'postcode'),
names={'ref': postcode},
fromarea=False, isaddress=True, rank_address=5,
distance=0.0))
if result.country_code:
async def _get_country_names() -> Optional[Dict[str, str]]:
t = conn.t.country_name
sql = sa.select(t.c.name, t.c.derived_name)\
.where(t.c.country_code == result.country_code)
for cres in await conn.execute(sql):
names = cast(Dict[str, str], cres[0])
if cres[1]:
names.update(cast(Dict[str, str], cres[1]))
return names
return None
country_names = await conn.get_cached_value('COUNTRY_NAME',
result.country_code,
_get_country_names)
if country_names:
result.address_rows.append(AddressLine(
category=('place', 'country'),
names=country_names,
fromarea=False, isaddress=True, rank_address=4,
distance=0.0))
result.address_rows.append(AddressLine(
category=('place', 'country_code'),
names={'ref': result.country_code}, extratags = {},
fromarea=True, isaddress=False, rank_address=4,
distance=0.0))
def _setup_address_details(result: BaseResultT) -> None:
""" Retrieve information about places that make up the address of the result.
"""
result.address_rows = AddressLines()
if result.names:
result.address_rows.append(AddressLine(
place_id=result.place_id,
osm_object=result.osm_object,
category=result.category,
names=result.names,
extratags=result.extratags or {},
admin_level=result.admin_level,
fromarea=True, isaddress=True,
rank_address=result.rank_address, distance=0.0))
if result.source_table == SourceTable.PLACEX and result.address:
housenumber = result.address.get('housenumber')\
or result.address.get('streetnumber')\
or result.address.get('conscriptionnumber')
elif result.housenumber:
housenumber = result.housenumber
else:
housenumber = None
if housenumber:
result.address_rows.append(AddressLine(
category=('place', 'house_number'),
names={'ref': housenumber},
fromarea=True, isaddress=True, rank_address=28, distance=0))
if result.address and '_unlisted_place' in result.address:
result.address_rows.append(AddressLine(
category=('place', 'locality'),
names={'name': result.address['_unlisted_place']},
fromarea=False, isaddress=True, rank_address=25, distance=0))
async def complete_address_details(conn: SearchConnection, results: List[BaseResultT]) -> None:
""" Retrieve information about places that make up the address of the result.
"""
for result in results:
_setup_address_details(result)
### Lookup entries from place_address line
lookup_ids = [{'pid': r.place_id,
'lid': _get_address_lookup_id(r),
'names': list(r.address.values()) if r.address else [],
'c': ('SRID=4326;' + r.centroid.to_wkt()) if r.centroid else '' }
for r in results if r.place_id]
if not lookup_ids:
return
ltab = sa.func.JsonArrayEach(sa.type_coerce(lookup_ids, sa.JSON))\
.table_valued(sa.column('value', type_=sa.JSON))
t = conn.t.placex
taddr = conn.t.addressline
sql = sa.select(ltab.c.value['pid'].as_integer().label('src_place_id'),
t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.extratags,
t.c.admin_level, taddr.c.fromarea,
sa.case((t.c.rank_address == 11, 5),
else_=t.c.rank_address).label('rank_address'),
taddr.c.distance, t.c.country_code, t.c.postcode)\
.join(taddr, sa.or_(taddr.c.place_id == ltab.c.value['pid'].as_integer(),
taddr.c.place_id == ltab.c.value['lid'].as_integer()))\
.join(t, taddr.c.address_place_id == t.c.place_id)\
.order_by('src_place_id')\
.order_by(sa.column('rank_address').desc())\
.order_by((taddr.c.place_id == ltab.c.value['pid'].as_integer()).desc())\
.order_by(sa.case((sa.func.CrosscheckNames(t.c.name, ltab.c.value['names']), 2),
(taddr.c.isaddress, 0),
(sa.and_(taddr.c.fromarea,
t.c.geometry.ST_Contains(
sa.func.ST_GeomFromEWKT(
ltab.c.value['c'].as_string()))), 1),
else_=-1).desc())\
.order_by(taddr.c.fromarea.desc())\
.order_by(taddr.c.distance.desc())\
.order_by(t.c.rank_search.desc())
current_result = None
current_rank_address = -1
for row in await conn.execute(sql):
if current_result is None or row.src_place_id != current_result.place_id:
current_result = next((r for r in results if r.place_id == row.src_place_id), None)
assert current_result is not None
current_rank_address = -1
location_isaddress = row.rank_address != current_rank_address
if current_result.country_code is None and row.country_code:
current_result.country_code = row.country_code
if row.type in ('postcode', 'postal_code') and location_isaddress:
if not row.fromarea or \
(current_result.address and 'postcode' in current_result.address):
location_isaddress = False
else:
current_result.postcode = None
assert current_result.address_rows is not None
current_result.address_rows.append(_result_row_to_address_row(row, location_isaddress))
current_rank_address = row.rank_address
for result in results:
await _finalize_entry(conn, result)
### Finally add the record for the parent entry where necessary.
parent_lookup_ids = list(filter(lambda e: e['pid'] != e['lid'], lookup_ids))
if parent_lookup_ids:
ltab = sa.func.JsonArrayEach(sa.type_coerce(parent_lookup_ids, sa.JSON))\
.table_valued(sa.column('value', type_=sa.JSON))
sql = sa.select(ltab.c.value['pid'].as_integer().label('src_place_id'),
t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.extratags,
t.c.admin_level,
t.c.rank_address)\
.where(t.c.place_id == ltab.c.value['lid'].as_integer())
for row in await conn.execute(sql):
current_result = next((r for r in results if r.place_id == row.src_place_id), None)
assert current_result is not None
assert current_result.address_rows is not None
current_result.address_rows.append(AddressLine(
place_id=row.place_id,
osm_object=(row.osm_type, row.osm_id),
category=(row.class_, row.type),
names=row.name, extratags=row.extratags or {},
admin_level=row.admin_level,
fromarea=True, isaddress=True,
rank_address=row.rank_address, distance=0.0))
### Now sort everything
def mk_sort_key(place_id: Optional[int]) -> Callable[[AddressLine], Tuple[bool, int, bool]]:
return lambda a: (a.place_id != place_id, -a.rank_address, a.isaddress)
for result in results:
assert result.address_rows is not None
result.address_rows.sort(key=mk_sort_key(result.place_id))
def _placex_select_address_row(conn: SearchConnection,
centroid: Point) -> SaSelect:
t = conn.t.placex
return sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_.label('class'), t.c.type,
t.c.admin_level, t.c.housenumber,
t.c.geometry.is_area().label('fromarea'),
t.c.rank_address,
t.c.geometry.distance_spheroid(
sa.bindparam('centroid', value=centroid, type_=Geometry)).label('distance'))
async def complete_linked_places(conn: SearchConnection, result: BaseResult) -> None:
""" Retrieve information about places that link to the result.
"""
result.linked_rows = AddressLines()
if result.source_table != SourceTable.PLACEX:
return
sql = _placex_select_address_row(conn, result.centroid)\
.where(conn.t.placex.c.linked_place_id == result.place_id)
for row in await conn.execute(sql):
result.linked_rows.append(_result_row_to_address_row(row))
async def complete_keywords(conn: SearchConnection, result: BaseResult) -> None:
""" Retrieve information about the search terms used for this place.
Requires that the query analyzer was initialised to get access to
the word table.
"""
t = conn.t.search_name
sql = sa.select(t.c.name_vector, t.c.nameaddress_vector)\
.where(t.c.place_id == result.place_id)
result.name_keywords = []
result.address_keywords = []
t = conn.t.meta.tables['word']
sel = sa.select(t.c.word_id, t.c.word_token, t.c.word)
for name_tokens, address_tokens in await conn.execute(sql):
for row in await conn.execute(sel.where(t.c.word_id.in_(name_tokens))):
result.name_keywords.append(WordInfo(*row))
for row in await conn.execute(sel.where(t.c.word_id.in_(address_tokens))):
result.address_keywords.append(WordInfo(*row))
async def complete_parented_places(conn: SearchConnection, result: BaseResult) -> None:
""" Retrieve information about places that the result provides the
address for.
"""
result.parented_rows = AddressLines()
if result.source_table != SourceTable.PLACEX:
return
sql = _placex_select_address_row(conn, result.centroid)\
.where(conn.t.placex.c.parent_place_id == result.place_id)\
.where(conn.t.placex.c.rank_search == 30)
for row in await conn.execute(sql):
result.parented_rows.append(_result_row_to_address_row(row))

View File

@@ -0,0 +1,603 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of reverse geocoding.
"""
from typing import Optional, List, Callable, Type, Tuple, Dict, Any, cast, Union
import functools
import sqlalchemy as sa
from nominatim_core.typing import SaColumn, SaSelect, SaFromClause, SaLabel, SaRow,\
SaBind, SaLambdaSelect
from nominatim_core.db.sqlalchemy_types import Geometry
from .connection import SearchConnection
from . import results as nres
from .logging import log
from .types import AnyPoint, DataLayer, ReverseDetails, GeometryFormat, Bbox
# In SQLAlchemy expression which compare with NULL need to be expressed with
# the equal sign.
# pylint: disable=singleton-comparison
RowFunc = Callable[[Optional[SaRow], Type[nres.ReverseResult]], Optional[nres.ReverseResult]]
WKT_PARAM: SaBind = sa.bindparam('wkt', type_=Geometry)
MAX_RANK_PARAM: SaBind = sa.bindparam('max_rank')
def no_index(expr: SaColumn) -> SaColumn:
""" Wrap the given expression, so that the query planner will
refrain from using the expression for index lookup.
"""
return sa.func.coalesce(sa.null(), expr) # pylint: disable=not-callable
def _select_from_placex(t: SaFromClause, use_wkt: bool = True) -> SaSelect:
""" Create a select statement with the columns relevant for reverse
results.
"""
if not use_wkt:
distance = t.c.distance
centroid = t.c.centroid
else:
distance = t.c.geometry.ST_Distance(WKT_PARAM)
centroid = sa.case((t.c.geometry.is_line_like(), t.c.geometry.ST_ClosestPoint(WKT_PARAM)),
else_=t.c.centroid).label('centroid')
return sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type,
t.c.address, t.c.extratags,
t.c.housenumber, t.c.postcode, t.c.country_code,
t.c.importance, t.c.wikipedia,
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
centroid,
t.c.linked_place_id, t.c.admin_level,
distance.label('distance'),
t.c.geometry.ST_Expand(0).label('bbox'))
def _interpolated_housenumber(table: SaFromClause) -> SaLabel:
return sa.cast(table.c.startnumber
+ sa.func.round(((table.c.endnumber - table.c.startnumber) * table.c.position)
/ table.c.step) * table.c.step,
sa.Integer).label('housenumber')
def _interpolated_position(table: SaFromClause) -> SaLabel:
fac = sa.cast(table.c.step, sa.Float) / (table.c.endnumber - table.c.startnumber)
rounded_pos = sa.func.round(table.c.position / fac) * fac
return sa.case(
(table.c.endnumber == table.c.startnumber, table.c.linegeo.ST_Centroid()),
else_=table.c.linegeo.ST_LineInterpolatePoint(rounded_pos)).label('centroid')
def _locate_interpolation(table: SaFromClause) -> SaLabel:
""" Given a position, locate the closest point on the line.
"""
return sa.case((table.c.linegeo.is_line_like(),
table.c.linegeo.ST_LineLocatePoint(WKT_PARAM)),
else_=0).label('position')
def _get_closest(*rows: Optional[SaRow]) -> Optional[SaRow]:
return min(rows, key=lambda row: 1000 if row is None else row.distance)
class ReverseGeocoder:
""" Class implementing the logic for looking up a place from a
coordinate.
"""
def __init__(self, conn: SearchConnection, params: ReverseDetails,
restrict_to_country_areas: bool = False) -> None:
self.conn = conn
self.params = params
self.restrict_to_country_areas = restrict_to_country_areas
self.bind_params: Dict[str, Any] = {'max_rank': params.max_rank}
@property
def max_rank(self) -> int:
""" Return the maximum configured rank.
"""
return self.params.max_rank
def has_geometries(self) -> bool:
""" Check if any geometries are requested.
"""
return bool(self.params.geometry_output)
def layer_enabled(self, *layer: DataLayer) -> bool:
""" Return true when any of the given layer types are requested.
"""
return any(self.params.layers & l for l in layer)
def layer_disabled(self, *layer: DataLayer) -> bool:
""" Return true when none of the given layer types is requested.
"""
return not any(self.params.layers & l for l in layer)
def has_feature_layers(self) -> bool:
""" Return true if any layer other than ADDRESS or POI is requested.
"""
return self.layer_enabled(DataLayer.RAILWAY, DataLayer.MANMADE, DataLayer.NATURAL)
def _add_geometry_columns(self, sql: SaLambdaSelect, col: SaColumn) -> SaSelect:
out = []
if self.params.geometry_simplification > 0.0:
col = sa.func.ST_SimplifyPreserveTopology(col, self.params.geometry_simplification)
if self.params.geometry_output & GeometryFormat.GEOJSON:
out.append(sa.func.ST_AsGeoJSON(col, 7).label('geometry_geojson'))
if self.params.geometry_output & GeometryFormat.TEXT:
out.append(sa.func.ST_AsText(col).label('geometry_text'))
if self.params.geometry_output & GeometryFormat.KML:
out.append(sa.func.ST_AsKML(col, 7).label('geometry_kml'))
if self.params.geometry_output & GeometryFormat.SVG:
out.append(sa.func.ST_AsSVG(col, 0, 7).label('geometry_svg'))
return sql.add_columns(*out)
def _filter_by_layer(self, table: SaFromClause) -> SaColumn:
if self.layer_enabled(DataLayer.MANMADE):
exclude = []
if self.layer_disabled(DataLayer.RAILWAY):
exclude.append('railway')
if self.layer_disabled(DataLayer.NATURAL):
exclude.extend(('natural', 'water', 'waterway'))
return table.c.class_.not_in(tuple(exclude))
include = []
if self.layer_enabled(DataLayer.RAILWAY):
include.append('railway')
if self.layer_enabled(DataLayer.NATURAL):
include.extend(('natural', 'water', 'waterway'))
return table.c.class_.in_(tuple(include))
async def _find_closest_street_or_poi(self, distance: float) -> Optional[SaRow]:
""" Look up the closest rank 26+ place in the database, which
is closer than the given distance.
"""
t = self.conn.t.placex
# PostgreSQL must not get the distance as a parameter because
# there is a danger it won't be able to properly estimate index use
# when used with prepared statements
diststr = sa.text(f"{distance}")
sql: SaLambdaSelect = sa.lambda_stmt(lambda: _select_from_placex(t)
.where(t.c.geometry.within_distance(WKT_PARAM, diststr))
.where(t.c.indexed_status == 0)
.where(t.c.linked_place_id == None)
.where(sa.or_(sa.not_(t.c.geometry.is_area()),
t.c.centroid.ST_Distance(WKT_PARAM) < diststr))
.order_by('distance')
.limit(2))
if self.has_geometries():
sql = self._add_geometry_columns(sql, t.c.geometry)
restrict: List[Union[SaColumn, Callable[[], SaColumn]]] = []
if self.layer_enabled(DataLayer.ADDRESS):
max_rank = min(29, self.max_rank)
restrict.append(lambda: no_index(t.c.rank_address).between(26, max_rank))
if self.max_rank == 30:
restrict.append(lambda: sa.func.IsAddressPoint(t))
if self.layer_enabled(DataLayer.POI) and self.max_rank == 30:
restrict.append(lambda: sa.and_(no_index(t.c.rank_search) == 30,
t.c.class_.not_in(('place', 'building')),
sa.not_(t.c.geometry.is_line_like())))
if self.has_feature_layers():
restrict.append(sa.and_(no_index(t.c.rank_search).between(26, MAX_RANK_PARAM),
no_index(t.c.rank_address) == 0,
self._filter_by_layer(t)))
if not restrict:
return None
sql = sql.where(sa.or_(*restrict))
# If the closest object is inside an area, then check if there is a
# POI node nearby and return that.
prev_row = None
for row in await self.conn.execute(sql, self.bind_params):
if prev_row is None:
if row.rank_search <= 27 or row.osm_type == 'N' or row.distance > 0:
return row
prev_row = row
else:
if row.rank_search > 27 and row.osm_type == 'N'\
and row.distance < 0.0001:
return row
return prev_row
async def _find_housenumber_for_street(self, parent_place_id: int) -> Optional[SaRow]:
t = self.conn.t.placex
def _base_query() -> SaSelect:
return _select_from_placex(t)\
.where(t.c.geometry.within_distance(WKT_PARAM, 0.001))\
.where(t.c.parent_place_id == parent_place_id)\
.where(sa.func.IsAddressPoint(t))\
.where(t.c.indexed_status == 0)\
.where(t.c.linked_place_id == None)\
.order_by('distance')\
.limit(1)
sql: SaLambdaSelect
if self.has_geometries():
sql = self._add_geometry_columns(_base_query(), t.c.geometry)
else:
sql = sa.lambda_stmt(_base_query)
return (await self.conn.execute(sql, self.bind_params)).one_or_none()
async def _find_interpolation_for_street(self, parent_place_id: Optional[int],
distance: float) -> Optional[SaRow]:
t = self.conn.t.osmline
sql = sa.select(t,
t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
_locate_interpolation(t))\
.where(t.c.linegeo.within_distance(WKT_PARAM, distance))\
.where(t.c.startnumber != None)\
.order_by('distance')\
.limit(1)
if parent_place_id is not None:
sql = sql.where(t.c.parent_place_id == parent_place_id)
inner = sql.subquery('ipol')
sql = sa.select(inner.c.place_id, inner.c.osm_id,
inner.c.parent_place_id, inner.c.address,
_interpolated_housenumber(inner),
_interpolated_position(inner),
inner.c.postcode, inner.c.country_code,
inner.c.distance)
if self.has_geometries():
sub = sql.subquery('geom')
sql = self._add_geometry_columns(sa.select(sub), sub.c.centroid)
return (await self.conn.execute(sql, self.bind_params)).one_or_none()
async def _find_tiger_number_for_street(self, parent_place_id: int) -> Optional[SaRow]:
t = self.conn.t.tiger
def _base_query() -> SaSelect:
inner = sa.select(t,
t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
_locate_interpolation(t))\
.where(t.c.linegeo.within_distance(WKT_PARAM, 0.001))\
.where(t.c.parent_place_id == parent_place_id)\
.order_by('distance')\
.limit(1)\
.subquery('tiger')
return sa.select(inner.c.place_id,
inner.c.parent_place_id,
_interpolated_housenumber(inner),
_interpolated_position(inner),
inner.c.postcode,
inner.c.distance)
sql: SaLambdaSelect
if self.has_geometries():
sub = _base_query().subquery('geom')
sql = self._add_geometry_columns(sa.select(sub), sub.c.centroid)
else:
sql = sa.lambda_stmt(_base_query)
return (await self.conn.execute(sql, self.bind_params)).one_or_none()
async def lookup_street_poi(self) -> Tuple[Optional[SaRow], RowFunc]:
""" Find a street or POI/address for the given WKT point.
"""
log().section('Reverse lookup on street/address level')
distance = 0.006
parent_place_id = None
row = await self._find_closest_street_or_poi(distance)
row_func: RowFunc = nres.create_from_placex_row
log().var_dump('Result (street/building)', row)
# If the closest result was a street, but an address was requested,
# check for a housenumber nearby which is part of the street.
if row is not None:
if self.max_rank > 27 \
and self.layer_enabled(DataLayer.ADDRESS) \
and row.rank_address <= 27:
distance = 0.001
parent_place_id = row.place_id
log().comment('Find housenumber for street')
addr_row = await self._find_housenumber_for_street(parent_place_id)
log().var_dump('Result (street housenumber)', addr_row)
if addr_row is not None:
row = addr_row
row_func = nres.create_from_placex_row
distance = addr_row.distance
elif row.country_code == 'us' and parent_place_id is not None:
log().comment('Find TIGER housenumber for street')
addr_row = await self._find_tiger_number_for_street(parent_place_id)
log().var_dump('Result (street Tiger housenumber)', addr_row)
if addr_row is not None:
row_func = cast(RowFunc,
functools.partial(nres.create_from_tiger_row,
osm_type=row.osm_type,
osm_id=row.osm_id))
row = addr_row
else:
distance = row.distance
# Check for an interpolation that is either closer than our result
# or belongs to a close street found.
if self.max_rank > 27 and self.layer_enabled(DataLayer.ADDRESS):
log().comment('Find interpolation for street')
addr_row = await self._find_interpolation_for_street(parent_place_id,
distance)
log().var_dump('Result (street interpolation)', addr_row)
if addr_row is not None:
row = addr_row
row_func = nres.create_from_osmline_row
return row, row_func
async def _lookup_area_address(self) -> Optional[SaRow]:
""" Lookup large addressable areas for the given WKT point.
"""
log().comment('Reverse lookup by larger address area features')
t = self.conn.t.placex
def _base_query() -> SaSelect:
# The inner SQL brings results in the right order, so that
# later only a minimum of results needs to be checked with ST_Contains.
inner = sa.select(t, sa.literal(0.0).label('distance'))\
.where(t.c.rank_search.between(5, MAX_RANK_PARAM))\
.where(t.c.geometry.intersects(WKT_PARAM))\
.where(sa.func.PlacexGeometryReverseLookuppolygon())\
.order_by(sa.desc(t.c.rank_search))\
.limit(50)\
.subquery('area')
return _select_from_placex(inner, False)\
.where(inner.c.geometry.ST_Contains(WKT_PARAM))\
.order_by(sa.desc(inner.c.rank_search))\
.limit(1)
sql: SaLambdaSelect = sa.lambda_stmt(_base_query)
if self.has_geometries():
sql = self._add_geometry_columns(sql, sa.literal_column('area.geometry'))
address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
log().var_dump('Result (area)', address_row)
if address_row is not None and address_row.rank_search < self.max_rank:
log().comment('Search for better matching place nodes inside the area')
address_rank = address_row.rank_search
address_id = address_row.place_id
def _place_inside_area_query() -> SaSelect:
inner = \
sa.select(t,
t.c.geometry.ST_Distance(WKT_PARAM).label('distance'))\
.where(t.c.rank_search > address_rank)\
.where(t.c.rank_search <= MAX_RANK_PARAM)\
.where(t.c.indexed_status == 0)\
.where(sa.func.IntersectsReverseDistance(t, WKT_PARAM))\
.order_by(sa.desc(t.c.rank_search))\
.limit(50)\
.subquery('places')
touter = t.alias('outer')
return _select_from_placex(inner, False)\
.join(touter, touter.c.geometry.ST_Contains(inner.c.geometry))\
.where(touter.c.place_id == address_id)\
.where(sa.func.IsBelowReverseDistance(inner.c.distance, inner.c.rank_search))\
.order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
.limit(1)
if self.has_geometries():
sql = self._add_geometry_columns(_place_inside_area_query(),
sa.literal_column('places.geometry'))
else:
sql = sa.lambda_stmt(_place_inside_area_query)
place_address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
log().var_dump('Result (place node)', place_address_row)
if place_address_row is not None:
return place_address_row
return address_row
async def _lookup_area_others(self) -> Optional[SaRow]:
t = self.conn.t.placex
inner = sa.select(t, t.c.geometry.ST_Distance(WKT_PARAM).label('distance'))\
.where(t.c.rank_address == 0)\
.where(t.c.rank_search.between(5, MAX_RANK_PARAM))\
.where(t.c.name != None)\
.where(t.c.indexed_status == 0)\
.where(t.c.linked_place_id == None)\
.where(self._filter_by_layer(t))\
.where(t.c.geometry.intersects(sa.func.ST_Expand(WKT_PARAM, 0.007)))\
.order_by(sa.desc(t.c.rank_search))\
.order_by('distance')\
.limit(50)\
.subquery()
sql = _select_from_placex(inner, False)\
.where(sa.or_(sa.not_(inner.c.geometry.is_area()),
inner.c.geometry.ST_Contains(WKT_PARAM)))\
.order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
.limit(1)
if self.has_geometries():
sql = self._add_geometry_columns(sql, inner.c.geometry)
row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
log().var_dump('Result (non-address feature)', row)
return row
async def lookup_area(self) -> Optional[SaRow]:
""" Lookup large areas for the current search.
"""
log().section('Reverse lookup by larger area features')
if self.layer_enabled(DataLayer.ADDRESS):
address_row = await self._lookup_area_address()
else:
address_row = None
if self.has_feature_layers():
other_row = await self._lookup_area_others()
else:
other_row = None
return _get_closest(address_row, other_row)
async def lookup_country_codes(self) -> List[str]:
""" Lookup the country for the current search.
"""
log().section('Reverse lookup by country code')
t = self.conn.t.country_grid
sql = sa.select(t.c.country_code).distinct()\
.where(t.c.geometry.ST_Contains(WKT_PARAM))
ccodes = [cast(str, r[0]) for r in await self.conn.execute(sql, self.bind_params)]
log().var_dump('Country codes', ccodes)
return ccodes
async def lookup_country(self, ccodes: List[str]) -> Optional[SaRow]:
""" Lookup the country for the current search.
"""
if not ccodes:
ccodes = await self.lookup_country_codes()
if not ccodes:
return None
t = self.conn.t.placex
if self.max_rank > 4:
log().comment('Search for place nodes in country')
def _base_query() -> SaSelect:
inner = \
sa.select(t,
t.c.geometry.ST_Distance(WKT_PARAM).label('distance'))\
.where(t.c.rank_search > 4)\
.where(t.c.rank_search <= MAX_RANK_PARAM)\
.where(t.c.indexed_status == 0)\
.where(t.c.country_code.in_(ccodes))\
.where(sa.func.IntersectsReverseDistance(t, WKT_PARAM))\
.order_by(sa.desc(t.c.rank_search))\
.limit(50)\
.subquery('area')
return _select_from_placex(inner, False)\
.where(sa.func.IsBelowReverseDistance(inner.c.distance, inner.c.rank_search))\
.order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
.limit(1)
sql: SaLambdaSelect
if self.has_geometries():
sql = self._add_geometry_columns(_base_query(),
sa.literal_column('area.geometry'))
else:
sql = sa.lambda_stmt(_base_query)
address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
log().var_dump('Result (addressable place node)', address_row)
else:
address_row = None
if address_row is None:
# Still nothing, then return a country with the appropriate country code.
def _country_base_query() -> SaSelect:
return _select_from_placex(t)\
.where(t.c.country_code.in_(ccodes))\
.where(t.c.rank_address == 4)\
.where(t.c.rank_search == 4)\
.where(t.c.linked_place_id == None)\
.order_by('distance')\
.limit(1)
if self.has_geometries():
sql = self._add_geometry_columns(_country_base_query(), t.c.geometry)
else:
sql = sa.lambda_stmt(_country_base_query)
address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
return address_row
async def lookup(self, coord: AnyPoint) -> Optional[nres.ReverseResult]:
""" Look up a single coordinate. Returns the place information,
if a place was found near the coordinates or None otherwise.
"""
log().function('reverse_lookup', coord=coord, params=self.params)
self.bind_params['wkt'] = f'POINT({coord[0]} {coord[1]})'
row: Optional[SaRow] = None
row_func: RowFunc = nres.create_from_placex_row
if self.max_rank >= 26:
row, tmp_row_func = await self.lookup_street_poi()
if row is not None:
row_func = tmp_row_func
if row is None:
if self.restrict_to_country_areas:
ccodes = await self.lookup_country_codes()
if not ccodes:
return None
else:
ccodes = []
if self.max_rank > 4:
row = await self.lookup_area()
if row is None and self.layer_enabled(DataLayer.ADDRESS):
row = await self.lookup_country(ccodes)
result = row_func(row, nres.ReverseResult)
if result is not None:
assert row is not None
result.distance = row.distance
if hasattr(row, 'bbox'):
result.bbox = Bbox.from_wkb(row.bbox)
await nres.add_result_details(self.conn, [result], self.params)
return result

View File

@@ -0,0 +1,15 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module for forward search.
"""
# pylint: disable=useless-import-alias
from .geocoder import (ForwardGeocoder as ForwardGeocoder)
from .query import (Phrase as Phrase,
PhraseType as PhraseType)
from .query_analyzer_factory import (make_query_analyzer as make_query_analyzer)

View File

@@ -0,0 +1,459 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Conversion from token assignment to an abstract DB search.
"""
from typing import Optional, List, Tuple, Iterator, Dict
import heapq
from ..types import SearchDetails, DataLayer
from .query import QueryStruct, Token, TokenType, TokenRange, BreakType
from .token_assignment import TokenAssignment
from . import db_search_fields as dbf
from . import db_searches as dbs
from . import db_search_lookups as lookups
def wrap_near_search(categories: List[Tuple[str, str]],
search: dbs.AbstractSearch) -> dbs.NearSearch:
""" Create a new search that wraps the given search in a search
for near places of the given category.
"""
return dbs.NearSearch(penalty=search.penalty,
categories=dbf.WeightedCategories(categories,
[0.0] * len(categories)),
search=search)
def build_poi_search(category: List[Tuple[str, str]],
countries: Optional[List[str]]) -> dbs.PoiSearch:
""" Create a new search for places by the given category, possibly
constraint to the given countries.
"""
if countries:
ccs = dbf.WeightedStrings(countries, [0.0] * len(countries))
else:
ccs = dbf.WeightedStrings([], [])
class _PoiData(dbf.SearchData):
penalty = 0.0
qualifiers = dbf.WeightedCategories(category, [0.0] * len(category))
countries=ccs
return dbs.PoiSearch(_PoiData())
class SearchBuilder:
""" Build the abstract search queries from token assignments.
"""
def __init__(self, query: QueryStruct, details: SearchDetails) -> None:
self.query = query
self.details = details
@property
def configured_for_country(self) -> bool:
""" Return true if the search details are configured to
allow countries in the result.
"""
return self.details.min_rank <= 4 and self.details.max_rank >= 4 \
and self.details.layer_enabled(DataLayer.ADDRESS)
@property
def configured_for_postcode(self) -> bool:
""" Return true if the search details are configured to
allow postcodes in the result.
"""
return self.details.min_rank <= 5 and self.details.max_rank >= 11\
and self.details.layer_enabled(DataLayer.ADDRESS)
@property
def configured_for_housenumbers(self) -> bool:
""" Return true if the search details are configured to
allow addresses in the result.
"""
return self.details.max_rank >= 30 \
and self.details.layer_enabled(DataLayer.ADDRESS)
def build(self, assignment: TokenAssignment) -> Iterator[dbs.AbstractSearch]:
""" Yield all possible abstract searches for the given token assignment.
"""
sdata = self.get_search_data(assignment)
if sdata is None:
return
near_items = self.get_near_items(assignment)
if near_items is not None and not near_items:
return # impossible compbination of near items and category parameter
if assignment.name is None:
if near_items and not sdata.postcodes:
sdata.qualifiers = near_items
near_items = None
builder = self.build_poi_search(sdata)
elif assignment.housenumber:
hnr_tokens = self.query.get_tokens(assignment.housenumber,
TokenType.HOUSENUMBER)
builder = self.build_housenumber_search(sdata, hnr_tokens, assignment.address)
else:
builder = self.build_special_search(sdata, assignment.address,
bool(near_items))
else:
builder = self.build_name_search(sdata, assignment.name, assignment.address,
bool(near_items))
if near_items:
penalty = min(near_items.penalties)
near_items.penalties = [p - penalty for p in near_items.penalties]
for search in builder:
search_penalty = search.penalty
search.penalty = 0.0
yield dbs.NearSearch(penalty + assignment.penalty + search_penalty,
near_items, search)
else:
for search in builder:
search.penalty += assignment.penalty
yield search
def build_poi_search(self, sdata: dbf.SearchData) -> Iterator[dbs.AbstractSearch]:
""" Build abstract search query for a simple category search.
This kind of search requires an additional geographic constraint.
"""
if not sdata.housenumbers \
and ((self.details.viewbox and self.details.bounded_viewbox) or self.details.near):
yield dbs.PoiSearch(sdata)
def build_special_search(self, sdata: dbf.SearchData,
address: List[TokenRange],
is_category: bool) -> Iterator[dbs.AbstractSearch]:
""" Build abstract search queries for searches that do not involve
a named place.
"""
if sdata.qualifiers:
# No special searches over qualifiers supported.
return
if sdata.countries and not address and not sdata.postcodes \
and self.configured_for_country:
yield dbs.CountrySearch(sdata)
if sdata.postcodes and (is_category or self.configured_for_postcode):
penalty = 0.0 if sdata.countries else 0.1
if address:
sdata.lookups = [dbf.FieldLookup('nameaddress_vector',
[t.token for r in address
for t in self.query.get_partials_list(r)],
lookups.Restrict)]
penalty += 0.2
yield dbs.PostcodeSearch(penalty, sdata)
def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
address: List[TokenRange]) -> Iterator[dbs.AbstractSearch]:
""" Build a simple address search for special entries where the
housenumber is the main name token.
"""
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
expected_count = sum(t.count for t in hnrs)
partials = {t.token: t.addr_count for trange in address
for t in self.query.get_partials_list(trange)}
if expected_count < 8000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
list(partials), lookups.Restrict))
elif len(partials) != 1 or list(partials.values())[0] < 10000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
list(partials), lookups.LookupAll))
else:
addr_fulls = [t.token for t
in self.query.get_tokens(address[0], TokenType.WORD)]
if len(addr_fulls) > 5:
return
sdata.lookups.append(
dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny))
sdata.housenumbers = dbf.WeightedStrings([], [])
yield dbs.PlaceSearch(0.05, sdata, expected_count)
def build_name_search(self, sdata: dbf.SearchData,
name: TokenRange, address: List[TokenRange],
is_category: bool) -> Iterator[dbs.AbstractSearch]:
""" Build abstract search queries for simple name or address searches.
"""
if is_category or not sdata.housenumbers or self.configured_for_housenumbers:
ranking = self.get_name_ranking(name)
name_penalty = ranking.normalize_penalty()
if ranking.rankings:
sdata.rankings.append(ranking)
for penalty, count, lookup in self.yield_lookups(name, address):
sdata.lookups = lookup
yield dbs.PlaceSearch(penalty + name_penalty, sdata, count)
def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
-> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
""" Yield all variants how the given name and address should best
be searched for. This takes into account how frequent the terms
are and tries to find a lookup that optimizes index use.
"""
penalty = 0.0 # extra penalty
name_partials = {t.token: t for t in self.query.get_partials_list(name)}
addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
addr_tokens = list({t.token for t in addr_partials})
partials_indexed = all(t.is_indexed for t in name_partials.values()) \
and all(t.is_indexed for t in addr_partials)
exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
return
addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
# Partial term to frequent. Try looking up by rare full names first.
name_fulls = self.query.get_tokens(name, TokenType.WORD)
if name_fulls:
fulls_count = sum(t.count for t in name_fulls)
if partials_indexed:
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
if fulls_count < 50000 or addr_count < 30000:
yield penalty,fulls_count / (2**len(addr_tokens)), \
self.get_full_name_ranking(name_fulls, addr_partials,
fulls_count > 30000 / max(1, len(addr_tokens)))
# To catch remaining results, lookup by name and address
# We only do this if there is a reasonable number of results expected.
exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
if exp_count < 10000 and addr_count < 20000\
and all(t.is_indexed for t in name_partials.values()):
penalty += 0.35 * max(1 if name_fulls else 0.1,
5 - len(name_partials) - len(addr_tokens))
yield penalty, exp_count,\
self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
def get_name_address_ranking(self, name_tokens: List[int],
addr_partials: List[Token]) -> List[dbf.FieldLookup]:
""" Create a ranking expression looking up by name and address.
"""
lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
addr_restrict_tokens = []
addr_lookup_tokens = []
for t in addr_partials:
if t.is_indexed:
if t.addr_count > 20000:
addr_restrict_tokens.append(t.token)
else:
addr_lookup_tokens.append(t.token)
if addr_restrict_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector',
addr_restrict_tokens, lookups.Restrict))
if addr_lookup_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector',
addr_lookup_tokens, lookups.LookupAll))
return lookup
def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Token],
use_lookup: bool) -> List[dbf.FieldLookup]:
""" Create a ranking expression with full name terms and
additional address lookup. When 'use_lookup' is true, then
address lookups will use the index, when the occurences are not
too many.
"""
# At this point drop unindexed partials from the address.
# This might yield wrong results, nothing we can do about that.
if use_lookup:
addr_restrict_tokens = []
addr_lookup_tokens = []
for t in addr_partials:
if t.is_indexed:
if t.addr_count > 20000:
addr_restrict_tokens.append(t.token)
else:
addr_lookup_tokens.append(t.token)
else:
addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed]
addr_lookup_tokens = []
return dbf.lookup_by_any_name([t.token for t in name_fulls],
addr_restrict_tokens, addr_lookup_tokens)
def get_name_ranking(self, trange: TokenRange,
db_field: str = 'name_vector') -> dbf.FieldRanking:
""" Create a ranking expression for a name term in the given range.
"""
name_fulls = self.query.get_tokens(trange, TokenType.WORD)
ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
ranks.sort(key=lambda r: r.penalty)
# Fallback, sum of penalty for partials
name_partials = self.query.get_partials_list(trange)
default = sum(t.penalty for t in name_partials) + 0.2
return dbf.FieldRanking(db_field, default, ranks)
def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
""" Create a list of ranking expressions for an address term
for the given ranges.
"""
todo: List[Tuple[int, int, dbf.RankedTokens]] = []
heapq.heappush(todo, (0, trange.start, dbf.RankedTokens(0.0, [])))
ranks: List[dbf.RankedTokens] = []
while todo: # pylint: disable=too-many-nested-blocks
neglen, pos, rank = heapq.heappop(todo)
for tlist in self.query.nodes[pos].starting:
if tlist.ttype in (TokenType.PARTIAL, TokenType.WORD):
if tlist.end < trange.end:
chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
if tlist.ttype == TokenType.PARTIAL:
penalty = rank.penalty + chgpenalty \
+ max(t.penalty for t in tlist.tokens)
heapq.heappush(todo, (neglen - 1, tlist.end,
dbf.RankedTokens(penalty, rank.tokens)))
else:
for t in tlist.tokens:
heapq.heappush(todo, (neglen - 1, tlist.end,
rank.with_token(t, chgpenalty)))
elif tlist.end == trange.end:
if tlist.ttype == TokenType.PARTIAL:
ranks.append(dbf.RankedTokens(rank.penalty
+ max(t.penalty for t in tlist.tokens),
rank.tokens))
else:
ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
if len(ranks) >= 10:
# Too many variants, bail out and only add
# Worst-case Fallback: sum of penalty of partials
name_partials = self.query.get_partials_list(trange)
default = sum(t.penalty for t in name_partials) + 0.2
ranks.append(dbf.RankedTokens(rank.penalty + default, []))
# Bail out of outer loop
todo.clear()
break
ranks.sort(key=lambda r: len(r.tokens))
default = ranks[0].penalty + 0.3
del ranks[0]
ranks.sort(key=lambda r: r.penalty)
return dbf.FieldRanking('nameaddress_vector', default, ranks)
def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchData]:
""" Collect the tokens for the non-name search fields in the
assignment.
"""
sdata = dbf.SearchData()
sdata.penalty = assignment.penalty
if assignment.country:
tokens = self.get_country_tokens(assignment.country)
if not tokens:
return None
sdata.set_strings('countries', tokens)
elif self.details.countries:
sdata.countries = dbf.WeightedStrings(self.details.countries,
[0.0] * len(self.details.countries))
if assignment.housenumber:
sdata.set_strings('housenumbers',
self.query.get_tokens(assignment.housenumber,
TokenType.HOUSENUMBER))
if assignment.postcode:
sdata.set_strings('postcodes',
self.query.get_tokens(assignment.postcode,
TokenType.POSTCODE))
if assignment.qualifier:
tokens = self.get_qualifier_tokens(assignment.qualifier)
if not tokens:
return None
sdata.set_qualifiers(tokens)
elif self.details.categories:
sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
[0.0] * len(self.details.categories))
if assignment.address:
if not assignment.name and assignment.housenumber:
# housenumber search: the first item needs to be handled like
# a name in ranking or penalties are not comparable with
# normal searches.
sdata.set_ranking([self.get_name_ranking(assignment.address[0],
db_field='nameaddress_vector')]
+ [self.get_addr_ranking(r) for r in assignment.address[1:]])
else:
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
else:
sdata.rankings = []
return sdata
def get_country_tokens(self, trange: TokenRange) -> List[Token]:
""" Return the list of country tokens for the given range,
optionally filtered by the country list from the details
parameters.
"""
tokens = self.query.get_tokens(trange, TokenType.COUNTRY)
if self.details.countries:
tokens = [t for t in tokens if t.lookup_word in self.details.countries]
return tokens
def get_qualifier_tokens(self, trange: TokenRange) -> List[Token]:
""" Return the list of qualifier tokens for the given range,
optionally filtered by the qualifier list from the details
parameters.
"""
tokens = self.query.get_tokens(trange, TokenType.QUALIFIER)
if self.details.categories:
tokens = [t for t in tokens if t.get_category() in self.details.categories]
return tokens
def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
""" Collect tokens for near items search or use the categories
requested per parameter.
Returns None if no category search is requested.
"""
if assignment.near_item:
tokens: Dict[Tuple[str, str], float] = {}
for t in self.query.get_tokens(assignment.near_item, TokenType.NEAR_ITEM):
cat = t.get_category()
# The category of a near search will be that of near_item.
# Thus, if search is restricted to a category parameter,
# the two sets must intersect.
if (not self.details.categories or cat in self.details.categories)\
and t.penalty < tokens.get(cat, 1000.0):
tokens[cat] = t.penalty
return dbf.WeightedCategories(list(tokens.keys()), list(tokens.values()))
return None
PENALTY_WORDCHANGE = {
BreakType.START: 0.0,
BreakType.END: 0.0,
BreakType.PHRASE: 0.0,
BreakType.WORD: 0.1,
BreakType.PART: 0.2,
BreakType.TOKEN: 0.4
}

View File

@@ -0,0 +1,254 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Data structures for more complex fields in abstract search descriptions.
"""
from typing import List, Tuple, Iterator, Dict, Type
import dataclasses
import sqlalchemy as sa
from nominatim_core.typing import SaFromClause, SaColumn, SaExpression
from .query import Token
from . import db_search_lookups as lookups
from nominatim_core.utils.json_writer import JsonWriter
@dataclasses.dataclass
class WeightedStrings:
""" A list of strings together with a penalty.
"""
values: List[str]
penalties: List[float]
def __bool__(self) -> bool:
return bool(self.values)
def __iter__(self) -> Iterator[Tuple[str, float]]:
return iter(zip(self.values, self.penalties))
def get_penalty(self, value: str, default: float = 1000.0) -> float:
""" Get the penalty for the given value. Returns the given default
if the value does not exist.
"""
try:
return self.penalties[self.values.index(value)]
except ValueError:
pass
return default
@dataclasses.dataclass
class WeightedCategories:
""" A list of class/type tuples together with a penalty.
"""
values: List[Tuple[str, str]]
penalties: List[float]
def __bool__(self) -> bool:
return bool(self.values)
def __iter__(self) -> Iterator[Tuple[Tuple[str, str], float]]:
return iter(zip(self.values, self.penalties))
def get_penalty(self, value: Tuple[str, str], default: float = 1000.0) -> float:
""" Get the penalty for the given value. Returns the given default
if the value does not exist.
"""
try:
return self.penalties[self.values.index(value)]
except ValueError:
pass
return default
def sql_restrict(self, table: SaFromClause) -> SaExpression:
""" Return an SQLAlcheny expression that restricts the
class and type columns of the given table to the values
in the list.
Must not be used with an empty list.
"""
assert self.values
if len(self.values) == 1:
return sa.and_(table.c.class_ == self.values[0][0],
table.c.type == self.values[0][1])
return sa.or_(*(sa.and_(table.c.class_ == c, table.c.type == t)
for c, t in self.values))
@dataclasses.dataclass(order=True)
class RankedTokens:
""" List of tokens together with the penalty of using it.
"""
penalty: float
tokens: List[int]
def with_token(self, t: Token, transition_penalty: float) -> 'RankedTokens':
""" Create a new RankedTokens list with the given token appended.
The tokens penalty as well as the given transition penalty
are added to the overall penalty.
"""
return RankedTokens(self.penalty + t.penalty + transition_penalty,
self.tokens + [t.token])
@dataclasses.dataclass
class FieldRanking:
""" A list of rankings to be applied sequentially until one matches.
The matched ranking determines the penalty. If none matches a
default penalty is applied.
"""
column: str
default: float
rankings: List[RankedTokens]
def normalize_penalty(self) -> float:
""" Reduce the default and ranking penalties, such that the minimum
penalty is 0. Return the penalty that was subtracted.
"""
if self.rankings:
min_penalty = min(self.default, min(r.penalty for r in self.rankings))
else:
min_penalty = self.default
if min_penalty > 0.0:
self.default -= min_penalty
for ranking in self.rankings:
ranking.penalty -= min_penalty
return min_penalty
def sql_penalty(self, table: SaFromClause) -> SaColumn:
""" Create an SQL expression for the rankings.
"""
assert self.rankings
rout = JsonWriter().start_array()
for rank in self.rankings:
rout.start_array().value(rank.penalty).next()
rout.start_array()
for token in rank.tokens:
rout.value(token).next()
rout.end_array()
rout.end_array().next()
rout.end_array()
return sa.func.weigh_search(table.c[self.column], rout(), self.default)
@dataclasses.dataclass
class FieldLookup:
""" A list of tokens to be searched for. The column names the database
column to search in and the lookup_type the operator that is applied.
'lookup_all' requires all tokens to match. 'lookup_any' requires
one of the tokens to match. 'restrict' requires to match all tokens
but avoids the use of indexes.
"""
column: str
tokens: List[int]
lookup_type: Type[lookups.LookupType]
def sql_condition(self, table: SaFromClause) -> SaColumn:
""" Create an SQL expression for the given match condition.
"""
return self.lookup_type(table, self.column, self.tokens)
class SearchData:
""" Search fields derived from query and token assignment
to be used with the SQL queries.
"""
penalty: float
lookups: List[FieldLookup] = []
rankings: List[FieldRanking]
housenumbers: WeightedStrings = WeightedStrings([], [])
postcodes: WeightedStrings = WeightedStrings([], [])
countries: WeightedStrings = WeightedStrings([], [])
qualifiers: WeightedCategories = WeightedCategories([], [])
def set_strings(self, field: str, tokens: List[Token]) -> None:
""" Set on of the WeightedStrings properties from the given
token list. Adapt the global penalty, so that the
minimum penalty is 0.
"""
if tokens:
min_penalty = min(t.penalty for t in tokens)
self.penalty += min_penalty
wstrs = WeightedStrings([t.lookup_word for t in tokens],
[t.penalty - min_penalty for t in tokens])
setattr(self, field, wstrs)
def set_qualifiers(self, tokens: List[Token]) -> None:
""" Set the qulaifier field from the given tokens.
"""
if tokens:
categories: Dict[Tuple[str, str], float] = {}
min_penalty = 1000.0
for t in tokens:
min_penalty = min(min_penalty, t.penalty)
cat = t.get_category()
if t.penalty < categories.get(cat, 1000.0):
categories[cat] = t.penalty
self.penalty += min_penalty
self.qualifiers = WeightedCategories(list(categories.keys()),
list(categories.values()))
def set_ranking(self, rankings: List[FieldRanking]) -> None:
""" Set the list of rankings and normalize the ranking.
"""
self.rankings = []
for ranking in rankings:
if ranking.rankings:
self.penalty += ranking.normalize_penalty()
self.rankings.append(ranking)
else:
self.penalty += ranking.default
def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where name tokens are looked up via index
and potential address tokens are used to restrict the search further.
"""
lookup = [FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
if addr_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_tokens, lookups.Restrict))
return lookup
def lookup_by_any_name(name_tokens: List[int], addr_restrict_tokens: List[int],
addr_lookup_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where name tokens are looked up via index
and only one of the name tokens must be present.
Potential address tokens are used to restrict the search further.
"""
lookup = [FieldLookup('name_vector', name_tokens, lookups.LookupAny)]
if addr_restrict_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_restrict_tokens, lookups.Restrict))
if addr_lookup_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_lookup_tokens, lookups.LookupAll))
return lookup
def lookup_by_addr(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where address tokens are looked up via index
and the name tokens are only used to restrict the search further.
"""
return [FieldLookup('name_vector', name_tokens, lookups.Restrict),
FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll)]

View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of lookup functions for the search_name table.
"""
from typing import List, Any
import sqlalchemy as sa
from sqlalchemy.ext.compiler import compiles
from nominatim_core.typing import SaFromClause
from nominatim_core.db.sqlalchemy_types import IntArray
# pylint: disable=consider-using-f-string
LookupType = sa.sql.expression.FunctionElement[Any]
class LookupAll(LookupType):
""" Find all entries in search_name table that contain all of
a given list of tokens using an index for the search.
"""
inherit_cache = True
def __init__(self, table: SaFromClause, column: str, tokens: List[int]) -> None:
super().__init__(table.c.place_id, getattr(table.c, column), column,
sa.type_coerce(tokens, IntArray))
@compiles(LookupAll) # type: ignore[no-untyped-call, misc]
def _default_lookup_all(element: LookupAll,
compiler: 'sa.Compiled', **kw: Any) -> str:
_, col, _, tokens = list(element.clauses)
return "(%s @> %s)" % (compiler.process(col, **kw),
compiler.process(tokens, **kw))
@compiles(LookupAll, 'sqlite') # type: ignore[no-untyped-call, misc]
def _sqlite_lookup_all(element: LookupAll,
compiler: 'sa.Compiled', **kw: Any) -> str:
place, col, colname, tokens = list(element.clauses)
return "(%s IN (SELECT CAST(value as bigint) FROM"\
" (SELECT array_intersect_fuzzy(places) as p FROM"\
" (SELECT places FROM reverse_search_name"\
" WHERE word IN (SELECT value FROM json_each('[' || %s || ']'))"\
" AND column = %s"\
" ORDER BY length(places)) as x) as u,"\
" json_each('[' || u.p || ']'))"\
" AND array_contains(%s, %s))"\
% (compiler.process(place, **kw),
compiler.process(tokens, **kw),
compiler.process(colname, **kw),
compiler.process(col, **kw),
compiler.process(tokens, **kw)
)
class LookupAny(LookupType):
""" Find all entries that contain at least one of the given tokens.
Use an index for the search.
"""
inherit_cache = True
def __init__(self, table: SaFromClause, column: str, tokens: List[int]) -> None:
super().__init__(table.c.place_id, getattr(table.c, column), column,
sa.type_coerce(tokens, IntArray))
@compiles(LookupAny) # type: ignore[no-untyped-call, misc]
def _default_lookup_any(element: LookupAny,
compiler: 'sa.Compiled', **kw: Any) -> str:
_, col, _, tokens = list(element.clauses)
return "(%s && %s)" % (compiler.process(col, **kw),
compiler.process(tokens, **kw))
@compiles(LookupAny, 'sqlite') # type: ignore[no-untyped-call, misc]
def _sqlite_lookup_any(element: LookupAny,
compiler: 'sa.Compiled', **kw: Any) -> str:
place, _, colname, tokens = list(element.clauses)
return "%s IN (SELECT CAST(value as bigint) FROM"\
" (SELECT array_union(places) as p FROM reverse_search_name"\
" WHERE word IN (SELECT value FROM json_each('[' || %s || ']'))"\
" AND column = %s) as u,"\
" json_each('[' || u.p || ']'))" % (compiler.process(place, **kw),
compiler.process(tokens, **kw),
compiler.process(colname, **kw))
class Restrict(LookupType):
""" Find all entries that contain all of the given tokens.
Do not use an index for the search.
"""
inherit_cache = True
def __init__(self, table: SaFromClause, column: str, tokens: List[int]) -> None:
super().__init__(getattr(table.c, column),
sa.type_coerce(tokens, IntArray))
@compiles(Restrict) # type: ignore[no-untyped-call, misc]
def _default_restrict(element: Restrict,
compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "(coalesce(null, %s) @> %s)" % (compiler.process(arg1, **kw),
compiler.process(arg2, **kw))
@compiles(Restrict, 'sqlite') # type: ignore[no-untyped-call, misc]
def _sqlite_restrict(element: Restrict,
compiler: 'sa.Compiled', **kw: Any) -> str:
return "array_contains(%s)" % compiler.process(element.clauses, **kw)

View File

@@ -0,0 +1,874 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the actual database accesses for forward search.
"""
from typing import List, Tuple, AsyncIterator, Dict, Any, Callable, cast
import abc
import sqlalchemy as sa
from nominatim_core.typing import SaFromClause, SaScalarSelect, SaColumn, \
SaExpression, SaSelect, SaLambdaSelect, SaRow, SaBind
from nominatim_core.db.sqlalchemy_types import Geometry, IntArray
from ..connection import SearchConnection
from ..types import SearchDetails, DataLayer, GeometryFormat, Bbox
from .. import results as nres
from .db_search_fields import SearchData, WeightedCategories
#pylint: disable=singleton-comparison,not-callable
#pylint: disable=too-many-branches,too-many-arguments,too-many-locals,too-many-statements
def no_index(expr: SaColumn) -> SaColumn:
""" Wrap the given expression, so that the query planner will
refrain from using the expression for index lookup.
"""
return sa.func.coalesce(sa.null(), expr) # pylint: disable=not-callable
def _details_to_bind_params(details: SearchDetails) -> Dict[str, Any]:
""" Create a dictionary from search parameters that can be used
as bind parameter for SQL execute.
"""
return {'limit': details.max_results,
'min_rank': details.min_rank,
'max_rank': details.max_rank,
'viewbox': details.viewbox,
'viewbox2': details.viewbox_x2,
'near': details.near,
'near_radius': details.near_radius,
'excluded': details.excluded,
'countries': details.countries}
LIMIT_PARAM: SaBind = sa.bindparam('limit')
MIN_RANK_PARAM: SaBind = sa.bindparam('min_rank')
MAX_RANK_PARAM: SaBind = sa.bindparam('max_rank')
VIEWBOX_PARAM: SaBind = sa.bindparam('viewbox', type_=Geometry)
VIEWBOX2_PARAM: SaBind = sa.bindparam('viewbox2', type_=Geometry)
NEAR_PARAM: SaBind = sa.bindparam('near', type_=Geometry)
NEAR_RADIUS_PARAM: SaBind = sa.bindparam('near_radius')
COUNTRIES_PARAM: SaBind = sa.bindparam('countries')
def filter_by_area(sql: SaSelect, t: SaFromClause,
details: SearchDetails, avoid_index: bool = False) -> SaSelect:
""" Apply SQL statements for filtering by viewbox and near point,
if applicable.
"""
if details.near is not None and details.near_radius is not None:
if details.near_radius < 0.1 and not avoid_index:
sql = sql.where(t.c.geometry.within_distance(NEAR_PARAM, NEAR_RADIUS_PARAM))
else:
sql = sql.where(t.c.geometry.ST_Distance(NEAR_PARAM) <= NEAR_RADIUS_PARAM)
if details.viewbox is not None and details.bounded_viewbox:
sql = sql.where(t.c.geometry.intersects(VIEWBOX_PARAM,
use_index=not avoid_index and
details.viewbox.area < 0.2))
return sql
def _exclude_places(t: SaFromClause) -> Callable[[], SaExpression]:
return lambda: t.c.place_id.not_in(sa.bindparam('excluded'))
def _select_placex(t: SaFromClause) -> SaSelect:
return sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type,
t.c.address, t.c.extratags,
t.c.housenumber, t.c.postcode, t.c.country_code,
t.c.wikipedia,
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
t.c.linked_place_id, t.c.admin_level,
t.c.centroid,
t.c.geometry.ST_Expand(0).label('bbox'))
def _add_geometry_columns(sql: SaLambdaSelect, col: SaColumn, details: SearchDetails) -> SaSelect:
out = []
if details.geometry_simplification > 0.0:
col = sa.func.ST_SimplifyPreserveTopology(col, details.geometry_simplification)
if details.geometry_output & GeometryFormat.GEOJSON:
out.append(sa.func.ST_AsGeoJSON(col, 7).label('geometry_geojson'))
if details.geometry_output & GeometryFormat.TEXT:
out.append(sa.func.ST_AsText(col).label('geometry_text'))
if details.geometry_output & GeometryFormat.KML:
out.append(sa.func.ST_AsKML(col, 7).label('geometry_kml'))
if details.geometry_output & GeometryFormat.SVG:
out.append(sa.func.ST_AsSVG(col, 0, 7).label('geometry_svg'))
return sql.add_columns(*out)
def _make_interpolation_subquery(table: SaFromClause, inner: SaFromClause,
numerals: List[int], details: SearchDetails) -> SaScalarSelect:
all_ids = sa.func.ArrayAgg(table.c.place_id)
sql = sa.select(all_ids).where(table.c.parent_place_id == inner.c.place_id)
if len(numerals) == 1:
sql = sql.where(sa.between(numerals[0], table.c.startnumber, table.c.endnumber))\
.where((numerals[0] - table.c.startnumber) % table.c.step == 0)
else:
sql = sql.where(sa.or_(
*(sa.and_(sa.between(n, table.c.startnumber, table.c.endnumber),
(n - table.c.startnumber) % table.c.step == 0)
for n in numerals)))
if details.excluded:
sql = sql.where(_exclude_places(table))
return sql.scalar_subquery()
def _filter_by_layer(table: SaFromClause, layers: DataLayer) -> SaColumn:
orexpr: List[SaExpression] = []
if layers & DataLayer.ADDRESS and layers & DataLayer.POI:
orexpr.append(no_index(table.c.rank_address).between(1, 30))
elif layers & DataLayer.ADDRESS:
orexpr.append(no_index(table.c.rank_address).between(1, 29))
orexpr.append(sa.func.IsAddressPoint(table))
elif layers & DataLayer.POI:
orexpr.append(sa.and_(no_index(table.c.rank_address) == 30,
table.c.class_.not_in(('place', 'building'))))
if layers & DataLayer.MANMADE:
exclude = []
if not layers & DataLayer.RAILWAY:
exclude.append('railway')
if not layers & DataLayer.NATURAL:
exclude.extend(('natural', 'water', 'waterway'))
orexpr.append(sa.and_(table.c.class_.not_in(tuple(exclude)),
no_index(table.c.rank_address) == 0))
else:
include = []
if layers & DataLayer.RAILWAY:
include.append('railway')
if layers & DataLayer.NATURAL:
include.extend(('natural', 'water', 'waterway'))
orexpr.append(sa.and_(table.c.class_.in_(tuple(include)),
no_index(table.c.rank_address) == 0))
if len(orexpr) == 1:
return orexpr[0]
return sa.or_(*orexpr)
def _interpolated_position(table: SaFromClause, nr: SaColumn) -> SaColumn:
pos = sa.cast(nr - table.c.startnumber, sa.Float) / (table.c.endnumber - table.c.startnumber)
return sa.case(
(table.c.endnumber == table.c.startnumber, table.c.linegeo.ST_Centroid()),
else_=table.c.linegeo.ST_LineInterpolatePoint(pos)).label('centroid')
async def _get_placex_housenumbers(conn: SearchConnection,
place_ids: List[int],
details: SearchDetails) -> AsyncIterator[nres.SearchResult]:
t = conn.t.placex
sql = _select_placex(t).add_columns(t.c.importance)\
.where(t.c.place_id.in_(place_ids))
if details.geometry_output:
sql = _add_geometry_columns(sql, t.c.geometry, details)
for row in await conn.execute(sql):
result = nres.create_from_placex_row(row, nres.SearchResult)
assert result
result.bbox = Bbox.from_wkb(row.bbox)
yield result
def _int_list_to_subquery(inp: List[int]) -> 'sa.Subquery':
""" Create a subselect that returns the given list of integers
as rows in the column 'nr'.
"""
vtab = sa.func.JsonArrayEach(sa.type_coerce(inp, sa.JSON))\
.table_valued(sa.column('value', type_=sa.JSON))
return sa.select(sa.cast(sa.cast(vtab.c.value, sa.Text), sa.Integer).label('nr')).subquery()
async def _get_osmline(conn: SearchConnection, place_ids: List[int],
numerals: List[int],
details: SearchDetails) -> AsyncIterator[nres.SearchResult]:
t = conn.t.osmline
values = _int_list_to_subquery(numerals)
sql = sa.select(t.c.place_id, t.c.osm_id,
t.c.parent_place_id, t.c.address,
values.c.nr.label('housenumber'),
_interpolated_position(t, values.c.nr),
t.c.postcode, t.c.country_code)\
.where(t.c.place_id.in_(place_ids))\
.join(values, values.c.nr.between(t.c.startnumber, t.c.endnumber))
if details.geometry_output:
sub = sql.subquery()
sql = _add_geometry_columns(sa.select(sub), sub.c.centroid, details)
for row in await conn.execute(sql):
result = nres.create_from_osmline_row(row, nres.SearchResult)
assert result
yield result
async def _get_tiger(conn: SearchConnection, place_ids: List[int],
numerals: List[int], osm_id: int,
details: SearchDetails) -> AsyncIterator[nres.SearchResult]:
t = conn.t.tiger
values = _int_list_to_subquery(numerals)
sql = sa.select(t.c.place_id, t.c.parent_place_id,
sa.literal('W').label('osm_type'),
sa.literal(osm_id).label('osm_id'),
values.c.nr.label('housenumber'),
_interpolated_position(t, values.c.nr),
t.c.postcode)\
.where(t.c.place_id.in_(place_ids))\
.join(values, values.c.nr.between(t.c.startnumber, t.c.endnumber))
if details.geometry_output:
sub = sql.subquery()
sql = _add_geometry_columns(sa.select(sub), sub.c.centroid, details)
for row in await conn.execute(sql):
result = nres.create_from_tiger_row(row, nres.SearchResult)
assert result
yield result
class AbstractSearch(abc.ABC):
""" Encapuslation of a single lookup in the database.
"""
SEARCH_PRIO: int = 2
def __init__(self, penalty: float) -> None:
self.penalty = penalty
@abc.abstractmethod
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
class NearSearch(AbstractSearch):
""" Category search of a place type near the result of another search.
"""
def __init__(self, penalty: float, categories: WeightedCategories,
search: AbstractSearch) -> None:
super().__init__(penalty)
self.search = search
self.categories = categories
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
results = nres.SearchResults()
base = await self.search.lookup(conn, details)
if not base:
return results
base.sort(key=lambda r: (r.accuracy, r.rank_search))
max_accuracy = base[0].accuracy + 0.5
if base[0].rank_address == 0:
min_rank = 0
max_rank = 0
elif base[0].rank_address < 26:
min_rank = 1
max_rank = min(25, base[0].rank_address + 4)
else:
min_rank = 26
max_rank = 30
base = nres.SearchResults(r for r in base if r.source_table == nres.SourceTable.PLACEX
and r.accuracy <= max_accuracy
and r.bbox and r.bbox.area < 20
and r.rank_address >= min_rank
and r.rank_address <= max_rank)
if base:
baseids = [b.place_id for b in base[:5] if b.place_id]
for category, penalty in self.categories:
await self.lookup_category(results, conn, baseids, category, penalty, details)
if len(results) >= details.max_results:
break
return results
async def lookup_category(self, results: nres.SearchResults,
conn: SearchConnection, ids: List[int],
category: Tuple[str, str], penalty: float,
details: SearchDetails) -> None:
""" Find places of the given category near the list of
place ids and add the results to 'results'.
"""
table = await conn.get_class_table(*category)
tgeom = conn.t.placex.alias('pgeom')
if table is None:
# No classtype table available, do a simplified lookup in placex.
table = conn.t.placex
sql = sa.select(table.c.place_id,
sa.func.min(tgeom.c.centroid.ST_Distance(table.c.centroid))
.label('dist'))\
.join(tgeom, table.c.geometry.intersects(tgeom.c.centroid.ST_Expand(0.01)))\
.where(table.c.class_ == category[0])\
.where(table.c.type == category[1])
else:
# Use classtype table. We can afford to use a larger
# radius for the lookup.
sql = sa.select(table.c.place_id,
sa.func.min(tgeom.c.centroid.ST_Distance(table.c.centroid))
.label('dist'))\
.join(tgeom,
table.c.centroid.ST_CoveredBy(
sa.case((sa.and_(tgeom.c.rank_address > 9,
tgeom.c.geometry.is_area()),
tgeom.c.geometry),
else_ = tgeom.c.centroid.ST_Expand(0.05))))
inner = sql.where(tgeom.c.place_id.in_(ids))\
.group_by(table.c.place_id).subquery()
t = conn.t.placex
sql = _select_placex(t).add_columns((-inner.c.dist).label('importance'))\
.join(inner, inner.c.place_id == t.c.place_id)\
.order_by(inner.c.dist)
sql = sql.where(no_index(t.c.rank_address).between(MIN_RANK_PARAM, MAX_RANK_PARAM))
if details.countries:
sql = sql.where(t.c.country_code.in_(COUNTRIES_PARAM))
if details.excluded:
sql = sql.where(_exclude_places(t))
if details.layers is not None:
sql = sql.where(_filter_by_layer(t, details.layers))
sql = sql.limit(LIMIT_PARAM)
for row in await conn.execute(sql, _details_to_bind_params(details)):
result = nres.create_from_placex_row(row, nres.SearchResult)
assert result
result.accuracy = self.penalty + penalty
result.bbox = Bbox.from_wkb(row.bbox)
results.append(result)
class PoiSearch(AbstractSearch):
""" Category search in a geographic area.
"""
def __init__(self, sdata: SearchData) -> None:
super().__init__(sdata.penalty)
self.qualifiers = sdata.qualifiers
self.countries = sdata.countries
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
bind_params = _details_to_bind_params(details)
t = conn.t.placex
rows: List[SaRow] = []
if details.near and details.near_radius is not None and details.near_radius < 0.2:
# simply search in placex table
def _base_query() -> SaSelect:
return _select_placex(t) \
.add_columns((-t.c.centroid.ST_Distance(NEAR_PARAM))
.label('importance'))\
.where(t.c.linked_place_id == None) \
.where(t.c.geometry.within_distance(NEAR_PARAM, NEAR_RADIUS_PARAM)) \
.order_by(t.c.centroid.ST_Distance(NEAR_PARAM)) \
.limit(LIMIT_PARAM)
classtype = self.qualifiers.values
if len(classtype) == 1:
cclass, ctype = classtype[0]
sql: SaLambdaSelect = sa.lambda_stmt(lambda: _base_query()
.where(t.c.class_ == cclass)
.where(t.c.type == ctype))
else:
sql = _base_query().where(sa.or_(*(sa.and_(t.c.class_ == cls, t.c.type == typ)
for cls, typ in classtype)))
if self.countries:
sql = sql.where(t.c.country_code.in_(self.countries.values))
if details.viewbox is not None and details.bounded_viewbox:
sql = sql.where(t.c.geometry.intersects(VIEWBOX_PARAM))
rows.extend(await conn.execute(sql, bind_params))
else:
# use the class type tables
for category in self.qualifiers.values:
table = await conn.get_class_table(*category)
if table is not None:
sql = _select_placex(t)\
.add_columns(t.c.importance)\
.join(table, t.c.place_id == table.c.place_id)\
.where(t.c.class_ == category[0])\
.where(t.c.type == category[1])
if details.viewbox is not None and details.bounded_viewbox:
sql = sql.where(table.c.centroid.intersects(VIEWBOX_PARAM))
if details.near and details.near_radius is not None:
sql = sql.order_by(table.c.centroid.ST_Distance(NEAR_PARAM))\
.where(table.c.centroid.within_distance(NEAR_PARAM,
NEAR_RADIUS_PARAM))
if self.countries:
sql = sql.where(t.c.country_code.in_(self.countries.values))
sql = sql.limit(LIMIT_PARAM)
rows.extend(await conn.execute(sql, bind_params))
results = nres.SearchResults()
for row in rows:
result = nres.create_from_placex_row(row, nres.SearchResult)
assert result
result.accuracy = self.penalty + self.qualifiers.get_penalty((row.class_, row.type))
result.bbox = Bbox.from_wkb(row.bbox)
results.append(result)
return results
class CountrySearch(AbstractSearch):
""" Search for a country name or country code.
"""
SEARCH_PRIO = 0
def __init__(self, sdata: SearchData) -> None:
super().__init__(sdata.penalty)
self.countries = sdata.countries
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
t = conn.t.placex
ccodes = self.countries.values
sql = _select_placex(t)\
.add_columns(t.c.importance)\
.where(t.c.country_code.in_(ccodes))\
.where(t.c.rank_address == 4)
if details.geometry_output:
sql = _add_geometry_columns(sql, t.c.geometry, details)
if details.excluded:
sql = sql.where(_exclude_places(t))
sql = filter_by_area(sql, t, details)
results = nres.SearchResults()
for row in await conn.execute(sql, _details_to_bind_params(details)):
result = nres.create_from_placex_row(row, nres.SearchResult)
assert result
result.accuracy = self.penalty + self.countries.get_penalty(row.country_code, 5.0)
result.bbox = Bbox.from_wkb(row.bbox)
results.append(result)
if not results:
results = await self.lookup_in_country_table(conn, details)
if results:
details.min_rank = min(5, details.max_rank)
details.max_rank = min(25, details.max_rank)
return results
async def lookup_in_country_table(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Look up the country in the fallback country tables.
"""
# Avoid the fallback search when this is a more search. Country results
# usually are in the first batch of results and it is not possible
# to exclude these fallbacks.
if details.excluded:
return nres.SearchResults()
t = conn.t.country_name
tgrid = conn.t.country_grid
sql = sa.select(tgrid.c.country_code,
tgrid.c.geometry.ST_Centroid().ST_Collect().ST_Centroid()
.label('centroid'),
tgrid.c.geometry.ST_Collect().ST_Expand(0).label('bbox'))\
.where(tgrid.c.country_code.in_(self.countries.values))\
.group_by(tgrid.c.country_code)
sql = filter_by_area(sql, tgrid, details, avoid_index=True)
sub = sql.subquery('grid')
sql = sa.select(t.c.country_code,
t.c.name.merge(t.c.derived_name).label('name'),
sub.c.centroid, sub.c.bbox)\
.join(sub, t.c.country_code == sub.c.country_code)
if details.geometry_output:
sql = _add_geometry_columns(sql, sub.c.centroid, details)
results = nres.SearchResults()
for row in await conn.execute(sql, _details_to_bind_params(details)):
result = nres.create_from_country_row(row, nres.SearchResult)
assert result
result.bbox = Bbox.from_wkb(row.bbox)
result.accuracy = self.penalty + self.countries.get_penalty(row.country_code, 5.0)
results.append(result)
return results
class PostcodeSearch(AbstractSearch):
""" Search for a postcode.
"""
def __init__(self, extra_penalty: float, sdata: SearchData) -> None:
super().__init__(sdata.penalty + extra_penalty)
self.countries = sdata.countries
self.postcodes = sdata.postcodes
self.lookups = sdata.lookups
self.rankings = sdata.rankings
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
t = conn.t.postcode
pcs = self.postcodes.values
sql = sa.select(t.c.place_id, t.c.parent_place_id,
t.c.rank_search, t.c.rank_address,
t.c.postcode, t.c.country_code,
t.c.geometry.label('centroid'))\
.where(t.c.postcode.in_(pcs))
if details.geometry_output:
sql = _add_geometry_columns(sql, t.c.geometry, details)
penalty: SaExpression = sa.literal(self.penalty)
if details.viewbox is not None and not details.bounded_viewbox:
penalty += sa.case((t.c.geometry.intersects(VIEWBOX_PARAM), 0.0),
(t.c.geometry.intersects(VIEWBOX2_PARAM), 0.5),
else_=1.0)
if details.near is not None:
sql = sql.order_by(t.c.geometry.ST_Distance(NEAR_PARAM))
sql = filter_by_area(sql, t, details)
if self.countries:
sql = sql.where(t.c.country_code.in_(self.countries.values))
if details.excluded:
sql = sql.where(_exclude_places(t))
if self.lookups:
assert len(self.lookups) == 1
tsearch = conn.t.search_name
sql = sql.where(tsearch.c.place_id == t.c.parent_place_id)\
.where((tsearch.c.name_vector + tsearch.c.nameaddress_vector)
.contains(sa.type_coerce(self.lookups[0].tokens,
IntArray)))
for ranking in self.rankings:
penalty += ranking.sql_penalty(conn.t.search_name)
penalty += sa.case(*((t.c.postcode == v, p) for v, p in self.postcodes),
else_=1.0)
sql = sql.add_columns(penalty.label('accuracy'))
sql = sql.order_by('accuracy').limit(LIMIT_PARAM)
results = nres.SearchResults()
for row in await conn.execute(sql, _details_to_bind_params(details)):
p = conn.t.placex
placex_sql = _select_placex(p).add_columns(p.c.importance)\
.where(sa.text("""class = 'boundary'
AND type = 'postal_code'
AND osm_type = 'R'"""))\
.where(p.c.country_code == row.country_code)\
.where(p.c.postcode == row.postcode)\
.limit(1)
if details.geometry_output:
placex_sql = _add_geometry_columns(placex_sql, p.c.geometry, details)
for prow in await conn.execute(placex_sql, _details_to_bind_params(details)):
result = nres.create_from_placex_row(prow, nres.SearchResult)
break
else:
result = nres.create_from_postcode_row(row, nres.SearchResult)
assert result
if result.place_id not in details.excluded:
result.accuracy = row.accuracy
results.append(result)
return results
class PlaceSearch(AbstractSearch):
""" Generic search for an address or named place.
"""
SEARCH_PRIO = 1
def __init__(self, extra_penalty: float, sdata: SearchData, expected_count: int) -> None:
super().__init__(sdata.penalty + extra_penalty)
self.countries = sdata.countries
self.postcodes = sdata.postcodes
self.housenumbers = sdata.housenumbers
self.qualifiers = sdata.qualifiers
self.lookups = sdata.lookups
self.rankings = sdata.rankings
self.expected_count = expected_count
def _inner_search_name_cte(self, conn: SearchConnection,
details: SearchDetails) -> 'sa.CTE':
""" Create a subquery that preselects the rows in the search_name
table.
"""
t = conn.t.search_name
penalty: SaExpression = sa.literal(self.penalty)
for ranking in self.rankings:
penalty += ranking.sql_penalty(t)
sql = sa.select(t.c.place_id, t.c.search_rank, t.c.address_rank,
t.c.country_code, t.c.centroid,
t.c.name_vector, t.c.nameaddress_vector,
sa.case((t.c.importance > 0, t.c.importance),
else_=0.40001-(sa.cast(t.c.search_rank, sa.Float())/75))
.label('importance'),
penalty.label('penalty'))
for lookup in self.lookups:
sql = sql.where(lookup.sql_condition(t))
if self.countries:
sql = sql.where(t.c.country_code.in_(self.countries.values))
if self.postcodes:
# if a postcode is given, don't search for state or country level objects
sql = sql.where(t.c.address_rank > 9)
if self.expected_count > 10000:
# Many results expected. Restrict by postcode.
tpc = conn.t.postcode
sql = sql.where(sa.select(tpc.c.postcode)
.where(tpc.c.postcode.in_(self.postcodes.values))
.where(t.c.centroid.within_distance(tpc.c.geometry, 0.4))
.exists())
if details.viewbox is not None:
if details.bounded_viewbox:
sql = sql.where(t.c.centroid
.intersects(VIEWBOX_PARAM,
use_index=details.viewbox.area < 0.2))
elif not self.postcodes and not self.housenumbers and self.expected_count >= 10000:
sql = sql.where(t.c.centroid
.intersects(VIEWBOX2_PARAM,
use_index=details.viewbox.area < 0.5))
if details.near is not None and details.near_radius is not None:
if details.near_radius < 0.1:
sql = sql.where(t.c.centroid.within_distance(NEAR_PARAM,
NEAR_RADIUS_PARAM))
else:
sql = sql.where(t.c.centroid
.ST_Distance(NEAR_PARAM) < NEAR_RADIUS_PARAM)
if self.housenumbers:
sql = sql.where(t.c.address_rank.between(16, 30))
else:
if details.excluded:
sql = sql.where(_exclude_places(t))
if details.min_rank > 0:
sql = sql.where(sa.or_(t.c.address_rank >= MIN_RANK_PARAM,
t.c.search_rank >= MIN_RANK_PARAM))
if details.max_rank < 30:
sql = sql.where(sa.or_(t.c.address_rank <= MAX_RANK_PARAM,
t.c.search_rank <= MAX_RANK_PARAM))
inner = sql.limit(10000).order_by(sa.desc(sa.text('importance'))).subquery()
sql = sa.select(inner.c.place_id, inner.c.search_rank, inner.c.address_rank,
inner.c.country_code, inner.c.centroid, inner.c.importance,
inner.c.penalty)
# If the query is not an address search or has a geographic preference,
# preselect most important items to restrict the number of places
# that need to be looked up in placex.
if not self.housenumbers\
and (details.viewbox is None or details.bounded_viewbox)\
and (details.near is None or details.near_radius is not None)\
and not self.qualifiers:
sql = sql.add_columns(sa.func.first_value(inner.c.penalty - inner.c.importance)
.over(order_by=inner.c.penalty - inner.c.importance)
.label('min_penalty'))
inner = sql.subquery()
sql = sa.select(inner.c.place_id, inner.c.search_rank, inner.c.address_rank,
inner.c.country_code, inner.c.centroid, inner.c.importance,
inner.c.penalty)\
.where(inner.c.penalty - inner.c.importance < inner.c.min_penalty + 0.5)
return sql.cte('searches')
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
t = conn.t.placex
tsearch = self._inner_search_name_cte(conn, details)
sql = _select_placex(t).join(tsearch, t.c.place_id == tsearch.c.place_id)
if details.geometry_output:
sql = _add_geometry_columns(sql, t.c.geometry, details)
penalty: SaExpression = tsearch.c.penalty
if self.postcodes:
tpc = conn.t.postcode
pcs = self.postcodes.values
pc_near = sa.select(sa.func.min(tpc.c.geometry.ST_Distance(t.c.centroid)))\
.where(tpc.c.postcode.in_(pcs))\
.scalar_subquery()
penalty += sa.case((t.c.postcode.in_(pcs), 0.0),
else_=sa.func.coalesce(pc_near, cast(SaColumn, 2.0)))
if details.viewbox is not None and not details.bounded_viewbox:
penalty += sa.case((t.c.geometry.intersects(VIEWBOX_PARAM, use_index=False), 0.0),
(t.c.geometry.intersects(VIEWBOX2_PARAM, use_index=False), 0.5),
else_=1.0)
if details.near is not None:
sql = sql.add_columns((-tsearch.c.centroid.ST_Distance(NEAR_PARAM))
.label('importance'))
sql = sql.order_by(sa.desc(sa.text('importance')))
else:
sql = sql.order_by(penalty - tsearch.c.importance)
sql = sql.add_columns(tsearch.c.importance)
sql = sql.add_columns(penalty.label('accuracy'))\
.order_by(sa.text('accuracy'))
if self.housenumbers:
hnr_list = '|'.join(self.housenumbers.values)
inner = sql.where(sa.or_(tsearch.c.address_rank < 30,
sa.func.RegexpWord(hnr_list, t.c.housenumber)))\
.subquery()
# Housenumbers from placex
thnr = conn.t.placex.alias('hnr')
pid_list = sa.func.ArrayAgg(thnr.c.place_id)
place_sql = sa.select(pid_list)\
.where(thnr.c.parent_place_id == inner.c.place_id)\
.where(sa.func.RegexpWord(hnr_list, thnr.c.housenumber))\
.where(thnr.c.linked_place_id == None)\
.where(thnr.c.indexed_status == 0)
if details.excluded:
place_sql = place_sql.where(thnr.c.place_id.not_in(sa.bindparam('excluded')))
if self.qualifiers:
place_sql = place_sql.where(self.qualifiers.sql_restrict(thnr))
numerals = [int(n) for n in self.housenumbers.values
if n.isdigit() and len(n) < 8]
interpol_sql: SaColumn
tiger_sql: SaColumn
if numerals and \
(not self.qualifiers or ('place', 'house') in self.qualifiers.values):
# Housenumbers from interpolations
interpol_sql = _make_interpolation_subquery(conn.t.osmline, inner,
numerals, details)
# Housenumbers from Tiger
tiger_sql = sa.case((inner.c.country_code == 'us',
_make_interpolation_subquery(conn.t.tiger, inner,
numerals, details)
), else_=None)
else:
interpol_sql = sa.null()
tiger_sql = sa.null()
unsort = sa.select(inner, place_sql.scalar_subquery().label('placex_hnr'),
interpol_sql.label('interpol_hnr'),
tiger_sql.label('tiger_hnr')).subquery('unsort')
sql = sa.select(unsort)\
.order_by(sa.case((unsort.c.placex_hnr != None, 1),
(unsort.c.interpol_hnr != None, 2),
(unsort.c.tiger_hnr != None, 3),
else_=4),
unsort.c.accuracy)
else:
sql = sql.where(t.c.linked_place_id == None)\
.where(t.c.indexed_status == 0)
if self.qualifiers:
sql = sql.where(self.qualifiers.sql_restrict(t))
if details.layers is not None:
sql = sql.where(_filter_by_layer(t, details.layers))
sql = sql.limit(LIMIT_PARAM)
results = nres.SearchResults()
for row in await conn.execute(sql, _details_to_bind_params(details)):
result = nres.create_from_placex_row(row, nres.SearchResult)
assert result
result.bbox = Bbox.from_wkb(row.bbox)
result.accuracy = row.accuracy
if self.housenumbers and row.rank_address < 30:
if row.placex_hnr:
subs = _get_placex_housenumbers(conn, row.placex_hnr, details)
elif row.interpol_hnr:
subs = _get_osmline(conn, row.interpol_hnr, numerals, details)
elif row.tiger_hnr:
subs = _get_tiger(conn, row.tiger_hnr, numerals, row.osm_id, details)
else:
subs = None
if subs is not None:
async for sub in subs:
assert sub.housenumber
sub.accuracy = result.accuracy
if not any(nr in self.housenumbers.values
for nr in sub.housenumber.split(';')):
sub.accuracy += 0.6
results.append(sub)
# Only add the street as a result, if it meets all other
# filter conditions.
if (not details.excluded or result.place_id not in details.excluded)\
and (not self.qualifiers or result.category in self.qualifiers.values)\
and result.rank_address >= details.min_rank:
result.accuracy += 1.0 # penalty for missing housenumber
results.append(result)
else:
results.append(result)
return results

View File

@@ -0,0 +1,274 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Public interface to the search code.
"""
from typing import List, Any, Optional, Iterator, Tuple, Dict
import itertools
import re
import datetime as dt
import difflib
from ..connection import SearchConnection
from ..types import SearchDetails
from ..results import SearchResult, SearchResults, add_result_details
from ..logging import log
from .token_assignment import yield_token_assignments
from .db_search_builder import SearchBuilder, build_poi_search, wrap_near_search
from .db_searches import AbstractSearch
from .query_analyzer_factory import make_query_analyzer, AbstractQueryAnalyzer
from .query import Phrase, QueryStruct
class ForwardGeocoder:
""" Main class responsible for place search.
"""
def __init__(self, conn: SearchConnection,
params: SearchDetails, timeout: Optional[int]) -> None:
self.conn = conn
self.params = params
self.timeout = dt.timedelta(seconds=timeout or 1000000)
self.query_analyzer: Optional[AbstractQueryAnalyzer] = None
@property
def limit(self) -> int:
""" Return the configured maximum number of search results.
"""
return self.params.max_results
async def build_searches(self,
phrases: List[Phrase]) -> Tuple[QueryStruct, List[AbstractSearch]]:
""" Analyse the query and return the tokenized query and list of
possible searches over it.
"""
if self.query_analyzer is None:
self.query_analyzer = await make_query_analyzer(self.conn)
query = await self.query_analyzer.analyze_query(phrases)
searches: List[AbstractSearch] = []
if query.num_token_slots() > 0:
# 2. Compute all possible search interpretations
log().section('Compute abstract searches')
search_builder = SearchBuilder(query, self.params)
num_searches = 0
for assignment in yield_token_assignments(query):
searches.extend(search_builder.build(assignment))
if num_searches < len(searches):
log().table_dump('Searches for assignment',
_dump_searches(searches, query, num_searches))
num_searches = len(searches)
searches.sort(key=lambda s: (s.penalty, s.SEARCH_PRIO))
return query, searches
async def execute_searches(self, query: QueryStruct,
searches: List[AbstractSearch]) -> SearchResults:
""" Run the abstract searches against the database until a result
is found.
"""
log().section('Execute database searches')
results: Dict[Any, SearchResult] = {}
end_time = dt.datetime.now() + self.timeout
min_ranking = searches[0].penalty + 2.0
prev_penalty = 0.0
for i, search in enumerate(searches):
if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
break
log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
log().var_dump('Params', self.params)
lookup_results = await search.lookup(self.conn, self.params)
for result in lookup_results:
rhash = (result.source_table, result.place_id,
result.housenumber, result.country_code)
prevresult = results.get(rhash)
if prevresult:
prevresult.accuracy = min(prevresult.accuracy, result.accuracy)
else:
results[rhash] = result
min_ranking = min(min_ranking, result.accuracy * 1.2, 2.0)
log().result_dump('Results', ((r.accuracy, r) for r in lookup_results))
prev_penalty = search.penalty
if dt.datetime.now() >= end_time:
break
return SearchResults(results.values())
def pre_filter_results(self, results: SearchResults) -> SearchResults:
""" Remove results that are significantly worse than the
best match.
"""
if results:
max_ranking = min(r.ranking for r in results) + 0.5
results = SearchResults(r for r in results if r.ranking < max_ranking)
return results
def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
""" Remove badly matching results, sort by ranking and
limit to the configured number of results.
"""
if results:
results.sort(key=lambda r: r.ranking)
min_rank = results[0].rank_search
min_ranking = results[0].ranking
results = SearchResults(r for r in results
if r.ranking + 0.03 * (r.rank_search - min_rank)
< min_ranking + 0.5)
results = SearchResults(results[:self.limit])
return results
def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
""" Adjust the accuracy of the localized result according to how well
they match the original query.
"""
assert self.query_analyzer is not None
qwords = [word for phrase in query.source
for word in re.split('[, ]+', phrase.text) if word]
if not qwords:
return
for result in results:
# Negative importance indicates ordering by distance, which is
# more important than word matching.
if not result.display_name\
or (result.importance is not None and result.importance < 0):
continue
distance = 0.0
norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
result.country_code or '')))
words = set((w for w in norm.split(' ') if w))
if not words:
continue
for qword in qwords:
wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
if wdist < 0.5:
distance += len(qword)
else:
distance += (1.0 - wdist) * len(qword)
# Compensate for the fact that country names do not get a
# match penalty yet by the tokenizer.
# Temporary hack that needs to be removed!
if result.rank_address == 4:
distance *= 2
result.accuracy += distance * 0.4 / sum(len(w) for w in qwords)
async def lookup_pois(self, categories: List[Tuple[str, str]],
phrases: List[Phrase]) -> SearchResults:
""" Look up places by category. If phrase is given, a place search
over the phrase will be executed first and places close to the
results returned.
"""
log().function('forward_lookup_pois', categories=categories, params=self.params)
if phrases:
query, searches = await self.build_searches(phrases)
if query:
searches = [wrap_near_search(categories, s) for s in searches[:50]]
results = await self.execute_searches(query, searches)
results = self.pre_filter_results(results)
await add_result_details(self.conn, results, self.params)
log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
results = self.sort_and_cut_results(results)
else:
results = SearchResults()
else:
search = build_poi_search(categories, self.params.countries)
results = await search.lookup(self.conn, self.params)
await add_result_details(self.conn, results, self.params)
log().result_dump('Final Results', ((r.accuracy, r) for r in results))
return results
async def lookup(self, phrases: List[Phrase]) -> SearchResults:
""" Look up a single free-text query.
"""
log().function('forward_lookup', phrases=phrases, params=self.params)
results = SearchResults()
if self.params.is_impossible():
return results
query, searches = await self.build_searches(phrases)
if searches:
# Execute SQL until an appropriate result is found.
results = await self.execute_searches(query, searches[:50])
results = self.pre_filter_results(results)
await add_result_details(self.conn, results, self.params)
log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
self.rerank_by_query(query, results)
log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
results = self.sort_and_cut_results(results)
log().result_dump('Final Results', ((r.accuracy, r) for r in results))
return results
# pylint: disable=invalid-name,too-many-locals
def _dump_searches(searches: List[AbstractSearch], query: QueryStruct,
start: int = 0) -> Iterator[Optional[List[Any]]]:
yield ['Penalty', 'Lookups', 'Housenr', 'Postcode', 'Countries',
'Qualifier', 'Catgeory', 'Rankings']
def tk(tl: List[int]) -> str:
tstr = [f"{query.find_lookup_word_by_id(t)}({t})" for t in tl]
return f"[{','.join(tstr)}]"
def fmt_ranking(f: Any) -> str:
if not f:
return ''
ranks = ','.join((f"{tk(r.tokens)}^{r.penalty:.3g}" for r in f.rankings))
if len(ranks) > 100:
ranks = ranks[:100] + '...'
return f"{f.column}({ranks},def={f.default:.3g})"
def fmt_lookup(l: Any) -> str:
if not l:
return ''
return f"{l.lookup_type}({l.column}{tk(l.tokens)})"
def fmt_cstr(c: Any) -> str:
if not c:
return ''
return f'{c[0]}^{c[1]}'
for search in searches[start:]:
fields = ('lookups', 'rankings', 'countries', 'housenumbers',
'postcodes', 'qualifiers')
if hasattr(search, 'search'):
iters = itertools.zip_longest([f"{search.penalty:.3g}"],
*(getattr(search.search, attr, []) for attr in fields),
getattr(search, 'categories', []),
fillvalue='')
else:
iters = itertools.zip_longest([f"{search.penalty:.3g}"],
*(getattr(search, attr, []) for attr in fields),
[],
fillvalue='')
for penalty, lookup, rank, cc, hnr, pc, qual, cat in iters:
yield [penalty, fmt_lookup(lookup), fmt_cstr(hnr),
fmt_cstr(pc), fmt_cstr(cc), fmt_cstr(qual), fmt_cstr(cat), fmt_ranking(rank)]
yield None

View File

@@ -0,0 +1,314 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of query analysis for the ICU tokenizer.
"""
from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
from collections import defaultdict
import dataclasses
import difflib
from icu import Transliterator
import sqlalchemy as sa
from nominatim_core.typing import SaRow
from nominatim_core.db.sqlalchemy_types import Json
from ..connection import SearchConnection
from ..logging import log
from ..search import query as qmod
from ..search.query_analyzer_factory import AbstractQueryAnalyzer
DB_TO_TOKEN_TYPE = {
'W': qmod.TokenType.WORD,
'w': qmod.TokenType.PARTIAL,
'H': qmod.TokenType.HOUSENUMBER,
'P': qmod.TokenType.POSTCODE,
'C': qmod.TokenType.COUNTRY
}
class QueryPart(NamedTuple):
""" Normalized and transliterated form of a single term in the query.
When the term came out of a split during the transliteration,
the normalized string is the full word before transliteration.
The word number keeps track of the word before transliteration
and can be used to identify partial transliterated terms.
"""
token: str
normalized: str
word_number: int
QueryParts = List[QueryPart]
WordDict = Dict[str, List[qmod.TokenRange]]
def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
""" Return all combinations of words in the terms list after the
given position.
"""
total = len(terms)
for first in range(start, total):
word = terms[first].token
yield word, qmod.TokenRange(first, first + 1)
for last in range(first + 1, min(first + 20, total)):
word = ' '.join((word, terms[last].token))
yield word, qmod.TokenRange(first, last + 1)
@dataclasses.dataclass
class ICUToken(qmod.Token):
""" Specialised token for ICU tokenizer.
"""
word_token: str
info: Optional[Dict[str, Any]]
def get_category(self) -> Tuple[str, str]:
assert self.info
return self.info.get('class', ''), self.info.get('type', '')
def rematch(self, norm: str) -> None:
""" Check how well the token matches the given normalized string
and add a penalty, if necessary.
"""
if not self.lookup_word:
return
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
distance = 0
for tag, afrom, ato, bfrom, bto in seq.get_opcodes():
if tag in ('delete', 'insert') and (afrom == 0 or ato == len(self.lookup_word)):
distance += 1
elif tag == 'replace':
distance += max((ato-afrom), (bto-bfrom))
elif tag != 'equal':
distance += abs((ato-afrom) - (bto-bfrom))
self.penalty += (distance/len(self.lookup_word))
@staticmethod
def from_db_row(row: SaRow) -> 'ICUToken':
""" Create a ICUToken from the row of the word table.
"""
count = 1 if row.info is None else row.info.get('count', 1)
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
penalty = 0.0
if row.type == 'w':
penalty = 0.3
elif row.type == 'W':
if len(row.word_token) == 1 and row.word_token == row.word:
penalty = 0.2 if row.word.isdigit() else 0.3
elif row.type == 'H':
penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
if all(not c.isdigit() for c in row.word_token):
penalty += 0.2 * (len(row.word_token) - 1)
elif row.type == 'C':
if len(row.word_token) == 1:
penalty = 0.3
if row.info is None:
lookup_word = row.word
else:
lookup_word = row.info.get('lookup', row.word)
if lookup_word:
lookup_word = lookup_word.split('@', 1)[0]
else:
lookup_word = row.word_token
return ICUToken(penalty=penalty, token=row.word_id, count=max(1, count),
lookup_word=lookup_word, is_indexed=True,
word_token=row.word_token, info=row.info,
addr_count=max(1, addr_count))
class ICUQueryAnalyzer(AbstractQueryAnalyzer):
""" Converter for query strings into a tokenized query
using the tokens created by a ICU tokenizer.
"""
def __init__(self, conn: SearchConnection) -> None:
self.conn = conn
async def setup(self) -> None:
""" Set up static data structures needed for the analysis.
"""
async def _make_normalizer() -> Any:
rules = await self.conn.get_property('tokenizer_import_normalisation')
return Transliterator.createFromRules("normalization", rules)
self.normalizer = await self.conn.get_cached_value('ICUTOK', 'normalizer',
_make_normalizer)
async def _make_transliterator() -> Any:
rules = await self.conn.get_property('tokenizer_import_transliteration')
return Transliterator.createFromRules("transliteration", rules)
self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator',
_make_transliterator)
if 'word' not in self.conn.t.meta.tables:
sa.Table('word', self.conn.t.meta,
sa.Column('word_id', sa.Integer),
sa.Column('word_token', sa.Text, nullable=False),
sa.Column('type', sa.Text, nullable=False),
sa.Column('word', sa.Text),
sa.Column('info', Json))
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
""" Analyze the given list of phrases and return the
tokenized query.
"""
log().section('Analyze query (using ICU tokenizer)')
normalized = list(filter(lambda p: p.text,
(qmod.Phrase(p.ptype, self.normalize_text(p.text))
for p in phrases)))
query = qmod.QueryStruct(normalized)
log().var_dump('Normalized query', query.source)
if not query.source:
return query
parts, words = self.split_query(query)
log().var_dump('Transliterated query', lambda: _dump_transliterated(query, parts))
for row in await self.lookup_in_db(list(words.keys())):
for trange in words[row.word_token]:
token = ICUToken.from_db_row(row)
if row.type == 'S':
if row.info['op'] in ('in', 'near'):
if trange.start == 0:
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
else:
if trange.start == 0 and trange.end == query.num_token_slots():
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
else:
query.add_token(trange, qmod.TokenType.QUALIFIER, token)
else:
query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
self.add_extra_tokens(query, parts)
self.rerank_tokens(query, parts)
log().table_dump('Word tokens', _dump_word_tokens(query))
return query
def normalize_text(self, text: str) -> str:
""" Bring the given text into a normalized form. That is the
standardized form search will work with. All information removed
at this stage is inevitably lost.
"""
return cast(str, self.normalizer.transliterate(text))
def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
""" Transliterate the phrases and split them into tokens.
Returns the list of transliterated tokens together with their
normalized form and a dictionary of words for lookup together
with their position.
"""
parts: QueryParts = []
phrase_start = 0
words = defaultdict(list)
wordnr = 0
for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype
for word in phrase.text.split(' '):
trans = self.transliterator.transliterate(word)
if trans:
for term in trans.split(' '):
if term:
parts.append(QueryPart(term, word, wordnr))
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
query.nodes[-1].btype = qmod.BreakType.WORD
wordnr += 1
query.nodes[-1].btype = qmod.BreakType.PHRASE
for word, wrange in yield_words(parts, phrase_start):
words[word].append(wrange)
phrase_start = len(parts)
query.nodes[-1].btype = qmod.BreakType.END
return parts, words
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
""" Return the token information from the database for the
given word tokens.
"""
t = self.conn.t.meta.tables['word']
return await self.conn.execute(t.select().where(t.c.word_token.in_(words)))
def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
""" Add tokens to query that are not saved in the database.
"""
for part, node, i in zip(parts, query.nodes, range(1000)):
if len(part.token) <= 4 and part[0].isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
""" Add penalties to tokens that depend on presence of other token.
"""
for i, node, tlist in query.iter_token_lists():
if tlist.ttype == qmod.TokenType.POSTCODE:
for repl in node.starting:
if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \
and (repl.ttype != qmod.TokenType.HOUSENUMBER
or len(tlist.tokens[0].lookup_word) > 4):
repl.add_penalty(0.39)
elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
and len(tlist.tokens[0].lookup_word) <= 3:
if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
for repl in node.starting:
if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
repl.add_penalty(0.5 - tlist.tokens[0].penalty)
elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
norm = parts[i].normalized
for j in range(i + 1, tlist.end):
if parts[j - 1].word_number != parts[j].word_number:
norm += ' ' + parts[j].normalized
for token in tlist.tokens:
cast(ICUToken, token).rematch(norm)
def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str:
out = query.nodes[0].btype.value
for node, part in zip(query.nodes[1:], parts):
out += part.token + node.btype.value
return out
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
for node in query.nodes:
for tlist in node.starting:
for token in tlist.tokens:
t = cast(ICUToken, token)
yield [tlist.ttype.name, t.token, t.word_token or '',
t.lookup_word or '', t.penalty, t.count, t.info]
async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
""" Create and set up a new query analyzer for a database based
on the ICU tokenizer.
"""
out = ICUQueryAnalyzer(conn)
await out.setup()
return out

View File

@@ -0,0 +1,272 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of query analysis for the legacy tokenizer.
"""
from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
from copy import copy
from collections import defaultdict
import dataclasses
import sqlalchemy as sa
from nominatim_core.typing import SaRow
from ..connection import SearchConnection
from ..logging import log
from . import query as qmod
from .query_analyzer_factory import AbstractQueryAnalyzer
def yield_words(terms: List[str], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
""" Return all combinations of words in the terms list after the
given position.
"""
total = len(terms)
for first in range(start, total):
word = terms[first]
yield word, qmod.TokenRange(first, first + 1)
for last in range(first + 1, min(first + 20, total)):
word = ' '.join((word, terms[last]))
yield word, qmod.TokenRange(first, last + 1)
@dataclasses.dataclass
class LegacyToken(qmod.Token):
""" Specialised token for legacy tokenizer.
"""
word_token: str
category: Optional[Tuple[str, str]]
country: Optional[str]
operator: Optional[str]
@property
def info(self) -> Dict[str, Any]:
""" Dictionary of additional properties of the token.
Should only be used for debugging purposes.
"""
return {'category': self.category,
'country': self.country,
'operator': self.operator}
def get_category(self) -> Tuple[str, str]:
assert self.category
return self.category
class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
""" Converter for query strings into a tokenized query
using the tokens created by a legacy tokenizer.
"""
def __init__(self, conn: SearchConnection) -> None:
self.conn = conn
async def setup(self) -> None:
""" Set up static data structures needed for the analysis.
"""
self.max_word_freq = int(await self.conn.get_property('tokenizer_maxwordfreq'))
if 'word' not in self.conn.t.meta.tables:
sa.Table('word', self.conn.t.meta,
sa.Column('word_id', sa.Integer),
sa.Column('word_token', sa.Text, nullable=False),
sa.Column('word', sa.Text),
sa.Column('class', sa.Text),
sa.Column('type', sa.Text),
sa.Column('country_code', sa.Text),
sa.Column('search_name_count', sa.Integer),
sa.Column('operator', sa.Text))
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
""" Analyze the given list of phrases and return the
tokenized query.
"""
log().section('Analyze query (using Legacy tokenizer)')
normalized = []
if phrases:
for row in await self.conn.execute(sa.select(*(sa.func.make_standard_name(p.text)
for p in phrases))):
normalized = [qmod.Phrase(p.ptype, r) for r, p in zip(row, phrases) if r]
break
query = qmod.QueryStruct(normalized)
log().var_dump('Normalized query', query.source)
if not query.source:
return query
parts, words = self.split_query(query)
lookup_words = list(words.keys())
log().var_dump('Split query', parts)
log().var_dump('Extracted words', lookup_words)
for row in await self.lookup_in_db(lookup_words):
for trange in words[row.word_token.strip()]:
token, ttype = self.make_token(row)
if ttype == qmod.TokenType.NEAR_ITEM:
if trange.start == 0:
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
elif ttype == qmod.TokenType.QUALIFIER:
query.add_token(trange, qmod.TokenType.QUALIFIER, token)
if trange.start == 0 or trange.end == query.num_token_slots():
token = copy(token)
token.penalty += 0.1 * (query.num_token_slots())
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
elif ttype != qmod.TokenType.PARTIAL or trange.start + 1 == trange.end:
query.add_token(trange, ttype, token)
self.add_extra_tokens(query, parts)
self.rerank_tokens(query)
log().table_dump('Word tokens', _dump_word_tokens(query))
return query
def normalize_text(self, text: str) -> str:
""" Bring the given text into a normalized form.
This only removes case, so some difference with the normalization
in the phrase remains.
"""
return text.lower()
def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
Dict[str, List[qmod.TokenRange]]]:
""" Transliterate the phrases and split them into tokens.
Returns a list of transliterated tokens and a dictionary
of words for lookup together with their position.
"""
parts: List[str] = []
phrase_start = 0
words = defaultdict(list)
for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype
for trans in phrase.text.split(' '):
if trans:
for term in trans.split(' '):
if term:
parts.append(trans)
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
query.nodes[-1].btype = qmod.BreakType.WORD
query.nodes[-1].btype = qmod.BreakType.PHRASE
for word, wrange in yield_words(parts, phrase_start):
words[word].append(wrange)
phrase_start = len(parts)
query.nodes[-1].btype = qmod.BreakType.END
return parts, words
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
""" Return the token information from the database for the
given word tokens.
"""
t = self.conn.t.meta.tables['word']
sql = t.select().where(t.c.word_token.in_(words + [' ' + w for w in words]))
return await self.conn.execute(sql)
def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]:
""" Create a LegacyToken from the row of the word table.
Also determines the type of token.
"""
penalty = 0.0
is_indexed = True
rowclass = getattr(row, 'class')
if row.country_code is not None:
ttype = qmod.TokenType.COUNTRY
lookup_word = row.country_code
elif rowclass is not None:
if rowclass == 'place' and row.type == 'house':
ttype = qmod.TokenType.HOUSENUMBER
lookup_word = row.word_token[1:]
elif rowclass == 'place' and row.type == 'postcode':
ttype = qmod.TokenType.POSTCODE
lookup_word = row.word_token[1:]
else:
ttype = qmod.TokenType.NEAR_ITEM if row.operator in ('in', 'near')\
else qmod.TokenType.QUALIFIER
lookup_word = row.word
elif row.word_token.startswith(' '):
ttype = qmod.TokenType.WORD
lookup_word = row.word or row.word_token[1:]
else:
ttype = qmod.TokenType.PARTIAL
lookup_word = row.word_token
penalty = 0.21
if row.search_name_count > self.max_word_freq:
is_indexed = False
return LegacyToken(penalty=penalty, token=row.word_id,
count=max(1, row.search_name_count or 1),
addr_count=1, # not supported
lookup_word=lookup_word,
word_token=row.word_token.strip(),
category=(rowclass, row.type) if rowclass is not None else None,
country=row.country_code,
operator=row.operator,
is_indexed=is_indexed),\
ttype
def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None:
""" Add tokens to query that are not saved in the database.
"""
for part, node, i in zip(parts, query.nodes, range(1000)):
if len(part) <= 4 and part.isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
LegacyToken(penalty=0.5, token=0, count=1, addr_count=1,
lookup_word=part, word_token=part,
category=None, country=None,
operator=None, is_indexed=True))
def rerank_tokens(self, query: qmod.QueryStruct) -> None:
""" Add penalties to tokens that depend on presence of other token.
"""
for _, node, tlist in query.iter_token_lists():
if tlist.ttype == qmod.TokenType.POSTCODE:
for repl in node.starting:
if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \
and (repl.ttype != qmod.TokenType.HOUSENUMBER
or len(tlist.tokens[0].lookup_word) > 4):
repl.add_penalty(0.39)
elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
and len(tlist.tokens[0].lookup_word) <= 3:
if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
for repl in node.starting:
if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
repl.add_penalty(0.5 - tlist.tokens[0].penalty)
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
for node in query.nodes:
for tlist in node.starting:
for token in tlist.tokens:
t = cast(LegacyToken, token)
yield [tlist.ttype.name, t.token, t.word_token or '',
t.lookup_word or '', t.penalty, t.count, t.info]
async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
""" Create and set up a new query analyzer for a database based
on the ICU tokenizer.
"""
out = LegacyQueryAnalyzer(conn)
await out.setup()
return out

View File

@@ -0,0 +1,297 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Datastructures for a tokenized query.
"""
from typing import List, Tuple, Optional, Iterator
from abc import ABC, abstractmethod
import dataclasses
import enum
class BreakType(enum.Enum):
""" Type of break between tokens.
"""
START = '<'
""" Begin of the query. """
END = '>'
""" End of the query. """
PHRASE = ','
""" Break between two phrases. """
WORD = ' '
""" Break between words. """
PART = '-'
""" Break inside a word, for example a hyphen or apostrophe. """
TOKEN = '`'
""" Break created as a result of tokenization.
This may happen in languages without spaces between words.
"""
class TokenType(enum.Enum):
""" Type of token.
"""
WORD = enum.auto()
""" Full name of a place. """
PARTIAL = enum.auto()
""" Word term without breaks, does not necessarily represent a full name. """
HOUSENUMBER = enum.auto()
""" Housenumber term. """
POSTCODE = enum.auto()
""" Postal code term. """
COUNTRY = enum.auto()
""" Country name or reference. """
QUALIFIER = enum.auto()
""" Special term used together with name (e.g. _Hotel_ Bellevue). """
NEAR_ITEM = enum.auto()
""" Special term used as searchable object(e.g. supermarket in ...). """
class PhraseType(enum.Enum):
""" Designation of a phrase.
"""
NONE = 0
""" No specific designation (i.e. source is free-form query). """
AMENITY = enum.auto()
""" Contains name or type of a POI. """
STREET = enum.auto()
""" Contains a street name optionally with a housenumber. """
CITY = enum.auto()
""" Contains the postal city. """
COUNTY = enum.auto()
""" Contains the equivalent of a county. """
STATE = enum.auto()
""" Contains a state or province. """
POSTCODE = enum.auto()
""" Contains a postal code. """
COUNTRY = enum.auto()
""" Contains the country name or code. """
def compatible_with(self, ttype: TokenType,
is_full_phrase: bool) -> bool:
""" Check if the given token type can be used with the phrase type.
"""
if self == PhraseType.NONE:
return not is_full_phrase or ttype != TokenType.QUALIFIER
if self == PhraseType.AMENITY:
return ttype in (TokenType.WORD, TokenType.PARTIAL)\
or (is_full_phrase and ttype == TokenType.NEAR_ITEM)\
or (not is_full_phrase and ttype == TokenType.QUALIFIER)
if self == PhraseType.STREET:
return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER)
if self == PhraseType.POSTCODE:
return ttype == TokenType.POSTCODE
if self == PhraseType.COUNTRY:
return ttype == TokenType.COUNTRY
return ttype in (TokenType.WORD, TokenType.PARTIAL)
@dataclasses.dataclass
class Token(ABC):
""" Base type for tokens.
Specific query analyzers must implement the concrete token class.
"""
penalty: float
token: int
count: int
addr_count: int
lookup_word: str
is_indexed: bool
@abstractmethod
def get_category(self) -> Tuple[str, str]:
""" Return the category restriction for qualifier terms and
category objects.
"""
@dataclasses.dataclass
class TokenRange:
""" Indexes of query nodes over which a token spans.
"""
start: int
end: int
def __lt__(self, other: 'TokenRange') -> bool:
return self.end <= other.start
def __le__(self, other: 'TokenRange') -> bool:
return NotImplemented
def __gt__(self, other: 'TokenRange') -> bool:
return self.start >= other.end
def __ge__(self, other: 'TokenRange') -> bool:
return NotImplemented
def replace_start(self, new_start: int) -> 'TokenRange':
""" Return a new token range with the new start.
"""
return TokenRange(new_start, self.end)
def replace_end(self, new_end: int) -> 'TokenRange':
""" Return a new token range with the new end.
"""
return TokenRange(self.start, new_end)
def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']:
""" Split the span into two spans at the given index.
The index must be within the span.
"""
return self.replace_end(index), self.replace_start(index)
@dataclasses.dataclass
class TokenList:
""" List of all tokens of a given type going from one breakpoint to another.
"""
end: int
ttype: TokenType
tokens: List[Token]
def add_penalty(self, penalty: float) -> None:
""" Add the given penalty to all tokens in the list.
"""
for token in self.tokens:
token.penalty += penalty
@dataclasses.dataclass
class QueryNode:
""" A node of the query representing a break between terms.
"""
btype: BreakType
ptype: PhraseType
starting: List[TokenList] = dataclasses.field(default_factory=list)
def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
""" Check if there are tokens of the given types ending at the
given node.
"""
return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
""" Get the list of tokens of the given type starting at this node
and ending at the node 'end'. Returns 'None' if no such
tokens exist.
"""
for tlist in self.starting:
if tlist.end == end and tlist.ttype == ttype:
return tlist.tokens
return None
@dataclasses.dataclass
class Phrase:
""" A normalized query part. Phrases may be typed which means that
they then represent a specific part of the address.
"""
ptype: PhraseType
text: str
class QueryStruct:
""" A tokenized search query together with the normalized source
from which the tokens have been parsed.
The query contains a list of nodes that represent the breaks
between words. Tokens span between nodes, which don't necessarily
need to be direct neighbours. Thus the query is represented as a
directed acyclic graph.
When created, a query contains a single node: the start of the
query. Further nodes can be added by appending to 'nodes'.
"""
def __init__(self, source: List[Phrase]) -> None:
self.source = source
self.nodes: List[QueryNode] = \
[QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
def num_token_slots(self) -> int:
""" Return the length of the query in vertice steps.
"""
return len(self.nodes) - 1
def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
""" Append a new break node with the given break type.
The phrase type denotes the type for any tokens starting
at the node.
"""
self.nodes.append(QueryNode(btype, ptype))
def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
""" Add a token to the query. 'start' and 'end' are the indexes of the
nodes from which to which the token spans. The indexes must exist
and are expected to be in the same phrase.
'ttype' denotes the type of the token and 'token' the token to
be inserted.
If the token type is not compatible with the phrase it should
be added to, then the token is silently dropped.
"""
snode = self.nodes[trange.start]
full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\
and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
if snode.ptype.compatible_with(ttype, full_phrase):
tlist = snode.get_tokens(trange.end, ttype)
if tlist is None:
snode.starting.append(TokenList(trange.end, ttype, [token]))
else:
tlist.append(token)
def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
""" Get the list of tokens of a given type, spanning the given
nodes. The nodes must exist. If no tokens exist, an
empty list is returned.
"""
return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
def get_partials_list(self, trange: TokenRange) -> List[Token]:
""" Create a list of partial tokens between the given nodes.
The list is composed of the first token of type PARTIAL
going to the subsequent node. Such PARTIAL tokens are
assumed to exist.
"""
return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL)))
for i in range(trange.start, trange.end)]
def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
""" Iterator over all token lists in the query.
"""
for i, node in enumerate(self.nodes):
for tlist in node.starting:
yield i, node, tlist
def find_lookup_word_by_id(self, token: int) -> str:
""" Find the first token with the given token ID and return
its lookup word. Returns 'None' if no such token exists.
The function is very slow and must only be used for
debugging.
"""
for node in self.nodes:
for tlist in node.starting:
for t in tlist.tokens:
if t.token == token:
return f"[{tlist.ttype.name[0]}]{t.lookup_word}"
return 'None'

View File

@@ -0,0 +1,54 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Factory for creating a query analyzer for the configured tokenizer.
"""
from typing import List, cast, TYPE_CHECKING
from abc import ABC, abstractmethod
from pathlib import Path
import importlib
from ..logging import log
from ..connection import SearchConnection
if TYPE_CHECKING:
from .query import Phrase, QueryStruct
class AbstractQueryAnalyzer(ABC):
""" Class for analysing incoming queries.
Query analyzers are tied to the tokenizer used on import.
"""
@abstractmethod
async def analyze_query(self, phrases: List['Phrase']) -> 'QueryStruct':
""" Analyze the given phrases and return the tokenized query.
"""
@abstractmethod
def normalize_text(self, text: str) -> str:
""" Bring the given text into a normalized form. That is the
standardized form search will work with. All information removed
at this stage is inevitably lost.
"""
async def make_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
""" Create a query analyzer for the tokenizer used by the database.
"""
name = await conn.get_property('tokenizer')
src_file = Path(__file__).parent / f'{name}_tokenizer.py'
if not src_file.is_file():
log().comment(f"No tokenizer named '{name}' available. Database not set up properly.")
raise RuntimeError('Tokenizer not found')
module = importlib.import_module(f'nominatim.api.search.{name}_tokenizer')
return cast(AbstractQueryAnalyzer, await module.create_query_analyzer(conn))

View File

@@ -0,0 +1,422 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Create query interpretations where each vertice in the query is assigned
a specific function (expressed as a token type).
"""
from typing import Optional, List, Iterator
import dataclasses
from ..logging import log
from . import query as qmod
# pylint: disable=too-many-return-statements,too-many-branches
@dataclasses.dataclass
class TypedRange:
""" A token range for a specific type of tokens.
"""
ttype: qmod.TokenType
trange: qmod.TokenRange
PENALTY_TOKENCHANGE = {
qmod.BreakType.START: 0.0,
qmod.BreakType.END: 0.0,
qmod.BreakType.PHRASE: 0.0,
qmod.BreakType.WORD: 0.1,
qmod.BreakType.PART: 0.2,
qmod.BreakType.TOKEN: 0.4
}
TypedRangeSeq = List[TypedRange]
@dataclasses.dataclass
class TokenAssignment: # pylint: disable=too-many-instance-attributes
""" Representation of a possible assignment of token types
to the tokens in a tokenized query.
"""
penalty: float = 0.0
name: Optional[qmod.TokenRange] = None
address: List[qmod.TokenRange] = dataclasses.field(default_factory=list)
housenumber: Optional[qmod.TokenRange] = None
postcode: Optional[qmod.TokenRange] = None
country: Optional[qmod.TokenRange] = None
near_item: Optional[qmod.TokenRange] = None
qualifier: Optional[qmod.TokenRange] = None
@staticmethod
def from_ranges(ranges: TypedRangeSeq) -> 'TokenAssignment':
""" Create a new token assignment from a sequence of typed spans.
"""
out = TokenAssignment()
for token in ranges:
if token.ttype == qmod.TokenType.PARTIAL:
out.address.append(token.trange)
elif token.ttype == qmod.TokenType.HOUSENUMBER:
out.housenumber = token.trange
elif token.ttype == qmod.TokenType.POSTCODE:
out.postcode = token.trange
elif token.ttype == qmod.TokenType.COUNTRY:
out.country = token.trange
elif token.ttype == qmod.TokenType.NEAR_ITEM:
out.near_item = token.trange
elif token.ttype == qmod.TokenType.QUALIFIER:
out.qualifier = token.trange
return out
class _TokenSequence:
""" Working state used to put together the token assignments.
Represents an intermediate state while traversing the tokenized
query.
"""
def __init__(self, seq: TypedRangeSeq,
direction: int = 0, penalty: float = 0.0) -> None:
self.seq = seq
self.direction = direction
self.penalty = penalty
def __str__(self) -> str:
seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype.name}]' for r in self.seq)
return f'{seq} (dir: {self.direction}, penalty: {self.penalty})'
@property
def end_pos(self) -> int:
""" Return the index of the global end of the current sequence.
"""
return self.seq[-1].trange.end if self.seq else 0
def has_types(self, *ttypes: qmod.TokenType) -> bool:
""" Check if the current sequence contains any typed ranges of
the given types.
"""
return any(s.ttype in ttypes for s in self.seq)
def is_final(self) -> bool:
""" Return true when the sequence cannot be extended by any
form of token anymore.
"""
# Country and category must be the final term for left-to-right
return len(self.seq) > 1 and \
self.seq[-1].ttype in (qmod.TokenType.COUNTRY, qmod.TokenType.NEAR_ITEM)
def appendable(self, ttype: qmod.TokenType) -> Optional[int]:
""" Check if the give token type is appendable to the existing sequence.
Returns None if the token type is not appendable, otherwise the
new direction of the sequence after adding such a type. The
token is not added.
"""
if ttype == qmod.TokenType.WORD:
return None
if not self.seq:
# Append unconditionally to the empty list
if ttype == qmod.TokenType.COUNTRY:
return -1
if ttype in (qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
return 1
return self.direction
# Name tokens are always acceptable and don't change direction
if ttype == qmod.TokenType.PARTIAL:
# qualifiers cannot appear in the middle of the query. They need
# to be near the next phrase.
if self.direction == -1 \
and any(t.ttype == qmod.TokenType.QUALIFIER for t in self.seq[:-1]):
return None
return self.direction
# Other tokens may only appear once
if self.has_types(ttype):
return None
if ttype == qmod.TokenType.HOUSENUMBER:
if self.direction == 1:
if len(self.seq) == 1 and self.seq[0].ttype == qmod.TokenType.QUALIFIER:
return None
if len(self.seq) > 2 \
or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
return None # direction left-to-right: housenumber must come before anything
elif self.direction == -1 \
or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
return -1 # force direction right-to-left if after other terms
return self.direction
if ttype == qmod.TokenType.POSTCODE:
if self.direction == -1:
if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
return None
return -1
if self.direction == 1:
return None if self.has_types(qmod.TokenType.COUNTRY) else 1
if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
return 1
return self.direction
if ttype == qmod.TokenType.COUNTRY:
return None if self.direction == -1 else 1
if ttype == qmod.TokenType.NEAR_ITEM:
return self.direction
if ttype == qmod.TokenType.QUALIFIER:
if self.direction == 1:
if (len(self.seq) == 1
and self.seq[0].ttype in (qmod.TokenType.PARTIAL, qmod.TokenType.NEAR_ITEM)) \
or (len(self.seq) == 2
and self.seq[0].ttype == qmod.TokenType.NEAR_ITEM
and self.seq[1].ttype == qmod.TokenType.PARTIAL):
return 1
return None
if self.direction == -1:
return -1
tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TokenType.NEAR_ITEM else self.seq
if len(tempseq) == 0:
return 1
if len(tempseq) == 1 and self.seq[0].ttype == qmod.TokenType.HOUSENUMBER:
return None
if len(tempseq) > 1 or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
return -1
return 0
return None
def advance(self, ttype: qmod.TokenType, end_pos: int,
btype: qmod.BreakType) -> Optional['_TokenSequence']:
""" Return a new token sequence state with the given token type
extended.
"""
newdir = self.appendable(ttype)
if newdir is None:
return None
if not self.seq:
newseq = [TypedRange(ttype, qmod.TokenRange(0, end_pos))]
new_penalty = 0.0
else:
last = self.seq[-1]
if btype != qmod.BreakType.PHRASE and last.ttype == ttype:
# extend the existing range
newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
new_penalty = 0.0
else:
# start a new range
newseq = list(self.seq) + [TypedRange(ttype,
qmod.TokenRange(last.trange.end, end_pos))]
new_penalty = PENALTY_TOKENCHANGE[btype]
return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
def _adapt_penalty_from_priors(self, priors: int, new_dir: int) -> bool:
if priors >= 2:
if self.direction == 0:
self.direction = new_dir
else:
if priors == 2:
self.penalty += 0.8
else:
return False
return True
def recheck_sequence(self) -> bool:
""" Check that the sequence is a fully valid token assignment
and adapt direction and penalties further if necessary.
This function catches some impossible assignments that need
forward context and can therefore not be excluded when building
the assignment.
"""
# housenumbers may not be further than 2 words from the beginning.
# If there are two words in front, give it a penalty.
hnrpos = next((i for i, tr in enumerate(self.seq)
if tr.ttype == qmod.TokenType.HOUSENUMBER),
None)
if hnrpos is not None:
if self.direction != -1:
priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TokenType.PARTIAL)
if not self._adapt_penalty_from_priors(priors, -1):
return False
if self.direction != 1:
priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TokenType.PARTIAL)
if not self._adapt_penalty_from_priors(priors, 1):
return False
if any(t.ttype == qmod.TokenType.NEAR_ITEM for t in self.seq):
self.penalty += 1.0
return True
def _get_assignments_postcode(self, base: TokenAssignment,
query_len: int) -> Iterator[TokenAssignment]:
""" Yield possible assignments of Postcode searches with an
address component.
"""
assert base.postcode is not None
if (base.postcode.start == 0 and self.direction != -1)\
or (base.postcode.end == query_len and self.direction != 1):
log().comment('postcode search')
# <address>,<postcode> should give preference to address search
if base.postcode.start == 0:
penalty = self.penalty
self.direction = -1 # name searches are only possible backwards
else:
penalty = self.penalty + 0.1
self.direction = 1 # name searches are only possible forwards
yield dataclasses.replace(base, penalty=penalty)
def _get_assignments_address_forward(self, base: TokenAssignment,
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments of address searches with
left-to-right reading.
"""
first = base.address[0]
log().comment('first word = name')
yield dataclasses.replace(base, penalty=self.penalty,
name=first, address=base.address[1:])
# To paraphrase:
# * if another name term comes after the first one and before the
# housenumber
# * a qualifier comes after the name
# * the containing phrase is strictly typed
if (base.housenumber and first.end < base.housenumber.start)\
or (base.qualifier and base.qualifier > first)\
or (query.nodes[first.start].ptype != qmod.PhraseType.NONE):
return
penalty = self.penalty
# Penalty for:
# * <name>, <street>, <housenumber> , ...
# * queries that are comma-separated
if (base.housenumber and base.housenumber > first) or len(query.source) > 1:
penalty += 0.25
for i in range(first.start + 1, first.end):
name, addr = first.split(i)
log().comment(f'split first word = name ({i - first.start})')
yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
def _get_assignments_address_backward(self, base: TokenAssignment,
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments of address searches with
right-to-left reading.
"""
last = base.address[-1]
if self.direction == -1 or len(base.address) > 1:
log().comment('last word = name')
yield dataclasses.replace(base, penalty=self.penalty,
name=last, address=base.address[:-1])
# To paraphrase:
# * if another name term comes before the last one and after the
# housenumber
# * a qualifier comes before the name
# * the containing phrase is strictly typed
if (base.housenumber and last.start > base.housenumber.end)\
or (base.qualifier and base.qualifier < last)\
or (query.nodes[last.start].ptype != qmod.PhraseType.NONE):
return
penalty = self.penalty
if base.housenumber and base.housenumber < last:
penalty += 0.4
if len(query.source) > 1:
penalty += 0.25
for i in range(last.start + 1, last.end):
addr, name = last.split(i)
log().comment(f'split last word = name ({i - last.start})')
yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments for the current sequence.
This function splits up general name assignments into name
and address and yields all possible variants of that.
"""
base = TokenAssignment.from_ranges(self.seq)
num_addr_tokens = sum(t.end - t.start for t in base.address)
if num_addr_tokens > 50:
return
# Postcode search (postcode-only search is covered in next case)
if base.postcode is not None and base.address:
yield from self._get_assignments_postcode(base, query.num_token_slots())
# Postcode or country-only search
if not base.address:
if not base.housenumber and (base.postcode or base.country or base.near_item):
log().comment('postcode/country search')
yield dataclasses.replace(base, penalty=self.penalty)
else:
# <postcode>,<address> should give preference to postcode search
if base.postcode and base.postcode.start == 0:
self.penalty += 0.1
# Right-to-left reading of the address
if self.direction != -1:
yield from self._get_assignments_address_forward(base, query)
# Left-to-right reading of the address
if self.direction != 1:
yield from self._get_assignments_address_backward(base, query)
# variant for special housenumber searches
if base.housenumber and not base.qualifier:
yield dataclasses.replace(base, penalty=self.penalty)
def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Return possible word type assignments to word positions.
The assignments are computed from the concrete tokens listed
in the tokenized query.
The result includes the penalty for transitions from one word type to
another. It does not include penalties for transitions within a
type.
"""
todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PhraseType.NONE else 1)]
while todo:
state = todo.pop()
node = query.nodes[state.end_pos]
for tlist in node.starting:
newstate = state.advance(tlist.ttype, tlist.end, node.btype)
if newstate is not None:
if newstate.end_pos == query.num_token_slots():
if newstate.recheck_sequence():
log().var_dump('Assignment', newstate)
yield from newstate.get_assignments(query)
elif not newstate.is_final():
todo.append(newstate)

View File

View File

@@ -0,0 +1,194 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Server implementation using the falcon webserver framework.
"""
from typing import Optional, Mapping, cast, Any, List
from pathlib import Path
import datetime as dt
import asyncio
from falcon.asgi import App, Request, Response
from nominatim_core.config import Configuration
from ...core import NominatimAPIAsync
from ... import v1 as api_impl
from ... import logging as loglib
class HTTPNominatimError(Exception):
""" A special exception class for errors raised during processing.
"""
def __init__(self, msg: str, status: int, content_type: str) -> None:
self.msg = msg
self.status = status
self.content_type = content_type
async def nominatim_error_handler(req: Request, resp: Response, #pylint: disable=unused-argument
exception: HTTPNominatimError,
_: Any) -> None:
""" Special error handler that passes message and content type as
per exception info.
"""
resp.status = exception.status
resp.text = exception.msg
resp.content_type = exception.content_type
async def timeout_error_handler(req: Request, resp: Response, #pylint: disable=unused-argument
exception: TimeoutError, #pylint: disable=unused-argument
_: Any) -> None:
""" Special error handler that passes message and content type as
per exception info.
"""
resp.status = 503
loglib.log().comment('Aborted: Query took too long to process.')
logdata = loglib.get_and_disable()
if logdata:
resp.text = logdata
resp.content_type = 'text/html; charset=utf-8'
else:
resp.text = "Query took too long to process."
resp.content_type = 'text/plain; charset=utf-8'
class ParamWrapper(api_impl.ASGIAdaptor):
""" Adaptor class for server glue to Falcon framework.
"""
def __init__(self, req: Request, resp: Response,
config: Configuration) -> None:
self.request = req
self.response = resp
self._config = config
def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
return cast(Optional[str], self.request.get_param(name, default=default))
def get_header(self, name: str, default: Optional[str] = None) -> Optional[str]:
return cast(Optional[str], self.request.get_header(name, default=default))
def error(self, msg: str, status: int = 400) -> HTTPNominatimError:
return HTTPNominatimError(msg, status, self.content_type)
def create_response(self, status: int, output: str, num_results: int) -> None:
self.response.context.num_results = num_results
self.response.status = status
self.response.text = output
self.response.content_type = self.content_type
def base_uri(self) -> str:
return cast (str, self.request.forwarded_prefix)
def config(self) -> Configuration:
return self._config
class EndpointWrapper:
""" Converter for server glue endpoint functions to Falcon request handlers.
"""
def __init__(self, name: str, func: api_impl.EndpointFunc, api: NominatimAPIAsync) -> None:
self.name = name
self.func = func
self.api = api
async def on_get(self, req: Request, resp: Response) -> None:
""" Implementation of the endpoint.
"""
await self.func(self.api, ParamWrapper(req, resp, self.api.config))
class FileLoggingMiddleware:
""" Middleware to log selected requests into a file.
"""
def __init__(self, file_name: str):
self.fd = open(file_name, 'a', buffering=1, encoding='utf8') # pylint: disable=R1732
async def process_request(self, req: Request, _: Response) -> None:
""" Callback before the request starts timing.
"""
req.context.start = dt.datetime.now(tz=dt.timezone.utc)
async def process_response(self, req: Request, resp: Response,
resource: Optional[EndpointWrapper],
req_succeeded: bool) -> None:
""" Callback after requests writes to the logfile. It only
writes logs for successful requests for search, reverse and lookup.
"""
if not req_succeeded or resource is None or resp.status != 200\
or resource.name not in ('reverse', 'search', 'lookup', 'details'):
return
finish = dt.datetime.now(tz=dt.timezone.utc)
duration = (finish - req.context.start).total_seconds()
params = req.scope['query_string'].decode('utf8')
start = req.context.start.replace(tzinfo=None)\
.isoformat(sep=' ', timespec='milliseconds')
self.fd.write(f"[{start}] "
f"{duration:.4f} {getattr(resp.context, 'num_results', 0)} "
f'{resource.name} "{params}"\n')
class APIShutdown:
""" Middleware that closes any open database connections.
"""
def __init__(self, api: NominatimAPIAsync) -> None:
self.api = api
async def process_shutdown(self, *_: Any) -> None:
"""Process the ASGI lifespan shutdown event.
"""
await self.api.close()
def get_application(project_dir: Path,
environ: Optional[Mapping[str, str]] = None) -> App:
""" Create a Nominatim Falcon ASGI application.
"""
api = NominatimAPIAsync(project_dir, environ)
middleware: List[object] = [APIShutdown(api)]
log_file = api.config.LOG_FILE
if log_file:
middleware.append(FileLoggingMiddleware(log_file))
app = App(cors_enable=api.config.get_bool('CORS_NOACCESSCONTROL'),
middleware=middleware)
app.add_error_handler(HTTPNominatimError, nominatim_error_handler)
app.add_error_handler(TimeoutError, timeout_error_handler)
# different from TimeoutError in Python <= 3.10
app.add_error_handler(asyncio.TimeoutError, timeout_error_handler)
legacy_urls = api.config.get_bool('SERVE_LEGACY_URLS')
for name, func in api_impl.ROUTES:
endpoint = EndpointWrapper(name, func, api)
app.add_route(f"/{name}", endpoint)
if legacy_urls:
app.add_route(f"/{name}.php", endpoint)
return app
def run_wsgi() -> App:
""" Entry point for uvicorn.
Make sure uvicorn is run from the project directory.
"""
return get_application(Path('.'))

View File

@@ -0,0 +1,174 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Server implementation using the starlette webserver framework.
"""
from typing import Any, Optional, Mapping, Callable, cast, Coroutine, Dict, Awaitable
from pathlib import Path
import datetime as dt
import asyncio
from starlette.applications import Starlette
from starlette.routing import Route
from starlette.exceptions import HTTPException
from starlette.responses import Response, PlainTextResponse, HTMLResponse
from starlette.requests import Request
from starlette.middleware import Middleware
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
from starlette.middleware.cors import CORSMiddleware
from nominatim_core.config import Configuration
from ...core import NominatimAPIAsync
from ... import v1 as api_impl
from ... import logging as loglib
class ParamWrapper(api_impl.ASGIAdaptor):
""" Adaptor class for server glue to Starlette framework.
"""
def __init__(self, request: Request) -> None:
self.request = request
def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
return self.request.query_params.get(name, default=default)
def get_header(self, name: str, default: Optional[str] = None) -> Optional[str]:
return self.request.headers.get(name, default)
def error(self, msg: str, status: int = 400) -> HTTPException:
return HTTPException(status, detail=msg,
headers={'content-type': self.content_type})
def create_response(self, status: int, output: str, num_results: int) -> Response:
self.request.state.num_results = num_results
return Response(output, status_code=status, media_type=self.content_type)
def base_uri(self) -> str:
scheme = self.request.url.scheme
host = self.request.url.hostname
port = self.request.url.port
root = self.request.scope['root_path']
if (scheme == 'http' and port == 80) or (scheme == 'https' and port == 443):
port = None
if port is not None:
return f"{scheme}://{host}:{port}{root}"
return f"{scheme}://{host}{root}"
def config(self) -> Configuration:
return cast(Configuration, self.request.app.state.API.config)
def _wrap_endpoint(func: api_impl.EndpointFunc)\
-> Callable[[Request], Coroutine[Any, Any, Response]]:
async def _callback(request: Request) -> Response:
return cast(Response, await func(request.app.state.API, ParamWrapper(request)))
return _callback
class FileLoggingMiddleware(BaseHTTPMiddleware):
""" Middleware to log selected requests into a file.
"""
def __init__(self, app: Starlette, file_name: str = ''):
super().__init__(app)
self.fd = open(file_name, 'a', buffering=1, encoding='utf8') # pylint: disable=R1732
async def dispatch(self, request: Request,
call_next: RequestResponseEndpoint) -> Response:
start = dt.datetime.now(tz=dt.timezone.utc)
response = await call_next(request)
if response.status_code != 200:
return response
finish = dt.datetime.now(tz=dt.timezone.utc)
for endpoint in ('reverse', 'search', 'lookup', 'details'):
if request.url.path.startswith('/' + endpoint):
qtype = endpoint
break
else:
return response
duration = (finish - start).total_seconds()
params = request.scope['query_string'].decode('utf8')
self.fd.write(f"[{start.replace(tzinfo=None).isoformat(sep=' ', timespec='milliseconds')}] "
f"{duration:.4f} {getattr(request.state, 'num_results', 0)} "
f'{qtype} "{params}"\n')
return response
async def timeout_error(request: Request, #pylint: disable=unused-argument
_: Exception) -> Response:
""" Error handler for query timeouts.
"""
loglib.log().comment('Aborted: Query took too long to process.')
logdata = loglib.get_and_disable()
if logdata:
return HTMLResponse(logdata)
return PlainTextResponse("Query took too long to process.", status_code=503)
def get_application(project_dir: Path,
environ: Optional[Mapping[str, str]] = None,
debug: bool = True) -> Starlette:
""" Create a Nominatim falcon ASGI application.
"""
config = Configuration(project_dir, environ)
routes = []
legacy_urls = config.get_bool('SERVE_LEGACY_URLS')
for name, func in api_impl.ROUTES:
endpoint = _wrap_endpoint(func)
routes.append(Route(f"/{name}", endpoint=endpoint))
if legacy_urls:
routes.append(Route(f"/{name}.php", endpoint=endpoint))
middleware = []
if config.get_bool('CORS_NOACCESSCONTROL'):
middleware.append(Middleware(CORSMiddleware,
allow_origins=['*'],
allow_methods=['GET', 'OPTIONS'],
max_age=86400))
log_file = config.LOG_FILE
if log_file:
middleware.append(Middleware(FileLoggingMiddleware, file_name=log_file))
exceptions: Dict[Any, Callable[[Request, Exception], Awaitable[Response]]] = {
TimeoutError: timeout_error,
asyncio.TimeoutError: timeout_error
}
async def _shutdown() -> None:
await app.state.API.close()
app = Starlette(debug=debug, routes=routes, middleware=middleware,
exception_handlers=exceptions,
on_shutdown=[_shutdown])
app.state.API = NominatimAPIAsync(project_dir, environ)
return app
def run_wsgi() -> Starlette:
""" Entry point for uvicorn.
"""
return get_application(Path('.'), debug=False)

View File

@@ -0,0 +1,221 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Custom functions and expressions for SQLAlchemy.
"""
from __future__ import annotations
from typing import Any
import sqlalchemy as sa
from sqlalchemy.ext.compiler import compiles
from nominatim_core.typing import SaColumn
# pylint: disable=all
class PlacexGeometryReverseLookuppolygon(sa.sql.functions.GenericFunction[Any]):
""" Check for conditions that allow partial index use on
'idx_placex_geometry_reverse_lookupPolygon'.
Needs to be constant, so that the query planner picks them up correctly
in prepared statements.
"""
name = 'PlacexGeometryReverseLookuppolygon'
inherit_cache = True
@compiles(PlacexGeometryReverseLookuppolygon) # type: ignore[no-untyped-call, misc]
def _default_intersects(element: PlacexGeometryReverseLookuppolygon,
compiler: 'sa.Compiled', **kw: Any) -> str:
return ("(ST_GeometryType(placex.geometry) in ('ST_Polygon', 'ST_MultiPolygon')"
" AND placex.rank_address between 4 and 25"
" AND placex.type != 'postcode'"
" AND placex.name is not null"
" AND placex.indexed_status = 0"
" AND placex.linked_place_id is null)")
@compiles(PlacexGeometryReverseLookuppolygon, 'sqlite') # type: ignore[no-untyped-call, misc]
def _sqlite_intersects(element: PlacexGeometryReverseLookuppolygon,
compiler: 'sa.Compiled', **kw: Any) -> str:
return ("(ST_GeometryType(placex.geometry) in ('POLYGON', 'MULTIPOLYGON')"
" AND placex.rank_address between 4 and 25"
" AND placex.type != 'postcode'"
" AND placex.name is not null"
" AND placex.indexed_status = 0"
" AND placex.linked_place_id is null)")
class IntersectsReverseDistance(sa.sql.functions.GenericFunction[Any]):
name = 'IntersectsReverseDistance'
inherit_cache = True
def __init__(self, table: sa.Table, geom: SaColumn) -> None:
super().__init__(table.c.geometry,
table.c.rank_search, geom)
self.tablename = table.name
@compiles(IntersectsReverseDistance) # type: ignore[no-untyped-call, misc]
def default_reverse_place_diameter(element: IntersectsReverseDistance,
compiler: 'sa.Compiled', **kw: Any) -> str:
table = element.tablename
return f"({table}.rank_address between 4 and 25"\
f" AND {table}.type != 'postcode'"\
f" AND {table}.name is not null"\
f" AND {table}.linked_place_id is null"\
f" AND {table}.osm_type = 'N'" + \
" AND ST_Buffer(%s, reverse_place_diameter(%s)) && %s)" % \
tuple(map(lambda c: compiler.process(c, **kw), element.clauses))
@compiles(IntersectsReverseDistance, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_reverse_place_diameter(element: IntersectsReverseDistance,
compiler: 'sa.Compiled', **kw: Any) -> str:
geom1, rank, geom2 = list(element.clauses)
table = element.tablename
return (f"({table}.rank_address between 4 and 25"\
f" AND {table}.type != 'postcode'"\
f" AND {table}.name is not null"\
f" AND {table}.linked_place_id is null"\
f" AND {table}.osm_type = 'N'"\
" AND MbrIntersects(%s, ST_Expand(%s, 14.0 * exp(-0.2 * %s) - 0.03))"\
f" AND {table}.place_id IN"\
" (SELECT place_id FROM placex_place_node_areas"\
" WHERE ROWID IN (SELECT ROWID FROM SpatialIndex"\
" WHERE f_table_name = 'placex_place_node_areas'"\
" AND search_frame = %s)))") % (
compiler.process(geom1, **kw),
compiler.process(geom2, **kw),
compiler.process(rank, **kw),
compiler.process(geom2, **kw))
class IsBelowReverseDistance(sa.sql.functions.GenericFunction[Any]):
name = 'IsBelowReverseDistance'
inherit_cache = True
@compiles(IsBelowReverseDistance) # type: ignore[no-untyped-call, misc]
def default_is_below_reverse_distance(element: IsBelowReverseDistance,
compiler: 'sa.Compiled', **kw: Any) -> str:
dist, rank = list(element.clauses)
return "%s < reverse_place_diameter(%s)" % (compiler.process(dist, **kw),
compiler.process(rank, **kw))
@compiles(IsBelowReverseDistance, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_is_below_reverse_distance(element: IsBelowReverseDistance,
compiler: 'sa.Compiled', **kw: Any) -> str:
dist, rank = list(element.clauses)
return "%s < 14.0 * exp(-0.2 * %s) - 0.03" % (compiler.process(dist, **kw),
compiler.process(rank, **kw))
class IsAddressPoint(sa.sql.functions.GenericFunction[Any]):
name = 'IsAddressPoint'
inherit_cache = True
def __init__(self, table: sa.Table) -> None:
super().__init__(table.c.rank_address,
table.c.housenumber, table.c.name)
@compiles(IsAddressPoint) # type: ignore[no-untyped-call, misc]
def default_is_address_point(element: IsAddressPoint,
compiler: 'sa.Compiled', **kw: Any) -> str:
rank, hnr, name = list(element.clauses)
return "(%s = 30 AND (%s IS NOT NULL OR %s ? 'addr:housename'))" % (
compiler.process(rank, **kw),
compiler.process(hnr, **kw),
compiler.process(name, **kw))
@compiles(IsAddressPoint, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_is_address_point(element: IsAddressPoint,
compiler: 'sa.Compiled', **kw: Any) -> str:
rank, hnr, name = list(element.clauses)
return "(%s = 30 AND coalesce(%s, json_extract(%s, '$.addr:housename')) IS NOT NULL)" % (
compiler.process(rank, **kw),
compiler.process(hnr, **kw),
compiler.process(name, **kw))
class CrosscheckNames(sa.sql.functions.GenericFunction[Any]):
""" Check if in the given list of names in parameters 1 any of the names
from the JSON array in parameter 2 are contained.
"""
name = 'CrosscheckNames'
inherit_cache = True
@compiles(CrosscheckNames) # type: ignore[no-untyped-call, misc]
def compile_crosscheck_names(element: CrosscheckNames,
compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "coalesce(avals(%s) && ARRAY(SELECT * FROM json_array_elements_text(%s)), false)" % (
compiler.process(arg1, **kw), compiler.process(arg2, **kw))
@compiles(CrosscheckNames, 'sqlite') # type: ignore[no-untyped-call, misc]
def compile_sqlite_crosscheck_names(element: CrosscheckNames,
compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "EXISTS(SELECT *"\
" FROM json_each(%s) as name, json_each(%s) as match_name"\
" WHERE name.value = match_name.value)"\
% (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
class JsonArrayEach(sa.sql.functions.GenericFunction[Any]):
""" Return elements of a json array as a set.
"""
name = 'JsonArrayEach'
inherit_cache = True
@compiles(JsonArrayEach) # type: ignore[no-untyped-call, misc]
def default_json_array_each(element: JsonArrayEach, compiler: 'sa.Compiled', **kw: Any) -> str:
return "json_array_elements(%s)" % compiler.process(element.clauses, **kw)
@compiles(JsonArrayEach, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_json_array_each(element: JsonArrayEach, compiler: 'sa.Compiled', **kw: Any) -> str:
return "json_each(%s)" % compiler.process(element.clauses, **kw)
class Greatest(sa.sql.functions.GenericFunction[Any]):
""" Function to compute maximum of all its input parameters.
"""
name = 'greatest'
inherit_cache = True
@compiles(Greatest, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_greatest(element: Greatest, compiler: 'sa.Compiled', **kw: Any) -> str:
return "max(%s)" % compiler.process(element.clauses, **kw)
class RegexpWord(sa.sql.functions.GenericFunction[Any]):
""" Check if a full word is in a given string.
"""
name = 'RegexpWord'
inherit_cache = True
@compiles(RegexpWord, 'postgresql') # type: ignore[no-untyped-call, misc]
def postgres_regexp_nocase(element: RegexpWord, compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "%s ~* ('\\m(' || %s || ')\\M')::text" % (compiler.process(arg2, **kw), compiler.process(arg1, **kw))
@compiles(RegexpWord, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_regexp_nocase(element: RegexpWord, compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "regexp('\\b(' || %s || ')\\b', %s)" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))

View File

@@ -0,0 +1,122 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Custom functions for SQLite.
"""
from typing import cast, Optional, Set, Any
import json
# pylint: disable=protected-access
def weigh_search(search_vector: Optional[str], rankings: str, default: float) -> float:
""" Custom weight function for search results.
"""
if search_vector is not None:
svec = [int(x) for x in search_vector.split(',')]
for rank in json.loads(rankings):
if all(r in svec for r in rank[1]):
return cast(float, rank[0])
return default
class ArrayIntersectFuzzy:
""" Compute the array of common elements of all input integer arrays.
Very large input parameters may be ignored to speed up
computation. Therefore, the result is a superset of common elements.
Input and output arrays are given as comma-separated lists.
"""
def __init__(self) -> None:
self.first = ''
self.values: Optional[Set[int]] = None
def step(self, value: Optional[str]) -> None:
""" Add the next array to the intersection.
"""
if value is not None:
if not self.first:
self.first = value
elif len(value) < 10000000:
if self.values is None:
self.values = {int(x) for x in self.first.split(',')}
self.values.intersection_update((int(x) for x in value.split(',')))
def finalize(self) -> str:
""" Return the final result.
"""
if self.values is not None:
return ','.join(map(str, self.values))
return self.first
class ArrayUnion:
""" Compute the set of all elements of the input integer arrays.
Input and output arrays are given as strings of comma-separated lists.
"""
def __init__(self) -> None:
self.values: Optional[Set[str]] = None
def step(self, value: Optional[str]) -> None:
""" Add the next array to the union.
"""
if value is not None:
if self.values is None:
self.values = set(value.split(','))
else:
self.values.update(value.split(','))
def finalize(self) -> str:
""" Return the final result.
"""
return '' if self.values is None else ','.join(self.values)
def array_contains(container: Optional[str], containee: Optional[str]) -> Optional[bool]:
""" Is the array 'containee' completely contained in array 'container'.
"""
if container is None or containee is None:
return None
vset = container.split(',')
return all(v in vset for v in containee.split(','))
def array_pair_contains(container1: Optional[str], container2: Optional[str],
containee: Optional[str]) -> Optional[bool]:
""" Is the array 'containee' completely contained in the union of
array 'container1' and array 'container2'.
"""
if container1 is None or container2 is None or containee is None:
return None
vset = container1.split(',') + container2.split(',')
return all(v in vset for v in containee.split(','))
def install_custom_functions(conn: Any) -> None:
""" Install helper functions for Nominatim into the given SQLite
database connection.
"""
conn.create_function('weigh_search', 3, weigh_search, deterministic=True)
conn.create_function('array_contains', 2, array_contains, deterministic=True)
conn.create_function('array_pair_contains', 3, array_pair_contains, deterministic=True)
_create_aggregate(conn, 'array_intersect_fuzzy', 1, ArrayIntersectFuzzy)
_create_aggregate(conn, 'array_union', 1, ArrayUnion)
async def _make_aggregate(aioconn: Any, *args: Any) -> None:
await aioconn._execute(aioconn._conn.create_aggregate, *args)
def _create_aggregate(conn: Any, name: str, nargs: int, aggregate: Any) -> None:
try:
conn.await_(_make_aggregate(conn._connection, name, nargs, aggregate))
except Exception as error: # pylint: disable=broad-exception-caught
conn._handle_exception(error)

View File

@@ -0,0 +1,51 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Classes and function related to status call.
"""
from typing import Optional
import datetime as dt
import dataclasses
import sqlalchemy as sa
from .connection import SearchConnection
from .version import NOMINATIM_API_VERSION
@dataclasses.dataclass
class StatusResult:
""" Result of a call to the status API.
"""
status: int
message: str
software_version = NOMINATIM_API_VERSION
data_updated: Optional[dt.datetime] = None
database_version: Optional[str] = None
async def get_status(conn: SearchConnection) -> StatusResult:
""" Execute a status API call.
"""
status = StatusResult(0, 'OK')
# Last update date
sql = sa.select(conn.t.import_status.c.lastimportdate).limit(1)
status.data_updated = await conn.scalar(sql)
if status.data_updated is not None:
if status.data_updated.tzinfo is None:
status.data_updated = status.data_updated.replace(tzinfo=dt.timezone.utc)
else:
status.data_updated = status.data_updated.astimezone(dt.timezone.utc)
# Database version
try:
status.database_version = await conn.get_property('database_version')
except ValueError:
pass
return status

550
src/nominatim_api/types.py Normal file
View File

@@ -0,0 +1,550 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Complex datatypes used by the Nominatim API.
"""
from typing import Optional, Union, Tuple, NamedTuple, TypeVar, Type, Dict, \
Any, List, Sequence
from collections import abc
import dataclasses
import enum
import math
from struct import unpack
from binascii import unhexlify
from nominatim_core.errors import UsageError
from .localization import Locales
# pylint: disable=no-member,too-many-boolean-expressions,too-many-instance-attributes
@dataclasses.dataclass
class PlaceID:
""" Reference a place by Nominatim's internal ID.
A PlaceID may reference place from the main table placex, from
the interpolation tables or the postcode tables. Place IDs are not
stable between installations. You may use this type theefore only
with place IDs obtained from the same database.
"""
place_id: int
"""
The internal ID of the place to reference.
"""
@dataclasses.dataclass
class OsmID:
""" Reference a place by its OSM ID and potentially the basic category.
The OSM ID may refer to places in the main table placex and OSM
interpolation lines.
"""
osm_type: str
""" OSM type of the object. Must be one of `N`(node), `W`(way) or
`R`(relation).
"""
osm_id: int
""" The OSM ID of the object.
"""
osm_class: Optional[str] = None
""" The same OSM object may appear multiple times in the database under
different categories. The optional class parameter allows to distinguish
the different categories and corresponds to the key part of the category.
If there are multiple objects in the database and `osm_class` is
left out, then one of the objects is returned at random.
"""
def __post_init__(self) -> None:
if self.osm_type not in ('N', 'W', 'R'):
raise ValueError(f"Illegal OSM type '{self.osm_type}'. Must be one of N, W, R.")
PlaceRef = Union[PlaceID, OsmID]
class Point(NamedTuple):
""" A geographic point in WGS84 projection.
"""
x: float
y: float
@property
def lat(self) -> float:
""" Return the latitude of the point.
"""
return self.y
@property
def lon(self) -> float:
""" Return the longitude of the point.
"""
return self.x
def to_geojson(self) -> str:
""" Return the point in GeoJSON format.
"""
return f'{{"type": "Point","coordinates": [{self.x}, {self.y}]}}'
@staticmethod
def from_wkb(wkb: Union[str, bytes]) -> 'Point':
""" Create a point from EWKB as returned from the database.
"""
if isinstance(wkb, str):
wkb = unhexlify(wkb)
if len(wkb) != 25:
raise ValueError(f"Point wkb has unexpected length {len(wkb)}")
if wkb[0] == 0:
gtype, srid, x, y = unpack('>iidd', wkb[1:])
elif wkb[0] == 1:
gtype, srid, x, y = unpack('<iidd', wkb[1:])
else:
raise ValueError("WKB has unknown endian value.")
if gtype != 0x20000001:
raise ValueError("WKB must be a point geometry.")
if srid != 4326:
raise ValueError("Only WGS84 WKB supported.")
return Point(x, y)
@staticmethod
def from_param(inp: Any) -> 'Point':
""" Create a point from an input parameter. The parameter
may be given as a point, a string or a sequence of
strings or floats. Raises a UsageError if the format is
not correct.
"""
if isinstance(inp, Point):
return inp
seq: Sequence[str]
if isinstance(inp, str):
seq = inp.split(',')
elif isinstance(inp, abc.Sequence):
seq = inp
if len(seq) != 2:
raise UsageError('Point parameter needs 2 coordinates.')
try:
x, y = filter(math.isfinite, map(float, seq))
except ValueError as exc:
raise UsageError('Point parameter needs to be numbers.') from exc
if x < -180.0 or x > 180.0 or y < -90.0 or y > 90.0:
raise UsageError('Point coordinates invalid.')
return Point(x, y)
def to_wkt(self) -> str:
""" Return the WKT representation of the point.
"""
return f'POINT({self.x} {self.y})'
AnyPoint = Union[Point, Tuple[float, float]]
WKB_BBOX_HEADER_LE = b'\x01\x03\x00\x00\x20\xE6\x10\x00\x00\x01\x00\x00\x00\x05\x00\x00\x00'
WKB_BBOX_HEADER_BE = b'\x00\x20\x00\x00\x03\x00\x00\x10\xe6\x00\x00\x00\x01\x00\x00\x00\x05'
class Bbox:
""" A bounding box in WGS84 projection.
The coordinates are available as an array in the 'coord'
property in the order (minx, miny, maxx, maxy).
"""
def __init__(self, minx: float, miny: float, maxx: float, maxy: float) -> None:
""" Create a new bounding box with the given coordinates in WGS84
projection.
"""
self.coords = (minx, miny, maxx, maxy)
@property
def minlat(self) -> float:
""" Southern-most latitude, corresponding to the minimum y coordinate.
"""
return self.coords[1]
@property
def maxlat(self) -> float:
""" Northern-most latitude, corresponding to the maximum y coordinate.
"""
return self.coords[3]
@property
def minlon(self) -> float:
""" Western-most longitude, corresponding to the minimum x coordinate.
"""
return self.coords[0]
@property
def maxlon(self) -> float:
""" Eastern-most longitude, corresponding to the maximum x coordinate.
"""
return self.coords[2]
@property
def area(self) -> float:
""" Return the area of the box in WGS84.
"""
return (self.coords[2] - self.coords[0]) * (self.coords[3] - self.coords[1])
def contains(self, pt: Point) -> bool:
""" Check if the point is inside or on the boundary of the box.
"""
return self.coords[0] <= pt[0] and self.coords[1] <= pt[1]\
and self.coords[2] >= pt[0] and self.coords[3] >= pt[1]
def to_wkt(self) -> str:
""" Return the WKT representation of the Bbox. This
is a simple polygon with four points.
"""
return 'POLYGON(({0} {1},{0} {3},{2} {3},{2} {1},{0} {1}))'\
.format(*self.coords) # pylint: disable=consider-using-f-string
@staticmethod
def from_wkb(wkb: Union[None, str, bytes]) -> 'Optional[Bbox]':
""" Create a Bbox from a bounding box polygon as returned by
the database. Returns `None` if the input value is None.
"""
if wkb is None:
return None
if isinstance(wkb, str):
wkb = unhexlify(wkb)
if len(wkb) != 97:
raise ValueError("WKB must be a bounding box polygon")
if wkb.startswith(WKB_BBOX_HEADER_LE):
x1, y1, _, _, x2, y2 = unpack('<dddddd', wkb[17:65])
elif wkb.startswith(WKB_BBOX_HEADER_BE):
x1, y1, _, _, x2, y2 = unpack('>dddddd', wkb[17:65])
else:
raise ValueError("WKB has wrong header")
return Bbox(min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2))
@staticmethod
def from_point(pt: Point, buffer: float) -> 'Bbox':
""" Return a Bbox around the point with the buffer added to all sides.
"""
return Bbox(pt[0] - buffer, pt[1] - buffer,
pt[0] + buffer, pt[1] + buffer)
@staticmethod
def from_param(inp: Any) -> 'Bbox':
""" Return a Bbox from an input parameter. The box may be
given as a Bbox, a string or a list or strings or integer.
Raises a UsageError if the format is incorrect.
"""
if isinstance(inp, Bbox):
return inp
seq: Sequence[str]
if isinstance(inp, str):
seq = inp.split(',')
elif isinstance(inp, abc.Sequence):
seq = inp
if len(seq) != 4:
raise UsageError('Bounding box parameter needs 4 coordinates.')
try:
x1, y1, x2, y2 = filter(math.isfinite, map(float, seq))
except ValueError as exc:
raise UsageError('Bounding box parameter needs to be numbers.') from exc
x1 = min(180, max(-180, x1))
x2 = min(180, max(-180, x2))
y1 = min(90, max(-90, y1))
y2 = min(90, max(-90, y2))
if x1 == x2 or y1 == y2:
raise UsageError('Bounding box with invalid parameters.')
return Bbox(min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2))
class GeometryFormat(enum.Flag):
""" All search functions support returning the full geometry of a place in
various formats. The internal geometry is converted by PostGIS to
the desired format and then returned as a string. It is possible to
request multiple formats at the same time.
"""
NONE = 0
""" No geometry requested. Alias for a empty flag.
"""
GEOJSON = enum.auto()
"""
[GeoJSON](https://geojson.org/) format
"""
KML = enum.auto()
"""
[KML](https://en.wikipedia.org/wiki/Keyhole_Markup_Language) format
"""
SVG = enum.auto()
"""
[SVG](http://www.w3.org/TR/SVG/paths.html) format
"""
TEXT = enum.auto()
"""
[WKT](https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry) format
"""
class DataLayer(enum.Flag):
""" The `DataLayer` flag type defines the layers that can be selected
for reverse and forward search.
"""
ADDRESS = enum.auto()
""" The address layer contains all places relevant for addresses:
fully qualified addresses with a house number (or a house name equivalent,
for some addresses) and places that can be part of an address like
roads, cities, states.
"""
POI = enum.auto()
""" Layer for points of interest like shops, restaurants but also
recycling bins or postboxes.
"""
RAILWAY = enum.auto()
""" Layer with railway features including tracks and other infrastructure.
Note that in Nominatim's standard configuration, only very few railway
features are imported into the database. Thus a custom configuration
is required to make full use of this layer.
"""
NATURAL = enum.auto()
""" Layer with natural features like rivers, lakes and mountains.
"""
MANMADE = enum.auto()
""" Layer with other human-made features and boundaries. This layer is
the catch-all and includes all features not covered by the other
layers. A typical example for this layer are national park boundaries.
"""
def format_country(cc: Any) -> List[str]:
""" Extract a list of country codes from the input which may be either
a string or list of strings. Filters out all values that are not
a two-letter string.
"""
clist: Sequence[str]
if isinstance(cc, str):
clist = cc.split(',')
elif isinstance(cc, abc.Sequence):
clist = cc
else:
raise UsageError("Parameter 'country' needs to be a comma-separated list "
"or a Python list of strings.")
return [cc.lower() for cc in clist if isinstance(cc, str) and len(cc) == 2]
def format_excluded(ids: Any) -> List[int]:
""" Extract a list of place ids from the input which may be either
a string or a list of strings or ints. Ignores empty value but
throws a UserError on anything that cannot be converted to int.
"""
plist: Sequence[str]
if isinstance(ids, str):
plist = [s.strip() for s in ids.split(',')]
elif isinstance(ids, abc.Sequence):
plist = ids
else:
raise UsageError("Parameter 'excluded' needs to be a comma-separated list "
"or a Python list of numbers.")
if not all(isinstance(i, int) or
(isinstance(i, str) and (not i or i.isdigit())) for i in plist):
raise UsageError("Parameter 'excluded' only takes place IDs.")
return [int(id) for id in plist if id] or [0]
def format_categories(categories: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
""" Extract a list of categories. Currently a noop.
"""
return categories
TParam = TypeVar('TParam', bound='LookupDetails') # pylint: disable=invalid-name
@dataclasses.dataclass
class LookupDetails:
""" Collection of parameters that define which kind of details are
returned with a lookup or details result.
"""
geometry_output: GeometryFormat = GeometryFormat.NONE
""" Add the full geometry of the place to the result. Multiple
formats may be selected. Note that geometries can become quite large.
"""
address_details: bool = False
""" Get detailed information on the places that make up the address
for the result.
"""
linked_places: bool = False
""" Get detailed information on the places that link to the result.
"""
parented_places: bool = False
""" Get detailed information on all places that this place is a parent
for, i.e. all places for which it provides the address details.
Only POI places can have parents.
"""
keywords: bool = False
""" Add information about the search terms used for this place.
"""
geometry_simplification: float = 0.0
""" Simplification factor for a geometry in degrees WGS. A factor of
0.0 means the original geometry is kept. The higher the value, the
more the geometry gets simplified.
"""
locales: Locales = Locales()
""" Preferred languages for localization of results.
"""
@classmethod
def from_kwargs(cls: Type[TParam], kwargs: Dict[str, Any]) -> TParam:
""" Load the data fields of the class from a dictionary.
Unknown entries in the dictionary are ignored, missing ones
get the default setting.
The function supports type checking and throws a UsageError
when the value does not fit.
"""
def _check_field(v: Any, field: 'dataclasses.Field[Any]') -> Any:
if v is None:
return field.default_factory() \
if field.default_factory != dataclasses.MISSING \
else field.default
if field.metadata and 'transform' in field.metadata:
return field.metadata['transform'](v)
if not isinstance(v, field.type):
raise UsageError(f"Parameter '{field.name}' needs to be of {field.type!s}.")
return v
return cls(**{f.name: _check_field(kwargs[f.name], f)
for f in dataclasses.fields(cls) if f.name in kwargs})
@dataclasses.dataclass
class ReverseDetails(LookupDetails):
""" Collection of parameters for the reverse call.
"""
max_rank: int = dataclasses.field(default=30,
metadata={'transform': lambda v: max(0, min(v, 30))}
)
""" Highest address rank to return.
"""
layers: DataLayer = DataLayer.ADDRESS | DataLayer.POI
""" Filter which kind of data to include.
"""
@dataclasses.dataclass
class SearchDetails(LookupDetails):
""" Collection of parameters for the search call.
"""
max_results: int = 10
""" Maximum number of results to be returned. The actual number of results
may be less.
"""
min_rank: int = dataclasses.field(default=0,
metadata={'transform': lambda v: max(0, min(v, 30))}
)
""" Lowest address rank to return.
"""
max_rank: int = dataclasses.field(default=30,
metadata={'transform': lambda v: max(0, min(v, 30))}
)
""" Highest address rank to return.
"""
layers: Optional[DataLayer] = dataclasses.field(default=None,
metadata={'transform': lambda r : r})
""" Filter which kind of data to include. When 'None' (the default) then
filtering by layers is disabled.
"""
countries: List[str] = dataclasses.field(default_factory=list,
metadata={'transform': format_country})
""" Restrict search results to the given countries. An empty list (the
default) will disable this filter.
"""
excluded: List[int] = dataclasses.field(default_factory=list,
metadata={'transform': format_excluded})
""" List of OSM objects to exclude from the results. Currently only
works when the internal place ID is given.
An empty list (the default) will disable this filter.
"""
viewbox: Optional[Bbox] = dataclasses.field(default=None,
metadata={'transform': Bbox.from_param})
""" Focus the search on a given map area.
"""
bounded_viewbox: bool = False
""" Use 'viewbox' as a filter and restrict results to places within the
given area.
"""
near: Optional[Point] = dataclasses.field(default=None,
metadata={'transform': Point.from_param})
""" Order results by distance to the given point.
"""
near_radius: Optional[float] = dataclasses.field(default=None,
metadata={'transform': lambda r : r})
""" Use near point as a filter and drop results outside the given
radius. Radius is given in degrees WSG84.
"""
categories: List[Tuple[str, str]] = dataclasses.field(default_factory=list,
metadata={'transform': format_categories})
""" Restrict search to places with one of the given class/type categories.
An empty list (the default) will disable this filter.
"""
viewbox_x2: Optional[Bbox] = None
def __post_init__(self) -> None:
if self.viewbox is not None:
xext = (self.viewbox.maxlon - self.viewbox.minlon)/2
yext = (self.viewbox.maxlat - self.viewbox.minlat)/2
self.viewbox_x2 = Bbox(self.viewbox.minlon - xext, self.viewbox.minlat - yext,
self.viewbox.maxlon + xext, self.viewbox.maxlat + yext)
def restrict_min_max_rank(self, new_min: int, new_max: int) -> None:
""" Change the min_rank and max_rank fields to respect the
given boundaries.
"""
assert new_min <= new_max
self.min_rank = max(self.min_rank, new_min)
self.max_rank = min(self.max_rank, new_max)
def is_impossible(self) -> bool:
""" Check if the parameter configuration is contradictionary and
cannot yield any results.
"""
return (self.min_rank > self.max_rank
or (self.bounded_viewbox
and self.viewbox is not None and self.near is not None
and self.viewbox.contains(self.near))
or (self.layers is not None and not self.layers)
or (self.max_rank <= 4 and
self.layers is not None and not self.layers & DataLayer.ADDRESS))
def layer_enabled(self, layer: DataLayer) -> bool:
""" Check if the given layer has been chosen. Also returns
true when layer restriction has been disabled completely.
"""
return self.layers is None or bool(self.layers & layer)

View File

@@ -0,0 +1,21 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of API version v1 (aka the legacy version).
"""
#pylint: disable=useless-import-alias
from .server_glue import (ASGIAdaptor as ASGIAdaptor,
EndpointFunc as EndpointFunc,
ROUTES as ROUTES)
from . import format as _format
list_formats = _format.dispatch.list_formats
supports_format = _format.dispatch.supports_format
format_result = _format.dispatch.format_result

View File

@@ -0,0 +1,201 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Hard-coded information about tag categories.
These tables have been copied verbatim from the old PHP code. For future
version a more flexible formatting is required.
"""
from typing import Tuple, Optional, Mapping, Union
from ..results import ReverseResult, SearchResult
from ..types import Bbox
def get_label_tag(category: Tuple[str, str], extratags: Optional[Mapping[str, str]],
rank: int, country: Optional[str]) -> str:
""" Create a label tag for the given place that can be used as an XML name.
"""
if rank < 26 and extratags and 'place' in extratags:
label = extratags['place']
elif rank < 26 and extratags and 'linked_place' in extratags:
label = extratags['linked_place']
elif category == ('boundary', 'administrative'):
label = ADMIN_LABELS.get((country or '', int(rank/2)))\
or ADMIN_LABELS.get(('', int(rank/2)))\
or 'Administrative'
elif category[1] == 'postal_code':
label = 'postcode'
elif rank < 26:
label = category[1] if category[1] != 'yes' else category[0]
elif rank < 28:
label = 'road'
elif category[0] == 'place'\
and category[1] in ('house_number', 'house_name', 'country_code'):
label = category[1]
else:
label = category[0]
return label.lower().replace(' ', '_')
def bbox_from_result(result: Union[ReverseResult, SearchResult]) -> Bbox:
""" Compute a bounding box for the result. For ways and relations
a given boundingbox is used. For all other object, a box is computed
around the centroid according to dimensions derived from the
search rank.
"""
if (result.osm_object and result.osm_object[0] == 'N') or result.bbox is None:
extent = NODE_EXTENT.get(result.category, 0.00005)
return Bbox.from_point(result.centroid, extent)
return result.bbox
# pylint: disable=line-too-long
OSM_ATTRIBUTION = 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright'
OSM_TYPE_NAME = {
'N': 'node',
'W': 'way',
'R': 'relation'
}
ADMIN_LABELS = {
('', 1): 'Continent',
('', 2): 'Country',
('', 3): 'Region',
('', 4): 'State',
('', 5): 'State District',
('', 6): 'County',
('', 7): 'Municipality',
('', 8): 'City',
('', 9): 'City District',
('', 10): 'Suburb',
('', 11): 'Neighbourhood',
('', 12): 'City Block',
('no', 3): 'State',
('no', 4): 'County',
('se', 3): 'State',
('se', 4): 'County'
}
ICONS = {
('boundary', 'administrative'): 'poi_boundary_administrative',
('place', 'city'): 'poi_place_city',
('place', 'town'): 'poi_place_town',
('place', 'village'): 'poi_place_village',
('place', 'hamlet'): 'poi_place_village',
('place', 'suburb'): 'poi_place_village',
('place', 'locality'): 'poi_place_village',
('place', 'airport'): 'transport_airport2',
('aeroway', 'aerodrome'): 'transport_airport2',
('railway', 'station'): 'transport_train_station2',
('amenity', 'place_of_worship'): 'place_of_worship_unknown3',
('amenity', 'pub'): 'food_pub',
('amenity', 'bar'): 'food_bar',
('amenity', 'university'): 'education_university',
('tourism', 'museum'): 'tourist_museum',
('amenity', 'arts_centre'): 'tourist_art_gallery2',
('tourism', 'zoo'): 'tourist_zoo',
('tourism', 'theme_park'): 'poi_point_of_interest',
('tourism', 'attraction'): 'poi_point_of_interest',
('leisure', 'golf_course'): 'sport_golf',
('historic', 'castle'): 'tourist_castle',
('amenity', 'hospital'): 'health_hospital',
('amenity', 'school'): 'education_school',
('amenity', 'theatre'): 'tourist_theatre',
('amenity', 'library'): 'amenity_library',
('amenity', 'fire_station'): 'amenity_firestation3',
('amenity', 'police'): 'amenity_police2',
('amenity', 'bank'): 'money_bank2',
('amenity', 'post_office'): 'amenity_post_office',
('tourism', 'hotel'): 'accommodation_hotel2',
('amenity', 'cinema'): 'tourist_cinema',
('tourism', 'artwork'): 'tourist_art_gallery2',
('historic', 'archaeological_site'): 'tourist_archaeological2',
('amenity', 'doctors'): 'health_doctors',
('leisure', 'sports_centre'): 'sport_leisure_centre',
('leisure', 'swimming_pool'): 'sport_swimming_outdoor',
('shop', 'supermarket'): 'shopping_supermarket',
('shop', 'convenience'): 'shopping_convenience',
('amenity', 'restaurant'): 'food_restaurant',
('amenity', 'fast_food'): 'food_fastfood',
('amenity', 'cafe'): 'food_cafe',
('tourism', 'guest_house'): 'accommodation_bed_and_breakfast',
('amenity', 'pharmacy'): 'health_pharmacy_dispensing',
('amenity', 'fuel'): 'transport_fuel',
('natural', 'peak'): 'poi_peak',
('natural', 'wood'): 'landuse_coniferous_and_deciduous',
('shop', 'bicycle'): 'shopping_bicycle',
('shop', 'clothes'): 'shopping_clothes',
('shop', 'hairdresser'): 'shopping_hairdresser',
('shop', 'doityourself'): 'shopping_diy',
('shop', 'estate_agent'): 'shopping_estateagent2',
('shop', 'car'): 'shopping_car',
('shop', 'garden_centre'): 'shopping_garden_centre',
('shop', 'car_repair'): 'shopping_car_repair',
('shop', 'bakery'): 'shopping_bakery',
('shop', 'butcher'): 'shopping_butcher',
('shop', 'apparel'): 'shopping_clothes',
('shop', 'laundry'): 'shopping_laundrette',
('shop', 'beverages'): 'shopping_alcohol',
('shop', 'alcohol'): 'shopping_alcohol',
('shop', 'optician'): 'health_opticians',
('shop', 'chemist'): 'health_pharmacy',
('shop', 'gallery'): 'tourist_art_gallery2',
('shop', 'jewelry'): 'shopping_jewelry',
('tourism', 'information'): 'amenity_information',
('historic', 'ruins'): 'tourist_ruin',
('amenity', 'college'): 'education_school',
('historic', 'monument'): 'tourist_monument',
('historic', 'memorial'): 'tourist_monument',
('historic', 'mine'): 'poi_mine',
('tourism', 'caravan_site'): 'accommodation_caravan_park',
('amenity', 'bus_station'): 'transport_bus_station',
('amenity', 'atm'): 'money_atm2',
('tourism', 'viewpoint'): 'tourist_view_point',
('tourism', 'guesthouse'): 'accommodation_bed_and_breakfast',
('railway', 'tram'): 'transport_tram_stop',
('amenity', 'courthouse'): 'amenity_court',
('amenity', 'recycling'): 'amenity_recycling',
('amenity', 'dentist'): 'health_dentist',
('natural', 'beach'): 'tourist_beach',
('railway', 'tram_stop'): 'transport_tram_stop',
('amenity', 'prison'): 'amenity_prison',
('highway', 'bus_stop'): 'transport_bus_stop2'
}
NODE_EXTENT = {
('place', 'continent'): 25,
('place', 'country'): 7,
('place', 'state'): 2.6,
('place', 'province'): 2.6,
('place', 'region'): 1.0,
('place', 'county'): 0.7,
('place', 'city'): 0.16,
('place', 'municipality'): 0.16,
('place', 'island'): 0.32,
('place', 'postcode'): 0.16,
('place', 'town'): 0.04,
('place', 'village'): 0.02,
('place', 'hamlet'): 0.02,
('place', 'district'): 0.02,
('place', 'borough'): 0.02,
('place', 'suburb'): 0.02,
('place', 'locality'): 0.01,
('place', 'neighbourhood'): 0.01,
('place', 'quarter'): 0.01,
('place', 'city_block'): 0.01,
('landuse', 'farm'): 0.01,
('place', 'farm'): 0.01,
('place', 'airport'): 0.015,
('aeroway', 'aerodrome'): 0.015,
('railway', 'station'): 0.005
}

View File

@@ -0,0 +1,259 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Output formatters for API version v1.
"""
from typing import List, Dict, Mapping, Any
import collections
import datetime as dt
from nominatim_core.utils.json_writer import JsonWriter
from ..status import StatusResult
from ..results import DetailedResult, ReverseResults, SearchResults, \
AddressLines, AddressLine
from ..localization import Locales
from ..result_formatting import FormatDispatcher
from .classtypes import ICONS
from . import format_json, format_xml
class RawDataList(List[Dict[str, Any]]):
""" Data type for formatting raw data lists 'as is' in json.
"""
dispatch = FormatDispatcher()
@dispatch.format_func(StatusResult, 'text')
def _format_status_text(result: StatusResult, _: Mapping[str, Any]) -> str:
if result.status:
return f"ERROR: {result.message}"
return 'OK'
@dispatch.format_func(StatusResult, 'json')
def _format_status_json(result: StatusResult, _: Mapping[str, Any]) -> str:
out = JsonWriter()
out.start_object()\
.keyval('status', result.status)\
.keyval('message', result.message)\
.keyval_not_none('data_updated', result.data_updated,
lambda v: v.isoformat())\
.keyval('software_version', str(result.software_version))\
.keyval_not_none('database_version', result.database_version, str)\
.end_object()
return out()
def _add_address_row(writer: JsonWriter, row: AddressLine,
locales: Locales) -> None:
writer.start_object()\
.keyval('localname', locales.display_name(row.names))\
.keyval_not_none('place_id', row.place_id)
if row.osm_object is not None:
writer.keyval('osm_id', row.osm_object[1])\
.keyval('osm_type', row.osm_object[0])
if row.extratags:
writer.keyval_not_none('place_type', row.extratags.get('place_type'))
writer.keyval('class', row.category[0])\
.keyval('type', row.category[1])\
.keyval_not_none('admin_level', row.admin_level)\
.keyval('rank_address', row.rank_address)\
.keyval('distance', row.distance)\
.keyval('isaddress', row.isaddress)\
.end_object()
def _add_address_rows(writer: JsonWriter, section: str, rows: AddressLines,
locales: Locales) -> None:
writer.key(section).start_array()
for row in rows:
_add_address_row(writer, row, locales)
writer.next()
writer.end_array().next()
def _add_parent_rows_grouped(writer: JsonWriter, rows: AddressLines,
locales: Locales) -> None:
# group by category type
data = collections.defaultdict(list)
for row in rows:
sub = JsonWriter()
_add_address_row(sub, row, locales)
data[row.category[1]].append(sub())
writer.key('hierarchy').start_object()
for group, grouped in data.items():
writer.key(group).start_array()
grouped.sort() # sorts alphabetically by local name
for line in grouped:
writer.raw(line).next()
writer.end_array().next()
writer.end_object().next()
@dispatch.format_func(DetailedResult, 'json')
def _format_details_json(result: DetailedResult, options: Mapping[str, Any]) -> str:
locales = options.get('locales', Locales())
geom = result.geometry.get('geojson')
centroid = result.centroid.to_geojson()
out = JsonWriter()
out.start_object()\
.keyval_not_none('place_id', result.place_id)\
.keyval_not_none('parent_place_id', result.parent_place_id)
if result.osm_object is not None:
out.keyval('osm_type', result.osm_object[0])\
.keyval('osm_id', result.osm_object[1])
out.keyval('category', result.category[0])\
.keyval('type', result.category[1])\
.keyval('admin_level', result.admin_level)\
.keyval('localname', result.locale_name or '')\
.keyval('names', result.names or {})\
.keyval('addresstags', result.address or {})\
.keyval_not_none('housenumber', result.housenumber)\
.keyval_not_none('calculated_postcode', result.postcode)\
.keyval_not_none('country_code', result.country_code)\
.keyval_not_none('indexed_date', result.indexed_date, lambda v: v.isoformat())\
.keyval_not_none('importance', result.importance)\
.keyval('calculated_importance', result.calculated_importance())\
.keyval('extratags', result.extratags or {})\
.keyval_not_none('calculated_wikipedia', result.wikipedia)\
.keyval('rank_address', result.rank_address)\
.keyval('rank_search', result.rank_search)\
.keyval('isarea', 'Polygon' in (geom or result.geometry.get('type') or ''))\
.key('centroid').raw(centroid).next()\
.key('geometry').raw(geom or centroid).next()
if options.get('icon_base_url', None):
icon = ICONS.get(result.category)
if icon:
out.keyval('icon', f"{options['icon_base_url']}/{icon}.p.20.png")
if result.address_rows is not None:
_add_address_rows(out, 'address', result.address_rows, locales)
if result.linked_rows:
_add_address_rows(out, 'linked_places', result.linked_rows, locales)
if result.name_keywords is not None or result.address_keywords is not None:
out.key('keywords').start_object()
for sec, klist in (('name', result.name_keywords), ('address', result.address_keywords)):
out.key(sec).start_array()
for word in (klist or []):
out.start_object()\
.keyval('id', word.word_id)\
.keyval('token', word.word_token)\
.end_object().next()
out.end_array().next()
out.end_object().next()
if result.parented_rows is not None:
if options.get('group_hierarchy', False):
_add_parent_rows_grouped(out, result.parented_rows, locales)
else:
_add_address_rows(out, 'hierarchy', result.parented_rows, locales)
out.end_object()
return out()
@dispatch.format_func(ReverseResults, 'xml')
def _format_reverse_xml(results: ReverseResults, options: Mapping[str, Any]) -> str:
return format_xml.format_base_xml(results,
options, True, 'reversegeocode',
{'querystring': options.get('query', '')})
@dispatch.format_func(ReverseResults, 'geojson')
def _format_reverse_geojson(results: ReverseResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_geojson(results, options, True)
@dispatch.format_func(ReverseResults, 'geocodejson')
def _format_reverse_geocodejson(results: ReverseResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_geocodejson(results, options, True)
@dispatch.format_func(ReverseResults, 'json')
def _format_reverse_json(results: ReverseResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_json(results, options, True,
class_label='class')
@dispatch.format_func(ReverseResults, 'jsonv2')
def _format_reverse_jsonv2(results: ReverseResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_json(results, options, True,
class_label='category')
@dispatch.format_func(SearchResults, 'xml')
def _format_search_xml(results: SearchResults, options: Mapping[str, Any]) -> str:
extra = {'querystring': options.get('query', '')}
for attr in ('more_url', 'exclude_place_ids', 'viewbox'):
if options.get(attr):
extra[attr] = options[attr]
return format_xml.format_base_xml(results, options, False, 'searchresults',
extra)
@dispatch.format_func(SearchResults, 'geojson')
def _format_search_geojson(results: SearchResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_geojson(results, options, False)
@dispatch.format_func(SearchResults, 'geocodejson')
def _format_search_geocodejson(results: SearchResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_geocodejson(results, options, False)
@dispatch.format_func(SearchResults, 'json')
def _format_search_json(results: SearchResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_json(results, options, False,
class_label='class')
@dispatch.format_func(SearchResults, 'jsonv2')
def _format_search_jsonv2(results: SearchResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_json(results, options, False,
class_label='category')
@dispatch.format_func(RawDataList, 'json')
def _format_raw_data_json(results: RawDataList, _: Mapping[str, Any]) -> str:
out = JsonWriter()
out.start_array()
for res in results:
out.start_object()
for k, v in res.items():
if isinstance(v, dt.datetime):
out.keyval(k, v.isoformat(sep= ' ', timespec='seconds'))
else:
out.keyval(k, v)
out.end_object().next()
out.end_array()
return out()

View File

@@ -0,0 +1,275 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for output of results in json formats.
"""
from typing import Mapping, Any, Optional, Tuple, Union
from nominatim_core.utils.json_writer import JsonWriter
from ..results import AddressLines, ReverseResults, SearchResults
from . import classtypes as cl
#pylint: disable=too-many-branches
def _write_osm_id(out: JsonWriter, osm_object: Optional[Tuple[str, int]]) -> None:
if osm_object is not None:
out.keyval_not_none('osm_type', cl.OSM_TYPE_NAME.get(osm_object[0], None))\
.keyval('osm_id', osm_object[1])
def _write_typed_address(out: JsonWriter, address: Optional[AddressLines],
country_code: Optional[str]) -> None:
parts = {}
for line in (address or []):
if line.isaddress:
if line.local_name:
label = cl.get_label_tag(line.category, line.extratags,
line.rank_address, country_code)
if label not in parts:
parts[label] = line.local_name
if line.names and 'ISO3166-2' in line.names and line.admin_level:
parts[f"ISO3166-2-lvl{line.admin_level}"] = line.names['ISO3166-2']
for k, v in parts.items():
out.keyval(k, v)
if country_code:
out.keyval('country_code', country_code)
def _write_geocodejson_address(out: JsonWriter,
address: Optional[AddressLines],
obj_place_id: Optional[int],
country_code: Optional[str]) -> None:
extra = {}
for line in (address or []):
if line.isaddress and line.local_name:
if line.category[1] in ('postcode', 'postal_code'):
out.keyval('postcode', line.local_name)
elif line.category[1] == 'house_number':
out.keyval('housenumber', line.local_name)
elif (obj_place_id is None or obj_place_id != line.place_id) \
and line.rank_address >= 4 and line.rank_address < 28:
rank_name = GEOCODEJSON_RANKS[line.rank_address]
if rank_name not in extra:
extra[rank_name] = line.local_name
for k, v in extra.items():
out.keyval(k, v)
if country_code:
out.keyval('country_code', country_code)
def format_base_json(results: Union[ReverseResults, SearchResults],
options: Mapping[str, Any], simple: bool,
class_label: str) -> str:
""" Return the result list as a simple json string in custom Nominatim format.
"""
out = JsonWriter()
if simple:
if not results:
return '{"error":"Unable to geocode"}'
else:
out.start_array()
for result in results:
out.start_object()\
.keyval_not_none('place_id', result.place_id)\
.keyval('licence', cl.OSM_ATTRIBUTION)\
_write_osm_id(out, result.osm_object)
out.keyval('lat', f"{result.centroid.lat}")\
.keyval('lon', f"{result.centroid.lon}")\
.keyval(class_label, result.category[0])\
.keyval('type', result.category[1])\
.keyval('place_rank', result.rank_search)\
.keyval('importance', result.calculated_importance())\
.keyval('addresstype', cl.get_label_tag(result.category, result.extratags,
result.rank_address,
result.country_code))\
.keyval('name', result.locale_name or '')\
.keyval('display_name', result.display_name or '')
if options.get('icon_base_url', None):
icon = cl.ICONS.get(result.category)
if icon:
out.keyval('icon', f"{options['icon_base_url']}/{icon}.p.20.png")
if options.get('addressdetails', False):
out.key('address').start_object()
_write_typed_address(out, result.address_rows, result.country_code)
out.end_object().next()
if options.get('extratags', False):
out.keyval('extratags', result.extratags)
if options.get('namedetails', False):
out.keyval('namedetails', result.names)
bbox = cl.bbox_from_result(result)
out.key('boundingbox').start_array()\
.value(f"{bbox.minlat:0.7f}").next()\
.value(f"{bbox.maxlat:0.7f}").next()\
.value(f"{bbox.minlon:0.7f}").next()\
.value(f"{bbox.maxlon:0.7f}").next()\
.end_array().next()
if result.geometry:
for key in ('text', 'kml'):
out.keyval_not_none('geo' + key, result.geometry.get(key))
if 'geojson' in result.geometry:
out.key('geojson').raw(result.geometry['geojson']).next()
out.keyval_not_none('svg', result.geometry.get('svg'))
out.end_object()
if simple:
return out()
out.next()
out.end_array()
return out()
def format_base_geojson(results: Union[ReverseResults, SearchResults],
options: Mapping[str, Any],
simple: bool) -> str:
""" Return the result list as a geojson string.
"""
if not results and simple:
return '{"error":"Unable to geocode"}'
out = JsonWriter()
out.start_object()\
.keyval('type', 'FeatureCollection')\
.keyval('licence', cl.OSM_ATTRIBUTION)\
.key('features').start_array()
for result in results:
out.start_object()\
.keyval('type', 'Feature')\
.key('properties').start_object()
out.keyval_not_none('place_id', result.place_id)
_write_osm_id(out, result.osm_object)
out.keyval('place_rank', result.rank_search)\
.keyval('category', result.category[0])\
.keyval('type', result.category[1])\
.keyval('importance', result.calculated_importance())\
.keyval('addresstype', cl.get_label_tag(result.category, result.extratags,
result.rank_address,
result.country_code))\
.keyval('name', result.locale_name or '')\
.keyval('display_name', result.display_name or '')
if options.get('addressdetails', False):
out.key('address').start_object()
_write_typed_address(out, result.address_rows, result.country_code)
out.end_object().next()
if options.get('extratags', False):
out.keyval('extratags', result.extratags)
if options.get('namedetails', False):
out.keyval('namedetails', result.names)
out.end_object().next() # properties
out.key('bbox').start_array()
for coord in cl.bbox_from_result(result).coords:
out.float(coord, 7).next()
out.end_array().next()
out.key('geometry').raw(result.geometry.get('geojson')
or result.centroid.to_geojson()).next()
out.end_object().next()
out.end_array().next().end_object()
return out()
def format_base_geocodejson(results: Union[ReverseResults, SearchResults],
options: Mapping[str, Any], simple: bool) -> str:
""" Return the result list as a geocodejson string.
"""
if not results and simple:
return '{"error":"Unable to geocode"}'
out = JsonWriter()
out.start_object()\
.keyval('type', 'FeatureCollection')\
.key('geocoding').start_object()\
.keyval('version', '0.1.0')\
.keyval('attribution', cl.OSM_ATTRIBUTION)\
.keyval('licence', 'ODbL')\
.keyval_not_none('query', options.get('query'))\
.end_object().next()\
.key('features').start_array()
for result in results:
out.start_object()\
.keyval('type', 'Feature')\
.key('properties').start_object()\
.key('geocoding').start_object()
out.keyval_not_none('place_id', result.place_id)
_write_osm_id(out, result.osm_object)
out.keyval('osm_key', result.category[0])\
.keyval('osm_value', result.category[1])\
.keyval('type', GEOCODEJSON_RANKS[max(3, min(28, result.rank_address))])\
.keyval_not_none('accuracy', getattr(result, 'distance', None), transform=int)\
.keyval('label', result.display_name or '')\
.keyval_not_none('name', result.locale_name or None)\
if options.get('addressdetails', False):
_write_geocodejson_address(out, result.address_rows, result.place_id,
result.country_code)
out.key('admin').start_object()
if result.address_rows:
for line in result.address_rows:
if line.isaddress and (line.admin_level or 15) < 15 and line.local_name \
and line.category[0] == 'boundary' and line.category[1] == 'administrative':
out.keyval(f"level{line.admin_level}", line.local_name)
out.end_object().next()
out.end_object().next().end_object().next()
out.key('geometry').raw(result.geometry.get('geojson')
or result.centroid.to_geojson()).next()
out.end_object().next()
out.end_array().next().end_object()
return out()
GEOCODEJSON_RANKS = {
3: 'locality',
4: 'country',
5: 'state', 6: 'state', 7: 'state', 8: 'state', 9: 'state',
10: 'county', 11: 'county', 12: 'county',
13: 'city', 14: 'city', 15: 'city', 16: 'city',
17: 'district', 18: 'district', 19: 'district', 20: 'district', 21: 'district',
22: 'locality', 23: 'locality', 24: 'locality',
25: 'street', 26: 'street', 27: 'street', 28: 'house'}

View File

@@ -0,0 +1,126 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for output of results in XML format.
"""
from typing import Mapping, Any, Optional, Union
import datetime as dt
import xml.etree.ElementTree as ET
from ..results import AddressLines, ReverseResult, ReverseResults, \
SearchResult, SearchResults
from . import classtypes as cl
#pylint: disable=too-many-branches
def _write_xml_address(root: ET.Element, address: AddressLines,
country_code: Optional[str]) -> None:
parts = {}
for line in address:
if line.isaddress:
if line.local_name:
label = cl.get_label_tag(line.category, line.extratags,
line.rank_address, country_code)
if label not in parts:
parts[label] = line.local_name
if line.names and 'ISO3166-2' in line.names and line.admin_level:
parts[f"ISO3166-2-lvl{line.admin_level}"] = line.names['ISO3166-2']
for k,v in parts.items():
ET.SubElement(root, k).text = v
if country_code:
ET.SubElement(root, 'country_code').text = country_code
def _create_base_entry(result: Union[ReverseResult, SearchResult],
root: ET.Element, simple: bool) -> ET.Element:
place = ET.SubElement(root, 'result' if simple else 'place')
if result.place_id is not None:
place.set('place_id', str(result.place_id))
if result.osm_object:
osm_type = cl.OSM_TYPE_NAME.get(result.osm_object[0], None)
if osm_type is not None:
place.set('osm_type', osm_type)
place.set('osm_id', str(result.osm_object[1]))
if result.names and 'ref' in result.names:
place.set('ref', result.names['ref'])
elif result.locale_name:
# bug reproduced from PHP
place.set('ref', result.locale_name)
place.set('lat', f"{result.centroid.lat:.7f}")
place.set('lon', f"{result.centroid.lon:.7f}")
bbox = cl.bbox_from_result(result)
place.set('boundingbox',
f"{bbox.minlat:.7f},{bbox.maxlat:.7f},{bbox.minlon:.7f},{bbox.maxlon:.7f}")
place.set('place_rank', str(result.rank_search))
place.set('address_rank', str(result.rank_address))
if result.geometry:
for key in ('text', 'svg'):
if key in result.geometry:
place.set('geo' + key, result.geometry[key])
if 'kml' in result.geometry:
ET.SubElement(root if simple else place, 'geokml')\
.append(ET.fromstring(result.geometry['kml']))
if 'geojson' in result.geometry:
place.set('geojson', result.geometry['geojson'])
if simple:
place.text = result.display_name or ''
else:
place.set('display_name', result.display_name or '')
place.set('class', result.category[0])
place.set('type', result.category[1])
place.set('importance', str(result.calculated_importance()))
return place
def format_base_xml(results: Union[ReverseResults, SearchResults],
options: Mapping[str, Any],
simple: bool, xml_root_tag: str,
xml_extra_info: Mapping[str, str]) -> str:
""" Format the result into an XML response. With 'simple' exactly one
result will be output, otherwise a list.
"""
root = ET.Element(xml_root_tag)
root.set('timestamp', dt.datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +00:00'))
root.set('attribution', cl.OSM_ATTRIBUTION)
for k, v in xml_extra_info.items():
root.set(k, v)
if simple and not results:
ET.SubElement(root, 'error').text = 'Unable to geocode'
for result in results:
place = _create_base_entry(result, root, simple)
if not simple and options.get('icon_base_url', None):
icon = cl.ICONS.get(result.category)
if icon:
place.set('icon', icon)
if options.get('addressdetails', False) and result.address_rows:
_write_xml_address(ET.SubElement(root, 'addressparts') if simple else place,
result.address_rows, result.country_code)
if options.get('extratags', False):
eroot = ET.SubElement(root if simple else place, 'extratags')
if result.extratags:
for k, v in result.extratags.items():
ET.SubElement(eroot, 'tag', attrib={'key': k, 'value': v})
if options.get('namedetails', False):
eroot = ET.SubElement(root if simple else place, 'namedetails')
if result.names:
for k,v in result.names.items():
ET.SubElement(eroot, 'name', attrib={'desc': k}).text = v
return '<?xml version="1.0" encoding="UTF-8" ?>\n' + ET.tostring(root, encoding='unicode')

View File

@@ -0,0 +1,201 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper function for parsing parameters and and outputting data
specifically for the v1 version of the API.
"""
from typing import Tuple, Optional, Any, Dict, Iterable
from itertools import chain
import re
from ..results import SearchResult, SearchResults, SourceTable
from ..types import SearchDetails, GeometryFormat
REVERSE_MAX_RANKS = [2, 2, 2, # 0-2 Continent/Sea
4, 4, # 3-4 Country
8, # 5 State
10, 10, # 6-7 Region
12, 12, # 8-9 County
16, 17, # 10-11 City
18, # 12 Town
19, # 13 Village/Suburb
22, # 14 Hamlet/Neighbourhood
25, # 15 Localities
26, # 16 Major Streets
27, # 17 Minor Streets
30 # 18 Building
]
def zoom_to_rank(zoom: int) -> int:
""" Convert a zoom parameter into a rank according to the v1 API spec.
"""
return REVERSE_MAX_RANKS[max(0, min(18, zoom))]
FEATURE_TYPE_TO_RANK: Dict[Optional[str], Tuple[int, int]] = {
'country': (4, 4),
'state': (8, 8),
'city': (14, 16),
'settlement': (8, 20)
}
def feature_type_to_rank(feature_type: Optional[str]) -> Tuple[int, int]:
""" Convert a feature type parameter to a tuple of
feature type name, minimum rank and maximum rank.
"""
return FEATURE_TYPE_TO_RANK.get(feature_type, (0, 30))
#pylint: disable=too-many-arguments,too-many-branches
def extend_query_parts(queryparts: Dict[str, Any], details: Dict[str, Any],
feature_type: Optional[str],
namedetails: bool, extratags: bool,
excluded: Iterable[str]) -> None:
""" Add parameters from details dictionary to the query parts
dictionary which is suitable as URL parameter dictionary.
"""
parsed = SearchDetails.from_kwargs(details)
if parsed.geometry_output != GeometryFormat.NONE:
if GeometryFormat.GEOJSON in parsed.geometry_output:
queryparts['polygon_geojson'] = '1'
if GeometryFormat.KML in parsed.geometry_output:
queryparts['polygon_kml'] = '1'
if GeometryFormat.SVG in parsed.geometry_output:
queryparts['polygon_svg'] = '1'
if GeometryFormat.TEXT in parsed.geometry_output:
queryparts['polygon_text'] = '1'
if parsed.address_details:
queryparts['addressdetails'] = '1'
if namedetails:
queryparts['namedetails'] = '1'
if extratags:
queryparts['extratags'] = '1'
if parsed.geometry_simplification > 0.0:
queryparts['polygon_threshold'] = f"{parsed.geometry_simplification:.6g}"
if parsed.max_results != 10:
queryparts['limit'] = str(parsed.max_results)
if parsed.countries:
queryparts['countrycodes'] = ','.join(parsed.countries)
queryparts['exclude_place_ids'] = \
','.join(chain(excluded, map(str, (e for e in parsed.excluded if e > 0))))
if parsed.viewbox:
queryparts['viewbox'] = ','.join(f"{c:.7g}" for c in parsed.viewbox.coords)
if parsed.bounded_viewbox:
queryparts['bounded'] = '1'
if not details['dedupe']:
queryparts['dedupe'] = '0'
if feature_type in FEATURE_TYPE_TO_RANK:
queryparts['featureType'] = feature_type
def deduplicate_results(results: SearchResults, max_results: int) -> SearchResults:
""" Remove results that look like duplicates.
Two results are considered the same if they have the same OSM ID
or if they have the same category, display name and rank.
"""
osm_ids_done = set()
classification_done = set()
deduped = SearchResults()
for result in results:
if result.source_table == SourceTable.POSTCODE:
assert result.names and 'ref' in result.names
if any(_is_postcode_relation_for(r, result.names['ref']) for r in results):
continue
if result.source_table == SourceTable.PLACEX:
classification = (result.osm_object[0] if result.osm_object else None,
result.category,
result.display_name,
result.rank_address)
if result.osm_object not in osm_ids_done \
and classification not in classification_done:
deduped.append(result)
osm_ids_done.add(result.osm_object)
classification_done.add(classification)
else:
deduped.append(result)
if len(deduped) >= max_results:
break
return deduped
def _is_postcode_relation_for(result: SearchResult, postcode: str) -> bool:
return result.source_table == SourceTable.PLACEX \
and result.osm_object is not None \
and result.osm_object[0] == 'R' \
and result.category == ('boundary', 'postal_code') \
and result.names is not None \
and result.names.get('ref') == postcode
def _deg(axis:str) -> str:
return f"(?P<{axis}_deg>\\d+\\.\\d+)°?"
def _deg_min(axis: str) -> str:
return f"(?P<{axis}_deg>\\d+)[°\\s]+(?P<{axis}_min>[\\d.]+)[']*"
def _deg_min_sec(axis: str) -> str:
return f"(?P<{axis}_deg>\\d+)[°\\s]+(?P<{axis}_min>\\d+)['\\s]+(?P<{axis}_sec>[\\d.]+)[\"″]*"
COORD_REGEX = [re.compile(r'(?:(?P<pre>.*?)\s+)??' + r + r'(?:\s+(?P<post>.*))?') for r in (
r"(?P<ns>[NS])\s*" + _deg('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg('lon'),
_deg('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg('lon') + r"\s*(?P<ew>[EW])",
r"(?P<ns>[NS])\s*" + _deg_min('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg_min('lon'),
_deg_min('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg_min('lon') + r"\s*(?P<ew>[EW])",
r"(?P<ns>[NS])\s*" + _deg_min_sec('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg_min_sec('lon'),
_deg_min_sec('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg_min_sec('lon') + r"\s*(?P<ew>[EW])",
r"\[?(?P<lat_deg>[+-]?\d+\.\d+)[\s,]+(?P<lon_deg>[+-]?\d+\.\d+)\]?"
)]
def extract_coords_from_query(query: str) -> Tuple[str, Optional[float], Optional[float]]:
""" Look for something that is formatted like a coordinate at the
beginning or end of the query. If found, extract the coordinate and
return the remaining query (or the empty string if the query
consisted of nothing but a coordinate).
Only the first match will be returned.
"""
for regex in COORD_REGEX:
match = regex.fullmatch(query)
if match is None:
continue
groups = match.groupdict()
if not groups['pre'] or not groups['post']:
x = float(groups['lon_deg']) \
+ float(groups.get('lon_min', 0.0)) / 60.0 \
+ float(groups.get('lon_sec', 0.0)) / 3600.0
if groups.get('ew') == 'W':
x = -x
y = float(groups['lat_deg']) \
+ float(groups.get('lat_min', 0.0)) / 60.0 \
+ float(groups.get('lat_sec', 0.0)) / 3600.0
if groups.get('ns') == 'S':
y = -y
return groups['pre'] or groups['post'] or '', x, y
return query, None, None
CATEGORY_REGEX = re.compile(r'(?P<pre>.*?)\[(?P<cls>[a-zA-Z_]+)=(?P<typ>[a-zA-Z_]+)\](?P<post>.*)')
def extract_category_from_query(query: str) -> Tuple[str, Optional[str], Optional[str]]:
""" Extract a hidden category specification of the form '[key=value]' from
the query. If found, extract key and value and
return the remaining query (or the empty string if the query
consisted of nothing but a category).
Only the first match will be returned.
"""
match = CATEGORY_REGEX.search(query)
if match is not None:
return (match.group('pre').strip() + ' ' + match.group('post').strip()).strip(), \
match.group('cls'), match.group('typ')
return query, None, None

View File

@@ -0,0 +1,577 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Generic part of the server implementation of the v1 API.
Combine with the scaffolding provided for the various Python ASGI frameworks.
"""
from typing import Optional, Any, Type, Callable, NoReturn, Dict, cast
from functools import reduce
import abc
import dataclasses
import math
from urllib.parse import urlencode
import sqlalchemy as sa
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from .. import logging as loglib
from ..core import NominatimAPIAsync
from .format import dispatch as formatting
from .format import RawDataList
from ..types import DataLayer, GeometryFormat, PlaceRef, PlaceID, OsmID, Point
from ..status import StatusResult
from ..results import DetailedResult, ReverseResults, SearchResult, SearchResults
from ..localization import Locales
from . import helpers
CONTENT_TEXT = 'text/plain; charset=utf-8'
CONTENT_XML = 'text/xml; charset=utf-8'
CONTENT_HTML = 'text/html; charset=utf-8'
CONTENT_JSON = 'application/json; charset=utf-8'
CONTENT_TYPE = {'text': CONTENT_TEXT, 'xml': CONTENT_XML, 'debug': CONTENT_HTML}
class ASGIAdaptor(abc.ABC):
""" Adapter class for the different ASGI frameworks.
Wraps functionality over concrete requests and responses.
"""
content_type: str = CONTENT_TEXT
@abc.abstractmethod
def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
""" Return an input parameter as a string. If the parameter was
not provided, return the 'default' value.
"""
@abc.abstractmethod
def get_header(self, name: str, default: Optional[str] = None) -> Optional[str]:
""" Return a HTTP header parameter as a string. If the parameter was
not provided, return the 'default' value.
"""
@abc.abstractmethod
def error(self, msg: str, status: int = 400) -> Exception:
""" Construct an appropriate exception from the given error message.
The exception must result in a HTTP error with the given status.
"""
@abc.abstractmethod
def create_response(self, status: int, output: str, num_results: int) -> Any:
""" Create a response from the given parameters. The result will
be returned by the endpoint functions. The adaptor may also
return None when the response is created internally with some
different means.
The response must return the HTTP given status code 'status', set
the HTTP content-type headers to the string provided and the
body of the response to 'output'.
"""
@abc.abstractmethod
def base_uri(self) -> str:
""" Return the URI of the original request.
"""
@abc.abstractmethod
def config(self) -> Configuration:
""" Return the current configuration object.
"""
def build_response(self, output: str, status: int = 200, num_results: int = 0) -> Any:
""" Create a response from the given output. Wraps a JSONP function
around the response, if necessary.
"""
if self.content_type == CONTENT_JSON and status == 200:
jsonp = self.get('json_callback')
if jsonp is not None:
if any(not part.isidentifier() for part in jsonp.split('.')):
self.raise_error('Invalid json_callback value')
output = f"{jsonp}({output})"
self.content_type = 'application/javascript; charset=utf-8'
return self.create_response(status, output, num_results)
def raise_error(self, msg: str, status: int = 400) -> NoReturn:
""" Raise an exception resulting in the given HTTP status and
message. The message will be formatted according to the
output format chosen by the request.
"""
if self.content_type == CONTENT_XML:
msg = f"""<?xml version="1.0" encoding="UTF-8" ?>
<error>
<code>{status}</code>
<message>{msg}</message>
</error>
"""
elif self.content_type == CONTENT_JSON:
msg = f"""{{"error":{{"code":{status},"message":"{msg}"}}}}"""
elif self.content_type == CONTENT_HTML:
loglib.log().section('Execution error')
loglib.log().var_dump('Status', status)
loglib.log().var_dump('Message', msg)
msg = loglib.get_and_disable()
raise self.error(msg, status)
def get_int(self, name: str, default: Optional[int] = None) -> int:
""" Return an input parameter as an int. Raises an exception if
the parameter is given but not in an integer format.
If 'default' is given, then it will be returned when the parameter
is missing completely. When 'default' is None, an error will be
raised on a missing parameter.
"""
value = self.get(name)
if value is None:
if default is not None:
return default
self.raise_error(f"Parameter '{name}' missing.")
try:
intval = int(value)
except ValueError:
self.raise_error(f"Parameter '{name}' must be a number.")
return intval
def get_float(self, name: str, default: Optional[float] = None) -> float:
""" Return an input parameter as a flaoting-point number. Raises an
exception if the parameter is given but not in an float format.
If 'default' is given, then it will be returned when the parameter
is missing completely. When 'default' is None, an error will be
raised on a missing parameter.
"""
value = self.get(name)
if value is None:
if default is not None:
return default
self.raise_error(f"Parameter '{name}' missing.")
try:
fval = float(value)
except ValueError:
self.raise_error(f"Parameter '{name}' must be a number.")
if math.isnan(fval) or math.isinf(fval):
self.raise_error(f"Parameter '{name}' must be a number.")
return fval
def get_bool(self, name: str, default: Optional[bool] = None) -> bool:
""" Return an input parameter as bool. Only '0' is accepted as
an input for 'false' all other inputs will be interpreted as 'true'.
If 'default' is given, then it will be returned when the parameter
is missing completely. When 'default' is None, an error will be
raised on a missing parameter.
"""
value = self.get(name)
if value is None:
if default is not None:
return default
self.raise_error(f"Parameter '{name}' missing.")
return value != '0'
def get_accepted_languages(self) -> str:
""" Return the accepted languages.
"""
return self.get('accept-language')\
or self.get_header('accept-language')\
or self.config().DEFAULT_LANGUAGE
def setup_debugging(self) -> bool:
""" Set up collection of debug information if requested.
Return True when debugging was requested.
"""
if self.get_bool('debug', False):
loglib.set_log_output('html')
self.content_type = CONTENT_HTML
return True
return False
def get_layers(self) -> Optional[DataLayer]:
""" Return a parsed version of the layer parameter.
"""
param = self.get('layer', None)
if param is None:
return None
return cast(DataLayer,
reduce(DataLayer.__or__,
(getattr(DataLayer, s.upper()) for s in param.split(','))))
def parse_format(self, result_type: Type[Any], default: str) -> str:
""" Get and check the 'format' parameter and prepare the formatter.
`result_type` is the type of result to be returned by the function
and `default` the format value to assume when no parameter is present.
"""
fmt = self.get('format', default=default)
assert fmt is not None
if not formatting.supports_format(result_type, fmt):
self.raise_error("Parameter 'format' must be one of: " +
', '.join(formatting.list_formats(result_type)))
self.content_type = CONTENT_TYPE.get(fmt, CONTENT_JSON)
return fmt
def parse_geometry_details(self, fmt: str) -> Dict[str, Any]:
""" Create details structure from the supplied geometry parameters.
"""
numgeoms = 0
output = GeometryFormat.NONE
if self.get_bool('polygon_geojson', False):
output |= GeometryFormat.GEOJSON
numgeoms += 1
if fmt not in ('geojson', 'geocodejson'):
if self.get_bool('polygon_text', False):
output |= GeometryFormat.TEXT
numgeoms += 1
if self.get_bool('polygon_kml', False):
output |= GeometryFormat.KML
numgeoms += 1
if self.get_bool('polygon_svg', False):
output |= GeometryFormat.SVG
numgeoms += 1
if numgeoms > self.config().get_int('POLYGON_OUTPUT_MAX_TYPES'):
self.raise_error('Too many polygon output options selected.')
return {'address_details': True,
'geometry_simplification': self.get_float('polygon_threshold', 0.0),
'geometry_output': output
}
async def status_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /status endpoint. See API docs for details.
"""
result = await api.status()
fmt = params.parse_format(StatusResult, 'text')
if fmt == 'text' and result.status:
status_code = 500
else:
status_code = 200
return params.build_response(formatting.format_result(result, fmt, {}),
status=status_code)
async def details_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /details endpoint. See API docs for details.
"""
fmt = params.parse_format(DetailedResult, 'json')
place_id = params.get_int('place_id', 0)
place: PlaceRef
if place_id:
place = PlaceID(place_id)
else:
osmtype = params.get('osmtype')
if osmtype is None:
params.raise_error("Missing ID parameter 'place_id' or 'osmtype'.")
place = OsmID(osmtype, params.get_int('osmid'), params.get('class'))
debug = params.setup_debugging()
locales = Locales.from_accept_languages(params.get_accepted_languages())
result = await api.details(place,
address_details=params.get_bool('addressdetails', False),
linked_places=params.get_bool('linkedplaces', True),
parented_places=params.get_bool('hierarchy', False),
keywords=params.get_bool('keywords', False),
geometry_output = GeometryFormat.GEOJSON
if params.get_bool('polygon_geojson', False)
else GeometryFormat.NONE,
locales=locales
)
if debug:
return params.build_response(loglib.get_and_disable())
if result is None:
params.raise_error('No place with that OSM ID found.', status=404)
output = formatting.format_result(result, fmt,
{'locales': locales,
'group_hierarchy': params.get_bool('group_hierarchy', False),
'icon_base_url': params.config().MAPICON_URL})
return params.build_response(output, num_results=1)
async def reverse_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /reverse endpoint. See API docs for details.
"""
fmt = params.parse_format(ReverseResults, 'xml')
debug = params.setup_debugging()
coord = Point(params.get_float('lon'), params.get_float('lat'))
details = params.parse_geometry_details(fmt)
details['max_rank'] = helpers.zoom_to_rank(params.get_int('zoom', 18))
details['layers'] = params.get_layers()
details['locales'] = Locales.from_accept_languages(params.get_accepted_languages())
result = await api.reverse(coord, **details)
if debug:
return params.build_response(loglib.get_and_disable(), num_results=1 if result else 0)
if fmt == 'xml':
queryparts = {'lat': str(coord.lat), 'lon': str(coord.lon), 'format': 'xml'}
zoom = params.get('zoom', None)
if zoom:
queryparts['zoom'] = zoom
query = urlencode(queryparts)
else:
query = ''
fmt_options = {'query': query,
'extratags': params.get_bool('extratags', False),
'namedetails': params.get_bool('namedetails', False),
'addressdetails': params.get_bool('addressdetails', True)}
output = formatting.format_result(ReverseResults([result] if result else []),
fmt, fmt_options)
return params.build_response(output, num_results=1 if result else 0)
async def lookup_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /lookup endpoint. See API docs for details.
"""
fmt = params.parse_format(SearchResults, 'xml')
debug = params.setup_debugging()
details = params.parse_geometry_details(fmt)
details['locales'] = Locales.from_accept_languages(params.get_accepted_languages())
places = []
for oid in (params.get('osm_ids') or '').split(','):
oid = oid.strip()
if len(oid) > 1 and oid[0] in 'RNWrnw' and oid[1:].isdigit():
places.append(OsmID(oid[0].upper(), int(oid[1:])))
if len(places) > params.config().get_int('LOOKUP_MAX_COUNT'):
params.raise_error('Too many object IDs.')
if places:
results = await api.lookup(places, **details)
else:
results = SearchResults()
if debug:
return params.build_response(loglib.get_and_disable(), num_results=len(results))
fmt_options = {'extratags': params.get_bool('extratags', False),
'namedetails': params.get_bool('namedetails', False),
'addressdetails': params.get_bool('addressdetails', True)}
output = formatting.format_result(results, fmt, fmt_options)
return params.build_response(output, num_results=len(results))
async def _unstructured_search(query: str, api: NominatimAPIAsync,
details: Dict[str, Any]) -> SearchResults:
if not query:
return SearchResults()
# Extract special format for coordinates from query.
query, x, y = helpers.extract_coords_from_query(query)
if x is not None:
assert y is not None
details['near'] = Point(x, y)
details['near_radius'] = 0.1
# If no query is left, revert to reverse search.
if x is not None and not query:
result = await api.reverse(details['near'], **details)
if not result:
return SearchResults()
return SearchResults(
[SearchResult(**{f.name: getattr(result, f.name)
for f in dataclasses.fields(SearchResult)
if hasattr(result, f.name)})])
query, cls, typ = helpers.extract_category_from_query(query)
if cls is not None:
assert typ is not None
return await api.search_category([(cls, typ)], near_query=query, **details)
return await api.search(query, **details)
async def search_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /search endpoint. See API docs for details.
"""
fmt = params.parse_format(SearchResults, 'jsonv2')
debug = params.setup_debugging()
details = params.parse_geometry_details(fmt)
details['countries'] = params.get('countrycodes', None)
details['excluded'] = params.get('exclude_place_ids', None)
details['viewbox'] = params.get('viewbox', None) or params.get('viewboxlbrt', None)
details['bounded_viewbox'] = params.get_bool('bounded', False)
details['dedupe'] = params.get_bool('dedupe', True)
max_results = max(1, min(50, params.get_int('limit', 10)))
details['max_results'] = max_results + min(10, max_results) \
if details['dedupe'] else max_results
details['min_rank'], details['max_rank'] = \
helpers.feature_type_to_rank(params.get('featureType', ''))
if params.get('featureType', None) is not None:
details['layers'] = DataLayer.ADDRESS
else:
details['layers'] = params.get_layers()
details['locales'] = Locales.from_accept_languages(params.get_accepted_languages())
# unstructured query parameters
query = params.get('q', None)
# structured query parameters
queryparts = {}
for key in ('amenity', 'street', 'city', 'county', 'state', 'postalcode', 'country'):
details[key] = params.get(key, None)
if details[key]:
queryparts[key] = details[key]
try:
if query is not None:
if queryparts:
params.raise_error("Structured query parameters"
"(amenity, street, city, county, state, postalcode, country)"
" cannot be used together with 'q' parameter.")
queryparts['q'] = query
results = await _unstructured_search(query, api, details)
else:
query = ', '.join(queryparts.values())
results = await api.search_address(**details)
except UsageError as err:
params.raise_error(str(err))
if details['dedupe'] and len(results) > 1:
results = helpers.deduplicate_results(results, max_results)
if debug:
return params.build_response(loglib.get_and_disable(), num_results=len(results))
if fmt == 'xml':
helpers.extend_query_parts(queryparts, details,
params.get('featureType', ''),
params.get_bool('namedetails', False),
params.get_bool('extratags', False),
(str(r.place_id) for r in results if r.place_id))
queryparts['format'] = fmt
moreurl = params.base_uri() + '/search?' + urlencode(queryparts)
else:
moreurl = ''
fmt_options = {'query': query, 'more_url': moreurl,
'exclude_place_ids': queryparts.get('exclude_place_ids'),
'viewbox': queryparts.get('viewbox'),
'extratags': params.get_bool('extratags', False),
'namedetails': params.get_bool('namedetails', False),
'addressdetails': params.get_bool('addressdetails', False)}
output = formatting.format_result(results, fmt, fmt_options)
return params.build_response(output, num_results=len(results))
async def deletable_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /deletable endpoint.
This is a special endpoint that shows polygons that have been
deleted or are broken in the OSM data but are kept in the
Nominatim database to minimize disruption.
"""
fmt = params.parse_format(RawDataList, 'json')
async with api.begin() as conn:
sql = sa.text(""" SELECT p.place_id, country_code,
name->'name' as name, i.*
FROM placex p, import_polygon_delete i
WHERE p.osm_id = i.osm_id AND p.osm_type = i.osm_type
AND p.class = i.class AND p.type = i.type
""")
results = RawDataList(r._asdict() for r in await conn.execute(sql))
return params.build_response(formatting.format_result(results, fmt, {}))
async def polygons_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /polygons endpoint.
This is a special endpoint that shows polygons that have changed
their size but are kept in the Nominatim database with their
old area to minimize disruption.
"""
fmt = params.parse_format(RawDataList, 'json')
sql_params: Dict[str, Any] = {
'days': params.get_int('days', -1),
'cls': params.get('class')
}
reduced = params.get_bool('reduced', False)
async with api.begin() as conn:
sql = sa.select(sa.text("""osm_type, osm_id, class, type,
name->'name' as name,
country_code, errormessage, updated"""))\
.select_from(sa.text('import_polygon_error'))
if sql_params['days'] > 0:
sql = sql.where(sa.text("updated > 'now'::timestamp - make_interval(days => :days)"))
if reduced:
sql = sql.where(sa.text("errormessage like 'Area reduced%'"))
if sql_params['cls'] is not None:
sql = sql.where(sa.text("class = :cls"))
sql = sql.order_by(sa.literal_column('updated').desc()).limit(1000)
results = RawDataList(r._asdict() for r in await conn.execute(sql, sql_params))
return params.build_response(formatting.format_result(results, fmt, {}))
EndpointFunc = Callable[[NominatimAPIAsync, ASGIAdaptor], Any]
ROUTES = [
('status', status_endpoint),
('details', details_endpoint),
('reverse', reverse_endpoint),
('lookup', lookup_endpoint),
('search', search_endpoint),
('deletable', deletable_endpoint),
('polygons', polygons_endpoint),
]

View File

@@ -0,0 +1,11 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Version information for the Nominatim API.
"""
NOMINATIM_API_VERSION = '4.4.99'