split code into submodules

This commit is contained in:
Sarah Hoffmann
2024-05-16 11:55:17 +02:00
parent 0fb4fe8e4d
commit 6e89310a92
137 changed files with 757 additions and 716 deletions

View File

@@ -0,0 +1,38 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
The public interface of the Nominatim library.
Classes and functions defined in this file are considered stable. Always
import from this file, not from the source files directly.
"""
# See also https://github.com/PyCQA/pylint/issues/6006
# pylint: disable=useless-import-alias
from .core import (NominatimAPI as NominatimAPI,
NominatimAPIAsync as NominatimAPIAsync)
from .connection import (SearchConnection as SearchConnection)
from .status import (StatusResult as StatusResult)
from .types import (PlaceID as PlaceID,
OsmID as OsmID,
PlaceRef as PlaceRef,
Point as Point,
Bbox as Bbox,
GeometryFormat as GeometryFormat,
DataLayer as DataLayer)
from .results import (SourceTable as SourceTable,
AddressLine as AddressLine,
AddressLines as AddressLines,
WordInfo as WordInfo,
WordInfos as WordInfos,
DetailedResult as DetailedResult,
ReverseResult as ReverseResult,
ReverseResults as ReverseResults,
SearchResult as SearchResult,
SearchResults as SearchResults)
from .localization import (Locales as Locales)

View File

@@ -0,0 +1,149 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Extended SQLAlchemy connection class that also includes access to the schema.
"""
from typing import cast, Any, Mapping, Sequence, Union, Dict, Optional, Set, \
Awaitable, Callable, TypeVar
import asyncio
import sqlalchemy as sa
from sqlalchemy.ext.asyncio import AsyncConnection
from nominatim_core.typing import SaFromClause
from nominatim_core.db.sqlalchemy_schema import SearchTables
from nominatim_core.db.sqlalchemy_types import Geometry
from .logging import log
T = TypeVar('T')
class SearchConnection:
""" An extended SQLAlchemy connection class, that also contains
the table definitions. The underlying asynchronous SQLAlchemy
connection can be accessed with the 'connection' property.
The 't' property is the collection of Nominatim tables.
"""
def __init__(self, conn: AsyncConnection,
tables: SearchTables,
properties: Dict[str, Any]) -> None:
self.connection = conn
self.t = tables # pylint: disable=invalid-name
self._property_cache = properties
self._classtables: Optional[Set[str]] = None
self.query_timeout: Optional[int] = None
def set_query_timeout(self, timeout: Optional[int]) -> None:
""" Set the timeout after which a query over this connection
is cancelled.
"""
self.query_timeout = timeout
async def scalar(self, sql: sa.sql.base.Executable,
params: Union[Mapping[str, Any], None] = None
) -> Any:
""" Execute a 'scalar()' query on the connection.
"""
log().sql(self.connection, sql, params)
return await asyncio.wait_for(self.connection.scalar(sql, params), self.query_timeout)
async def execute(self, sql: 'sa.Executable',
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None] = None
) -> 'sa.Result[Any]':
""" Execute a 'execute()' query on the connection.
"""
log().sql(self.connection, sql, params)
return await asyncio.wait_for(self.connection.execute(sql, params), self.query_timeout)
async def get_property(self, name: str, cached: bool = True) -> str:
""" Get a property from Nominatim's property table.
Property values are normally cached so that they are only
retrieved from the database when they are queried for the
first time with this function. Set 'cached' to False to force
reading the property from the database.
Raises a ValueError if the property does not exist.
"""
lookup_name = f'DBPROP:{name}'
if cached and lookup_name in self._property_cache:
return cast(str, self._property_cache[lookup_name])
sql = sa.select(self.t.properties.c.value)\
.where(self.t.properties.c.property == name)
value = await self.connection.scalar(sql)
if value is None:
raise ValueError(f"Property '{name}' not found in database.")
self._property_cache[lookup_name] = cast(str, value)
return cast(str, value)
async def get_db_property(self, name: str) -> Any:
""" Get a setting from the database. At the moment, only
'server_version', the version of the database software, can
be retrieved with this function.
Raises a ValueError if the property does not exist.
"""
if name != 'server_version':
raise ValueError(f"DB setting '{name}' not found in database.")
return self._property_cache['DB:server_version']
async def get_cached_value(self, group: str, name: str,
factory: Callable[[], Awaitable[T]]) -> T:
""" Access the cache for this Nominatim instance.
Each cache value needs to belong to a group and have a name.
This function is for internal API use only.
`factory` is an async callback function that produces
the value if it is not already cached.
Returns the cached value or the result of factory (also caching
the result).
"""
full_name = f'{group}:{name}'
if full_name in self._property_cache:
return cast(T, self._property_cache[full_name])
value = await factory()
self._property_cache[full_name] = value
return value
async def get_class_table(self, cls: str, typ: str) -> Optional[SaFromClause]:
""" Lookup up if there is a classtype table for the given category
and return a SQLAlchemy table for it, if it exists.
"""
if self._classtables is None:
res = await self.execute(sa.text("""SELECT tablename FROM pg_tables
WHERE tablename LIKE 'place_classtype_%'
"""))
self._classtables = {r[0] for r in res}
tablename = f"place_classtype_{cls}_{typ}"
if tablename not in self._classtables:
return None
if tablename in self.t.meta.tables:
return self.t.meta.tables[tablename]
return sa.Table(tablename, self.t.meta,
sa.Column('place_id', sa.BigInteger),
sa.Column('centroid', Geometry))

974
src/nominatim_api/core.py Normal file
View File

@@ -0,0 +1,974 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of classes for API access via libraries.
"""
from typing import Mapping, Optional, Any, AsyncIterator, Dict, Sequence, List, Tuple
import asyncio
import sys
import contextlib
from pathlib import Path
import sqlalchemy as sa
import sqlalchemy.ext.asyncio as sa_asyncio
from nominatim_core.errors import UsageError
from nominatim_core.db.sqlalchemy_schema import SearchTables
from nominatim_core.db.async_core_library import PGCORE_LIB, PGCORE_ERROR
from nominatim_core.config import Configuration
from .sql import sqlite_functions, sqlalchemy_functions #pylint: disable=unused-import
from .connection import SearchConnection
from .status import get_status, StatusResult
from .lookup import get_detailed_place, get_simple_place
from .reverse import ReverseGeocoder
from .search import ForwardGeocoder, Phrase, PhraseType, make_query_analyzer
from . import types as ntyp
from .results import DetailedResult, ReverseResult, SearchResults
class NominatimAPIAsync: #pylint: disable=too-many-instance-attributes
""" The main frontend to the Nominatim database implements the
functions for lookup, forward and reverse geocoding using
asynchronous functions.
This class shares most of the functions with its synchronous
version. There are some additional functions or parameters,
which are documented below.
"""
def __init__(self, project_dir: Path,
environ: Optional[Mapping[str, str]] = None,
loop: Optional[asyncio.AbstractEventLoop] = None) -> None:
""" Initiate a new frontend object with synchronous API functions.
Parameters:
project_dir: Path to the
[project directory](../admin/Import.md#creating-the-project-directory)
of the local Nominatim installation.
environ: Mapping of [configuration parameters](../customize/Settings.md).
When set, replaces any configuration via environment variables.
Settings in this mapping also have precedence over any
parameters found in the `.env` file of the project directory.
loop: The asyncio event loop that will be used when calling
functions. Only needed, when a custom event loop is used
and the Python version is 3.9 or earlier.
"""
self.config = Configuration(project_dir, environ)
self.query_timeout = self.config.get_int('QUERY_TIMEOUT') \
if self.config.QUERY_TIMEOUT else None
self.reverse_restrict_to_country_area = self.config.get_bool('SEARCH_WITHIN_COUNTRIES')
self.server_version = 0
if sys.version_info >= (3, 10):
self._engine_lock = asyncio.Lock()
else:
self._engine_lock = asyncio.Lock(loop=loop) # pylint: disable=unexpected-keyword-arg
self._engine: Optional[sa_asyncio.AsyncEngine] = None
self._tables: Optional[SearchTables] = None
self._property_cache: Dict[str, Any] = {'DB:server_version': 0}
async def setup_database(self) -> None:
""" Set up the SQL engine and connections.
This function will be implicitly called when the database is
accessed for the first time. You may also call it explicitly to
avoid that the first call is delayed by the setup.
"""
async with self._engine_lock:
if self._engine:
return
extra_args: Dict[str, Any] = {'future': True,
'echo': self.config.get_bool('DEBUG_SQL')}
if self.config.get_int('API_POOL_SIZE') == 0:
extra_args['poolclass'] = sa.pool.NullPool
else:
extra_args['poolclass'] = sa.pool.AsyncAdaptedQueuePool
extra_args['max_overflow'] = 0
extra_args['pool_size'] = self.config.get_int('API_POOL_SIZE')
is_sqlite = self.config.DATABASE_DSN.startswith('sqlite:')
if is_sqlite:
params = dict((p.split('=', 1)
for p in self.config.DATABASE_DSN[7:].split(';')))
dburl = sa.engine.URL.create('sqlite+aiosqlite',
database=params.get('dbname'))
if not ('NOMINATIM_DATABASE_RW' in self.config.environ
and self.config.get_bool('DATABASE_RW')) \
and not Path(params.get('dbname', '')).is_file():
raise UsageError(f"SQlite database '{params.get('dbname')}' does not exist.")
else:
dsn = self.config.get_database_params()
query = {k: v for k, v in dsn.items()
if k not in ('user', 'password', 'dbname', 'host', 'port')}
dburl = sa.engine.URL.create(
f'postgresql+{PGCORE_LIB}',
database=dsn.get('dbname'),
username=dsn.get('user'),
password=dsn.get('password'),
host=dsn.get('host'),
port=int(dsn['port']) if 'port' in dsn else None,
query=query)
engine = sa_asyncio.create_async_engine(dburl, **extra_args)
if is_sqlite:
server_version = 0
@sa.event.listens_for(engine.sync_engine, "connect")
def _on_sqlite_connect(dbapi_con: Any, _: Any) -> None:
dbapi_con.run_async(lambda conn: conn.enable_load_extension(True))
sqlite_functions.install_custom_functions(dbapi_con)
cursor = dbapi_con.cursor()
cursor.execute("SELECT load_extension('mod_spatialite')")
cursor.execute('SELECT SetDecimalPrecision(7)')
dbapi_con.run_async(lambda conn: conn.enable_load_extension(False))
else:
try:
async with engine.begin() as conn:
result = await conn.scalar(sa.text('SHOW server_version_num'))
server_version = int(result)
if server_version >= 110000:
await conn.execute(sa.text("SET jit_above_cost TO '-1'"))
await conn.execute(sa.text(
"SET max_parallel_workers_per_gather TO '0'"))
except (PGCORE_ERROR, sa.exc.OperationalError):
server_version = 0
if server_version >= 110000:
@sa.event.listens_for(engine.sync_engine, "connect")
def _on_connect(dbapi_con: Any, _: Any) -> None:
cursor = dbapi_con.cursor()
cursor.execute("SET jit_above_cost TO '-1'")
cursor.execute("SET max_parallel_workers_per_gather TO '0'")
self._property_cache['DB:server_version'] = server_version
self._tables = SearchTables(sa.MetaData()) # pylint: disable=no-member
self._engine = engine
async def close(self) -> None:
""" Close all active connections to the database. The NominatimAPIAsync
object remains usable after closing. If a new API functions is
called, new connections are created.
"""
if self._engine is not None:
await self._engine.dispose()
@contextlib.asynccontextmanager
async def begin(self) -> AsyncIterator[SearchConnection]:
""" Create a new connection with automatic transaction handling.
This function may be used to get low-level access to the database.
Refer to the documentation of SQLAlchemy for details how to use
the connection object.
"""
if self._engine is None:
await self.setup_database()
assert self._engine is not None
assert self._tables is not None
async with self._engine.begin() as conn:
yield SearchConnection(conn, self._tables, self._property_cache)
async def status(self) -> StatusResult:
""" Return the status of the database.
"""
try:
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
status = await get_status(conn)
except (PGCORE_ERROR, sa.exc.OperationalError):
return StatusResult(700, 'Database connection failed')
return status
async def details(self, place: ntyp.PlaceRef, **params: Any) -> Optional[DetailedResult]:
""" Get detailed information about a place in the database.
Returns None if there is no entry under the given ID.
"""
details = ntyp.LookupDetails.from_kwargs(params)
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
if details.keywords:
await make_query_analyzer(conn)
return await get_detailed_place(conn, place, details)
async def lookup(self, places: Sequence[ntyp.PlaceRef], **params: Any) -> SearchResults:
""" Get simple information about a list of places.
Returns a list of place information for all IDs that were found.
"""
details = ntyp.LookupDetails.from_kwargs(params)
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
if details.keywords:
await make_query_analyzer(conn)
return SearchResults(filter(None,
[await get_simple_place(conn, p, details) for p in places]))
async def reverse(self, coord: ntyp.AnyPoint, **params: Any) -> Optional[ReverseResult]:
""" Find a place by its coordinates. Also known as reverse geocoding.
Returns the closest result that can be found or None if
no place matches the given criteria.
"""
# The following negation handles NaN correctly. Don't change.
if not abs(coord[0]) <= 180 or not abs(coord[1]) <= 90:
# There are no results to be expected outside valid coordinates.
return None
details = ntyp.ReverseDetails.from_kwargs(params)
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
if details.keywords:
await make_query_analyzer(conn)
geocoder = ReverseGeocoder(conn, details,
self.reverse_restrict_to_country_area)
return await geocoder.lookup(coord)
async def search(self, query: str, **params: Any) -> SearchResults:
""" Find a place by free-text search. Also known as forward geocoding.
"""
query = query.strip()
if not query:
raise UsageError('Nothing to search for.')
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
geocoder = ForwardGeocoder(conn, ntyp.SearchDetails.from_kwargs(params),
self.config.get_int('REQUEST_TIMEOUT') \
if self.config.REQUEST_TIMEOUT else None)
phrases = [Phrase(PhraseType.NONE, p.strip()) for p in query.split(',')]
return await geocoder.lookup(phrases)
# pylint: disable=too-many-arguments,too-many-branches
async def search_address(self, amenity: Optional[str] = None,
street: Optional[str] = None,
city: Optional[str] = None,
county: Optional[str] = None,
state: Optional[str] = None,
country: Optional[str] = None,
postalcode: Optional[str] = None,
**params: Any) -> SearchResults:
""" Find an address using structured search.
"""
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
details = ntyp.SearchDetails.from_kwargs(params)
phrases: List[Phrase] = []
if amenity:
phrases.append(Phrase(PhraseType.AMENITY, amenity))
if street:
phrases.append(Phrase(PhraseType.STREET, street))
if city:
phrases.append(Phrase(PhraseType.CITY, city))
if county:
phrases.append(Phrase(PhraseType.COUNTY, county))
if state:
phrases.append(Phrase(PhraseType.STATE, state))
if postalcode:
phrases.append(Phrase(PhraseType.POSTCODE, postalcode))
if country:
phrases.append(Phrase(PhraseType.COUNTRY, country))
if not phrases:
raise UsageError('Nothing to search for.')
if amenity or street:
details.restrict_min_max_rank(26, 30)
elif city:
details.restrict_min_max_rank(13, 25)
elif county:
details.restrict_min_max_rank(10, 12)
elif state:
details.restrict_min_max_rank(5, 9)
elif postalcode:
details.restrict_min_max_rank(5, 11)
else:
details.restrict_min_max_rank(4, 4)
if 'layers' not in params:
details.layers = ntyp.DataLayer.ADDRESS
if amenity:
details.layers |= ntyp.DataLayer.POI
geocoder = ForwardGeocoder(conn, details,
self.config.get_int('REQUEST_TIMEOUT') \
if self.config.REQUEST_TIMEOUT else None)
return await geocoder.lookup(phrases)
async def search_category(self, categories: List[Tuple[str, str]],
near_query: Optional[str] = None,
**params: Any) -> SearchResults:
""" Find an object of a certain category near another place.
The near place may either be given as an unstructured search
query in itself or as coordinates.
"""
if not categories:
return SearchResults()
details = ntyp.SearchDetails.from_kwargs(params)
async with self.begin() as conn:
conn.set_query_timeout(self.query_timeout)
if near_query:
phrases = [Phrase(PhraseType.NONE, p) for p in near_query.split(',')]
else:
phrases = []
if details.keywords:
await make_query_analyzer(conn)
geocoder = ForwardGeocoder(conn, details,
self.config.get_int('REQUEST_TIMEOUT') \
if self.config.REQUEST_TIMEOUT else None)
return await geocoder.lookup_pois(categories, phrases)
class NominatimAPI:
""" This class provides a thin synchronous wrapper around the asynchronous
Nominatim functions. It creates its own event loop and runs each
synchronous function call to completion using that loop.
"""
def __init__(self, project_dir: Path,
environ: Optional[Mapping[str, str]] = None) -> None:
""" Initiate a new frontend object with synchronous API functions.
Parameters:
project_dir: Path to the
[project directory](../admin/Import.md#creating-the-project-directory)
of the local Nominatim installation.
environ: Mapping of [configuration parameters](../customize/Settings.md).
When set, replaces any configuration via environment variables.
Settings in this mapping also have precedence over any
parameters found in the `.env` file of the project directory.
"""
self._loop = asyncio.new_event_loop()
self._async_api = NominatimAPIAsync(project_dir, environ, loop=self._loop)
def close(self) -> None:
""" Close all active connections to the database.
This function also closes the asynchronous worker loop making
the NominatimAPI object unusable.
"""
self._loop.run_until_complete(self._async_api.close())
self._loop.close()
@property
def config(self) -> Configuration:
""" Provide read-only access to the [configuration](#Configuration)
used by the API.
"""
return self._async_api.config
def status(self) -> StatusResult:
""" Return the status of the database as a dataclass object
with the fields described below.
Returns:
status(int): A status code as described on the status page.
message(str): Either 'OK' or a human-readable message of the
problem encountered.
software_version(tuple): A tuple with the version of the
Nominatim library consisting of (major, minor, patch, db-patch)
version.
database_version(tuple): A tuple with the version of the library
which was used for the import or last migration.
Also consists of (major, minor, patch, db-patch).
data_updated(datetime): Timestamp with the age of the data.
"""
return self._loop.run_until_complete(self._async_api.status())
def details(self, place: ntyp.PlaceRef, **params: Any) -> Optional[DetailedResult]:
""" Get detailed information about a place in the database.
The result is a dataclass object with the fields described below
or `None` if the place could not be found in the database.
Parameters:
place: Description of the place to look up. See
[Place identification](Input-Parameter-Types.md#place-identification)
for the various ways to reference a place.
Other parameters:
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
parent_place_id (Optional(int]): Internal ID of the parent of this
place. Only meaning full for POI-like objects (places with a
rank_address of 30).
linked_place_id (Optional[int]): Internal ID of the place this object
links to. When this ID is set then there is no guarantee that
the rest of the result information is complete.
admin_level (int): Value of the `admin_level` OSM tag. Only meaningful
for administrative boundary objects.
indexed_date (datetime): Timestamp when the place was last updated.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
"""
return self._loop.run_until_complete(self._async_api.details(place, **params))
def lookup(self, places: Sequence[ntyp.PlaceRef], **params: Any) -> SearchResults:
""" Get simple information about a list of places.
Returns a list of place information for all IDs that were found.
Each result is a dataclass with the fields detailed below.
Parameters:
places: List of descriptions of the place to look up. See
[Place identification](Input-Parameter-Types.md#place-identification)
for the various ways to reference a place.
Other parameters:
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
bbox (Bbox): Bounding box of the full geometry of the place.
If the place is a single point, then the size of the bounding
box is guessed according to the type of place.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
"""
return self._loop.run_until_complete(self._async_api.lookup(places, **params))
def reverse(self, coord: ntyp.AnyPoint, **params: Any) -> Optional[ReverseResult]:
""" Find a place by its coordinates. Also known as reverse geocoding.
Returns the closest result that can be found or `None` if
no place matches the given criteria. The result is a dataclass
with the fields as detailed below.
Parameters:
coord: Coordinate to lookup the place for as a Point
or a tuple (x, y). Must be in WGS84 projection.
Other parameters:
max_rank (int): Highest address rank to return. Can be used to
restrict search to streets or settlements.
layers (enum): Defines the kind of data to take into account.
See description of layers below. (Default: addresses and POIs)
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
bbox (Bbox): Bounding box of the full geometry of the place.
If the place is a single point, then the size of the bounding
box is guessed according to the type of place.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
distance (Optional[float]): Distance in degree from the input point.
"""
return self._loop.run_until_complete(self._async_api.reverse(coord, **params))
def search(self, query: str, **params: Any) -> SearchResults:
""" Find a place by free-text search. Also known as forward geocoding.
Parameters:
query: Free-form text query searching for a place.
Other parameters:
max_results (int): Maximum number of results to return. The
actual number of results may be less. (Default: 10)
min_rank (int): Lowest permissible rank for the result.
For addressable places this is the minimum
[address rank](../customize/Ranking.md#address-rank). For all
other places the [search rank](../customize/Ranking.md#search-rank)
is used.
max_rank (int): Highest permissible rank for the result. See min_rank above.
layers (enum): Defines the kind of data to take into account.
See [layers section](Input-Parameter-Types.md#layers) for details.
(Default: addresses and POIs)
countries (list[str]): Restrict search to countries with the given
ISO 3166-1 alpha-2 country code. An empty list (the default)
disables this filter.
excluded (list[int]): A list of internal IDs of places to exclude
from the search.
viewbox (Optional[Bbox]): Bounding box of an area to focus search on.
bounded_viewbox (bool): Consider the bounding box given in `viewbox`
as a filter and return only results within the bounding box.
near (Optional[Point]): Focus search around the given point and
return results ordered by distance to the given point.
near_radius (Optional[float]): Restrict results to results within
the given distance in degrees of `near` point. Ignored, when
`near` is not set.
categories (list[tuple]): Restrict search to places of the given
categories. The category is the main OSM tag assigned to each
place. An empty list (the default) disables this filter.
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
bbox (Bbox): Bounding box of the full geometry of the place.
If the place is a single point, then the size of the bounding
box is guessed according to the type of place.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
"""
return self._loop.run_until_complete(
self._async_api.search(query, **params))
# pylint: disable=too-many-arguments
def search_address(self, amenity: Optional[str] = None,
street: Optional[str] = None,
city: Optional[str] = None,
county: Optional[str] = None,
state: Optional[str] = None,
country: Optional[str] = None,
postalcode: Optional[str] = None,
**params: Any) -> SearchResults:
""" Find an address using structured search.
Parameters:
amenity: Name of a POI.
street: Street and optionally housenumber of the address. If the address
does not have a street, then the place the housenumber references to.
city: Postal city of the address.
county: County equivalent of the address. Does not exist in all
jurisdictions.
state: State or province of the address.
country: Country with its full name or its ISO 3166-1 alpha-2 country code.
Do not use together with the country_code filter.
postalcode: Post code or ZIP for the place.
Other parameters:
max_results (int): Maximum number of results to return. The
actual number of results may be less. (Default: 10)
min_rank (int): Lowest permissible rank for the result.
For addressable places this is the minimum
[address rank](../customize/Ranking.md#address-rank). For all
other places the [search rank](../customize/Ranking.md#search-rank)
is used.
max_rank (int): Highest permissible rank for the result. See min_rank above.
layers (enum): Defines the kind of data to take into account.
See [layers section](Input-Parameter-Types.md#layers) for details.
(Default: addresses and POIs)
countries (list[str]): Restrict search to countries with the given
ISO 3166-1 alpha-2 country code. An empty list (the default)
disables this filter. Do not use, when the country parameter
is used.
excluded (list[int]): A list of internal IDs of places to exclude
from the search.
viewbox (Optional[Bbox]): Bounding box of an area to focus search on.
bounded_viewbox (bool): Consider the bounding box given in `viewbox`
as a filter and return only results within the bounding box.
near (Optional[Point]): Focus search around the given point and
return results ordered by distance to the given point.
near_radius (Optional[float]): Restrict results to results within
the given distance in degrees of `near` point. Ignored, when
`near` is not set.
categories (list[tuple]): Restrict search to places of the given
categories. The category is the main OSM tag assigned to each
place. An empty list (the default) disables this filter.
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
bbox (Bbox): Bounding box of the full geometry of the place.
If the place is a single point, then the size of the bounding
box is guessed according to the type of place.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
"""
return self._loop.run_until_complete(
self._async_api.search_address(amenity, street, city, county,
state, country, postalcode, **params))
def search_category(self, categories: List[Tuple[str, str]],
near_query: Optional[str] = None,
**params: Any) -> SearchResults:
""" Find an object of a certain category near another place.
The near place may either be given as an unstructured search
query in itself or as a geographic area through the
viewbox or near parameters.
Parameters:
categories: Restrict search to places of the given
categories. The category is the main OSM tag assigned to each
place.
near_query: Optional free-text query to define the are to
restrict search to.
Other parameters:
max_results (int): Maximum number of results to return. The
actual number of results may be less. (Default: 10)
min_rank (int): Lowest permissible rank for the result.
For addressable places this is the minimum
[address rank](../customize/Ranking.md#address-rank). For all
other places the [search rank](../customize/Ranking.md#search-rank)
is used.
max_rank (int): Highest permissible rank for the result. See min_rank above.
layers (enum): Defines the kind of data to take into account.
See [layers section](Input-Parameter-Types.md#layers) for details.
(Default: addresses and POIs)
countries (list[str]): Restrict search to countries with the given
ISO 3166-1 alpha-2 country code. An empty list (the default)
disables this filter.
excluded (list[int]): A list of internal IDs of places to exclude
from the search.
viewbox (Optional[Bbox]): Bounding box of an area to focus search on.
bounded_viewbox (bool): Consider the bounding box given in `viewbox`
as a filter and return only results within the bounding box.
near (Optional[Point]): Focus search around the given point and
return results ordered by distance to the given point.
near_radius (Optional[float]): Restrict results to results within
the given distance in degrees of `near` point. Ignored, when
`near` is not set.
geometry_output (enum): Add the full geometry of the place to the result.
Multiple formats may be selected. Note that geometries can become
quite large. (Default: none)
geometry_simplification (float): Simplification factor to use on
the geometries before returning them. The factor expresses
the tolerance in degrees from which the geometry may differ.
Topology is preserved. (Default: 0.0)
address_details (bool): Add detailed information about the places
that make up the address of the requested object. (Default: False)
linked_places (bool): Add detailed information about the places
that link to the result. (Default: False)
parented_places (bool): Add detailed information about all places
for which the requested object is a parent, i.e. all places for
which the object provides the address details.
Only POI places can have parents. (Default: False)
keywords (bool): Add detailed information about the search terms
used for this place.
Returns:
source_table (enum): Data source of the place. See below for possible values.
category (tuple): A tuple of two strings with the primary OSM tag
and value.
centroid (Point): Point position of the place.
place_id (Optional[int]): Internal ID of the place. This ID may differ
for the same place between different installations.
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
names (Optional[dict]): Dictionary of names of the place. Keys are
usually the corresponding OSM tag keys.
address (Optional[dict]): Dictionary of address parts directly
attributed to the place. Keys are usually the corresponding
OSM tag keys with the `addr:` prefix removed.
extratags (Optional[dict]): Dictionary of additional attributes for
the place. Usually OSM tag keys and values.
housenumber (Optional[str]): House number of the place, normalised
for lookup. To get the house number in its original spelling,
use `address['housenumber']`.
postcode (Optional[str]): Computed postcode for the place. To get
directly attributed postcodes, use `address['postcode']` instead.
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
The string has the format <language code>:<wikipedia title>.
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
importance (Optional[float]): Relative importance of the place. This is a measure
how likely the place will be searched for.
country_code (Optional[str]): Country the feature is in as
ISO 3166-1 alpha-2 country code.
address_rows (Optional[AddressLines]): List of places that make up the
computed address. `None` when `address_details` parameter was False.
linked_rows (Optional[AddressLines]): List of places that link to the object.
`None` when `linked_places` parameter was False.
parented_rows (Optional[AddressLines]): List of direct children of the place.
`None` when `parented_places` parameter was False.
name_keywords (Optional[WordInfos]): List of search words for the name of
the place. `None` when `keywords` parameter is set to False.
address_keywords (Optional[WordInfos]): List of search word for the address of
the place. `None` when `keywords` parameter is set to False.
bbox (Bbox): Bounding box of the full geometry of the place.
If the place is a single point, then the size of the bounding
box is guessed according to the type of place.
geometry (dict): Dictionary containing the full geometry of the place
in the formats requested in the `geometry_output` parameter.
"""
return self._loop.run_until_complete(
self._async_api.search_category(categories, near_query, **params))

View File

@@ -0,0 +1,97 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for localizing names of results.
"""
from typing import Mapping, List, Optional
import re
class Locales:
""" Helper class for localization of names.
It takes a list of language prefixes in their order of preferred
usage.
"""
def __init__(self, langs: Optional[List[str]] = None):
self.languages = langs or []
self.name_tags: List[str] = []
# Build the list of supported tags. It is currently hard-coded.
self._add_lang_tags('name')
self._add_tags('name', 'brand')
self._add_lang_tags('official_name', 'short_name')
self._add_tags('official_name', 'short_name', 'ref')
def __bool__(self) -> bool:
return len(self.languages) > 0
def _add_tags(self, *tags: str) -> None:
for tag in tags:
self.name_tags.append(tag)
self.name_tags.append(f"_place_{tag}")
def _add_lang_tags(self, *tags: str) -> None:
for tag in tags:
for lang in self.languages:
self.name_tags.append(f"{tag}:{lang}")
self.name_tags.append(f"_place_{tag}:{lang}")
def display_name(self, names: Optional[Mapping[str, str]]) -> str:
""" Return the best matching name from a dictionary of names
containing different name variants.
If 'names' is null or empty, an empty string is returned. If no
appropriate localization is found, the first name is returned.
"""
if not names:
return ''
if len(names) > 1:
for tag in self.name_tags:
if tag in names:
return names[tag]
# Nothing? Return any of the other names as a default.
return next(iter(names.values()))
@staticmethod
def from_accept_languages(langstr: str) -> 'Locales':
""" Create a localization object from a language list in the
format of HTTP accept-languages header.
The functions tries to be forgiving of format errors by first splitting
the string into comma-separated parts and then parsing each
description separately. Badly formatted parts are then ignored.
"""
# split string into languages
candidates = []
for desc in langstr.split(','):
m = re.fullmatch(r'\s*([a-z_-]+)(?:;\s*q\s*=\s*([01](?:\.\d+)?))?\s*',
desc, flags=re.I)
if m:
candidates.append((m[1], float(m[2] or 1.0)))
# sort the results by the weight of each language (preserving order).
candidates.sort(reverse=True, key=lambda e: e[1])
# If a language has a region variant, also add the language without
# variant but only if it isn't already in the list to not mess up the weight.
languages = []
for lid, _ in candidates:
languages.append(lid)
parts = lid.split('-', 1)
if len(parts) > 1 and all(c[0] != parts[0] for c in candidates):
languages.append(parts[0])
return Locales(languages)

View File

@@ -0,0 +1,433 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for specialised logging with HTML output.
"""
from typing import Any, Iterator, Optional, List, Tuple, cast, Union, Mapping, Sequence
from contextvars import ContextVar
import datetime as dt
import textwrap
import io
import re
import html
import sqlalchemy as sa
from sqlalchemy.ext.asyncio import AsyncConnection
try:
from pygments import highlight
from pygments.lexers import PythonLexer, PostgresLexer
from pygments.formatters import HtmlFormatter
CODE_HIGHLIGHT = True
except ModuleNotFoundError:
CODE_HIGHLIGHT = False
def _debug_name(res: Any) -> str:
if res.names:
return cast(str, res.names.get('name', next(iter(res.names.values()))))
return f"Hnr {res.housenumber}" if res.housenumber is not None else '[NONE]'
class BaseLogger:
""" Interface for logging function.
The base implementation does nothing. Overwrite the functions
in derived classes which implement logging functionality.
"""
def get_buffer(self) -> str:
""" Return the current content of the log buffer.
"""
return ''
def function(self, func: str, **kwargs: Any) -> None:
""" Start a new debug chapter for the given function and its parameters.
"""
def section(self, heading: str) -> None:
""" Start a new section with the given title.
"""
def comment(self, text: str) -> None:
""" Add a simple comment to the debug output.
"""
def var_dump(self, heading: str, var: Any) -> None:
""" Print the content of the variable to the debug output prefixed by
the given heading.
"""
def table_dump(self, heading: str, rows: Iterator[Optional[List[Any]]]) -> None:
""" Print the table generated by the generator function.
"""
def result_dump(self, heading: str, results: Iterator[Tuple[Any, Any]]) -> None:
""" Print a list of search results generated by the generator function.
"""
def sql(self, conn: AsyncConnection, statement: 'sa.Executable',
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None]) -> None:
""" Print the SQL for the given statement.
"""
def format_sql(self, conn: AsyncConnection, statement: 'sa.Executable',
extra_params: Union[Mapping[str, Any],
Sequence[Mapping[str, Any]], None]) -> str:
""" Return the compiled version of the statement.
"""
compiled = cast('sa.ClauseElement', statement).compile(conn.sync_engine)
params = dict(compiled.params)
if isinstance(extra_params, Mapping):
for k, v in extra_params.items():
if hasattr(v, 'to_wkt'):
params[k] = v.to_wkt()
elif isinstance(v, (int, float)):
params[k] = v
else:
params[k] = str(v)
elif isinstance(extra_params, Sequence) and extra_params:
for k in extra_params[0]:
params[k] = f':{k}'
sqlstr = str(compiled)
if conn.dialect.name == 'postgresql':
if sa.__version__.startswith('1'):
try:
sqlstr = re.sub(r'__\[POSTCOMPILE_[^]]*\]', '%s', sqlstr)
return sqlstr % tuple((repr(params.get(name, None))
for name in compiled.positiontup)) # type: ignore
except TypeError:
return sqlstr
# Fixes an odd issue with Python 3.7 where percentages are not
# quoted correctly.
sqlstr = re.sub(r'%(?!\()', '%%', sqlstr)
sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', r'%(\1)s', sqlstr)
return sqlstr % params
assert conn.dialect.name == 'sqlite'
# params in positional order
pparams = (repr(params.get(name, None)) for name in compiled.positiontup) # type: ignore
sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', '?', sqlstr)
sqlstr = re.sub(r"\?", lambda m: next(pparams), sqlstr)
return sqlstr
class HTMLLogger(BaseLogger):
""" Logger that formats messages in HTML.
"""
def __init__(self) -> None:
self.buffer = io.StringIO()
def _timestamp(self) -> None:
self._write(f'<p class="timestamp">[{dt.datetime.now()}]</p>')
def get_buffer(self) -> str:
return HTML_HEADER + self.buffer.getvalue() + HTML_FOOTER
def function(self, func: str, **kwargs: Any) -> None:
self._timestamp()
self._write(f"<h1>Debug output for {func}()</h1>\n<p>Parameters:<dl>")
for name, value in kwargs.items():
self._write(f'<dt>{name}</dt><dd>{self._python_var(value)}</dd>')
self._write('</dl></p>')
def section(self, heading: str) -> None:
self._timestamp()
self._write(f"<h2>{heading}</h2>")
def comment(self, text: str) -> None:
self._timestamp()
self._write(f"<p>{text}</p>")
def var_dump(self, heading: str, var: Any) -> None:
self._timestamp()
if callable(var):
var = var()
self._write(f'<h5>{heading}</h5>{self._python_var(var)}')
def table_dump(self, heading: str, rows: Iterator[Optional[List[Any]]]) -> None:
self._timestamp()
head = next(rows)
assert head
self._write(f'<table><thead><tr><th colspan="{len(head)}">{heading}</th></tr><tr>')
for cell in head:
self._write(f'<th>{cell}</th>')
self._write('</tr></thead><tbody>')
for row in rows:
if row is not None:
self._write('<tr>')
for cell in row:
self._write(f'<td>{cell}</td>')
self._write('</tr>')
self._write('</tbody></table>')
def result_dump(self, heading: str, results: Iterator[Tuple[Any, Any]]) -> None:
""" Print a list of search results generated by the generator function.
"""
self._timestamp()
def format_osm(osm_object: Optional[Tuple[str, int]]) -> str:
if not osm_object:
return '-'
t, i = osm_object
if t == 'N':
fullt = 'node'
elif t == 'W':
fullt = 'way'
elif t == 'R':
fullt = 'relation'
else:
return f'{t}{i}'
return f'<a href="https://www.openstreetmap.org/{fullt}/{i}">{t}{i}</a>'
self._write(f'<h5>{heading}</h5><p><dl>')
total = 0
for rank, res in results:
self._write(f'<dt>[{rank:.3f}]</dt> <dd>{res.source_table.name}(')
self._write(f"{_debug_name(res)}, type=({','.join(res.category)}), ")
self._write(f"rank={res.rank_address}, ")
self._write(f"osm={format_osm(res.osm_object)}, ")
self._write(f'cc={res.country_code}, ')
self._write(f'importance={res.importance or float("nan"):.5f})</dd>')
total += 1
self._write(f'</dl><b>TOTAL:</b> {total}</p>')
def sql(self, conn: AsyncConnection, statement: 'sa.Executable',
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None]) -> None:
self._timestamp()
sqlstr = self.format_sql(conn, statement, params)
if CODE_HIGHLIGHT:
sqlstr = highlight(sqlstr, PostgresLexer(),
HtmlFormatter(nowrap=True, lineseparator='<br />'))
self._write(f'<div class="highlight"><code class="lang-sql">{sqlstr}</code></div>')
else:
self._write(f'<code class="lang-sql">{html.escape(sqlstr)}</code>')
def _python_var(self, var: Any) -> str:
if CODE_HIGHLIGHT:
fmt = highlight(str(var), PythonLexer(), HtmlFormatter(nowrap=True))
return f'<div class="highlight"><code class="lang-python">{fmt}</code></div>'
return f'<code class="lang-python">{html.escape(str(var))}</code>'
def _write(self, text: str) -> None:
""" Add the raw text to the debug output.
"""
self.buffer.write(text)
class TextLogger(BaseLogger):
""" Logger creating output suitable for the console.
"""
def __init__(self) -> None:
self.buffer = io.StringIO()
def _timestamp(self) -> None:
self._write(f'[{dt.datetime.now()}]\n')
def get_buffer(self) -> str:
return self.buffer.getvalue()
def function(self, func: str, **kwargs: Any) -> None:
self._write(f"#### Debug output for {func}()\n\nParameters:\n")
for name, value in kwargs.items():
self._write(f' {name}: {self._python_var(value)}\n')
self._write('\n')
def section(self, heading: str) -> None:
self._timestamp()
self._write(f"\n# {heading}\n\n")
def comment(self, text: str) -> None:
self._write(f"{text}\n")
def var_dump(self, heading: str, var: Any) -> None:
if callable(var):
var = var()
self._write(f'{heading}:\n {self._python_var(var)}\n\n')
def table_dump(self, heading: str, rows: Iterator[Optional[List[Any]]]) -> None:
self._write(f'{heading}:\n')
data = [list(map(self._python_var, row)) if row else None for row in rows]
assert data[0] is not None
num_cols = len(data[0])
maxlens = [max(len(d[i]) for d in data if d) for i in range(num_cols)]
tablewidth = sum(maxlens) + 3 * num_cols + 1
row_format = '| ' +' | '.join(f'{{:<{l}}}' for l in maxlens) + ' |\n'
self._write('-'*tablewidth + '\n')
self._write(row_format.format(*data[0]))
self._write('-'*tablewidth + '\n')
for row in data[1:]:
if row:
self._write(row_format.format(*row))
else:
self._write('-'*tablewidth + '\n')
if data[-1]:
self._write('-'*tablewidth + '\n')
def result_dump(self, heading: str, results: Iterator[Tuple[Any, Any]]) -> None:
self._timestamp()
self._write(f'{heading}:\n')
total = 0
for rank, res in results:
self._write(f'[{rank:.3f}] {res.source_table.name}(')
self._write(f"{_debug_name(res)}, type=({','.join(res.category)}), ")
self._write(f"rank={res.rank_address}, ")
self._write(f"osm={''.join(map(str, res.osm_object or []))}, ")
self._write(f'cc={res.country_code}, ')
self._write(f'importance={res.importance or -1:.5f})\n')
total += 1
self._write(f'TOTAL: {total}\n\n')
def sql(self, conn: AsyncConnection, statement: 'sa.Executable',
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None]) -> None:
self._timestamp()
sqlstr = '\n| '.join(textwrap.wrap(self.format_sql(conn, statement, params), width=78))
self._write(f"| {sqlstr}\n\n")
def _python_var(self, var: Any) -> str:
return str(var)
def _write(self, text: str) -> None:
self.buffer.write(text)
logger: ContextVar[BaseLogger] = ContextVar('logger', default=BaseLogger())
def set_log_output(fmt: str) -> None:
""" Enable collecting debug information.
"""
if fmt == 'html':
logger.set(HTMLLogger())
elif fmt == 'text':
logger.set(TextLogger())
else:
logger.set(BaseLogger())
def log() -> BaseLogger:
""" Return the logger for the current context.
"""
return logger.get()
def get_and_disable() -> str:
""" Return the current content of the debug buffer and disable logging.
"""
buf = logger.get().get_buffer()
logger.set(BaseLogger())
return buf
HTML_HEADER: str = """<!DOCTYPE html>
<html>
<head>
<title>Nominatim - Debug</title>
<style>
""" + \
(HtmlFormatter(nobackground=True).get_style_defs('.highlight') if CODE_HIGHLIGHT else '') +\
"""
h2 { font-size: x-large }
dl {
padding-left: 10pt;
font-family: monospace
}
dt {
float: left;
font-weight: bold;
margin-right: 0.5em
}
dt::after { content: ": "; }
dd::after {
clear: left;
display: block
}
.lang-sql {
color: #555;
font-size: small
}
h5 {
border: solid lightgrey 0.1pt;
margin-bottom: 0;
background-color: #f7f7f7
}
h5 + .highlight {
padding: 3pt;
border: solid lightgrey 0.1pt
}
table, th, tbody {
border: thin solid;
border-collapse: collapse;
}
td {
border-right: thin solid;
padding-left: 3pt;
padding-right: 3pt;
}
.timestamp {
font-size: 0.8em;
color: darkblue;
width: calc(100% - 5pt);
text-align: right;
position: absolute;
left: 0;
margin-top: -5px;
}
</style>
</head>
<body>
"""
HTML_FOOTER: str = "</body></html>"

250
src/nominatim_api/lookup.py Normal file
View File

@@ -0,0 +1,250 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of place lookup by ID.
"""
from typing import Optional, Callable, Tuple, Type
import datetime as dt
import sqlalchemy as sa
from nominatim_core.typing import SaColumn, SaRow, SaSelect
from .connection import SearchConnection
from .logging import log
from . import types as ntyp
from . import results as nres
RowFunc = Callable[[Optional[SaRow], Type[nres.BaseResultT]], Optional[nres.BaseResultT]]
GeomFunc = Callable[[SaSelect, SaColumn], SaSelect]
async def find_in_placex(conn: SearchConnection, place: ntyp.PlaceRef,
add_geometries: GeomFunc) -> Optional[SaRow]:
""" Search for the given place in the placex table and return the
base information.
"""
log().section("Find in placex table")
t = conn.t.placex
sql = sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.admin_level,
t.c.address, t.c.extratags,
t.c.housenumber, t.c.postcode, t.c.country_code,
t.c.importance, t.c.wikipedia, t.c.indexed_date,
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
t.c.linked_place_id,
t.c.geometry.ST_Expand(0).label('bbox'),
t.c.centroid)
if isinstance(place, ntyp.PlaceID):
sql = sql.where(t.c.place_id == place.place_id)
elif isinstance(place, ntyp.OsmID):
sql = sql.where(t.c.osm_type == place.osm_type)\
.where(t.c.osm_id == place.osm_id)
if place.osm_class:
sql = sql.where(t.c.class_ == place.osm_class)
else:
sql = sql.order_by(t.c.class_)
sql = sql.limit(1)
else:
return None
return (await conn.execute(add_geometries(sql, t.c.geometry))).one_or_none()
async def find_in_osmline(conn: SearchConnection, place: ntyp.PlaceRef,
add_geometries: GeomFunc) -> Optional[SaRow]:
""" Search for the given place in the osmline table and return the
base information.
"""
log().section("Find in interpolation table")
t = conn.t.osmline
sql = sa.select(t.c.place_id, t.c.osm_id, t.c.parent_place_id,
t.c.indexed_date, t.c.startnumber, t.c.endnumber,
t.c.step, t.c.address, t.c.postcode, t.c.country_code,
t.c.linegeo.ST_Centroid().label('centroid'))
if isinstance(place, ntyp.PlaceID):
sql = sql.where(t.c.place_id == place.place_id)
elif isinstance(place, ntyp.OsmID) and place.osm_type == 'W':
# There may be multiple interpolations for a single way.
# If 'class' contains a number, return the one that belongs to that number.
sql = sql.where(t.c.osm_id == place.osm_id).limit(1)
if place.osm_class and place.osm_class.isdigit():
sql = sql.order_by(sa.func.greatest(0,
int(place.osm_class) - t.c.endnumber,
t.c.startnumber - int(place.osm_class)))
else:
return None
return (await conn.execute(add_geometries(sql, t.c.linegeo))).one_or_none()
async def find_in_tiger(conn: SearchConnection, place: ntyp.PlaceRef,
add_geometries: GeomFunc) -> Optional[SaRow]:
""" Search for the given place in the table of Tiger addresses and return
the base information. Only lookup by place ID is supported.
"""
if not isinstance(place, ntyp.PlaceID):
return None
log().section("Find in TIGER table")
t = conn.t.tiger
parent = conn.t.placex
sql = sa.select(t.c.place_id, t.c.parent_place_id,
parent.c.osm_type, parent.c.osm_id,
t.c.startnumber, t.c.endnumber, t.c.step,
t.c.postcode,
t.c.linegeo.ST_Centroid().label('centroid'))\
.where(t.c.place_id == place.place_id)\
.join(parent, t.c.parent_place_id == parent.c.place_id, isouter=True)
return (await conn.execute(add_geometries(sql, t.c.linegeo))).one_or_none()
async def find_in_postcode(conn: SearchConnection, place: ntyp.PlaceRef,
add_geometries: GeomFunc) -> Optional[SaRow]:
""" Search for the given place in the postcode table and return the
base information. Only lookup by place ID is supported.
"""
if not isinstance(place, ntyp.PlaceID):
return None
log().section("Find in postcode table")
t = conn.t.postcode
sql = sa.select(t.c.place_id, t.c.parent_place_id,
t.c.rank_search, t.c.rank_address,
t.c.indexed_date, t.c.postcode, t.c.country_code,
t.c.geometry.label('centroid')) \
.where(t.c.place_id == place.place_id)
return (await conn.execute(add_geometries(sql, t.c.geometry))).one_or_none()
async def find_in_all_tables(conn: SearchConnection, place: ntyp.PlaceRef,
add_geometries: GeomFunc
) -> Tuple[Optional[SaRow], RowFunc[nres.BaseResultT]]:
""" Search for the given place in all data tables
and return the base information.
"""
row = await find_in_placex(conn, place, add_geometries)
log().var_dump('Result (placex)', row)
if row is not None:
return row, nres.create_from_placex_row
row = await find_in_osmline(conn, place, add_geometries)
log().var_dump('Result (osmline)', row)
if row is not None:
return row, nres.create_from_osmline_row
row = await find_in_postcode(conn, place, add_geometries)
log().var_dump('Result (postcode)', row)
if row is not None:
return row, nres.create_from_postcode_row
row = await find_in_tiger(conn, place, add_geometries)
log().var_dump('Result (tiger)', row)
return row, nres.create_from_tiger_row
async def get_detailed_place(conn: SearchConnection, place: ntyp.PlaceRef,
details: ntyp.LookupDetails) -> Optional[nres.DetailedResult]:
""" Retrieve a place with additional details from the database.
"""
log().function('get_detailed_place', place=place, details=details)
if details.geometry_output and details.geometry_output != ntyp.GeometryFormat.GEOJSON:
raise ValueError("lookup only supports geojosn polygon output.")
if details.geometry_output & ntyp.GeometryFormat.GEOJSON:
def _add_geometry(sql: SaSelect, column: SaColumn) -> SaSelect:
return sql.add_columns(sa.func.ST_AsGeoJSON(
sa.case((sa.func.ST_NPoints(column) > 5000,
sa.func.ST_SimplifyPreserveTopology(column, 0.0001)),
else_=column), 7).label('geometry_geojson'))
else:
def _add_geometry(sql: SaSelect, column: SaColumn) -> SaSelect:
return sql.add_columns(sa.func.ST_GeometryType(column).label('geometry_type'))
row_func: RowFunc[nres.DetailedResult]
row, row_func = await find_in_all_tables(conn, place, _add_geometry)
if row is None:
return None
result = row_func(row, nres.DetailedResult)
assert result is not None
# add missing details
assert result is not None
if 'type' in result.geometry:
result.geometry['type'] = GEOMETRY_TYPE_MAP.get(result.geometry['type'],
result.geometry['type'])
indexed_date = getattr(row, 'indexed_date', None)
if indexed_date is not None:
result.indexed_date = indexed_date.replace(tzinfo=dt.timezone.utc)
await nres.add_result_details(conn, [result], details)
return result
async def get_simple_place(conn: SearchConnection, place: ntyp.PlaceRef,
details: ntyp.LookupDetails) -> Optional[nres.SearchResult]:
""" Retrieve a place as a simple search result from the database.
"""
log().function('get_simple_place', place=place, details=details)
def _add_geometry(sql: SaSelect, col: SaColumn) -> SaSelect:
if not details.geometry_output:
return sql
out = []
if details.geometry_simplification > 0.0:
col = sa.func.ST_SimplifyPreserveTopology(col, details.geometry_simplification)
if details.geometry_output & ntyp.GeometryFormat.GEOJSON:
out.append(sa.func.ST_AsGeoJSON(col, 7).label('geometry_geojson'))
if details.geometry_output & ntyp.GeometryFormat.TEXT:
out.append(sa.func.ST_AsText(col).label('geometry_text'))
if details.geometry_output & ntyp.GeometryFormat.KML:
out.append(sa.func.ST_AsKML(col, 7).label('geometry_kml'))
if details.geometry_output & ntyp.GeometryFormat.SVG:
out.append(sa.func.ST_AsSVG(col, 0, 7).label('geometry_svg'))
return sql.add_columns(*out)
row_func: RowFunc[nres.SearchResult]
row, row_func = await find_in_all_tables(conn, place, _add_geometry)
if row is None:
return None
result = row_func(row, nres.SearchResult)
assert result is not None
# add missing details
assert result is not None
if hasattr(row, 'bbox'):
result.bbox = ntyp.Bbox.from_wkb(row.bbox)
await nres.add_result_details(conn, [result], details)
return result
GEOMETRY_TYPE_MAP = {
'POINT': 'ST_Point',
'MULTIPOINT': 'ST_MultiPoint',
'LINESTRING': 'ST_LineString',
'MULTILINESTRING': 'ST_MultiLineString',
'POLYGON': 'ST_Polygon',
'MULTIPOLYGON': 'ST_MultiPolygon',
'GEOMETRYCOLLECTION': 'ST_GeometryCollection'
}

View File

View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper classes and functions for formatting results into API responses.
"""
from typing import Type, TypeVar, Dict, List, Callable, Any, Mapping
from collections import defaultdict
T = TypeVar('T') # pylint: disable=invalid-name
FormatFunc = Callable[[T, Mapping[str, Any]], str]
class FormatDispatcher:
""" Helper class to conveniently create formatting functions in
a module using decorators.
"""
def __init__(self) -> None:
self.format_functions: Dict[Type[Any], Dict[str, FormatFunc[Any]]] = defaultdict(dict)
def format_func(self, result_class: Type[T],
fmt: str) -> Callable[[FormatFunc[T]], FormatFunc[T]]:
""" Decorator for a function that formats a given type of result into the
selected format.
"""
def decorator(func: FormatFunc[T]) -> FormatFunc[T]:
self.format_functions[result_class][fmt] = func
return func
return decorator
def list_formats(self, result_type: Type[Any]) -> List[str]:
""" Return a list of formats supported by this formatter.
"""
return list(self.format_functions[result_type].keys())
def supports_format(self, result_type: Type[Any], fmt: str) -> bool:
""" Check if the given format is supported by this formatter.
"""
return fmt in self.format_functions[result_type]
def format_result(self, result: Any, fmt: str, options: Mapping[str, Any]) -> str:
""" Convert the given result into a string using the given format.
The format is expected to be in the list returned by
`list_formats()`.
"""
return self.format_functions[type(result)][fmt](result, options)

View File

@@ -0,0 +1,752 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Dataclasses for search results and helper functions to fill them.
Data classes are part of the public API while the functions are for
internal use only. That's why they are implemented as free-standing functions
instead of member functions.
"""
from typing import Optional, Tuple, Dict, Sequence, TypeVar, Type, List, cast, Callable
import enum
import dataclasses
import datetime as dt
import sqlalchemy as sa
from nominatim_core.typing import SaSelect, SaRow
from nominatim_core.db.sqlalchemy_types import Geometry
from .types import Point, Bbox, LookupDetails
from .connection import SearchConnection
from .logging import log
from .localization import Locales
# This file defines complex result data classes.
# pylint: disable=too-many-instance-attributes
def _mingle_name_tags(names: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]:
""" Mix-in names from linked places, so that they show up
as standard names where necessary.
"""
if not names:
return None
out = {}
for k, v in names.items():
if k.startswith('_place_'):
outkey = k[7:]
out[k if outkey in names else outkey] = v
else:
out[k] = v
return out
class SourceTable(enum.Enum):
""" The `SourceTable` type lists the possible sources a result can have.
"""
PLACEX = 1
""" The placex table is the main source for result usually containing
OSM data.
"""
OSMLINE = 2
""" The osmline table contains address interpolations from OSM data.
Interpolation addresses are always approximate. The OSM id in the
result refers to the OSM way with the interpolation line object.
"""
TIGER = 3
""" TIGER address data contains US addresses imported on the side,
see [Installing TIGER data](../customize/Tiger.md).
TIGER address are also interpolations. The addresses always refer
to a street from OSM data. The OSM id in the result refers to
that street.
"""
POSTCODE = 4
""" The postcode table contains artificial centroids for postcodes,
computed from the postcodes available with address points. Results
are always approximate.
"""
COUNTRY = 5
""" The country table provides a fallback, when country data is missing
in the OSM data.
"""
@dataclasses.dataclass
class AddressLine:
""" The `AddressLine` may contain the following fields about a related place
and its function as an address object. Most fields are optional.
Their presence depends on the kind and function of the address part.
"""
category: Tuple[str, str]
""" Main category of the place, described by a key-value pair.
"""
names: Dict[str, str]
""" All available names for the place including references, alternative
names and translations.
"""
fromarea: bool
""" If true, then the exact area of the place is known. Without area
information, Nominatim has to make an educated guess if an address
belongs to one place or another.
"""
isaddress: bool
""" If true, this place should be considered for the final address display.
Nominatim will sometimes include more than one candidate for
the address in the list when it cannot reliably determine where the
place belongs. It will consider names of all candidates when searching
but when displaying the result, only the most likely candidate should
be shown.
"""
rank_address: int
""" [Address rank](../customize/Ranking.md#address-rank) of the place.
"""
distance: float
""" Distance in degrees between the result place and this address part.
"""
place_id: Optional[int] = None
""" Internal ID of the place.
"""
osm_object: Optional[Tuple[str, int]] = None
""" OSM type and ID of the place, if such an object exists.
"""
extratags: Optional[Dict[str, str]] = None
""" Any extra information available about the place. This is a dictionary
that usually contains OSM tag key-value pairs.
"""
admin_level: Optional[int] = None
""" The administrative level of a boundary as tagged in the input data.
This field is only meaningful for places of the category
(boundary, administrative).
"""
local_name: Optional[str] = None
""" Place holder for localization of this address part. See
[Localization](#localization) below.
"""
class AddressLines(List[AddressLine]):
""" Sequence of address lines order in descending order by their rank.
"""
def localize(self, locales: Locales) -> List[str]:
""" Set the local name of address parts according to the chosen
locale. Return the list of local names without duplicates.
Only address parts that are marked as isaddress are localized
and returned.
"""
label_parts: List[str] = []
for line in self:
if line.isaddress and line.names:
line.local_name = locales.display_name(line.names)
if not label_parts or label_parts[-1] != line.local_name:
label_parts.append(line.local_name)
return label_parts
@dataclasses.dataclass
class WordInfo:
""" Each entry in the list of search terms contains the
following detailed information.
"""
word_id: int
""" Internal identifier for the word.
"""
word_token: str
""" Normalised and transliterated form of the word.
This form is used for searching.
"""
word: Optional[str] = None
""" Untransliterated form, if available.
"""
WordInfos = Sequence[WordInfo]
@dataclasses.dataclass
class BaseResult:
""" Data class collecting information common to all
types of search results.
"""
source_table: SourceTable
category: Tuple[str, str]
centroid: Point
place_id : Optional[int] = None
osm_object: Optional[Tuple[str, int]] = None
parent_place_id: Optional[int] = None
linked_place_id: Optional[int] = None
admin_level: int = 15
locale_name: Optional[str] = None
display_name: Optional[str] = None
names: Optional[Dict[str, str]] = None
address: Optional[Dict[str, str]] = None
extratags: Optional[Dict[str, str]] = None
housenumber: Optional[str] = None
postcode: Optional[str] = None
wikipedia: Optional[str] = None
rank_address: int = 30
rank_search: int = 30
importance: Optional[float] = None
country_code: Optional[str] = None
address_rows: Optional[AddressLines] = None
linked_rows: Optional[AddressLines] = None
parented_rows: Optional[AddressLines] = None
name_keywords: Optional[WordInfos] = None
address_keywords: Optional[WordInfos] = None
geometry: Dict[str, str] = dataclasses.field(default_factory=dict)
@property
def lat(self) -> float:
""" Get the latitude (or y) of the center point of the place.
"""
return self.centroid[1]
@property
def lon(self) -> float:
""" Get the longitude (or x) of the center point of the place.
"""
return self.centroid[0]
def calculated_importance(self) -> float:
""" Get a valid importance value. This is either the stored importance
of the value or an artificial value computed from the place's
search rank.
"""
return self.importance or (0.40001 - (self.rank_search/75.0))
def localize(self, locales: Locales) -> None:
""" Fill the locale_name and the display_name field for the
place and, if available, its address information.
"""
self.locale_name = locales.display_name(self.names)
if self.address_rows:
self.display_name = ', '.join(self.address_rows.localize(locales))
else:
self.display_name = self.locale_name
BaseResultT = TypeVar('BaseResultT', bound=BaseResult)
@dataclasses.dataclass
class DetailedResult(BaseResult):
""" A search result with more internal information from the database
added.
"""
indexed_date: Optional[dt.datetime] = None
@dataclasses.dataclass
class ReverseResult(BaseResult):
""" A search result for reverse geocoding.
"""
distance: Optional[float] = None
bbox: Optional[Bbox] = None
class ReverseResults(List[ReverseResult]):
""" Sequence of reverse lookup results ordered by distance.
May be empty when no result was found.
"""
@dataclasses.dataclass
class SearchResult(BaseResult):
""" A search result for forward geocoding.
"""
bbox: Optional[Bbox] = None
accuracy: float = 0.0
@property
def ranking(self) -> float:
""" Return the ranking, a combined measure of accuracy and importance.
"""
return (self.accuracy if self.accuracy is not None else 1) \
- self.calculated_importance()
class SearchResults(List[SearchResult]):
""" Sequence of forward lookup results ordered by relevance.
May be empty when no result was found.
"""
def _filter_geometries(row: SaRow) -> Dict[str, str]:
return {k[9:]: v for k, v in row._mapping.items() # pylint: disable=W0212
if k.startswith('geometry_')}
def create_from_placex_row(row: Optional[SaRow],
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
""" Construct a new result and add the data from the result row
from the placex table. 'class_type' defines the type of result
to return. Returns None if the row is None.
"""
if row is None:
return None
return class_type(source_table=SourceTable.PLACEX,
place_id=row.place_id,
osm_object=(row.osm_type, row.osm_id),
category=(row.class_, row.type),
parent_place_id = row.parent_place_id,
linked_place_id = getattr(row, 'linked_place_id', None),
admin_level = getattr(row, 'admin_level', 15),
names=_mingle_name_tags(row.name),
address=row.address,
extratags=row.extratags,
housenumber=row.housenumber,
postcode=row.postcode,
wikipedia=row.wikipedia,
rank_address=row.rank_address,
rank_search=row.rank_search,
importance=row.importance,
country_code=row.country_code,
centroid=Point.from_wkb(row.centroid),
geometry=_filter_geometries(row))
def create_from_osmline_row(row: Optional[SaRow],
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
""" Construct a new result and add the data from the result row
from the address interpolation table osmline. 'class_type' defines
the type of result to return. Returns None if the row is None.
If the row contains a housenumber, then the housenumber is filled out.
Otherwise the result contains the interpolation information in extratags.
"""
if row is None:
return None
hnr = getattr(row, 'housenumber', None)
res = class_type(source_table=SourceTable.OSMLINE,
place_id=row.place_id,
parent_place_id = row.parent_place_id,
osm_object=('W', row.osm_id),
category=('place', 'houses' if hnr is None else 'house'),
address=row.address,
postcode=row.postcode,
country_code=row.country_code,
centroid=Point.from_wkb(row.centroid),
geometry=_filter_geometries(row))
if hnr is None:
res.extratags = {'startnumber': str(row.startnumber),
'endnumber': str(row.endnumber),
'step': str(row.step)}
else:
res.housenumber = str(hnr)
return res
def create_from_tiger_row(row: Optional[SaRow],
class_type: Type[BaseResultT],
osm_type: Optional[str] = None,
osm_id: Optional[int] = None) -> Optional[BaseResultT]:
""" Construct a new result and add the data from the result row
from the Tiger data interpolation table. 'class_type' defines
the type of result to return. Returns None if the row is None.
If the row contains a housenumber, then the housenumber is filled out.
Otherwise the result contains the interpolation information in extratags.
"""
if row is None:
return None
hnr = getattr(row, 'housenumber', None)
res = class_type(source_table=SourceTable.TIGER,
place_id=row.place_id,
parent_place_id = row.parent_place_id,
osm_object=(osm_type or row.osm_type, osm_id or row.osm_id),
category=('place', 'houses' if hnr is None else 'house'),
postcode=row.postcode,
country_code='us',
centroid=Point.from_wkb(row.centroid),
geometry=_filter_geometries(row))
if hnr is None:
res.extratags = {'startnumber': str(row.startnumber),
'endnumber': str(row.endnumber),
'step': str(row.step)}
else:
res.housenumber = str(hnr)
return res
def create_from_postcode_row(row: Optional[SaRow],
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
""" Construct a new result and add the data from the result row
from the postcode table. 'class_type' defines
the type of result to return. Returns None if the row is None.
"""
if row is None:
return None
return class_type(source_table=SourceTable.POSTCODE,
place_id=row.place_id,
parent_place_id = row.parent_place_id,
category=('place', 'postcode'),
names={'ref': row.postcode},
rank_search=row.rank_search,
rank_address=row.rank_address,
country_code=row.country_code,
centroid=Point.from_wkb(row.centroid),
geometry=_filter_geometries(row))
def create_from_country_row(row: Optional[SaRow],
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
""" Construct a new result and add the data from the result row
from the fallback country tables. 'class_type' defines
the type of result to return. Returns None if the row is None.
"""
if row is None:
return None
return class_type(source_table=SourceTable.COUNTRY,
category=('place', 'country'),
centroid=Point.from_wkb(row.centroid),
names=row.name,
rank_address=4, rank_search=4,
country_code=row.country_code,
geometry=_filter_geometries(row))
async def add_result_details(conn: SearchConnection, results: List[BaseResultT],
details: LookupDetails) -> None:
""" Retrieve more details from the database according to the
parameters specified in 'details'.
"""
if results:
log().section('Query details for result')
if details.address_details:
log().comment('Query address details')
await complete_address_details(conn, results)
if details.linked_places:
log().comment('Query linked places')
for result in results:
await complete_linked_places(conn, result)
if details.parented_places:
log().comment('Query parent places')
for result in results:
await complete_parented_places(conn, result)
if details.keywords:
log().comment('Query keywords')
for result in results:
await complete_keywords(conn, result)
for result in results:
result.localize(details.locales)
def _result_row_to_address_row(row: SaRow, isaddress: Optional[bool] = None) -> AddressLine:
""" Create a new AddressLine from the results of a database query.
"""
extratags: Dict[str, str] = getattr(row, 'extratags', {}) or {}
if 'linked_place' in extratags:
extratags['place'] = extratags['linked_place']
names = _mingle_name_tags(row.name) or {}
if getattr(row, 'housenumber', None) is not None:
names['housenumber'] = row.housenumber
if isaddress is None:
isaddress = getattr(row, 'isaddress', True)
return AddressLine(place_id=row.place_id,
osm_object=None if row.osm_type is None else (row.osm_type, row.osm_id),
category=(getattr(row, 'class'), row.type),
names=names,
extratags=extratags,
admin_level=row.admin_level,
fromarea=row.fromarea,
isaddress=isaddress,
rank_address=row.rank_address,
distance=row.distance)
def _get_address_lookup_id(result: BaseResultT) -> int:
assert result.place_id
if result.source_table != SourceTable.PLACEX or result.rank_search > 27:
return result.parent_place_id or result.place_id
return result.linked_place_id or result.place_id
async def _finalize_entry(conn: SearchConnection, result: BaseResultT) -> None:
assert result.address_rows is not None
if result.category[0] not in ('boundary', 'place')\
or result.category[1] not in ('postal_code', 'postcode'):
postcode = result.postcode
if not postcode and result.address:
postcode = result.address.get('postcode')
if postcode and ',' not in postcode and ';' not in postcode:
result.address_rows.append(AddressLine(
category=('place', 'postcode'),
names={'ref': postcode},
fromarea=False, isaddress=True, rank_address=5,
distance=0.0))
if result.country_code:
async def _get_country_names() -> Optional[Dict[str, str]]:
t = conn.t.country_name
sql = sa.select(t.c.name, t.c.derived_name)\
.where(t.c.country_code == result.country_code)
for cres in await conn.execute(sql):
names = cast(Dict[str, str], cres[0])
if cres[1]:
names.update(cast(Dict[str, str], cres[1]))
return names
return None
country_names = await conn.get_cached_value('COUNTRY_NAME',
result.country_code,
_get_country_names)
if country_names:
result.address_rows.append(AddressLine(
category=('place', 'country'),
names=country_names,
fromarea=False, isaddress=True, rank_address=4,
distance=0.0))
result.address_rows.append(AddressLine(
category=('place', 'country_code'),
names={'ref': result.country_code}, extratags = {},
fromarea=True, isaddress=False, rank_address=4,
distance=0.0))
def _setup_address_details(result: BaseResultT) -> None:
""" Retrieve information about places that make up the address of the result.
"""
result.address_rows = AddressLines()
if result.names:
result.address_rows.append(AddressLine(
place_id=result.place_id,
osm_object=result.osm_object,
category=result.category,
names=result.names,
extratags=result.extratags or {},
admin_level=result.admin_level,
fromarea=True, isaddress=True,
rank_address=result.rank_address, distance=0.0))
if result.source_table == SourceTable.PLACEX and result.address:
housenumber = result.address.get('housenumber')\
or result.address.get('streetnumber')\
or result.address.get('conscriptionnumber')
elif result.housenumber:
housenumber = result.housenumber
else:
housenumber = None
if housenumber:
result.address_rows.append(AddressLine(
category=('place', 'house_number'),
names={'ref': housenumber},
fromarea=True, isaddress=True, rank_address=28, distance=0))
if result.address and '_unlisted_place' in result.address:
result.address_rows.append(AddressLine(
category=('place', 'locality'),
names={'name': result.address['_unlisted_place']},
fromarea=False, isaddress=True, rank_address=25, distance=0))
async def complete_address_details(conn: SearchConnection, results: List[BaseResultT]) -> None:
""" Retrieve information about places that make up the address of the result.
"""
for result in results:
_setup_address_details(result)
### Lookup entries from place_address line
lookup_ids = [{'pid': r.place_id,
'lid': _get_address_lookup_id(r),
'names': list(r.address.values()) if r.address else [],
'c': ('SRID=4326;' + r.centroid.to_wkt()) if r.centroid else '' }
for r in results if r.place_id]
if not lookup_ids:
return
ltab = sa.func.JsonArrayEach(sa.type_coerce(lookup_ids, sa.JSON))\
.table_valued(sa.column('value', type_=sa.JSON))
t = conn.t.placex
taddr = conn.t.addressline
sql = sa.select(ltab.c.value['pid'].as_integer().label('src_place_id'),
t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.extratags,
t.c.admin_level, taddr.c.fromarea,
sa.case((t.c.rank_address == 11, 5),
else_=t.c.rank_address).label('rank_address'),
taddr.c.distance, t.c.country_code, t.c.postcode)\
.join(taddr, sa.or_(taddr.c.place_id == ltab.c.value['pid'].as_integer(),
taddr.c.place_id == ltab.c.value['lid'].as_integer()))\
.join(t, taddr.c.address_place_id == t.c.place_id)\
.order_by('src_place_id')\
.order_by(sa.column('rank_address').desc())\
.order_by((taddr.c.place_id == ltab.c.value['pid'].as_integer()).desc())\
.order_by(sa.case((sa.func.CrosscheckNames(t.c.name, ltab.c.value['names']), 2),
(taddr.c.isaddress, 0),
(sa.and_(taddr.c.fromarea,
t.c.geometry.ST_Contains(
sa.func.ST_GeomFromEWKT(
ltab.c.value['c'].as_string()))), 1),
else_=-1).desc())\
.order_by(taddr.c.fromarea.desc())\
.order_by(taddr.c.distance.desc())\
.order_by(t.c.rank_search.desc())
current_result = None
current_rank_address = -1
for row in await conn.execute(sql):
if current_result is None or row.src_place_id != current_result.place_id:
current_result = next((r for r in results if r.place_id == row.src_place_id), None)
assert current_result is not None
current_rank_address = -1
location_isaddress = row.rank_address != current_rank_address
if current_result.country_code is None and row.country_code:
current_result.country_code = row.country_code
if row.type in ('postcode', 'postal_code') and location_isaddress:
if not row.fromarea or \
(current_result.address and 'postcode' in current_result.address):
location_isaddress = False
else:
current_result.postcode = None
assert current_result.address_rows is not None
current_result.address_rows.append(_result_row_to_address_row(row, location_isaddress))
current_rank_address = row.rank_address
for result in results:
await _finalize_entry(conn, result)
### Finally add the record for the parent entry where necessary.
parent_lookup_ids = list(filter(lambda e: e['pid'] != e['lid'], lookup_ids))
if parent_lookup_ids:
ltab = sa.func.JsonArrayEach(sa.type_coerce(parent_lookup_ids, sa.JSON))\
.table_valued(sa.column('value', type_=sa.JSON))
sql = sa.select(ltab.c.value['pid'].as_integer().label('src_place_id'),
t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.extratags,
t.c.admin_level,
t.c.rank_address)\
.where(t.c.place_id == ltab.c.value['lid'].as_integer())
for row in await conn.execute(sql):
current_result = next((r for r in results if r.place_id == row.src_place_id), None)
assert current_result is not None
assert current_result.address_rows is not None
current_result.address_rows.append(AddressLine(
place_id=row.place_id,
osm_object=(row.osm_type, row.osm_id),
category=(row.class_, row.type),
names=row.name, extratags=row.extratags or {},
admin_level=row.admin_level,
fromarea=True, isaddress=True,
rank_address=row.rank_address, distance=0.0))
### Now sort everything
def mk_sort_key(place_id: Optional[int]) -> Callable[[AddressLine], Tuple[bool, int, bool]]:
return lambda a: (a.place_id != place_id, -a.rank_address, a.isaddress)
for result in results:
assert result.address_rows is not None
result.address_rows.sort(key=mk_sort_key(result.place_id))
def _placex_select_address_row(conn: SearchConnection,
centroid: Point) -> SaSelect:
t = conn.t.placex
return sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_.label('class'), t.c.type,
t.c.admin_level, t.c.housenumber,
t.c.geometry.is_area().label('fromarea'),
t.c.rank_address,
t.c.geometry.distance_spheroid(
sa.bindparam('centroid', value=centroid, type_=Geometry)).label('distance'))
async def complete_linked_places(conn: SearchConnection, result: BaseResult) -> None:
""" Retrieve information about places that link to the result.
"""
result.linked_rows = AddressLines()
if result.source_table != SourceTable.PLACEX:
return
sql = _placex_select_address_row(conn, result.centroid)\
.where(conn.t.placex.c.linked_place_id == result.place_id)
for row in await conn.execute(sql):
result.linked_rows.append(_result_row_to_address_row(row))
async def complete_keywords(conn: SearchConnection, result: BaseResult) -> None:
""" Retrieve information about the search terms used for this place.
Requires that the query analyzer was initialised to get access to
the word table.
"""
t = conn.t.search_name
sql = sa.select(t.c.name_vector, t.c.nameaddress_vector)\
.where(t.c.place_id == result.place_id)
result.name_keywords = []
result.address_keywords = []
t = conn.t.meta.tables['word']
sel = sa.select(t.c.word_id, t.c.word_token, t.c.word)
for name_tokens, address_tokens in await conn.execute(sql):
for row in await conn.execute(sel.where(t.c.word_id.in_(name_tokens))):
result.name_keywords.append(WordInfo(*row))
for row in await conn.execute(sel.where(t.c.word_id.in_(address_tokens))):
result.address_keywords.append(WordInfo(*row))
async def complete_parented_places(conn: SearchConnection, result: BaseResult) -> None:
""" Retrieve information about places that the result provides the
address for.
"""
result.parented_rows = AddressLines()
if result.source_table != SourceTable.PLACEX:
return
sql = _placex_select_address_row(conn, result.centroid)\
.where(conn.t.placex.c.parent_place_id == result.place_id)\
.where(conn.t.placex.c.rank_search == 30)
for row in await conn.execute(sql):
result.parented_rows.append(_result_row_to_address_row(row))

View File

@@ -0,0 +1,603 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of reverse geocoding.
"""
from typing import Optional, List, Callable, Type, Tuple, Dict, Any, cast, Union
import functools
import sqlalchemy as sa
from nominatim_core.typing import SaColumn, SaSelect, SaFromClause, SaLabel, SaRow,\
SaBind, SaLambdaSelect
from nominatim_core.db.sqlalchemy_types import Geometry
from .connection import SearchConnection
from . import results as nres
from .logging import log
from .types import AnyPoint, DataLayer, ReverseDetails, GeometryFormat, Bbox
# In SQLAlchemy expression which compare with NULL need to be expressed with
# the equal sign.
# pylint: disable=singleton-comparison
RowFunc = Callable[[Optional[SaRow], Type[nres.ReverseResult]], Optional[nres.ReverseResult]]
WKT_PARAM: SaBind = sa.bindparam('wkt', type_=Geometry)
MAX_RANK_PARAM: SaBind = sa.bindparam('max_rank')
def no_index(expr: SaColumn) -> SaColumn:
""" Wrap the given expression, so that the query planner will
refrain from using the expression for index lookup.
"""
return sa.func.coalesce(sa.null(), expr) # pylint: disable=not-callable
def _select_from_placex(t: SaFromClause, use_wkt: bool = True) -> SaSelect:
""" Create a select statement with the columns relevant for reverse
results.
"""
if not use_wkt:
distance = t.c.distance
centroid = t.c.centroid
else:
distance = t.c.geometry.ST_Distance(WKT_PARAM)
centroid = sa.case((t.c.geometry.is_line_like(), t.c.geometry.ST_ClosestPoint(WKT_PARAM)),
else_=t.c.centroid).label('centroid')
return sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type,
t.c.address, t.c.extratags,
t.c.housenumber, t.c.postcode, t.c.country_code,
t.c.importance, t.c.wikipedia,
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
centroid,
t.c.linked_place_id, t.c.admin_level,
distance.label('distance'),
t.c.geometry.ST_Expand(0).label('bbox'))
def _interpolated_housenumber(table: SaFromClause) -> SaLabel:
return sa.cast(table.c.startnumber
+ sa.func.round(((table.c.endnumber - table.c.startnumber) * table.c.position)
/ table.c.step) * table.c.step,
sa.Integer).label('housenumber')
def _interpolated_position(table: SaFromClause) -> SaLabel:
fac = sa.cast(table.c.step, sa.Float) / (table.c.endnumber - table.c.startnumber)
rounded_pos = sa.func.round(table.c.position / fac) * fac
return sa.case(
(table.c.endnumber == table.c.startnumber, table.c.linegeo.ST_Centroid()),
else_=table.c.linegeo.ST_LineInterpolatePoint(rounded_pos)).label('centroid')
def _locate_interpolation(table: SaFromClause) -> SaLabel:
""" Given a position, locate the closest point on the line.
"""
return sa.case((table.c.linegeo.is_line_like(),
table.c.linegeo.ST_LineLocatePoint(WKT_PARAM)),
else_=0).label('position')
def _get_closest(*rows: Optional[SaRow]) -> Optional[SaRow]:
return min(rows, key=lambda row: 1000 if row is None else row.distance)
class ReverseGeocoder:
""" Class implementing the logic for looking up a place from a
coordinate.
"""
def __init__(self, conn: SearchConnection, params: ReverseDetails,
restrict_to_country_areas: bool = False) -> None:
self.conn = conn
self.params = params
self.restrict_to_country_areas = restrict_to_country_areas
self.bind_params: Dict[str, Any] = {'max_rank': params.max_rank}
@property
def max_rank(self) -> int:
""" Return the maximum configured rank.
"""
return self.params.max_rank
def has_geometries(self) -> bool:
""" Check if any geometries are requested.
"""
return bool(self.params.geometry_output)
def layer_enabled(self, *layer: DataLayer) -> bool:
""" Return true when any of the given layer types are requested.
"""
return any(self.params.layers & l for l in layer)
def layer_disabled(self, *layer: DataLayer) -> bool:
""" Return true when none of the given layer types is requested.
"""
return not any(self.params.layers & l for l in layer)
def has_feature_layers(self) -> bool:
""" Return true if any layer other than ADDRESS or POI is requested.
"""
return self.layer_enabled(DataLayer.RAILWAY, DataLayer.MANMADE, DataLayer.NATURAL)
def _add_geometry_columns(self, sql: SaLambdaSelect, col: SaColumn) -> SaSelect:
out = []
if self.params.geometry_simplification > 0.0:
col = sa.func.ST_SimplifyPreserveTopology(col, self.params.geometry_simplification)
if self.params.geometry_output & GeometryFormat.GEOJSON:
out.append(sa.func.ST_AsGeoJSON(col, 7).label('geometry_geojson'))
if self.params.geometry_output & GeometryFormat.TEXT:
out.append(sa.func.ST_AsText(col).label('geometry_text'))
if self.params.geometry_output & GeometryFormat.KML:
out.append(sa.func.ST_AsKML(col, 7).label('geometry_kml'))
if self.params.geometry_output & GeometryFormat.SVG:
out.append(sa.func.ST_AsSVG(col, 0, 7).label('geometry_svg'))
return sql.add_columns(*out)
def _filter_by_layer(self, table: SaFromClause) -> SaColumn:
if self.layer_enabled(DataLayer.MANMADE):
exclude = []
if self.layer_disabled(DataLayer.RAILWAY):
exclude.append('railway')
if self.layer_disabled(DataLayer.NATURAL):
exclude.extend(('natural', 'water', 'waterway'))
return table.c.class_.not_in(tuple(exclude))
include = []
if self.layer_enabled(DataLayer.RAILWAY):
include.append('railway')
if self.layer_enabled(DataLayer.NATURAL):
include.extend(('natural', 'water', 'waterway'))
return table.c.class_.in_(tuple(include))
async def _find_closest_street_or_poi(self, distance: float) -> Optional[SaRow]:
""" Look up the closest rank 26+ place in the database, which
is closer than the given distance.
"""
t = self.conn.t.placex
# PostgreSQL must not get the distance as a parameter because
# there is a danger it won't be able to properly estimate index use
# when used with prepared statements
diststr = sa.text(f"{distance}")
sql: SaLambdaSelect = sa.lambda_stmt(lambda: _select_from_placex(t)
.where(t.c.geometry.within_distance(WKT_PARAM, diststr))
.where(t.c.indexed_status == 0)
.where(t.c.linked_place_id == None)
.where(sa.or_(sa.not_(t.c.geometry.is_area()),
t.c.centroid.ST_Distance(WKT_PARAM) < diststr))
.order_by('distance')
.limit(2))
if self.has_geometries():
sql = self._add_geometry_columns(sql, t.c.geometry)
restrict: List[Union[SaColumn, Callable[[], SaColumn]]] = []
if self.layer_enabled(DataLayer.ADDRESS):
max_rank = min(29, self.max_rank)
restrict.append(lambda: no_index(t.c.rank_address).between(26, max_rank))
if self.max_rank == 30:
restrict.append(lambda: sa.func.IsAddressPoint(t))
if self.layer_enabled(DataLayer.POI) and self.max_rank == 30:
restrict.append(lambda: sa.and_(no_index(t.c.rank_search) == 30,
t.c.class_.not_in(('place', 'building')),
sa.not_(t.c.geometry.is_line_like())))
if self.has_feature_layers():
restrict.append(sa.and_(no_index(t.c.rank_search).between(26, MAX_RANK_PARAM),
no_index(t.c.rank_address) == 0,
self._filter_by_layer(t)))
if not restrict:
return None
sql = sql.where(sa.or_(*restrict))
# If the closest object is inside an area, then check if there is a
# POI node nearby and return that.
prev_row = None
for row in await self.conn.execute(sql, self.bind_params):
if prev_row is None:
if row.rank_search <= 27 or row.osm_type == 'N' or row.distance > 0:
return row
prev_row = row
else:
if row.rank_search > 27 and row.osm_type == 'N'\
and row.distance < 0.0001:
return row
return prev_row
async def _find_housenumber_for_street(self, parent_place_id: int) -> Optional[SaRow]:
t = self.conn.t.placex
def _base_query() -> SaSelect:
return _select_from_placex(t)\
.where(t.c.geometry.within_distance(WKT_PARAM, 0.001))\
.where(t.c.parent_place_id == parent_place_id)\
.where(sa.func.IsAddressPoint(t))\
.where(t.c.indexed_status == 0)\
.where(t.c.linked_place_id == None)\
.order_by('distance')\
.limit(1)
sql: SaLambdaSelect
if self.has_geometries():
sql = self._add_geometry_columns(_base_query(), t.c.geometry)
else:
sql = sa.lambda_stmt(_base_query)
return (await self.conn.execute(sql, self.bind_params)).one_or_none()
async def _find_interpolation_for_street(self, parent_place_id: Optional[int],
distance: float) -> Optional[SaRow]:
t = self.conn.t.osmline
sql = sa.select(t,
t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
_locate_interpolation(t))\
.where(t.c.linegeo.within_distance(WKT_PARAM, distance))\
.where(t.c.startnumber != None)\
.order_by('distance')\
.limit(1)
if parent_place_id is not None:
sql = sql.where(t.c.parent_place_id == parent_place_id)
inner = sql.subquery('ipol')
sql = sa.select(inner.c.place_id, inner.c.osm_id,
inner.c.parent_place_id, inner.c.address,
_interpolated_housenumber(inner),
_interpolated_position(inner),
inner.c.postcode, inner.c.country_code,
inner.c.distance)
if self.has_geometries():
sub = sql.subquery('geom')
sql = self._add_geometry_columns(sa.select(sub), sub.c.centroid)
return (await self.conn.execute(sql, self.bind_params)).one_or_none()
async def _find_tiger_number_for_street(self, parent_place_id: int) -> Optional[SaRow]:
t = self.conn.t.tiger
def _base_query() -> SaSelect:
inner = sa.select(t,
t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
_locate_interpolation(t))\
.where(t.c.linegeo.within_distance(WKT_PARAM, 0.001))\
.where(t.c.parent_place_id == parent_place_id)\
.order_by('distance')\
.limit(1)\
.subquery('tiger')
return sa.select(inner.c.place_id,
inner.c.parent_place_id,
_interpolated_housenumber(inner),
_interpolated_position(inner),
inner.c.postcode,
inner.c.distance)
sql: SaLambdaSelect
if self.has_geometries():
sub = _base_query().subquery('geom')
sql = self._add_geometry_columns(sa.select(sub), sub.c.centroid)
else:
sql = sa.lambda_stmt(_base_query)
return (await self.conn.execute(sql, self.bind_params)).one_or_none()
async def lookup_street_poi(self) -> Tuple[Optional[SaRow], RowFunc]:
""" Find a street or POI/address for the given WKT point.
"""
log().section('Reverse lookup on street/address level')
distance = 0.006
parent_place_id = None
row = await self._find_closest_street_or_poi(distance)
row_func: RowFunc = nres.create_from_placex_row
log().var_dump('Result (street/building)', row)
# If the closest result was a street, but an address was requested,
# check for a housenumber nearby which is part of the street.
if row is not None:
if self.max_rank > 27 \
and self.layer_enabled(DataLayer.ADDRESS) \
and row.rank_address <= 27:
distance = 0.001
parent_place_id = row.place_id
log().comment('Find housenumber for street')
addr_row = await self._find_housenumber_for_street(parent_place_id)
log().var_dump('Result (street housenumber)', addr_row)
if addr_row is not None:
row = addr_row
row_func = nres.create_from_placex_row
distance = addr_row.distance
elif row.country_code == 'us' and parent_place_id is not None:
log().comment('Find TIGER housenumber for street')
addr_row = await self._find_tiger_number_for_street(parent_place_id)
log().var_dump('Result (street Tiger housenumber)', addr_row)
if addr_row is not None:
row_func = cast(RowFunc,
functools.partial(nres.create_from_tiger_row,
osm_type=row.osm_type,
osm_id=row.osm_id))
row = addr_row
else:
distance = row.distance
# Check for an interpolation that is either closer than our result
# or belongs to a close street found.
if self.max_rank > 27 and self.layer_enabled(DataLayer.ADDRESS):
log().comment('Find interpolation for street')
addr_row = await self._find_interpolation_for_street(parent_place_id,
distance)
log().var_dump('Result (street interpolation)', addr_row)
if addr_row is not None:
row = addr_row
row_func = nres.create_from_osmline_row
return row, row_func
async def _lookup_area_address(self) -> Optional[SaRow]:
""" Lookup large addressable areas for the given WKT point.
"""
log().comment('Reverse lookup by larger address area features')
t = self.conn.t.placex
def _base_query() -> SaSelect:
# The inner SQL brings results in the right order, so that
# later only a minimum of results needs to be checked with ST_Contains.
inner = sa.select(t, sa.literal(0.0).label('distance'))\
.where(t.c.rank_search.between(5, MAX_RANK_PARAM))\
.where(t.c.geometry.intersects(WKT_PARAM))\
.where(sa.func.PlacexGeometryReverseLookuppolygon())\
.order_by(sa.desc(t.c.rank_search))\
.limit(50)\
.subquery('area')
return _select_from_placex(inner, False)\
.where(inner.c.geometry.ST_Contains(WKT_PARAM))\
.order_by(sa.desc(inner.c.rank_search))\
.limit(1)
sql: SaLambdaSelect = sa.lambda_stmt(_base_query)
if self.has_geometries():
sql = self._add_geometry_columns(sql, sa.literal_column('area.geometry'))
address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
log().var_dump('Result (area)', address_row)
if address_row is not None and address_row.rank_search < self.max_rank:
log().comment('Search for better matching place nodes inside the area')
address_rank = address_row.rank_search
address_id = address_row.place_id
def _place_inside_area_query() -> SaSelect:
inner = \
sa.select(t,
t.c.geometry.ST_Distance(WKT_PARAM).label('distance'))\
.where(t.c.rank_search > address_rank)\
.where(t.c.rank_search <= MAX_RANK_PARAM)\
.where(t.c.indexed_status == 0)\
.where(sa.func.IntersectsReverseDistance(t, WKT_PARAM))\
.order_by(sa.desc(t.c.rank_search))\
.limit(50)\
.subquery('places')
touter = t.alias('outer')
return _select_from_placex(inner, False)\
.join(touter, touter.c.geometry.ST_Contains(inner.c.geometry))\
.where(touter.c.place_id == address_id)\
.where(sa.func.IsBelowReverseDistance(inner.c.distance, inner.c.rank_search))\
.order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
.limit(1)
if self.has_geometries():
sql = self._add_geometry_columns(_place_inside_area_query(),
sa.literal_column('places.geometry'))
else:
sql = sa.lambda_stmt(_place_inside_area_query)
place_address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
log().var_dump('Result (place node)', place_address_row)
if place_address_row is not None:
return place_address_row
return address_row
async def _lookup_area_others(self) -> Optional[SaRow]:
t = self.conn.t.placex
inner = sa.select(t, t.c.geometry.ST_Distance(WKT_PARAM).label('distance'))\
.where(t.c.rank_address == 0)\
.where(t.c.rank_search.between(5, MAX_RANK_PARAM))\
.where(t.c.name != None)\
.where(t.c.indexed_status == 0)\
.where(t.c.linked_place_id == None)\
.where(self._filter_by_layer(t))\
.where(t.c.geometry.intersects(sa.func.ST_Expand(WKT_PARAM, 0.007)))\
.order_by(sa.desc(t.c.rank_search))\
.order_by('distance')\
.limit(50)\
.subquery()
sql = _select_from_placex(inner, False)\
.where(sa.or_(sa.not_(inner.c.geometry.is_area()),
inner.c.geometry.ST_Contains(WKT_PARAM)))\
.order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
.limit(1)
if self.has_geometries():
sql = self._add_geometry_columns(sql, inner.c.geometry)
row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
log().var_dump('Result (non-address feature)', row)
return row
async def lookup_area(self) -> Optional[SaRow]:
""" Lookup large areas for the current search.
"""
log().section('Reverse lookup by larger area features')
if self.layer_enabled(DataLayer.ADDRESS):
address_row = await self._lookup_area_address()
else:
address_row = None
if self.has_feature_layers():
other_row = await self._lookup_area_others()
else:
other_row = None
return _get_closest(address_row, other_row)
async def lookup_country_codes(self) -> List[str]:
""" Lookup the country for the current search.
"""
log().section('Reverse lookup by country code')
t = self.conn.t.country_grid
sql = sa.select(t.c.country_code).distinct()\
.where(t.c.geometry.ST_Contains(WKT_PARAM))
ccodes = [cast(str, r[0]) for r in await self.conn.execute(sql, self.bind_params)]
log().var_dump('Country codes', ccodes)
return ccodes
async def lookup_country(self, ccodes: List[str]) -> Optional[SaRow]:
""" Lookup the country for the current search.
"""
if not ccodes:
ccodes = await self.lookup_country_codes()
if not ccodes:
return None
t = self.conn.t.placex
if self.max_rank > 4:
log().comment('Search for place nodes in country')
def _base_query() -> SaSelect:
inner = \
sa.select(t,
t.c.geometry.ST_Distance(WKT_PARAM).label('distance'))\
.where(t.c.rank_search > 4)\
.where(t.c.rank_search <= MAX_RANK_PARAM)\
.where(t.c.indexed_status == 0)\
.where(t.c.country_code.in_(ccodes))\
.where(sa.func.IntersectsReverseDistance(t, WKT_PARAM))\
.order_by(sa.desc(t.c.rank_search))\
.limit(50)\
.subquery('area')
return _select_from_placex(inner, False)\
.where(sa.func.IsBelowReverseDistance(inner.c.distance, inner.c.rank_search))\
.order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
.limit(1)
sql: SaLambdaSelect
if self.has_geometries():
sql = self._add_geometry_columns(_base_query(),
sa.literal_column('area.geometry'))
else:
sql = sa.lambda_stmt(_base_query)
address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
log().var_dump('Result (addressable place node)', address_row)
else:
address_row = None
if address_row is None:
# Still nothing, then return a country with the appropriate country code.
def _country_base_query() -> SaSelect:
return _select_from_placex(t)\
.where(t.c.country_code.in_(ccodes))\
.where(t.c.rank_address == 4)\
.where(t.c.rank_search == 4)\
.where(t.c.linked_place_id == None)\
.order_by('distance')\
.limit(1)
if self.has_geometries():
sql = self._add_geometry_columns(_country_base_query(), t.c.geometry)
else:
sql = sa.lambda_stmt(_country_base_query)
address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
return address_row
async def lookup(self, coord: AnyPoint) -> Optional[nres.ReverseResult]:
""" Look up a single coordinate. Returns the place information,
if a place was found near the coordinates or None otherwise.
"""
log().function('reverse_lookup', coord=coord, params=self.params)
self.bind_params['wkt'] = f'POINT({coord[0]} {coord[1]})'
row: Optional[SaRow] = None
row_func: RowFunc = nres.create_from_placex_row
if self.max_rank >= 26:
row, tmp_row_func = await self.lookup_street_poi()
if row is not None:
row_func = tmp_row_func
if row is None:
if self.restrict_to_country_areas:
ccodes = await self.lookup_country_codes()
if not ccodes:
return None
else:
ccodes = []
if self.max_rank > 4:
row = await self.lookup_area()
if row is None and self.layer_enabled(DataLayer.ADDRESS):
row = await self.lookup_country(ccodes)
result = row_func(row, nres.ReverseResult)
if result is not None:
assert row is not None
result.distance = row.distance
if hasattr(row, 'bbox'):
result.bbox = Bbox.from_wkb(row.bbox)
await nres.add_result_details(self.conn, [result], self.params)
return result

View File

@@ -0,0 +1,15 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module for forward search.
"""
# pylint: disable=useless-import-alias
from .geocoder import (ForwardGeocoder as ForwardGeocoder)
from .query import (Phrase as Phrase,
PhraseType as PhraseType)
from .query_analyzer_factory import (make_query_analyzer as make_query_analyzer)

View File

@@ -0,0 +1,459 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Conversion from token assignment to an abstract DB search.
"""
from typing import Optional, List, Tuple, Iterator, Dict
import heapq
from ..types import SearchDetails, DataLayer
from .query import QueryStruct, Token, TokenType, TokenRange, BreakType
from .token_assignment import TokenAssignment
from . import db_search_fields as dbf
from . import db_searches as dbs
from . import db_search_lookups as lookups
def wrap_near_search(categories: List[Tuple[str, str]],
search: dbs.AbstractSearch) -> dbs.NearSearch:
""" Create a new search that wraps the given search in a search
for near places of the given category.
"""
return dbs.NearSearch(penalty=search.penalty,
categories=dbf.WeightedCategories(categories,
[0.0] * len(categories)),
search=search)
def build_poi_search(category: List[Tuple[str, str]],
countries: Optional[List[str]]) -> dbs.PoiSearch:
""" Create a new search for places by the given category, possibly
constraint to the given countries.
"""
if countries:
ccs = dbf.WeightedStrings(countries, [0.0] * len(countries))
else:
ccs = dbf.WeightedStrings([], [])
class _PoiData(dbf.SearchData):
penalty = 0.0
qualifiers = dbf.WeightedCategories(category, [0.0] * len(category))
countries=ccs
return dbs.PoiSearch(_PoiData())
class SearchBuilder:
""" Build the abstract search queries from token assignments.
"""
def __init__(self, query: QueryStruct, details: SearchDetails) -> None:
self.query = query
self.details = details
@property
def configured_for_country(self) -> bool:
""" Return true if the search details are configured to
allow countries in the result.
"""
return self.details.min_rank <= 4 and self.details.max_rank >= 4 \
and self.details.layer_enabled(DataLayer.ADDRESS)
@property
def configured_for_postcode(self) -> bool:
""" Return true if the search details are configured to
allow postcodes in the result.
"""
return self.details.min_rank <= 5 and self.details.max_rank >= 11\
and self.details.layer_enabled(DataLayer.ADDRESS)
@property
def configured_for_housenumbers(self) -> bool:
""" Return true if the search details are configured to
allow addresses in the result.
"""
return self.details.max_rank >= 30 \
and self.details.layer_enabled(DataLayer.ADDRESS)
def build(self, assignment: TokenAssignment) -> Iterator[dbs.AbstractSearch]:
""" Yield all possible abstract searches for the given token assignment.
"""
sdata = self.get_search_data(assignment)
if sdata is None:
return
near_items = self.get_near_items(assignment)
if near_items is not None and not near_items:
return # impossible compbination of near items and category parameter
if assignment.name is None:
if near_items and not sdata.postcodes:
sdata.qualifiers = near_items
near_items = None
builder = self.build_poi_search(sdata)
elif assignment.housenumber:
hnr_tokens = self.query.get_tokens(assignment.housenumber,
TokenType.HOUSENUMBER)
builder = self.build_housenumber_search(sdata, hnr_tokens, assignment.address)
else:
builder = self.build_special_search(sdata, assignment.address,
bool(near_items))
else:
builder = self.build_name_search(sdata, assignment.name, assignment.address,
bool(near_items))
if near_items:
penalty = min(near_items.penalties)
near_items.penalties = [p - penalty for p in near_items.penalties]
for search in builder:
search_penalty = search.penalty
search.penalty = 0.0
yield dbs.NearSearch(penalty + assignment.penalty + search_penalty,
near_items, search)
else:
for search in builder:
search.penalty += assignment.penalty
yield search
def build_poi_search(self, sdata: dbf.SearchData) -> Iterator[dbs.AbstractSearch]:
""" Build abstract search query for a simple category search.
This kind of search requires an additional geographic constraint.
"""
if not sdata.housenumbers \
and ((self.details.viewbox and self.details.bounded_viewbox) or self.details.near):
yield dbs.PoiSearch(sdata)
def build_special_search(self, sdata: dbf.SearchData,
address: List[TokenRange],
is_category: bool) -> Iterator[dbs.AbstractSearch]:
""" Build abstract search queries for searches that do not involve
a named place.
"""
if sdata.qualifiers:
# No special searches over qualifiers supported.
return
if sdata.countries and not address and not sdata.postcodes \
and self.configured_for_country:
yield dbs.CountrySearch(sdata)
if sdata.postcodes and (is_category or self.configured_for_postcode):
penalty = 0.0 if sdata.countries else 0.1
if address:
sdata.lookups = [dbf.FieldLookup('nameaddress_vector',
[t.token for r in address
for t in self.query.get_partials_list(r)],
lookups.Restrict)]
penalty += 0.2
yield dbs.PostcodeSearch(penalty, sdata)
def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
address: List[TokenRange]) -> Iterator[dbs.AbstractSearch]:
""" Build a simple address search for special entries where the
housenumber is the main name token.
"""
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
expected_count = sum(t.count for t in hnrs)
partials = {t.token: t.addr_count for trange in address
for t in self.query.get_partials_list(trange)}
if expected_count < 8000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
list(partials), lookups.Restrict))
elif len(partials) != 1 or list(partials.values())[0] < 10000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
list(partials), lookups.LookupAll))
else:
addr_fulls = [t.token for t
in self.query.get_tokens(address[0], TokenType.WORD)]
if len(addr_fulls) > 5:
return
sdata.lookups.append(
dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny))
sdata.housenumbers = dbf.WeightedStrings([], [])
yield dbs.PlaceSearch(0.05, sdata, expected_count)
def build_name_search(self, sdata: dbf.SearchData,
name: TokenRange, address: List[TokenRange],
is_category: bool) -> Iterator[dbs.AbstractSearch]:
""" Build abstract search queries for simple name or address searches.
"""
if is_category or not sdata.housenumbers or self.configured_for_housenumbers:
ranking = self.get_name_ranking(name)
name_penalty = ranking.normalize_penalty()
if ranking.rankings:
sdata.rankings.append(ranking)
for penalty, count, lookup in self.yield_lookups(name, address):
sdata.lookups = lookup
yield dbs.PlaceSearch(penalty + name_penalty, sdata, count)
def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
-> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
""" Yield all variants how the given name and address should best
be searched for. This takes into account how frequent the terms
are and tries to find a lookup that optimizes index use.
"""
penalty = 0.0 # extra penalty
name_partials = {t.token: t for t in self.query.get_partials_list(name)}
addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
addr_tokens = list({t.token for t in addr_partials})
partials_indexed = all(t.is_indexed for t in name_partials.values()) \
and all(t.is_indexed for t in addr_partials)
exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
return
addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
# Partial term to frequent. Try looking up by rare full names first.
name_fulls = self.query.get_tokens(name, TokenType.WORD)
if name_fulls:
fulls_count = sum(t.count for t in name_fulls)
if partials_indexed:
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
if fulls_count < 50000 or addr_count < 30000:
yield penalty,fulls_count / (2**len(addr_tokens)), \
self.get_full_name_ranking(name_fulls, addr_partials,
fulls_count > 30000 / max(1, len(addr_tokens)))
# To catch remaining results, lookup by name and address
# We only do this if there is a reasonable number of results expected.
exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
if exp_count < 10000 and addr_count < 20000\
and all(t.is_indexed for t in name_partials.values()):
penalty += 0.35 * max(1 if name_fulls else 0.1,
5 - len(name_partials) - len(addr_tokens))
yield penalty, exp_count,\
self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
def get_name_address_ranking(self, name_tokens: List[int],
addr_partials: List[Token]) -> List[dbf.FieldLookup]:
""" Create a ranking expression looking up by name and address.
"""
lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
addr_restrict_tokens = []
addr_lookup_tokens = []
for t in addr_partials:
if t.is_indexed:
if t.addr_count > 20000:
addr_restrict_tokens.append(t.token)
else:
addr_lookup_tokens.append(t.token)
if addr_restrict_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector',
addr_restrict_tokens, lookups.Restrict))
if addr_lookup_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector',
addr_lookup_tokens, lookups.LookupAll))
return lookup
def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Token],
use_lookup: bool) -> List[dbf.FieldLookup]:
""" Create a ranking expression with full name terms and
additional address lookup. When 'use_lookup' is true, then
address lookups will use the index, when the occurences are not
too many.
"""
# At this point drop unindexed partials from the address.
# This might yield wrong results, nothing we can do about that.
if use_lookup:
addr_restrict_tokens = []
addr_lookup_tokens = []
for t in addr_partials:
if t.is_indexed:
if t.addr_count > 20000:
addr_restrict_tokens.append(t.token)
else:
addr_lookup_tokens.append(t.token)
else:
addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed]
addr_lookup_tokens = []
return dbf.lookup_by_any_name([t.token for t in name_fulls],
addr_restrict_tokens, addr_lookup_tokens)
def get_name_ranking(self, trange: TokenRange,
db_field: str = 'name_vector') -> dbf.FieldRanking:
""" Create a ranking expression for a name term in the given range.
"""
name_fulls = self.query.get_tokens(trange, TokenType.WORD)
ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
ranks.sort(key=lambda r: r.penalty)
# Fallback, sum of penalty for partials
name_partials = self.query.get_partials_list(trange)
default = sum(t.penalty for t in name_partials) + 0.2
return dbf.FieldRanking(db_field, default, ranks)
def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
""" Create a list of ranking expressions for an address term
for the given ranges.
"""
todo: List[Tuple[int, int, dbf.RankedTokens]] = []
heapq.heappush(todo, (0, trange.start, dbf.RankedTokens(0.0, [])))
ranks: List[dbf.RankedTokens] = []
while todo: # pylint: disable=too-many-nested-blocks
neglen, pos, rank = heapq.heappop(todo)
for tlist in self.query.nodes[pos].starting:
if tlist.ttype in (TokenType.PARTIAL, TokenType.WORD):
if tlist.end < trange.end:
chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
if tlist.ttype == TokenType.PARTIAL:
penalty = rank.penalty + chgpenalty \
+ max(t.penalty for t in tlist.tokens)
heapq.heappush(todo, (neglen - 1, tlist.end,
dbf.RankedTokens(penalty, rank.tokens)))
else:
for t in tlist.tokens:
heapq.heappush(todo, (neglen - 1, tlist.end,
rank.with_token(t, chgpenalty)))
elif tlist.end == trange.end:
if tlist.ttype == TokenType.PARTIAL:
ranks.append(dbf.RankedTokens(rank.penalty
+ max(t.penalty for t in tlist.tokens),
rank.tokens))
else:
ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
if len(ranks) >= 10:
# Too many variants, bail out and only add
# Worst-case Fallback: sum of penalty of partials
name_partials = self.query.get_partials_list(trange)
default = sum(t.penalty for t in name_partials) + 0.2
ranks.append(dbf.RankedTokens(rank.penalty + default, []))
# Bail out of outer loop
todo.clear()
break
ranks.sort(key=lambda r: len(r.tokens))
default = ranks[0].penalty + 0.3
del ranks[0]
ranks.sort(key=lambda r: r.penalty)
return dbf.FieldRanking('nameaddress_vector', default, ranks)
def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchData]:
""" Collect the tokens for the non-name search fields in the
assignment.
"""
sdata = dbf.SearchData()
sdata.penalty = assignment.penalty
if assignment.country:
tokens = self.get_country_tokens(assignment.country)
if not tokens:
return None
sdata.set_strings('countries', tokens)
elif self.details.countries:
sdata.countries = dbf.WeightedStrings(self.details.countries,
[0.0] * len(self.details.countries))
if assignment.housenumber:
sdata.set_strings('housenumbers',
self.query.get_tokens(assignment.housenumber,
TokenType.HOUSENUMBER))
if assignment.postcode:
sdata.set_strings('postcodes',
self.query.get_tokens(assignment.postcode,
TokenType.POSTCODE))
if assignment.qualifier:
tokens = self.get_qualifier_tokens(assignment.qualifier)
if not tokens:
return None
sdata.set_qualifiers(tokens)
elif self.details.categories:
sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
[0.0] * len(self.details.categories))
if assignment.address:
if not assignment.name and assignment.housenumber:
# housenumber search: the first item needs to be handled like
# a name in ranking or penalties are not comparable with
# normal searches.
sdata.set_ranking([self.get_name_ranking(assignment.address[0],
db_field='nameaddress_vector')]
+ [self.get_addr_ranking(r) for r in assignment.address[1:]])
else:
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
else:
sdata.rankings = []
return sdata
def get_country_tokens(self, trange: TokenRange) -> List[Token]:
""" Return the list of country tokens for the given range,
optionally filtered by the country list from the details
parameters.
"""
tokens = self.query.get_tokens(trange, TokenType.COUNTRY)
if self.details.countries:
tokens = [t for t in tokens if t.lookup_word in self.details.countries]
return tokens
def get_qualifier_tokens(self, trange: TokenRange) -> List[Token]:
""" Return the list of qualifier tokens for the given range,
optionally filtered by the qualifier list from the details
parameters.
"""
tokens = self.query.get_tokens(trange, TokenType.QUALIFIER)
if self.details.categories:
tokens = [t for t in tokens if t.get_category() in self.details.categories]
return tokens
def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
""" Collect tokens for near items search or use the categories
requested per parameter.
Returns None if no category search is requested.
"""
if assignment.near_item:
tokens: Dict[Tuple[str, str], float] = {}
for t in self.query.get_tokens(assignment.near_item, TokenType.NEAR_ITEM):
cat = t.get_category()
# The category of a near search will be that of near_item.
# Thus, if search is restricted to a category parameter,
# the two sets must intersect.
if (not self.details.categories or cat in self.details.categories)\
and t.penalty < tokens.get(cat, 1000.0):
tokens[cat] = t.penalty
return dbf.WeightedCategories(list(tokens.keys()), list(tokens.values()))
return None
PENALTY_WORDCHANGE = {
BreakType.START: 0.0,
BreakType.END: 0.0,
BreakType.PHRASE: 0.0,
BreakType.WORD: 0.1,
BreakType.PART: 0.2,
BreakType.TOKEN: 0.4
}

View File

@@ -0,0 +1,254 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Data structures for more complex fields in abstract search descriptions.
"""
from typing import List, Tuple, Iterator, Dict, Type
import dataclasses
import sqlalchemy as sa
from nominatim_core.typing import SaFromClause, SaColumn, SaExpression
from .query import Token
from . import db_search_lookups as lookups
from nominatim_core.utils.json_writer import JsonWriter
@dataclasses.dataclass
class WeightedStrings:
""" A list of strings together with a penalty.
"""
values: List[str]
penalties: List[float]
def __bool__(self) -> bool:
return bool(self.values)
def __iter__(self) -> Iterator[Tuple[str, float]]:
return iter(zip(self.values, self.penalties))
def get_penalty(self, value: str, default: float = 1000.0) -> float:
""" Get the penalty for the given value. Returns the given default
if the value does not exist.
"""
try:
return self.penalties[self.values.index(value)]
except ValueError:
pass
return default
@dataclasses.dataclass
class WeightedCategories:
""" A list of class/type tuples together with a penalty.
"""
values: List[Tuple[str, str]]
penalties: List[float]
def __bool__(self) -> bool:
return bool(self.values)
def __iter__(self) -> Iterator[Tuple[Tuple[str, str], float]]:
return iter(zip(self.values, self.penalties))
def get_penalty(self, value: Tuple[str, str], default: float = 1000.0) -> float:
""" Get the penalty for the given value. Returns the given default
if the value does not exist.
"""
try:
return self.penalties[self.values.index(value)]
except ValueError:
pass
return default
def sql_restrict(self, table: SaFromClause) -> SaExpression:
""" Return an SQLAlcheny expression that restricts the
class and type columns of the given table to the values
in the list.
Must not be used with an empty list.
"""
assert self.values
if len(self.values) == 1:
return sa.and_(table.c.class_ == self.values[0][0],
table.c.type == self.values[0][1])
return sa.or_(*(sa.and_(table.c.class_ == c, table.c.type == t)
for c, t in self.values))
@dataclasses.dataclass(order=True)
class RankedTokens:
""" List of tokens together with the penalty of using it.
"""
penalty: float
tokens: List[int]
def with_token(self, t: Token, transition_penalty: float) -> 'RankedTokens':
""" Create a new RankedTokens list with the given token appended.
The tokens penalty as well as the given transition penalty
are added to the overall penalty.
"""
return RankedTokens(self.penalty + t.penalty + transition_penalty,
self.tokens + [t.token])
@dataclasses.dataclass
class FieldRanking:
""" A list of rankings to be applied sequentially until one matches.
The matched ranking determines the penalty. If none matches a
default penalty is applied.
"""
column: str
default: float
rankings: List[RankedTokens]
def normalize_penalty(self) -> float:
""" Reduce the default and ranking penalties, such that the minimum
penalty is 0. Return the penalty that was subtracted.
"""
if self.rankings:
min_penalty = min(self.default, min(r.penalty for r in self.rankings))
else:
min_penalty = self.default
if min_penalty > 0.0:
self.default -= min_penalty
for ranking in self.rankings:
ranking.penalty -= min_penalty
return min_penalty
def sql_penalty(self, table: SaFromClause) -> SaColumn:
""" Create an SQL expression for the rankings.
"""
assert self.rankings
rout = JsonWriter().start_array()
for rank in self.rankings:
rout.start_array().value(rank.penalty).next()
rout.start_array()
for token in rank.tokens:
rout.value(token).next()
rout.end_array()
rout.end_array().next()
rout.end_array()
return sa.func.weigh_search(table.c[self.column], rout(), self.default)
@dataclasses.dataclass
class FieldLookup:
""" A list of tokens to be searched for. The column names the database
column to search in and the lookup_type the operator that is applied.
'lookup_all' requires all tokens to match. 'lookup_any' requires
one of the tokens to match. 'restrict' requires to match all tokens
but avoids the use of indexes.
"""
column: str
tokens: List[int]
lookup_type: Type[lookups.LookupType]
def sql_condition(self, table: SaFromClause) -> SaColumn:
""" Create an SQL expression for the given match condition.
"""
return self.lookup_type(table, self.column, self.tokens)
class SearchData:
""" Search fields derived from query and token assignment
to be used with the SQL queries.
"""
penalty: float
lookups: List[FieldLookup] = []
rankings: List[FieldRanking]
housenumbers: WeightedStrings = WeightedStrings([], [])
postcodes: WeightedStrings = WeightedStrings([], [])
countries: WeightedStrings = WeightedStrings([], [])
qualifiers: WeightedCategories = WeightedCategories([], [])
def set_strings(self, field: str, tokens: List[Token]) -> None:
""" Set on of the WeightedStrings properties from the given
token list. Adapt the global penalty, so that the
minimum penalty is 0.
"""
if tokens:
min_penalty = min(t.penalty for t in tokens)
self.penalty += min_penalty
wstrs = WeightedStrings([t.lookup_word for t in tokens],
[t.penalty - min_penalty for t in tokens])
setattr(self, field, wstrs)
def set_qualifiers(self, tokens: List[Token]) -> None:
""" Set the qulaifier field from the given tokens.
"""
if tokens:
categories: Dict[Tuple[str, str], float] = {}
min_penalty = 1000.0
for t in tokens:
min_penalty = min(min_penalty, t.penalty)
cat = t.get_category()
if t.penalty < categories.get(cat, 1000.0):
categories[cat] = t.penalty
self.penalty += min_penalty
self.qualifiers = WeightedCategories(list(categories.keys()),
list(categories.values()))
def set_ranking(self, rankings: List[FieldRanking]) -> None:
""" Set the list of rankings and normalize the ranking.
"""
self.rankings = []
for ranking in rankings:
if ranking.rankings:
self.penalty += ranking.normalize_penalty()
self.rankings.append(ranking)
else:
self.penalty += ranking.default
def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where name tokens are looked up via index
and potential address tokens are used to restrict the search further.
"""
lookup = [FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
if addr_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_tokens, lookups.Restrict))
return lookup
def lookup_by_any_name(name_tokens: List[int], addr_restrict_tokens: List[int],
addr_lookup_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where name tokens are looked up via index
and only one of the name tokens must be present.
Potential address tokens are used to restrict the search further.
"""
lookup = [FieldLookup('name_vector', name_tokens, lookups.LookupAny)]
if addr_restrict_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_restrict_tokens, lookups.Restrict))
if addr_lookup_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_lookup_tokens, lookups.LookupAll))
return lookup
def lookup_by_addr(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where address tokens are looked up via index
and the name tokens are only used to restrict the search further.
"""
return [FieldLookup('name_vector', name_tokens, lookups.Restrict),
FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll)]

View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of lookup functions for the search_name table.
"""
from typing import List, Any
import sqlalchemy as sa
from sqlalchemy.ext.compiler import compiles
from nominatim_core.typing import SaFromClause
from nominatim_core.db.sqlalchemy_types import IntArray
# pylint: disable=consider-using-f-string
LookupType = sa.sql.expression.FunctionElement[Any]
class LookupAll(LookupType):
""" Find all entries in search_name table that contain all of
a given list of tokens using an index for the search.
"""
inherit_cache = True
def __init__(self, table: SaFromClause, column: str, tokens: List[int]) -> None:
super().__init__(table.c.place_id, getattr(table.c, column), column,
sa.type_coerce(tokens, IntArray))
@compiles(LookupAll) # type: ignore[no-untyped-call, misc]
def _default_lookup_all(element: LookupAll,
compiler: 'sa.Compiled', **kw: Any) -> str:
_, col, _, tokens = list(element.clauses)
return "(%s @> %s)" % (compiler.process(col, **kw),
compiler.process(tokens, **kw))
@compiles(LookupAll, 'sqlite') # type: ignore[no-untyped-call, misc]
def _sqlite_lookup_all(element: LookupAll,
compiler: 'sa.Compiled', **kw: Any) -> str:
place, col, colname, tokens = list(element.clauses)
return "(%s IN (SELECT CAST(value as bigint) FROM"\
" (SELECT array_intersect_fuzzy(places) as p FROM"\
" (SELECT places FROM reverse_search_name"\
" WHERE word IN (SELECT value FROM json_each('[' || %s || ']'))"\
" AND column = %s"\
" ORDER BY length(places)) as x) as u,"\
" json_each('[' || u.p || ']'))"\
" AND array_contains(%s, %s))"\
% (compiler.process(place, **kw),
compiler.process(tokens, **kw),
compiler.process(colname, **kw),
compiler.process(col, **kw),
compiler.process(tokens, **kw)
)
class LookupAny(LookupType):
""" Find all entries that contain at least one of the given tokens.
Use an index for the search.
"""
inherit_cache = True
def __init__(self, table: SaFromClause, column: str, tokens: List[int]) -> None:
super().__init__(table.c.place_id, getattr(table.c, column), column,
sa.type_coerce(tokens, IntArray))
@compiles(LookupAny) # type: ignore[no-untyped-call, misc]
def _default_lookup_any(element: LookupAny,
compiler: 'sa.Compiled', **kw: Any) -> str:
_, col, _, tokens = list(element.clauses)
return "(%s && %s)" % (compiler.process(col, **kw),
compiler.process(tokens, **kw))
@compiles(LookupAny, 'sqlite') # type: ignore[no-untyped-call, misc]
def _sqlite_lookup_any(element: LookupAny,
compiler: 'sa.Compiled', **kw: Any) -> str:
place, _, colname, tokens = list(element.clauses)
return "%s IN (SELECT CAST(value as bigint) FROM"\
" (SELECT array_union(places) as p FROM reverse_search_name"\
" WHERE word IN (SELECT value FROM json_each('[' || %s || ']'))"\
" AND column = %s) as u,"\
" json_each('[' || u.p || ']'))" % (compiler.process(place, **kw),
compiler.process(tokens, **kw),
compiler.process(colname, **kw))
class Restrict(LookupType):
""" Find all entries that contain all of the given tokens.
Do not use an index for the search.
"""
inherit_cache = True
def __init__(self, table: SaFromClause, column: str, tokens: List[int]) -> None:
super().__init__(getattr(table.c, column),
sa.type_coerce(tokens, IntArray))
@compiles(Restrict) # type: ignore[no-untyped-call, misc]
def _default_restrict(element: Restrict,
compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "(coalesce(null, %s) @> %s)" % (compiler.process(arg1, **kw),
compiler.process(arg2, **kw))
@compiles(Restrict, 'sqlite') # type: ignore[no-untyped-call, misc]
def _sqlite_restrict(element: Restrict,
compiler: 'sa.Compiled', **kw: Any) -> str:
return "array_contains(%s)" % compiler.process(element.clauses, **kw)

View File

@@ -0,0 +1,874 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the actual database accesses for forward search.
"""
from typing import List, Tuple, AsyncIterator, Dict, Any, Callable, cast
import abc
import sqlalchemy as sa
from nominatim_core.typing import SaFromClause, SaScalarSelect, SaColumn, \
SaExpression, SaSelect, SaLambdaSelect, SaRow, SaBind
from nominatim_core.db.sqlalchemy_types import Geometry, IntArray
from ..connection import SearchConnection
from ..types import SearchDetails, DataLayer, GeometryFormat, Bbox
from .. import results as nres
from .db_search_fields import SearchData, WeightedCategories
#pylint: disable=singleton-comparison,not-callable
#pylint: disable=too-many-branches,too-many-arguments,too-many-locals,too-many-statements
def no_index(expr: SaColumn) -> SaColumn:
""" Wrap the given expression, so that the query planner will
refrain from using the expression for index lookup.
"""
return sa.func.coalesce(sa.null(), expr) # pylint: disable=not-callable
def _details_to_bind_params(details: SearchDetails) -> Dict[str, Any]:
""" Create a dictionary from search parameters that can be used
as bind parameter for SQL execute.
"""
return {'limit': details.max_results,
'min_rank': details.min_rank,
'max_rank': details.max_rank,
'viewbox': details.viewbox,
'viewbox2': details.viewbox_x2,
'near': details.near,
'near_radius': details.near_radius,
'excluded': details.excluded,
'countries': details.countries}
LIMIT_PARAM: SaBind = sa.bindparam('limit')
MIN_RANK_PARAM: SaBind = sa.bindparam('min_rank')
MAX_RANK_PARAM: SaBind = sa.bindparam('max_rank')
VIEWBOX_PARAM: SaBind = sa.bindparam('viewbox', type_=Geometry)
VIEWBOX2_PARAM: SaBind = sa.bindparam('viewbox2', type_=Geometry)
NEAR_PARAM: SaBind = sa.bindparam('near', type_=Geometry)
NEAR_RADIUS_PARAM: SaBind = sa.bindparam('near_radius')
COUNTRIES_PARAM: SaBind = sa.bindparam('countries')
def filter_by_area(sql: SaSelect, t: SaFromClause,
details: SearchDetails, avoid_index: bool = False) -> SaSelect:
""" Apply SQL statements for filtering by viewbox and near point,
if applicable.
"""
if details.near is not None and details.near_radius is not None:
if details.near_radius < 0.1 and not avoid_index:
sql = sql.where(t.c.geometry.within_distance(NEAR_PARAM, NEAR_RADIUS_PARAM))
else:
sql = sql.where(t.c.geometry.ST_Distance(NEAR_PARAM) <= NEAR_RADIUS_PARAM)
if details.viewbox is not None and details.bounded_viewbox:
sql = sql.where(t.c.geometry.intersects(VIEWBOX_PARAM,
use_index=not avoid_index and
details.viewbox.area < 0.2))
return sql
def _exclude_places(t: SaFromClause) -> Callable[[], SaExpression]:
return lambda: t.c.place_id.not_in(sa.bindparam('excluded'))
def _select_placex(t: SaFromClause) -> SaSelect:
return sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type,
t.c.address, t.c.extratags,
t.c.housenumber, t.c.postcode, t.c.country_code,
t.c.wikipedia,
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
t.c.linked_place_id, t.c.admin_level,
t.c.centroid,
t.c.geometry.ST_Expand(0).label('bbox'))
def _add_geometry_columns(sql: SaLambdaSelect, col: SaColumn, details: SearchDetails) -> SaSelect:
out = []
if details.geometry_simplification > 0.0:
col = sa.func.ST_SimplifyPreserveTopology(col, details.geometry_simplification)
if details.geometry_output & GeometryFormat.GEOJSON:
out.append(sa.func.ST_AsGeoJSON(col, 7).label('geometry_geojson'))
if details.geometry_output & GeometryFormat.TEXT:
out.append(sa.func.ST_AsText(col).label('geometry_text'))
if details.geometry_output & GeometryFormat.KML:
out.append(sa.func.ST_AsKML(col, 7).label('geometry_kml'))
if details.geometry_output & GeometryFormat.SVG:
out.append(sa.func.ST_AsSVG(col, 0, 7).label('geometry_svg'))
return sql.add_columns(*out)
def _make_interpolation_subquery(table: SaFromClause, inner: SaFromClause,
numerals: List[int], details: SearchDetails) -> SaScalarSelect:
all_ids = sa.func.ArrayAgg(table.c.place_id)
sql = sa.select(all_ids).where(table.c.parent_place_id == inner.c.place_id)
if len(numerals) == 1:
sql = sql.where(sa.between(numerals[0], table.c.startnumber, table.c.endnumber))\
.where((numerals[0] - table.c.startnumber) % table.c.step == 0)
else:
sql = sql.where(sa.or_(
*(sa.and_(sa.between(n, table.c.startnumber, table.c.endnumber),
(n - table.c.startnumber) % table.c.step == 0)
for n in numerals)))
if details.excluded:
sql = sql.where(_exclude_places(table))
return sql.scalar_subquery()
def _filter_by_layer(table: SaFromClause, layers: DataLayer) -> SaColumn:
orexpr: List[SaExpression] = []
if layers & DataLayer.ADDRESS and layers & DataLayer.POI:
orexpr.append(no_index(table.c.rank_address).between(1, 30))
elif layers & DataLayer.ADDRESS:
orexpr.append(no_index(table.c.rank_address).between(1, 29))
orexpr.append(sa.func.IsAddressPoint(table))
elif layers & DataLayer.POI:
orexpr.append(sa.and_(no_index(table.c.rank_address) == 30,
table.c.class_.not_in(('place', 'building'))))
if layers & DataLayer.MANMADE:
exclude = []
if not layers & DataLayer.RAILWAY:
exclude.append('railway')
if not layers & DataLayer.NATURAL:
exclude.extend(('natural', 'water', 'waterway'))
orexpr.append(sa.and_(table.c.class_.not_in(tuple(exclude)),
no_index(table.c.rank_address) == 0))
else:
include = []
if layers & DataLayer.RAILWAY:
include.append('railway')
if layers & DataLayer.NATURAL:
include.extend(('natural', 'water', 'waterway'))
orexpr.append(sa.and_(table.c.class_.in_(tuple(include)),
no_index(table.c.rank_address) == 0))
if len(orexpr) == 1:
return orexpr[0]
return sa.or_(*orexpr)
def _interpolated_position(table: SaFromClause, nr: SaColumn) -> SaColumn:
pos = sa.cast(nr - table.c.startnumber, sa.Float) / (table.c.endnumber - table.c.startnumber)
return sa.case(
(table.c.endnumber == table.c.startnumber, table.c.linegeo.ST_Centroid()),
else_=table.c.linegeo.ST_LineInterpolatePoint(pos)).label('centroid')
async def _get_placex_housenumbers(conn: SearchConnection,
place_ids: List[int],
details: SearchDetails) -> AsyncIterator[nres.SearchResult]:
t = conn.t.placex
sql = _select_placex(t).add_columns(t.c.importance)\
.where(t.c.place_id.in_(place_ids))
if details.geometry_output:
sql = _add_geometry_columns(sql, t.c.geometry, details)
for row in await conn.execute(sql):
result = nres.create_from_placex_row(row, nres.SearchResult)
assert result
result.bbox = Bbox.from_wkb(row.bbox)
yield result
def _int_list_to_subquery(inp: List[int]) -> 'sa.Subquery':
""" Create a subselect that returns the given list of integers
as rows in the column 'nr'.
"""
vtab = sa.func.JsonArrayEach(sa.type_coerce(inp, sa.JSON))\
.table_valued(sa.column('value', type_=sa.JSON))
return sa.select(sa.cast(sa.cast(vtab.c.value, sa.Text), sa.Integer).label('nr')).subquery()
async def _get_osmline(conn: SearchConnection, place_ids: List[int],
numerals: List[int],
details: SearchDetails) -> AsyncIterator[nres.SearchResult]:
t = conn.t.osmline
values = _int_list_to_subquery(numerals)
sql = sa.select(t.c.place_id, t.c.osm_id,
t.c.parent_place_id, t.c.address,
values.c.nr.label('housenumber'),
_interpolated_position(t, values.c.nr),
t.c.postcode, t.c.country_code)\
.where(t.c.place_id.in_(place_ids))\
.join(values, values.c.nr.between(t.c.startnumber, t.c.endnumber))
if details.geometry_output:
sub = sql.subquery()
sql = _add_geometry_columns(sa.select(sub), sub.c.centroid, details)
for row in await conn.execute(sql):
result = nres.create_from_osmline_row(row, nres.SearchResult)
assert result
yield result
async def _get_tiger(conn: SearchConnection, place_ids: List[int],
numerals: List[int], osm_id: int,
details: SearchDetails) -> AsyncIterator[nres.SearchResult]:
t = conn.t.tiger
values = _int_list_to_subquery(numerals)
sql = sa.select(t.c.place_id, t.c.parent_place_id,
sa.literal('W').label('osm_type'),
sa.literal(osm_id).label('osm_id'),
values.c.nr.label('housenumber'),
_interpolated_position(t, values.c.nr),
t.c.postcode)\
.where(t.c.place_id.in_(place_ids))\
.join(values, values.c.nr.between(t.c.startnumber, t.c.endnumber))
if details.geometry_output:
sub = sql.subquery()
sql = _add_geometry_columns(sa.select(sub), sub.c.centroid, details)
for row in await conn.execute(sql):
result = nres.create_from_tiger_row(row, nres.SearchResult)
assert result
yield result
class AbstractSearch(abc.ABC):
""" Encapuslation of a single lookup in the database.
"""
SEARCH_PRIO: int = 2
def __init__(self, penalty: float) -> None:
self.penalty = penalty
@abc.abstractmethod
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
class NearSearch(AbstractSearch):
""" Category search of a place type near the result of another search.
"""
def __init__(self, penalty: float, categories: WeightedCategories,
search: AbstractSearch) -> None:
super().__init__(penalty)
self.search = search
self.categories = categories
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
results = nres.SearchResults()
base = await self.search.lookup(conn, details)
if not base:
return results
base.sort(key=lambda r: (r.accuracy, r.rank_search))
max_accuracy = base[0].accuracy + 0.5
if base[0].rank_address == 0:
min_rank = 0
max_rank = 0
elif base[0].rank_address < 26:
min_rank = 1
max_rank = min(25, base[0].rank_address + 4)
else:
min_rank = 26
max_rank = 30
base = nres.SearchResults(r for r in base if r.source_table == nres.SourceTable.PLACEX
and r.accuracy <= max_accuracy
and r.bbox and r.bbox.area < 20
and r.rank_address >= min_rank
and r.rank_address <= max_rank)
if base:
baseids = [b.place_id for b in base[:5] if b.place_id]
for category, penalty in self.categories:
await self.lookup_category(results, conn, baseids, category, penalty, details)
if len(results) >= details.max_results:
break
return results
async def lookup_category(self, results: nres.SearchResults,
conn: SearchConnection, ids: List[int],
category: Tuple[str, str], penalty: float,
details: SearchDetails) -> None:
""" Find places of the given category near the list of
place ids and add the results to 'results'.
"""
table = await conn.get_class_table(*category)
tgeom = conn.t.placex.alias('pgeom')
if table is None:
# No classtype table available, do a simplified lookup in placex.
table = conn.t.placex
sql = sa.select(table.c.place_id,
sa.func.min(tgeom.c.centroid.ST_Distance(table.c.centroid))
.label('dist'))\
.join(tgeom, table.c.geometry.intersects(tgeom.c.centroid.ST_Expand(0.01)))\
.where(table.c.class_ == category[0])\
.where(table.c.type == category[1])
else:
# Use classtype table. We can afford to use a larger
# radius for the lookup.
sql = sa.select(table.c.place_id,
sa.func.min(tgeom.c.centroid.ST_Distance(table.c.centroid))
.label('dist'))\
.join(tgeom,
table.c.centroid.ST_CoveredBy(
sa.case((sa.and_(tgeom.c.rank_address > 9,
tgeom.c.geometry.is_area()),
tgeom.c.geometry),
else_ = tgeom.c.centroid.ST_Expand(0.05))))
inner = sql.where(tgeom.c.place_id.in_(ids))\
.group_by(table.c.place_id).subquery()
t = conn.t.placex
sql = _select_placex(t).add_columns((-inner.c.dist).label('importance'))\
.join(inner, inner.c.place_id == t.c.place_id)\
.order_by(inner.c.dist)
sql = sql.where(no_index(t.c.rank_address).between(MIN_RANK_PARAM, MAX_RANK_PARAM))
if details.countries:
sql = sql.where(t.c.country_code.in_(COUNTRIES_PARAM))
if details.excluded:
sql = sql.where(_exclude_places(t))
if details.layers is not None:
sql = sql.where(_filter_by_layer(t, details.layers))
sql = sql.limit(LIMIT_PARAM)
for row in await conn.execute(sql, _details_to_bind_params(details)):
result = nres.create_from_placex_row(row, nres.SearchResult)
assert result
result.accuracy = self.penalty + penalty
result.bbox = Bbox.from_wkb(row.bbox)
results.append(result)
class PoiSearch(AbstractSearch):
""" Category search in a geographic area.
"""
def __init__(self, sdata: SearchData) -> None:
super().__init__(sdata.penalty)
self.qualifiers = sdata.qualifiers
self.countries = sdata.countries
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
bind_params = _details_to_bind_params(details)
t = conn.t.placex
rows: List[SaRow] = []
if details.near and details.near_radius is not None and details.near_radius < 0.2:
# simply search in placex table
def _base_query() -> SaSelect:
return _select_placex(t) \
.add_columns((-t.c.centroid.ST_Distance(NEAR_PARAM))
.label('importance'))\
.where(t.c.linked_place_id == None) \
.where(t.c.geometry.within_distance(NEAR_PARAM, NEAR_RADIUS_PARAM)) \
.order_by(t.c.centroid.ST_Distance(NEAR_PARAM)) \
.limit(LIMIT_PARAM)
classtype = self.qualifiers.values
if len(classtype) == 1:
cclass, ctype = classtype[0]
sql: SaLambdaSelect = sa.lambda_stmt(lambda: _base_query()
.where(t.c.class_ == cclass)
.where(t.c.type == ctype))
else:
sql = _base_query().where(sa.or_(*(sa.and_(t.c.class_ == cls, t.c.type == typ)
for cls, typ in classtype)))
if self.countries:
sql = sql.where(t.c.country_code.in_(self.countries.values))
if details.viewbox is not None and details.bounded_viewbox:
sql = sql.where(t.c.geometry.intersects(VIEWBOX_PARAM))
rows.extend(await conn.execute(sql, bind_params))
else:
# use the class type tables
for category in self.qualifiers.values:
table = await conn.get_class_table(*category)
if table is not None:
sql = _select_placex(t)\
.add_columns(t.c.importance)\
.join(table, t.c.place_id == table.c.place_id)\
.where(t.c.class_ == category[0])\
.where(t.c.type == category[1])
if details.viewbox is not None and details.bounded_viewbox:
sql = sql.where(table.c.centroid.intersects(VIEWBOX_PARAM))
if details.near and details.near_radius is not None:
sql = sql.order_by(table.c.centroid.ST_Distance(NEAR_PARAM))\
.where(table.c.centroid.within_distance(NEAR_PARAM,
NEAR_RADIUS_PARAM))
if self.countries:
sql = sql.where(t.c.country_code.in_(self.countries.values))
sql = sql.limit(LIMIT_PARAM)
rows.extend(await conn.execute(sql, bind_params))
results = nres.SearchResults()
for row in rows:
result = nres.create_from_placex_row(row, nres.SearchResult)
assert result
result.accuracy = self.penalty + self.qualifiers.get_penalty((row.class_, row.type))
result.bbox = Bbox.from_wkb(row.bbox)
results.append(result)
return results
class CountrySearch(AbstractSearch):
""" Search for a country name or country code.
"""
SEARCH_PRIO = 0
def __init__(self, sdata: SearchData) -> None:
super().__init__(sdata.penalty)
self.countries = sdata.countries
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
t = conn.t.placex
ccodes = self.countries.values
sql = _select_placex(t)\
.add_columns(t.c.importance)\
.where(t.c.country_code.in_(ccodes))\
.where(t.c.rank_address == 4)
if details.geometry_output:
sql = _add_geometry_columns(sql, t.c.geometry, details)
if details.excluded:
sql = sql.where(_exclude_places(t))
sql = filter_by_area(sql, t, details)
results = nres.SearchResults()
for row in await conn.execute(sql, _details_to_bind_params(details)):
result = nres.create_from_placex_row(row, nres.SearchResult)
assert result
result.accuracy = self.penalty + self.countries.get_penalty(row.country_code, 5.0)
result.bbox = Bbox.from_wkb(row.bbox)
results.append(result)
if not results:
results = await self.lookup_in_country_table(conn, details)
if results:
details.min_rank = min(5, details.max_rank)
details.max_rank = min(25, details.max_rank)
return results
async def lookup_in_country_table(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Look up the country in the fallback country tables.
"""
# Avoid the fallback search when this is a more search. Country results
# usually are in the first batch of results and it is not possible
# to exclude these fallbacks.
if details.excluded:
return nres.SearchResults()
t = conn.t.country_name
tgrid = conn.t.country_grid
sql = sa.select(tgrid.c.country_code,
tgrid.c.geometry.ST_Centroid().ST_Collect().ST_Centroid()
.label('centroid'),
tgrid.c.geometry.ST_Collect().ST_Expand(0).label('bbox'))\
.where(tgrid.c.country_code.in_(self.countries.values))\
.group_by(tgrid.c.country_code)
sql = filter_by_area(sql, tgrid, details, avoid_index=True)
sub = sql.subquery('grid')
sql = sa.select(t.c.country_code,
t.c.name.merge(t.c.derived_name).label('name'),
sub.c.centroid, sub.c.bbox)\
.join(sub, t.c.country_code == sub.c.country_code)
if details.geometry_output:
sql = _add_geometry_columns(sql, sub.c.centroid, details)
results = nres.SearchResults()
for row in await conn.execute(sql, _details_to_bind_params(details)):
result = nres.create_from_country_row(row, nres.SearchResult)
assert result
result.bbox = Bbox.from_wkb(row.bbox)
result.accuracy = self.penalty + self.countries.get_penalty(row.country_code, 5.0)
results.append(result)
return results
class PostcodeSearch(AbstractSearch):
""" Search for a postcode.
"""
def __init__(self, extra_penalty: float, sdata: SearchData) -> None:
super().__init__(sdata.penalty + extra_penalty)
self.countries = sdata.countries
self.postcodes = sdata.postcodes
self.lookups = sdata.lookups
self.rankings = sdata.rankings
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
t = conn.t.postcode
pcs = self.postcodes.values
sql = sa.select(t.c.place_id, t.c.parent_place_id,
t.c.rank_search, t.c.rank_address,
t.c.postcode, t.c.country_code,
t.c.geometry.label('centroid'))\
.where(t.c.postcode.in_(pcs))
if details.geometry_output:
sql = _add_geometry_columns(sql, t.c.geometry, details)
penalty: SaExpression = sa.literal(self.penalty)
if details.viewbox is not None and not details.bounded_viewbox:
penalty += sa.case((t.c.geometry.intersects(VIEWBOX_PARAM), 0.0),
(t.c.geometry.intersects(VIEWBOX2_PARAM), 0.5),
else_=1.0)
if details.near is not None:
sql = sql.order_by(t.c.geometry.ST_Distance(NEAR_PARAM))
sql = filter_by_area(sql, t, details)
if self.countries:
sql = sql.where(t.c.country_code.in_(self.countries.values))
if details.excluded:
sql = sql.where(_exclude_places(t))
if self.lookups:
assert len(self.lookups) == 1
tsearch = conn.t.search_name
sql = sql.where(tsearch.c.place_id == t.c.parent_place_id)\
.where((tsearch.c.name_vector + tsearch.c.nameaddress_vector)
.contains(sa.type_coerce(self.lookups[0].tokens,
IntArray)))
for ranking in self.rankings:
penalty += ranking.sql_penalty(conn.t.search_name)
penalty += sa.case(*((t.c.postcode == v, p) for v, p in self.postcodes),
else_=1.0)
sql = sql.add_columns(penalty.label('accuracy'))
sql = sql.order_by('accuracy').limit(LIMIT_PARAM)
results = nres.SearchResults()
for row in await conn.execute(sql, _details_to_bind_params(details)):
p = conn.t.placex
placex_sql = _select_placex(p).add_columns(p.c.importance)\
.where(sa.text("""class = 'boundary'
AND type = 'postal_code'
AND osm_type = 'R'"""))\
.where(p.c.country_code == row.country_code)\
.where(p.c.postcode == row.postcode)\
.limit(1)
if details.geometry_output:
placex_sql = _add_geometry_columns(placex_sql, p.c.geometry, details)
for prow in await conn.execute(placex_sql, _details_to_bind_params(details)):
result = nres.create_from_placex_row(prow, nres.SearchResult)
break
else:
result = nres.create_from_postcode_row(row, nres.SearchResult)
assert result
if result.place_id not in details.excluded:
result.accuracy = row.accuracy
results.append(result)
return results
class PlaceSearch(AbstractSearch):
""" Generic search for an address or named place.
"""
SEARCH_PRIO = 1
def __init__(self, extra_penalty: float, sdata: SearchData, expected_count: int) -> None:
super().__init__(sdata.penalty + extra_penalty)
self.countries = sdata.countries
self.postcodes = sdata.postcodes
self.housenumbers = sdata.housenumbers
self.qualifiers = sdata.qualifiers
self.lookups = sdata.lookups
self.rankings = sdata.rankings
self.expected_count = expected_count
def _inner_search_name_cte(self, conn: SearchConnection,
details: SearchDetails) -> 'sa.CTE':
""" Create a subquery that preselects the rows in the search_name
table.
"""
t = conn.t.search_name
penalty: SaExpression = sa.literal(self.penalty)
for ranking in self.rankings:
penalty += ranking.sql_penalty(t)
sql = sa.select(t.c.place_id, t.c.search_rank, t.c.address_rank,
t.c.country_code, t.c.centroid,
t.c.name_vector, t.c.nameaddress_vector,
sa.case((t.c.importance > 0, t.c.importance),
else_=0.40001-(sa.cast(t.c.search_rank, sa.Float())/75))
.label('importance'),
penalty.label('penalty'))
for lookup in self.lookups:
sql = sql.where(lookup.sql_condition(t))
if self.countries:
sql = sql.where(t.c.country_code.in_(self.countries.values))
if self.postcodes:
# if a postcode is given, don't search for state or country level objects
sql = sql.where(t.c.address_rank > 9)
if self.expected_count > 10000:
# Many results expected. Restrict by postcode.
tpc = conn.t.postcode
sql = sql.where(sa.select(tpc.c.postcode)
.where(tpc.c.postcode.in_(self.postcodes.values))
.where(t.c.centroid.within_distance(tpc.c.geometry, 0.4))
.exists())
if details.viewbox is not None:
if details.bounded_viewbox:
sql = sql.where(t.c.centroid
.intersects(VIEWBOX_PARAM,
use_index=details.viewbox.area < 0.2))
elif not self.postcodes and not self.housenumbers and self.expected_count >= 10000:
sql = sql.where(t.c.centroid
.intersects(VIEWBOX2_PARAM,
use_index=details.viewbox.area < 0.5))
if details.near is not None and details.near_radius is not None:
if details.near_radius < 0.1:
sql = sql.where(t.c.centroid.within_distance(NEAR_PARAM,
NEAR_RADIUS_PARAM))
else:
sql = sql.where(t.c.centroid
.ST_Distance(NEAR_PARAM) < NEAR_RADIUS_PARAM)
if self.housenumbers:
sql = sql.where(t.c.address_rank.between(16, 30))
else:
if details.excluded:
sql = sql.where(_exclude_places(t))
if details.min_rank > 0:
sql = sql.where(sa.or_(t.c.address_rank >= MIN_RANK_PARAM,
t.c.search_rank >= MIN_RANK_PARAM))
if details.max_rank < 30:
sql = sql.where(sa.or_(t.c.address_rank <= MAX_RANK_PARAM,
t.c.search_rank <= MAX_RANK_PARAM))
inner = sql.limit(10000).order_by(sa.desc(sa.text('importance'))).subquery()
sql = sa.select(inner.c.place_id, inner.c.search_rank, inner.c.address_rank,
inner.c.country_code, inner.c.centroid, inner.c.importance,
inner.c.penalty)
# If the query is not an address search or has a geographic preference,
# preselect most important items to restrict the number of places
# that need to be looked up in placex.
if not self.housenumbers\
and (details.viewbox is None or details.bounded_viewbox)\
and (details.near is None or details.near_radius is not None)\
and not self.qualifiers:
sql = sql.add_columns(sa.func.first_value(inner.c.penalty - inner.c.importance)
.over(order_by=inner.c.penalty - inner.c.importance)
.label('min_penalty'))
inner = sql.subquery()
sql = sa.select(inner.c.place_id, inner.c.search_rank, inner.c.address_rank,
inner.c.country_code, inner.c.centroid, inner.c.importance,
inner.c.penalty)\
.where(inner.c.penalty - inner.c.importance < inner.c.min_penalty + 0.5)
return sql.cte('searches')
async def lookup(self, conn: SearchConnection,
details: SearchDetails) -> nres.SearchResults:
""" Find results for the search in the database.
"""
t = conn.t.placex
tsearch = self._inner_search_name_cte(conn, details)
sql = _select_placex(t).join(tsearch, t.c.place_id == tsearch.c.place_id)
if details.geometry_output:
sql = _add_geometry_columns(sql, t.c.geometry, details)
penalty: SaExpression = tsearch.c.penalty
if self.postcodes:
tpc = conn.t.postcode
pcs = self.postcodes.values
pc_near = sa.select(sa.func.min(tpc.c.geometry.ST_Distance(t.c.centroid)))\
.where(tpc.c.postcode.in_(pcs))\
.scalar_subquery()
penalty += sa.case((t.c.postcode.in_(pcs), 0.0),
else_=sa.func.coalesce(pc_near, cast(SaColumn, 2.0)))
if details.viewbox is not None and not details.bounded_viewbox:
penalty += sa.case((t.c.geometry.intersects(VIEWBOX_PARAM, use_index=False), 0.0),
(t.c.geometry.intersects(VIEWBOX2_PARAM, use_index=False), 0.5),
else_=1.0)
if details.near is not None:
sql = sql.add_columns((-tsearch.c.centroid.ST_Distance(NEAR_PARAM))
.label('importance'))
sql = sql.order_by(sa.desc(sa.text('importance')))
else:
sql = sql.order_by(penalty - tsearch.c.importance)
sql = sql.add_columns(tsearch.c.importance)
sql = sql.add_columns(penalty.label('accuracy'))\
.order_by(sa.text('accuracy'))
if self.housenumbers:
hnr_list = '|'.join(self.housenumbers.values)
inner = sql.where(sa.or_(tsearch.c.address_rank < 30,
sa.func.RegexpWord(hnr_list, t.c.housenumber)))\
.subquery()
# Housenumbers from placex
thnr = conn.t.placex.alias('hnr')
pid_list = sa.func.ArrayAgg(thnr.c.place_id)
place_sql = sa.select(pid_list)\
.where(thnr.c.parent_place_id == inner.c.place_id)\
.where(sa.func.RegexpWord(hnr_list, thnr.c.housenumber))\
.where(thnr.c.linked_place_id == None)\
.where(thnr.c.indexed_status == 0)
if details.excluded:
place_sql = place_sql.where(thnr.c.place_id.not_in(sa.bindparam('excluded')))
if self.qualifiers:
place_sql = place_sql.where(self.qualifiers.sql_restrict(thnr))
numerals = [int(n) for n in self.housenumbers.values
if n.isdigit() and len(n) < 8]
interpol_sql: SaColumn
tiger_sql: SaColumn
if numerals and \
(not self.qualifiers or ('place', 'house') in self.qualifiers.values):
# Housenumbers from interpolations
interpol_sql = _make_interpolation_subquery(conn.t.osmline, inner,
numerals, details)
# Housenumbers from Tiger
tiger_sql = sa.case((inner.c.country_code == 'us',
_make_interpolation_subquery(conn.t.tiger, inner,
numerals, details)
), else_=None)
else:
interpol_sql = sa.null()
tiger_sql = sa.null()
unsort = sa.select(inner, place_sql.scalar_subquery().label('placex_hnr'),
interpol_sql.label('interpol_hnr'),
tiger_sql.label('tiger_hnr')).subquery('unsort')
sql = sa.select(unsort)\
.order_by(sa.case((unsort.c.placex_hnr != None, 1),
(unsort.c.interpol_hnr != None, 2),
(unsort.c.tiger_hnr != None, 3),
else_=4),
unsort.c.accuracy)
else:
sql = sql.where(t.c.linked_place_id == None)\
.where(t.c.indexed_status == 0)
if self.qualifiers:
sql = sql.where(self.qualifiers.sql_restrict(t))
if details.layers is not None:
sql = sql.where(_filter_by_layer(t, details.layers))
sql = sql.limit(LIMIT_PARAM)
results = nres.SearchResults()
for row in await conn.execute(sql, _details_to_bind_params(details)):
result = nres.create_from_placex_row(row, nres.SearchResult)
assert result
result.bbox = Bbox.from_wkb(row.bbox)
result.accuracy = row.accuracy
if self.housenumbers and row.rank_address < 30:
if row.placex_hnr:
subs = _get_placex_housenumbers(conn, row.placex_hnr, details)
elif row.interpol_hnr:
subs = _get_osmline(conn, row.interpol_hnr, numerals, details)
elif row.tiger_hnr:
subs = _get_tiger(conn, row.tiger_hnr, numerals, row.osm_id, details)
else:
subs = None
if subs is not None:
async for sub in subs:
assert sub.housenumber
sub.accuracy = result.accuracy
if not any(nr in self.housenumbers.values
for nr in sub.housenumber.split(';')):
sub.accuracy += 0.6
results.append(sub)
# Only add the street as a result, if it meets all other
# filter conditions.
if (not details.excluded or result.place_id not in details.excluded)\
and (not self.qualifiers or result.category in self.qualifiers.values)\
and result.rank_address >= details.min_rank:
result.accuracy += 1.0 # penalty for missing housenumber
results.append(result)
else:
results.append(result)
return results

View File

@@ -0,0 +1,274 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Public interface to the search code.
"""
from typing import List, Any, Optional, Iterator, Tuple, Dict
import itertools
import re
import datetime as dt
import difflib
from ..connection import SearchConnection
from ..types import SearchDetails
from ..results import SearchResult, SearchResults, add_result_details
from ..logging import log
from .token_assignment import yield_token_assignments
from .db_search_builder import SearchBuilder, build_poi_search, wrap_near_search
from .db_searches import AbstractSearch
from .query_analyzer_factory import make_query_analyzer, AbstractQueryAnalyzer
from .query import Phrase, QueryStruct
class ForwardGeocoder:
""" Main class responsible for place search.
"""
def __init__(self, conn: SearchConnection,
params: SearchDetails, timeout: Optional[int]) -> None:
self.conn = conn
self.params = params
self.timeout = dt.timedelta(seconds=timeout or 1000000)
self.query_analyzer: Optional[AbstractQueryAnalyzer] = None
@property
def limit(self) -> int:
""" Return the configured maximum number of search results.
"""
return self.params.max_results
async def build_searches(self,
phrases: List[Phrase]) -> Tuple[QueryStruct, List[AbstractSearch]]:
""" Analyse the query and return the tokenized query and list of
possible searches over it.
"""
if self.query_analyzer is None:
self.query_analyzer = await make_query_analyzer(self.conn)
query = await self.query_analyzer.analyze_query(phrases)
searches: List[AbstractSearch] = []
if query.num_token_slots() > 0:
# 2. Compute all possible search interpretations
log().section('Compute abstract searches')
search_builder = SearchBuilder(query, self.params)
num_searches = 0
for assignment in yield_token_assignments(query):
searches.extend(search_builder.build(assignment))
if num_searches < len(searches):
log().table_dump('Searches for assignment',
_dump_searches(searches, query, num_searches))
num_searches = len(searches)
searches.sort(key=lambda s: (s.penalty, s.SEARCH_PRIO))
return query, searches
async def execute_searches(self, query: QueryStruct,
searches: List[AbstractSearch]) -> SearchResults:
""" Run the abstract searches against the database until a result
is found.
"""
log().section('Execute database searches')
results: Dict[Any, SearchResult] = {}
end_time = dt.datetime.now() + self.timeout
min_ranking = searches[0].penalty + 2.0
prev_penalty = 0.0
for i, search in enumerate(searches):
if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
break
log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
log().var_dump('Params', self.params)
lookup_results = await search.lookup(self.conn, self.params)
for result in lookup_results:
rhash = (result.source_table, result.place_id,
result.housenumber, result.country_code)
prevresult = results.get(rhash)
if prevresult:
prevresult.accuracy = min(prevresult.accuracy, result.accuracy)
else:
results[rhash] = result
min_ranking = min(min_ranking, result.accuracy * 1.2, 2.0)
log().result_dump('Results', ((r.accuracy, r) for r in lookup_results))
prev_penalty = search.penalty
if dt.datetime.now() >= end_time:
break
return SearchResults(results.values())
def pre_filter_results(self, results: SearchResults) -> SearchResults:
""" Remove results that are significantly worse than the
best match.
"""
if results:
max_ranking = min(r.ranking for r in results) + 0.5
results = SearchResults(r for r in results if r.ranking < max_ranking)
return results
def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
""" Remove badly matching results, sort by ranking and
limit to the configured number of results.
"""
if results:
results.sort(key=lambda r: r.ranking)
min_rank = results[0].rank_search
min_ranking = results[0].ranking
results = SearchResults(r for r in results
if r.ranking + 0.03 * (r.rank_search - min_rank)
< min_ranking + 0.5)
results = SearchResults(results[:self.limit])
return results
def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
""" Adjust the accuracy of the localized result according to how well
they match the original query.
"""
assert self.query_analyzer is not None
qwords = [word for phrase in query.source
for word in re.split('[, ]+', phrase.text) if word]
if not qwords:
return
for result in results:
# Negative importance indicates ordering by distance, which is
# more important than word matching.
if not result.display_name\
or (result.importance is not None and result.importance < 0):
continue
distance = 0.0
norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
result.country_code or '')))
words = set((w for w in norm.split(' ') if w))
if not words:
continue
for qword in qwords:
wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
if wdist < 0.5:
distance += len(qword)
else:
distance += (1.0 - wdist) * len(qword)
# Compensate for the fact that country names do not get a
# match penalty yet by the tokenizer.
# Temporary hack that needs to be removed!
if result.rank_address == 4:
distance *= 2
result.accuracy += distance * 0.4 / sum(len(w) for w in qwords)
async def lookup_pois(self, categories: List[Tuple[str, str]],
phrases: List[Phrase]) -> SearchResults:
""" Look up places by category. If phrase is given, a place search
over the phrase will be executed first and places close to the
results returned.
"""
log().function('forward_lookup_pois', categories=categories, params=self.params)
if phrases:
query, searches = await self.build_searches(phrases)
if query:
searches = [wrap_near_search(categories, s) for s in searches[:50]]
results = await self.execute_searches(query, searches)
results = self.pre_filter_results(results)
await add_result_details(self.conn, results, self.params)
log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
results = self.sort_and_cut_results(results)
else:
results = SearchResults()
else:
search = build_poi_search(categories, self.params.countries)
results = await search.lookup(self.conn, self.params)
await add_result_details(self.conn, results, self.params)
log().result_dump('Final Results', ((r.accuracy, r) for r in results))
return results
async def lookup(self, phrases: List[Phrase]) -> SearchResults:
""" Look up a single free-text query.
"""
log().function('forward_lookup', phrases=phrases, params=self.params)
results = SearchResults()
if self.params.is_impossible():
return results
query, searches = await self.build_searches(phrases)
if searches:
# Execute SQL until an appropriate result is found.
results = await self.execute_searches(query, searches[:50])
results = self.pre_filter_results(results)
await add_result_details(self.conn, results, self.params)
log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
self.rerank_by_query(query, results)
log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
results = self.sort_and_cut_results(results)
log().result_dump('Final Results', ((r.accuracy, r) for r in results))
return results
# pylint: disable=invalid-name,too-many-locals
def _dump_searches(searches: List[AbstractSearch], query: QueryStruct,
start: int = 0) -> Iterator[Optional[List[Any]]]:
yield ['Penalty', 'Lookups', 'Housenr', 'Postcode', 'Countries',
'Qualifier', 'Catgeory', 'Rankings']
def tk(tl: List[int]) -> str:
tstr = [f"{query.find_lookup_word_by_id(t)}({t})" for t in tl]
return f"[{','.join(tstr)}]"
def fmt_ranking(f: Any) -> str:
if not f:
return ''
ranks = ','.join((f"{tk(r.tokens)}^{r.penalty:.3g}" for r in f.rankings))
if len(ranks) > 100:
ranks = ranks[:100] + '...'
return f"{f.column}({ranks},def={f.default:.3g})"
def fmt_lookup(l: Any) -> str:
if not l:
return ''
return f"{l.lookup_type}({l.column}{tk(l.tokens)})"
def fmt_cstr(c: Any) -> str:
if not c:
return ''
return f'{c[0]}^{c[1]}'
for search in searches[start:]:
fields = ('lookups', 'rankings', 'countries', 'housenumbers',
'postcodes', 'qualifiers')
if hasattr(search, 'search'):
iters = itertools.zip_longest([f"{search.penalty:.3g}"],
*(getattr(search.search, attr, []) for attr in fields),
getattr(search, 'categories', []),
fillvalue='')
else:
iters = itertools.zip_longest([f"{search.penalty:.3g}"],
*(getattr(search, attr, []) for attr in fields),
[],
fillvalue='')
for penalty, lookup, rank, cc, hnr, pc, qual, cat in iters:
yield [penalty, fmt_lookup(lookup), fmt_cstr(hnr),
fmt_cstr(pc), fmt_cstr(cc), fmt_cstr(qual), fmt_cstr(cat), fmt_ranking(rank)]
yield None

View File

@@ -0,0 +1,314 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of query analysis for the ICU tokenizer.
"""
from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
from collections import defaultdict
import dataclasses
import difflib
from icu import Transliterator
import sqlalchemy as sa
from nominatim_core.typing import SaRow
from nominatim_core.db.sqlalchemy_types import Json
from ..connection import SearchConnection
from ..logging import log
from ..search import query as qmod
from ..search.query_analyzer_factory import AbstractQueryAnalyzer
DB_TO_TOKEN_TYPE = {
'W': qmod.TokenType.WORD,
'w': qmod.TokenType.PARTIAL,
'H': qmod.TokenType.HOUSENUMBER,
'P': qmod.TokenType.POSTCODE,
'C': qmod.TokenType.COUNTRY
}
class QueryPart(NamedTuple):
""" Normalized and transliterated form of a single term in the query.
When the term came out of a split during the transliteration,
the normalized string is the full word before transliteration.
The word number keeps track of the word before transliteration
and can be used to identify partial transliterated terms.
"""
token: str
normalized: str
word_number: int
QueryParts = List[QueryPart]
WordDict = Dict[str, List[qmod.TokenRange]]
def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
""" Return all combinations of words in the terms list after the
given position.
"""
total = len(terms)
for first in range(start, total):
word = terms[first].token
yield word, qmod.TokenRange(first, first + 1)
for last in range(first + 1, min(first + 20, total)):
word = ' '.join((word, terms[last].token))
yield word, qmod.TokenRange(first, last + 1)
@dataclasses.dataclass
class ICUToken(qmod.Token):
""" Specialised token for ICU tokenizer.
"""
word_token: str
info: Optional[Dict[str, Any]]
def get_category(self) -> Tuple[str, str]:
assert self.info
return self.info.get('class', ''), self.info.get('type', '')
def rematch(self, norm: str) -> None:
""" Check how well the token matches the given normalized string
and add a penalty, if necessary.
"""
if not self.lookup_word:
return
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
distance = 0
for tag, afrom, ato, bfrom, bto in seq.get_opcodes():
if tag in ('delete', 'insert') and (afrom == 0 or ato == len(self.lookup_word)):
distance += 1
elif tag == 'replace':
distance += max((ato-afrom), (bto-bfrom))
elif tag != 'equal':
distance += abs((ato-afrom) - (bto-bfrom))
self.penalty += (distance/len(self.lookup_word))
@staticmethod
def from_db_row(row: SaRow) -> 'ICUToken':
""" Create a ICUToken from the row of the word table.
"""
count = 1 if row.info is None else row.info.get('count', 1)
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
penalty = 0.0
if row.type == 'w':
penalty = 0.3
elif row.type == 'W':
if len(row.word_token) == 1 and row.word_token == row.word:
penalty = 0.2 if row.word.isdigit() else 0.3
elif row.type == 'H':
penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
if all(not c.isdigit() for c in row.word_token):
penalty += 0.2 * (len(row.word_token) - 1)
elif row.type == 'C':
if len(row.word_token) == 1:
penalty = 0.3
if row.info is None:
lookup_word = row.word
else:
lookup_word = row.info.get('lookup', row.word)
if lookup_word:
lookup_word = lookup_word.split('@', 1)[0]
else:
lookup_word = row.word_token
return ICUToken(penalty=penalty, token=row.word_id, count=max(1, count),
lookup_word=lookup_word, is_indexed=True,
word_token=row.word_token, info=row.info,
addr_count=max(1, addr_count))
class ICUQueryAnalyzer(AbstractQueryAnalyzer):
""" Converter for query strings into a tokenized query
using the tokens created by a ICU tokenizer.
"""
def __init__(self, conn: SearchConnection) -> None:
self.conn = conn
async def setup(self) -> None:
""" Set up static data structures needed for the analysis.
"""
async def _make_normalizer() -> Any:
rules = await self.conn.get_property('tokenizer_import_normalisation')
return Transliterator.createFromRules("normalization", rules)
self.normalizer = await self.conn.get_cached_value('ICUTOK', 'normalizer',
_make_normalizer)
async def _make_transliterator() -> Any:
rules = await self.conn.get_property('tokenizer_import_transliteration')
return Transliterator.createFromRules("transliteration", rules)
self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator',
_make_transliterator)
if 'word' not in self.conn.t.meta.tables:
sa.Table('word', self.conn.t.meta,
sa.Column('word_id', sa.Integer),
sa.Column('word_token', sa.Text, nullable=False),
sa.Column('type', sa.Text, nullable=False),
sa.Column('word', sa.Text),
sa.Column('info', Json))
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
""" Analyze the given list of phrases and return the
tokenized query.
"""
log().section('Analyze query (using ICU tokenizer)')
normalized = list(filter(lambda p: p.text,
(qmod.Phrase(p.ptype, self.normalize_text(p.text))
for p in phrases)))
query = qmod.QueryStruct(normalized)
log().var_dump('Normalized query', query.source)
if not query.source:
return query
parts, words = self.split_query(query)
log().var_dump('Transliterated query', lambda: _dump_transliterated(query, parts))
for row in await self.lookup_in_db(list(words.keys())):
for trange in words[row.word_token]:
token = ICUToken.from_db_row(row)
if row.type == 'S':
if row.info['op'] in ('in', 'near'):
if trange.start == 0:
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
else:
if trange.start == 0 and trange.end == query.num_token_slots():
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
else:
query.add_token(trange, qmod.TokenType.QUALIFIER, token)
else:
query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
self.add_extra_tokens(query, parts)
self.rerank_tokens(query, parts)
log().table_dump('Word tokens', _dump_word_tokens(query))
return query
def normalize_text(self, text: str) -> str:
""" Bring the given text into a normalized form. That is the
standardized form search will work with. All information removed
at this stage is inevitably lost.
"""
return cast(str, self.normalizer.transliterate(text))
def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
""" Transliterate the phrases and split them into tokens.
Returns the list of transliterated tokens together with their
normalized form and a dictionary of words for lookup together
with their position.
"""
parts: QueryParts = []
phrase_start = 0
words = defaultdict(list)
wordnr = 0
for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype
for word in phrase.text.split(' '):
trans = self.transliterator.transliterate(word)
if trans:
for term in trans.split(' '):
if term:
parts.append(QueryPart(term, word, wordnr))
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
query.nodes[-1].btype = qmod.BreakType.WORD
wordnr += 1
query.nodes[-1].btype = qmod.BreakType.PHRASE
for word, wrange in yield_words(parts, phrase_start):
words[word].append(wrange)
phrase_start = len(parts)
query.nodes[-1].btype = qmod.BreakType.END
return parts, words
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
""" Return the token information from the database for the
given word tokens.
"""
t = self.conn.t.meta.tables['word']
return await self.conn.execute(t.select().where(t.c.word_token.in_(words)))
def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
""" Add tokens to query that are not saved in the database.
"""
for part, node, i in zip(parts, query.nodes, range(1000)):
if len(part.token) <= 4 and part[0].isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
""" Add penalties to tokens that depend on presence of other token.
"""
for i, node, tlist in query.iter_token_lists():
if tlist.ttype == qmod.TokenType.POSTCODE:
for repl in node.starting:
if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \
and (repl.ttype != qmod.TokenType.HOUSENUMBER
or len(tlist.tokens[0].lookup_word) > 4):
repl.add_penalty(0.39)
elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
and len(tlist.tokens[0].lookup_word) <= 3:
if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
for repl in node.starting:
if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
repl.add_penalty(0.5 - tlist.tokens[0].penalty)
elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
norm = parts[i].normalized
for j in range(i + 1, tlist.end):
if parts[j - 1].word_number != parts[j].word_number:
norm += ' ' + parts[j].normalized
for token in tlist.tokens:
cast(ICUToken, token).rematch(norm)
def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str:
out = query.nodes[0].btype.value
for node, part in zip(query.nodes[1:], parts):
out += part.token + node.btype.value
return out
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
for node in query.nodes:
for tlist in node.starting:
for token in tlist.tokens:
t = cast(ICUToken, token)
yield [tlist.ttype.name, t.token, t.word_token or '',
t.lookup_word or '', t.penalty, t.count, t.info]
async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
""" Create and set up a new query analyzer for a database based
on the ICU tokenizer.
"""
out = ICUQueryAnalyzer(conn)
await out.setup()
return out

View File

@@ -0,0 +1,272 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of query analysis for the legacy tokenizer.
"""
from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
from copy import copy
from collections import defaultdict
import dataclasses
import sqlalchemy as sa
from nominatim_core.typing import SaRow
from ..connection import SearchConnection
from ..logging import log
from . import query as qmod
from .query_analyzer_factory import AbstractQueryAnalyzer
def yield_words(terms: List[str], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
""" Return all combinations of words in the terms list after the
given position.
"""
total = len(terms)
for first in range(start, total):
word = terms[first]
yield word, qmod.TokenRange(first, first + 1)
for last in range(first + 1, min(first + 20, total)):
word = ' '.join((word, terms[last]))
yield word, qmod.TokenRange(first, last + 1)
@dataclasses.dataclass
class LegacyToken(qmod.Token):
""" Specialised token for legacy tokenizer.
"""
word_token: str
category: Optional[Tuple[str, str]]
country: Optional[str]
operator: Optional[str]
@property
def info(self) -> Dict[str, Any]:
""" Dictionary of additional properties of the token.
Should only be used for debugging purposes.
"""
return {'category': self.category,
'country': self.country,
'operator': self.operator}
def get_category(self) -> Tuple[str, str]:
assert self.category
return self.category
class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
""" Converter for query strings into a tokenized query
using the tokens created by a legacy tokenizer.
"""
def __init__(self, conn: SearchConnection) -> None:
self.conn = conn
async def setup(self) -> None:
""" Set up static data structures needed for the analysis.
"""
self.max_word_freq = int(await self.conn.get_property('tokenizer_maxwordfreq'))
if 'word' not in self.conn.t.meta.tables:
sa.Table('word', self.conn.t.meta,
sa.Column('word_id', sa.Integer),
sa.Column('word_token', sa.Text, nullable=False),
sa.Column('word', sa.Text),
sa.Column('class', sa.Text),
sa.Column('type', sa.Text),
sa.Column('country_code', sa.Text),
sa.Column('search_name_count', sa.Integer),
sa.Column('operator', sa.Text))
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
""" Analyze the given list of phrases and return the
tokenized query.
"""
log().section('Analyze query (using Legacy tokenizer)')
normalized = []
if phrases:
for row in await self.conn.execute(sa.select(*(sa.func.make_standard_name(p.text)
for p in phrases))):
normalized = [qmod.Phrase(p.ptype, r) for r, p in zip(row, phrases) if r]
break
query = qmod.QueryStruct(normalized)
log().var_dump('Normalized query', query.source)
if not query.source:
return query
parts, words = self.split_query(query)
lookup_words = list(words.keys())
log().var_dump('Split query', parts)
log().var_dump('Extracted words', lookup_words)
for row in await self.lookup_in_db(lookup_words):
for trange in words[row.word_token.strip()]:
token, ttype = self.make_token(row)
if ttype == qmod.TokenType.NEAR_ITEM:
if trange.start == 0:
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
elif ttype == qmod.TokenType.QUALIFIER:
query.add_token(trange, qmod.TokenType.QUALIFIER, token)
if trange.start == 0 or trange.end == query.num_token_slots():
token = copy(token)
token.penalty += 0.1 * (query.num_token_slots())
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
elif ttype != qmod.TokenType.PARTIAL or trange.start + 1 == trange.end:
query.add_token(trange, ttype, token)
self.add_extra_tokens(query, parts)
self.rerank_tokens(query)
log().table_dump('Word tokens', _dump_word_tokens(query))
return query
def normalize_text(self, text: str) -> str:
""" Bring the given text into a normalized form.
This only removes case, so some difference with the normalization
in the phrase remains.
"""
return text.lower()
def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
Dict[str, List[qmod.TokenRange]]]:
""" Transliterate the phrases and split them into tokens.
Returns a list of transliterated tokens and a dictionary
of words for lookup together with their position.
"""
parts: List[str] = []
phrase_start = 0
words = defaultdict(list)
for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype
for trans in phrase.text.split(' '):
if trans:
for term in trans.split(' '):
if term:
parts.append(trans)
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
query.nodes[-1].btype = qmod.BreakType.WORD
query.nodes[-1].btype = qmod.BreakType.PHRASE
for word, wrange in yield_words(parts, phrase_start):
words[word].append(wrange)
phrase_start = len(parts)
query.nodes[-1].btype = qmod.BreakType.END
return parts, words
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
""" Return the token information from the database for the
given word tokens.
"""
t = self.conn.t.meta.tables['word']
sql = t.select().where(t.c.word_token.in_(words + [' ' + w for w in words]))
return await self.conn.execute(sql)
def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]:
""" Create a LegacyToken from the row of the word table.
Also determines the type of token.
"""
penalty = 0.0
is_indexed = True
rowclass = getattr(row, 'class')
if row.country_code is not None:
ttype = qmod.TokenType.COUNTRY
lookup_word = row.country_code
elif rowclass is not None:
if rowclass == 'place' and row.type == 'house':
ttype = qmod.TokenType.HOUSENUMBER
lookup_word = row.word_token[1:]
elif rowclass == 'place' and row.type == 'postcode':
ttype = qmod.TokenType.POSTCODE
lookup_word = row.word_token[1:]
else:
ttype = qmod.TokenType.NEAR_ITEM if row.operator in ('in', 'near')\
else qmod.TokenType.QUALIFIER
lookup_word = row.word
elif row.word_token.startswith(' '):
ttype = qmod.TokenType.WORD
lookup_word = row.word or row.word_token[1:]
else:
ttype = qmod.TokenType.PARTIAL
lookup_word = row.word_token
penalty = 0.21
if row.search_name_count > self.max_word_freq:
is_indexed = False
return LegacyToken(penalty=penalty, token=row.word_id,
count=max(1, row.search_name_count or 1),
addr_count=1, # not supported
lookup_word=lookup_word,
word_token=row.word_token.strip(),
category=(rowclass, row.type) if rowclass is not None else None,
country=row.country_code,
operator=row.operator,
is_indexed=is_indexed),\
ttype
def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None:
""" Add tokens to query that are not saved in the database.
"""
for part, node, i in zip(parts, query.nodes, range(1000)):
if len(part) <= 4 and part.isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
LegacyToken(penalty=0.5, token=0, count=1, addr_count=1,
lookup_word=part, word_token=part,
category=None, country=None,
operator=None, is_indexed=True))
def rerank_tokens(self, query: qmod.QueryStruct) -> None:
""" Add penalties to tokens that depend on presence of other token.
"""
for _, node, tlist in query.iter_token_lists():
if tlist.ttype == qmod.TokenType.POSTCODE:
for repl in node.starting:
if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \
and (repl.ttype != qmod.TokenType.HOUSENUMBER
or len(tlist.tokens[0].lookup_word) > 4):
repl.add_penalty(0.39)
elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
and len(tlist.tokens[0].lookup_word) <= 3:
if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
for repl in node.starting:
if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
repl.add_penalty(0.5 - tlist.tokens[0].penalty)
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
for node in query.nodes:
for tlist in node.starting:
for token in tlist.tokens:
t = cast(LegacyToken, token)
yield [tlist.ttype.name, t.token, t.word_token or '',
t.lookup_word or '', t.penalty, t.count, t.info]
async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
""" Create and set up a new query analyzer for a database based
on the ICU tokenizer.
"""
out = LegacyQueryAnalyzer(conn)
await out.setup()
return out

View File

@@ -0,0 +1,297 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Datastructures for a tokenized query.
"""
from typing import List, Tuple, Optional, Iterator
from abc import ABC, abstractmethod
import dataclasses
import enum
class BreakType(enum.Enum):
""" Type of break between tokens.
"""
START = '<'
""" Begin of the query. """
END = '>'
""" End of the query. """
PHRASE = ','
""" Break between two phrases. """
WORD = ' '
""" Break between words. """
PART = '-'
""" Break inside a word, for example a hyphen or apostrophe. """
TOKEN = '`'
""" Break created as a result of tokenization.
This may happen in languages without spaces between words.
"""
class TokenType(enum.Enum):
""" Type of token.
"""
WORD = enum.auto()
""" Full name of a place. """
PARTIAL = enum.auto()
""" Word term without breaks, does not necessarily represent a full name. """
HOUSENUMBER = enum.auto()
""" Housenumber term. """
POSTCODE = enum.auto()
""" Postal code term. """
COUNTRY = enum.auto()
""" Country name or reference. """
QUALIFIER = enum.auto()
""" Special term used together with name (e.g. _Hotel_ Bellevue). """
NEAR_ITEM = enum.auto()
""" Special term used as searchable object(e.g. supermarket in ...). """
class PhraseType(enum.Enum):
""" Designation of a phrase.
"""
NONE = 0
""" No specific designation (i.e. source is free-form query). """
AMENITY = enum.auto()
""" Contains name or type of a POI. """
STREET = enum.auto()
""" Contains a street name optionally with a housenumber. """
CITY = enum.auto()
""" Contains the postal city. """
COUNTY = enum.auto()
""" Contains the equivalent of a county. """
STATE = enum.auto()
""" Contains a state or province. """
POSTCODE = enum.auto()
""" Contains a postal code. """
COUNTRY = enum.auto()
""" Contains the country name or code. """
def compatible_with(self, ttype: TokenType,
is_full_phrase: bool) -> bool:
""" Check if the given token type can be used with the phrase type.
"""
if self == PhraseType.NONE:
return not is_full_phrase or ttype != TokenType.QUALIFIER
if self == PhraseType.AMENITY:
return ttype in (TokenType.WORD, TokenType.PARTIAL)\
or (is_full_phrase and ttype == TokenType.NEAR_ITEM)\
or (not is_full_phrase and ttype == TokenType.QUALIFIER)
if self == PhraseType.STREET:
return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER)
if self == PhraseType.POSTCODE:
return ttype == TokenType.POSTCODE
if self == PhraseType.COUNTRY:
return ttype == TokenType.COUNTRY
return ttype in (TokenType.WORD, TokenType.PARTIAL)
@dataclasses.dataclass
class Token(ABC):
""" Base type for tokens.
Specific query analyzers must implement the concrete token class.
"""
penalty: float
token: int
count: int
addr_count: int
lookup_word: str
is_indexed: bool
@abstractmethod
def get_category(self) -> Tuple[str, str]:
""" Return the category restriction for qualifier terms and
category objects.
"""
@dataclasses.dataclass
class TokenRange:
""" Indexes of query nodes over which a token spans.
"""
start: int
end: int
def __lt__(self, other: 'TokenRange') -> bool:
return self.end <= other.start
def __le__(self, other: 'TokenRange') -> bool:
return NotImplemented
def __gt__(self, other: 'TokenRange') -> bool:
return self.start >= other.end
def __ge__(self, other: 'TokenRange') -> bool:
return NotImplemented
def replace_start(self, new_start: int) -> 'TokenRange':
""" Return a new token range with the new start.
"""
return TokenRange(new_start, self.end)
def replace_end(self, new_end: int) -> 'TokenRange':
""" Return a new token range with the new end.
"""
return TokenRange(self.start, new_end)
def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']:
""" Split the span into two spans at the given index.
The index must be within the span.
"""
return self.replace_end(index), self.replace_start(index)
@dataclasses.dataclass
class TokenList:
""" List of all tokens of a given type going from one breakpoint to another.
"""
end: int
ttype: TokenType
tokens: List[Token]
def add_penalty(self, penalty: float) -> None:
""" Add the given penalty to all tokens in the list.
"""
for token in self.tokens:
token.penalty += penalty
@dataclasses.dataclass
class QueryNode:
""" A node of the query representing a break between terms.
"""
btype: BreakType
ptype: PhraseType
starting: List[TokenList] = dataclasses.field(default_factory=list)
def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
""" Check if there are tokens of the given types ending at the
given node.
"""
return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
""" Get the list of tokens of the given type starting at this node
and ending at the node 'end'. Returns 'None' if no such
tokens exist.
"""
for tlist in self.starting:
if tlist.end == end and tlist.ttype == ttype:
return tlist.tokens
return None
@dataclasses.dataclass
class Phrase:
""" A normalized query part. Phrases may be typed which means that
they then represent a specific part of the address.
"""
ptype: PhraseType
text: str
class QueryStruct:
""" A tokenized search query together with the normalized source
from which the tokens have been parsed.
The query contains a list of nodes that represent the breaks
between words. Tokens span between nodes, which don't necessarily
need to be direct neighbours. Thus the query is represented as a
directed acyclic graph.
When created, a query contains a single node: the start of the
query. Further nodes can be added by appending to 'nodes'.
"""
def __init__(self, source: List[Phrase]) -> None:
self.source = source
self.nodes: List[QueryNode] = \
[QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
def num_token_slots(self) -> int:
""" Return the length of the query in vertice steps.
"""
return len(self.nodes) - 1
def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
""" Append a new break node with the given break type.
The phrase type denotes the type for any tokens starting
at the node.
"""
self.nodes.append(QueryNode(btype, ptype))
def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
""" Add a token to the query. 'start' and 'end' are the indexes of the
nodes from which to which the token spans. The indexes must exist
and are expected to be in the same phrase.
'ttype' denotes the type of the token and 'token' the token to
be inserted.
If the token type is not compatible with the phrase it should
be added to, then the token is silently dropped.
"""
snode = self.nodes[trange.start]
full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\
and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
if snode.ptype.compatible_with(ttype, full_phrase):
tlist = snode.get_tokens(trange.end, ttype)
if tlist is None:
snode.starting.append(TokenList(trange.end, ttype, [token]))
else:
tlist.append(token)
def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
""" Get the list of tokens of a given type, spanning the given
nodes. The nodes must exist. If no tokens exist, an
empty list is returned.
"""
return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
def get_partials_list(self, trange: TokenRange) -> List[Token]:
""" Create a list of partial tokens between the given nodes.
The list is composed of the first token of type PARTIAL
going to the subsequent node. Such PARTIAL tokens are
assumed to exist.
"""
return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL)))
for i in range(trange.start, trange.end)]
def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
""" Iterator over all token lists in the query.
"""
for i, node in enumerate(self.nodes):
for tlist in node.starting:
yield i, node, tlist
def find_lookup_word_by_id(self, token: int) -> str:
""" Find the first token with the given token ID and return
its lookup word. Returns 'None' if no such token exists.
The function is very slow and must only be used for
debugging.
"""
for node in self.nodes:
for tlist in node.starting:
for t in tlist.tokens:
if t.token == token:
return f"[{tlist.ttype.name[0]}]{t.lookup_word}"
return 'None'

View File

@@ -0,0 +1,54 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Factory for creating a query analyzer for the configured tokenizer.
"""
from typing import List, cast, TYPE_CHECKING
from abc import ABC, abstractmethod
from pathlib import Path
import importlib
from ..logging import log
from ..connection import SearchConnection
if TYPE_CHECKING:
from .query import Phrase, QueryStruct
class AbstractQueryAnalyzer(ABC):
""" Class for analysing incoming queries.
Query analyzers are tied to the tokenizer used on import.
"""
@abstractmethod
async def analyze_query(self, phrases: List['Phrase']) -> 'QueryStruct':
""" Analyze the given phrases and return the tokenized query.
"""
@abstractmethod
def normalize_text(self, text: str) -> str:
""" Bring the given text into a normalized form. That is the
standardized form search will work with. All information removed
at this stage is inevitably lost.
"""
async def make_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
""" Create a query analyzer for the tokenizer used by the database.
"""
name = await conn.get_property('tokenizer')
src_file = Path(__file__).parent / f'{name}_tokenizer.py'
if not src_file.is_file():
log().comment(f"No tokenizer named '{name}' available. Database not set up properly.")
raise RuntimeError('Tokenizer not found')
module = importlib.import_module(f'nominatim.api.search.{name}_tokenizer')
return cast(AbstractQueryAnalyzer, await module.create_query_analyzer(conn))

View File

@@ -0,0 +1,422 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Create query interpretations where each vertice in the query is assigned
a specific function (expressed as a token type).
"""
from typing import Optional, List, Iterator
import dataclasses
from ..logging import log
from . import query as qmod
# pylint: disable=too-many-return-statements,too-many-branches
@dataclasses.dataclass
class TypedRange:
""" A token range for a specific type of tokens.
"""
ttype: qmod.TokenType
trange: qmod.TokenRange
PENALTY_TOKENCHANGE = {
qmod.BreakType.START: 0.0,
qmod.BreakType.END: 0.0,
qmod.BreakType.PHRASE: 0.0,
qmod.BreakType.WORD: 0.1,
qmod.BreakType.PART: 0.2,
qmod.BreakType.TOKEN: 0.4
}
TypedRangeSeq = List[TypedRange]
@dataclasses.dataclass
class TokenAssignment: # pylint: disable=too-many-instance-attributes
""" Representation of a possible assignment of token types
to the tokens in a tokenized query.
"""
penalty: float = 0.0
name: Optional[qmod.TokenRange] = None
address: List[qmod.TokenRange] = dataclasses.field(default_factory=list)
housenumber: Optional[qmod.TokenRange] = None
postcode: Optional[qmod.TokenRange] = None
country: Optional[qmod.TokenRange] = None
near_item: Optional[qmod.TokenRange] = None
qualifier: Optional[qmod.TokenRange] = None
@staticmethod
def from_ranges(ranges: TypedRangeSeq) -> 'TokenAssignment':
""" Create a new token assignment from a sequence of typed spans.
"""
out = TokenAssignment()
for token in ranges:
if token.ttype == qmod.TokenType.PARTIAL:
out.address.append(token.trange)
elif token.ttype == qmod.TokenType.HOUSENUMBER:
out.housenumber = token.trange
elif token.ttype == qmod.TokenType.POSTCODE:
out.postcode = token.trange
elif token.ttype == qmod.TokenType.COUNTRY:
out.country = token.trange
elif token.ttype == qmod.TokenType.NEAR_ITEM:
out.near_item = token.trange
elif token.ttype == qmod.TokenType.QUALIFIER:
out.qualifier = token.trange
return out
class _TokenSequence:
""" Working state used to put together the token assignments.
Represents an intermediate state while traversing the tokenized
query.
"""
def __init__(self, seq: TypedRangeSeq,
direction: int = 0, penalty: float = 0.0) -> None:
self.seq = seq
self.direction = direction
self.penalty = penalty
def __str__(self) -> str:
seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype.name}]' for r in self.seq)
return f'{seq} (dir: {self.direction}, penalty: {self.penalty})'
@property
def end_pos(self) -> int:
""" Return the index of the global end of the current sequence.
"""
return self.seq[-1].trange.end if self.seq else 0
def has_types(self, *ttypes: qmod.TokenType) -> bool:
""" Check if the current sequence contains any typed ranges of
the given types.
"""
return any(s.ttype in ttypes for s in self.seq)
def is_final(self) -> bool:
""" Return true when the sequence cannot be extended by any
form of token anymore.
"""
# Country and category must be the final term for left-to-right
return len(self.seq) > 1 and \
self.seq[-1].ttype in (qmod.TokenType.COUNTRY, qmod.TokenType.NEAR_ITEM)
def appendable(self, ttype: qmod.TokenType) -> Optional[int]:
""" Check if the give token type is appendable to the existing sequence.
Returns None if the token type is not appendable, otherwise the
new direction of the sequence after adding such a type. The
token is not added.
"""
if ttype == qmod.TokenType.WORD:
return None
if not self.seq:
# Append unconditionally to the empty list
if ttype == qmod.TokenType.COUNTRY:
return -1
if ttype in (qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
return 1
return self.direction
# Name tokens are always acceptable and don't change direction
if ttype == qmod.TokenType.PARTIAL:
# qualifiers cannot appear in the middle of the query. They need
# to be near the next phrase.
if self.direction == -1 \
and any(t.ttype == qmod.TokenType.QUALIFIER for t in self.seq[:-1]):
return None
return self.direction
# Other tokens may only appear once
if self.has_types(ttype):
return None
if ttype == qmod.TokenType.HOUSENUMBER:
if self.direction == 1:
if len(self.seq) == 1 and self.seq[0].ttype == qmod.TokenType.QUALIFIER:
return None
if len(self.seq) > 2 \
or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
return None # direction left-to-right: housenumber must come before anything
elif self.direction == -1 \
or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
return -1 # force direction right-to-left if after other terms
return self.direction
if ttype == qmod.TokenType.POSTCODE:
if self.direction == -1:
if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
return None
return -1
if self.direction == 1:
return None if self.has_types(qmod.TokenType.COUNTRY) else 1
if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
return 1
return self.direction
if ttype == qmod.TokenType.COUNTRY:
return None if self.direction == -1 else 1
if ttype == qmod.TokenType.NEAR_ITEM:
return self.direction
if ttype == qmod.TokenType.QUALIFIER:
if self.direction == 1:
if (len(self.seq) == 1
and self.seq[0].ttype in (qmod.TokenType.PARTIAL, qmod.TokenType.NEAR_ITEM)) \
or (len(self.seq) == 2
and self.seq[0].ttype == qmod.TokenType.NEAR_ITEM
and self.seq[1].ttype == qmod.TokenType.PARTIAL):
return 1
return None
if self.direction == -1:
return -1
tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TokenType.NEAR_ITEM else self.seq
if len(tempseq) == 0:
return 1
if len(tempseq) == 1 and self.seq[0].ttype == qmod.TokenType.HOUSENUMBER:
return None
if len(tempseq) > 1 or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
return -1
return 0
return None
def advance(self, ttype: qmod.TokenType, end_pos: int,
btype: qmod.BreakType) -> Optional['_TokenSequence']:
""" Return a new token sequence state with the given token type
extended.
"""
newdir = self.appendable(ttype)
if newdir is None:
return None
if not self.seq:
newseq = [TypedRange(ttype, qmod.TokenRange(0, end_pos))]
new_penalty = 0.0
else:
last = self.seq[-1]
if btype != qmod.BreakType.PHRASE and last.ttype == ttype:
# extend the existing range
newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
new_penalty = 0.0
else:
# start a new range
newseq = list(self.seq) + [TypedRange(ttype,
qmod.TokenRange(last.trange.end, end_pos))]
new_penalty = PENALTY_TOKENCHANGE[btype]
return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
def _adapt_penalty_from_priors(self, priors: int, new_dir: int) -> bool:
if priors >= 2:
if self.direction == 0:
self.direction = new_dir
else:
if priors == 2:
self.penalty += 0.8
else:
return False
return True
def recheck_sequence(self) -> bool:
""" Check that the sequence is a fully valid token assignment
and adapt direction and penalties further if necessary.
This function catches some impossible assignments that need
forward context and can therefore not be excluded when building
the assignment.
"""
# housenumbers may not be further than 2 words from the beginning.
# If there are two words in front, give it a penalty.
hnrpos = next((i for i, tr in enumerate(self.seq)
if tr.ttype == qmod.TokenType.HOUSENUMBER),
None)
if hnrpos is not None:
if self.direction != -1:
priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TokenType.PARTIAL)
if not self._adapt_penalty_from_priors(priors, -1):
return False
if self.direction != 1:
priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TokenType.PARTIAL)
if not self._adapt_penalty_from_priors(priors, 1):
return False
if any(t.ttype == qmod.TokenType.NEAR_ITEM for t in self.seq):
self.penalty += 1.0
return True
def _get_assignments_postcode(self, base: TokenAssignment,
query_len: int) -> Iterator[TokenAssignment]:
""" Yield possible assignments of Postcode searches with an
address component.
"""
assert base.postcode is not None
if (base.postcode.start == 0 and self.direction != -1)\
or (base.postcode.end == query_len and self.direction != 1):
log().comment('postcode search')
# <address>,<postcode> should give preference to address search
if base.postcode.start == 0:
penalty = self.penalty
self.direction = -1 # name searches are only possible backwards
else:
penalty = self.penalty + 0.1
self.direction = 1 # name searches are only possible forwards
yield dataclasses.replace(base, penalty=penalty)
def _get_assignments_address_forward(self, base: TokenAssignment,
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments of address searches with
left-to-right reading.
"""
first = base.address[0]
log().comment('first word = name')
yield dataclasses.replace(base, penalty=self.penalty,
name=first, address=base.address[1:])
# To paraphrase:
# * if another name term comes after the first one and before the
# housenumber
# * a qualifier comes after the name
# * the containing phrase is strictly typed
if (base.housenumber and first.end < base.housenumber.start)\
or (base.qualifier and base.qualifier > first)\
or (query.nodes[first.start].ptype != qmod.PhraseType.NONE):
return
penalty = self.penalty
# Penalty for:
# * <name>, <street>, <housenumber> , ...
# * queries that are comma-separated
if (base.housenumber and base.housenumber > first) or len(query.source) > 1:
penalty += 0.25
for i in range(first.start + 1, first.end):
name, addr = first.split(i)
log().comment(f'split first word = name ({i - first.start})')
yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
def _get_assignments_address_backward(self, base: TokenAssignment,
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments of address searches with
right-to-left reading.
"""
last = base.address[-1]
if self.direction == -1 or len(base.address) > 1:
log().comment('last word = name')
yield dataclasses.replace(base, penalty=self.penalty,
name=last, address=base.address[:-1])
# To paraphrase:
# * if another name term comes before the last one and after the
# housenumber
# * a qualifier comes before the name
# * the containing phrase is strictly typed
if (base.housenumber and last.start > base.housenumber.end)\
or (base.qualifier and base.qualifier < last)\
or (query.nodes[last.start].ptype != qmod.PhraseType.NONE):
return
penalty = self.penalty
if base.housenumber and base.housenumber < last:
penalty += 0.4
if len(query.source) > 1:
penalty += 0.25
for i in range(last.start + 1, last.end):
addr, name = last.split(i)
log().comment(f'split last word = name ({i - last.start})')
yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments for the current sequence.
This function splits up general name assignments into name
and address and yields all possible variants of that.
"""
base = TokenAssignment.from_ranges(self.seq)
num_addr_tokens = sum(t.end - t.start for t in base.address)
if num_addr_tokens > 50:
return
# Postcode search (postcode-only search is covered in next case)
if base.postcode is not None and base.address:
yield from self._get_assignments_postcode(base, query.num_token_slots())
# Postcode or country-only search
if not base.address:
if not base.housenumber and (base.postcode or base.country or base.near_item):
log().comment('postcode/country search')
yield dataclasses.replace(base, penalty=self.penalty)
else:
# <postcode>,<address> should give preference to postcode search
if base.postcode and base.postcode.start == 0:
self.penalty += 0.1
# Right-to-left reading of the address
if self.direction != -1:
yield from self._get_assignments_address_forward(base, query)
# Left-to-right reading of the address
if self.direction != 1:
yield from self._get_assignments_address_backward(base, query)
# variant for special housenumber searches
if base.housenumber and not base.qualifier:
yield dataclasses.replace(base, penalty=self.penalty)
def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Return possible word type assignments to word positions.
The assignments are computed from the concrete tokens listed
in the tokenized query.
The result includes the penalty for transitions from one word type to
another. It does not include penalties for transitions within a
type.
"""
todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PhraseType.NONE else 1)]
while todo:
state = todo.pop()
node = query.nodes[state.end_pos]
for tlist in node.starting:
newstate = state.advance(tlist.ttype, tlist.end, node.btype)
if newstate is not None:
if newstate.end_pos == query.num_token_slots():
if newstate.recheck_sequence():
log().var_dump('Assignment', newstate)
yield from newstate.get_assignments(query)
elif not newstate.is_final():
todo.append(newstate)

View File

View File

@@ -0,0 +1,194 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Server implementation using the falcon webserver framework.
"""
from typing import Optional, Mapping, cast, Any, List
from pathlib import Path
import datetime as dt
import asyncio
from falcon.asgi import App, Request, Response
from nominatim_core.config import Configuration
from ...core import NominatimAPIAsync
from ... import v1 as api_impl
from ... import logging as loglib
class HTTPNominatimError(Exception):
""" A special exception class for errors raised during processing.
"""
def __init__(self, msg: str, status: int, content_type: str) -> None:
self.msg = msg
self.status = status
self.content_type = content_type
async def nominatim_error_handler(req: Request, resp: Response, #pylint: disable=unused-argument
exception: HTTPNominatimError,
_: Any) -> None:
""" Special error handler that passes message and content type as
per exception info.
"""
resp.status = exception.status
resp.text = exception.msg
resp.content_type = exception.content_type
async def timeout_error_handler(req: Request, resp: Response, #pylint: disable=unused-argument
exception: TimeoutError, #pylint: disable=unused-argument
_: Any) -> None:
""" Special error handler that passes message and content type as
per exception info.
"""
resp.status = 503
loglib.log().comment('Aborted: Query took too long to process.')
logdata = loglib.get_and_disable()
if logdata:
resp.text = logdata
resp.content_type = 'text/html; charset=utf-8'
else:
resp.text = "Query took too long to process."
resp.content_type = 'text/plain; charset=utf-8'
class ParamWrapper(api_impl.ASGIAdaptor):
""" Adaptor class for server glue to Falcon framework.
"""
def __init__(self, req: Request, resp: Response,
config: Configuration) -> None:
self.request = req
self.response = resp
self._config = config
def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
return cast(Optional[str], self.request.get_param(name, default=default))
def get_header(self, name: str, default: Optional[str] = None) -> Optional[str]:
return cast(Optional[str], self.request.get_header(name, default=default))
def error(self, msg: str, status: int = 400) -> HTTPNominatimError:
return HTTPNominatimError(msg, status, self.content_type)
def create_response(self, status: int, output: str, num_results: int) -> None:
self.response.context.num_results = num_results
self.response.status = status
self.response.text = output
self.response.content_type = self.content_type
def base_uri(self) -> str:
return cast (str, self.request.forwarded_prefix)
def config(self) -> Configuration:
return self._config
class EndpointWrapper:
""" Converter for server glue endpoint functions to Falcon request handlers.
"""
def __init__(self, name: str, func: api_impl.EndpointFunc, api: NominatimAPIAsync) -> None:
self.name = name
self.func = func
self.api = api
async def on_get(self, req: Request, resp: Response) -> None:
""" Implementation of the endpoint.
"""
await self.func(self.api, ParamWrapper(req, resp, self.api.config))
class FileLoggingMiddleware:
""" Middleware to log selected requests into a file.
"""
def __init__(self, file_name: str):
self.fd = open(file_name, 'a', buffering=1, encoding='utf8') # pylint: disable=R1732
async def process_request(self, req: Request, _: Response) -> None:
""" Callback before the request starts timing.
"""
req.context.start = dt.datetime.now(tz=dt.timezone.utc)
async def process_response(self, req: Request, resp: Response,
resource: Optional[EndpointWrapper],
req_succeeded: bool) -> None:
""" Callback after requests writes to the logfile. It only
writes logs for successful requests for search, reverse and lookup.
"""
if not req_succeeded or resource is None or resp.status != 200\
or resource.name not in ('reverse', 'search', 'lookup', 'details'):
return
finish = dt.datetime.now(tz=dt.timezone.utc)
duration = (finish - req.context.start).total_seconds()
params = req.scope['query_string'].decode('utf8')
start = req.context.start.replace(tzinfo=None)\
.isoformat(sep=' ', timespec='milliseconds')
self.fd.write(f"[{start}] "
f"{duration:.4f} {getattr(resp.context, 'num_results', 0)} "
f'{resource.name} "{params}"\n')
class APIShutdown:
""" Middleware that closes any open database connections.
"""
def __init__(self, api: NominatimAPIAsync) -> None:
self.api = api
async def process_shutdown(self, *_: Any) -> None:
"""Process the ASGI lifespan shutdown event.
"""
await self.api.close()
def get_application(project_dir: Path,
environ: Optional[Mapping[str, str]] = None) -> App:
""" Create a Nominatim Falcon ASGI application.
"""
api = NominatimAPIAsync(project_dir, environ)
middleware: List[object] = [APIShutdown(api)]
log_file = api.config.LOG_FILE
if log_file:
middleware.append(FileLoggingMiddleware(log_file))
app = App(cors_enable=api.config.get_bool('CORS_NOACCESSCONTROL'),
middleware=middleware)
app.add_error_handler(HTTPNominatimError, nominatim_error_handler)
app.add_error_handler(TimeoutError, timeout_error_handler)
# different from TimeoutError in Python <= 3.10
app.add_error_handler(asyncio.TimeoutError, timeout_error_handler)
legacy_urls = api.config.get_bool('SERVE_LEGACY_URLS')
for name, func in api_impl.ROUTES:
endpoint = EndpointWrapper(name, func, api)
app.add_route(f"/{name}", endpoint)
if legacy_urls:
app.add_route(f"/{name}.php", endpoint)
return app
def run_wsgi() -> App:
""" Entry point for uvicorn.
Make sure uvicorn is run from the project directory.
"""
return get_application(Path('.'))

View File

@@ -0,0 +1,174 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Server implementation using the starlette webserver framework.
"""
from typing import Any, Optional, Mapping, Callable, cast, Coroutine, Dict, Awaitable
from pathlib import Path
import datetime as dt
import asyncio
from starlette.applications import Starlette
from starlette.routing import Route
from starlette.exceptions import HTTPException
from starlette.responses import Response, PlainTextResponse, HTMLResponse
from starlette.requests import Request
from starlette.middleware import Middleware
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
from starlette.middleware.cors import CORSMiddleware
from nominatim_core.config import Configuration
from ...core import NominatimAPIAsync
from ... import v1 as api_impl
from ... import logging as loglib
class ParamWrapper(api_impl.ASGIAdaptor):
""" Adaptor class for server glue to Starlette framework.
"""
def __init__(self, request: Request) -> None:
self.request = request
def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
return self.request.query_params.get(name, default=default)
def get_header(self, name: str, default: Optional[str] = None) -> Optional[str]:
return self.request.headers.get(name, default)
def error(self, msg: str, status: int = 400) -> HTTPException:
return HTTPException(status, detail=msg,
headers={'content-type': self.content_type})
def create_response(self, status: int, output: str, num_results: int) -> Response:
self.request.state.num_results = num_results
return Response(output, status_code=status, media_type=self.content_type)
def base_uri(self) -> str:
scheme = self.request.url.scheme
host = self.request.url.hostname
port = self.request.url.port
root = self.request.scope['root_path']
if (scheme == 'http' and port == 80) or (scheme == 'https' and port == 443):
port = None
if port is not None:
return f"{scheme}://{host}:{port}{root}"
return f"{scheme}://{host}{root}"
def config(self) -> Configuration:
return cast(Configuration, self.request.app.state.API.config)
def _wrap_endpoint(func: api_impl.EndpointFunc)\
-> Callable[[Request], Coroutine[Any, Any, Response]]:
async def _callback(request: Request) -> Response:
return cast(Response, await func(request.app.state.API, ParamWrapper(request)))
return _callback
class FileLoggingMiddleware(BaseHTTPMiddleware):
""" Middleware to log selected requests into a file.
"""
def __init__(self, app: Starlette, file_name: str = ''):
super().__init__(app)
self.fd = open(file_name, 'a', buffering=1, encoding='utf8') # pylint: disable=R1732
async def dispatch(self, request: Request,
call_next: RequestResponseEndpoint) -> Response:
start = dt.datetime.now(tz=dt.timezone.utc)
response = await call_next(request)
if response.status_code != 200:
return response
finish = dt.datetime.now(tz=dt.timezone.utc)
for endpoint in ('reverse', 'search', 'lookup', 'details'):
if request.url.path.startswith('/' + endpoint):
qtype = endpoint
break
else:
return response
duration = (finish - start).total_seconds()
params = request.scope['query_string'].decode('utf8')
self.fd.write(f"[{start.replace(tzinfo=None).isoformat(sep=' ', timespec='milliseconds')}] "
f"{duration:.4f} {getattr(request.state, 'num_results', 0)} "
f'{qtype} "{params}"\n')
return response
async def timeout_error(request: Request, #pylint: disable=unused-argument
_: Exception) -> Response:
""" Error handler for query timeouts.
"""
loglib.log().comment('Aborted: Query took too long to process.')
logdata = loglib.get_and_disable()
if logdata:
return HTMLResponse(logdata)
return PlainTextResponse("Query took too long to process.", status_code=503)
def get_application(project_dir: Path,
environ: Optional[Mapping[str, str]] = None,
debug: bool = True) -> Starlette:
""" Create a Nominatim falcon ASGI application.
"""
config = Configuration(project_dir, environ)
routes = []
legacy_urls = config.get_bool('SERVE_LEGACY_URLS')
for name, func in api_impl.ROUTES:
endpoint = _wrap_endpoint(func)
routes.append(Route(f"/{name}", endpoint=endpoint))
if legacy_urls:
routes.append(Route(f"/{name}.php", endpoint=endpoint))
middleware = []
if config.get_bool('CORS_NOACCESSCONTROL'):
middleware.append(Middleware(CORSMiddleware,
allow_origins=['*'],
allow_methods=['GET', 'OPTIONS'],
max_age=86400))
log_file = config.LOG_FILE
if log_file:
middleware.append(Middleware(FileLoggingMiddleware, file_name=log_file))
exceptions: Dict[Any, Callable[[Request, Exception], Awaitable[Response]]] = {
TimeoutError: timeout_error,
asyncio.TimeoutError: timeout_error
}
async def _shutdown() -> None:
await app.state.API.close()
app = Starlette(debug=debug, routes=routes, middleware=middleware,
exception_handlers=exceptions,
on_shutdown=[_shutdown])
app.state.API = NominatimAPIAsync(project_dir, environ)
return app
def run_wsgi() -> Starlette:
""" Entry point for uvicorn.
"""
return get_application(Path('.'), debug=False)

View File

@@ -0,0 +1,221 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Custom functions and expressions for SQLAlchemy.
"""
from __future__ import annotations
from typing import Any
import sqlalchemy as sa
from sqlalchemy.ext.compiler import compiles
from nominatim_core.typing import SaColumn
# pylint: disable=all
class PlacexGeometryReverseLookuppolygon(sa.sql.functions.GenericFunction[Any]):
""" Check for conditions that allow partial index use on
'idx_placex_geometry_reverse_lookupPolygon'.
Needs to be constant, so that the query planner picks them up correctly
in prepared statements.
"""
name = 'PlacexGeometryReverseLookuppolygon'
inherit_cache = True
@compiles(PlacexGeometryReverseLookuppolygon) # type: ignore[no-untyped-call, misc]
def _default_intersects(element: PlacexGeometryReverseLookuppolygon,
compiler: 'sa.Compiled', **kw: Any) -> str:
return ("(ST_GeometryType(placex.geometry) in ('ST_Polygon', 'ST_MultiPolygon')"
" AND placex.rank_address between 4 and 25"
" AND placex.type != 'postcode'"
" AND placex.name is not null"
" AND placex.indexed_status = 0"
" AND placex.linked_place_id is null)")
@compiles(PlacexGeometryReverseLookuppolygon, 'sqlite') # type: ignore[no-untyped-call, misc]
def _sqlite_intersects(element: PlacexGeometryReverseLookuppolygon,
compiler: 'sa.Compiled', **kw: Any) -> str:
return ("(ST_GeometryType(placex.geometry) in ('POLYGON', 'MULTIPOLYGON')"
" AND placex.rank_address between 4 and 25"
" AND placex.type != 'postcode'"
" AND placex.name is not null"
" AND placex.indexed_status = 0"
" AND placex.linked_place_id is null)")
class IntersectsReverseDistance(sa.sql.functions.GenericFunction[Any]):
name = 'IntersectsReverseDistance'
inherit_cache = True
def __init__(self, table: sa.Table, geom: SaColumn) -> None:
super().__init__(table.c.geometry,
table.c.rank_search, geom)
self.tablename = table.name
@compiles(IntersectsReverseDistance) # type: ignore[no-untyped-call, misc]
def default_reverse_place_diameter(element: IntersectsReverseDistance,
compiler: 'sa.Compiled', **kw: Any) -> str:
table = element.tablename
return f"({table}.rank_address between 4 and 25"\
f" AND {table}.type != 'postcode'"\
f" AND {table}.name is not null"\
f" AND {table}.linked_place_id is null"\
f" AND {table}.osm_type = 'N'" + \
" AND ST_Buffer(%s, reverse_place_diameter(%s)) && %s)" % \
tuple(map(lambda c: compiler.process(c, **kw), element.clauses))
@compiles(IntersectsReverseDistance, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_reverse_place_diameter(element: IntersectsReverseDistance,
compiler: 'sa.Compiled', **kw: Any) -> str:
geom1, rank, geom2 = list(element.clauses)
table = element.tablename
return (f"({table}.rank_address between 4 and 25"\
f" AND {table}.type != 'postcode'"\
f" AND {table}.name is not null"\
f" AND {table}.linked_place_id is null"\
f" AND {table}.osm_type = 'N'"\
" AND MbrIntersects(%s, ST_Expand(%s, 14.0 * exp(-0.2 * %s) - 0.03))"\
f" AND {table}.place_id IN"\
" (SELECT place_id FROM placex_place_node_areas"\
" WHERE ROWID IN (SELECT ROWID FROM SpatialIndex"\
" WHERE f_table_name = 'placex_place_node_areas'"\
" AND search_frame = %s)))") % (
compiler.process(geom1, **kw),
compiler.process(geom2, **kw),
compiler.process(rank, **kw),
compiler.process(geom2, **kw))
class IsBelowReverseDistance(sa.sql.functions.GenericFunction[Any]):
name = 'IsBelowReverseDistance'
inherit_cache = True
@compiles(IsBelowReverseDistance) # type: ignore[no-untyped-call, misc]
def default_is_below_reverse_distance(element: IsBelowReverseDistance,
compiler: 'sa.Compiled', **kw: Any) -> str:
dist, rank = list(element.clauses)
return "%s < reverse_place_diameter(%s)" % (compiler.process(dist, **kw),
compiler.process(rank, **kw))
@compiles(IsBelowReverseDistance, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_is_below_reverse_distance(element: IsBelowReverseDistance,
compiler: 'sa.Compiled', **kw: Any) -> str:
dist, rank = list(element.clauses)
return "%s < 14.0 * exp(-0.2 * %s) - 0.03" % (compiler.process(dist, **kw),
compiler.process(rank, **kw))
class IsAddressPoint(sa.sql.functions.GenericFunction[Any]):
name = 'IsAddressPoint'
inherit_cache = True
def __init__(self, table: sa.Table) -> None:
super().__init__(table.c.rank_address,
table.c.housenumber, table.c.name)
@compiles(IsAddressPoint) # type: ignore[no-untyped-call, misc]
def default_is_address_point(element: IsAddressPoint,
compiler: 'sa.Compiled', **kw: Any) -> str:
rank, hnr, name = list(element.clauses)
return "(%s = 30 AND (%s IS NOT NULL OR %s ? 'addr:housename'))" % (
compiler.process(rank, **kw),
compiler.process(hnr, **kw),
compiler.process(name, **kw))
@compiles(IsAddressPoint, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_is_address_point(element: IsAddressPoint,
compiler: 'sa.Compiled', **kw: Any) -> str:
rank, hnr, name = list(element.clauses)
return "(%s = 30 AND coalesce(%s, json_extract(%s, '$.addr:housename')) IS NOT NULL)" % (
compiler.process(rank, **kw),
compiler.process(hnr, **kw),
compiler.process(name, **kw))
class CrosscheckNames(sa.sql.functions.GenericFunction[Any]):
""" Check if in the given list of names in parameters 1 any of the names
from the JSON array in parameter 2 are contained.
"""
name = 'CrosscheckNames'
inherit_cache = True
@compiles(CrosscheckNames) # type: ignore[no-untyped-call, misc]
def compile_crosscheck_names(element: CrosscheckNames,
compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "coalesce(avals(%s) && ARRAY(SELECT * FROM json_array_elements_text(%s)), false)" % (
compiler.process(arg1, **kw), compiler.process(arg2, **kw))
@compiles(CrosscheckNames, 'sqlite') # type: ignore[no-untyped-call, misc]
def compile_sqlite_crosscheck_names(element: CrosscheckNames,
compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "EXISTS(SELECT *"\
" FROM json_each(%s) as name, json_each(%s) as match_name"\
" WHERE name.value = match_name.value)"\
% (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
class JsonArrayEach(sa.sql.functions.GenericFunction[Any]):
""" Return elements of a json array as a set.
"""
name = 'JsonArrayEach'
inherit_cache = True
@compiles(JsonArrayEach) # type: ignore[no-untyped-call, misc]
def default_json_array_each(element: JsonArrayEach, compiler: 'sa.Compiled', **kw: Any) -> str:
return "json_array_elements(%s)" % compiler.process(element.clauses, **kw)
@compiles(JsonArrayEach, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_json_array_each(element: JsonArrayEach, compiler: 'sa.Compiled', **kw: Any) -> str:
return "json_each(%s)" % compiler.process(element.clauses, **kw)
class Greatest(sa.sql.functions.GenericFunction[Any]):
""" Function to compute maximum of all its input parameters.
"""
name = 'greatest'
inherit_cache = True
@compiles(Greatest, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_greatest(element: Greatest, compiler: 'sa.Compiled', **kw: Any) -> str:
return "max(%s)" % compiler.process(element.clauses, **kw)
class RegexpWord(sa.sql.functions.GenericFunction[Any]):
""" Check if a full word is in a given string.
"""
name = 'RegexpWord'
inherit_cache = True
@compiles(RegexpWord, 'postgresql') # type: ignore[no-untyped-call, misc]
def postgres_regexp_nocase(element: RegexpWord, compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "%s ~* ('\\m(' || %s || ')\\M')::text" % (compiler.process(arg2, **kw), compiler.process(arg1, **kw))
@compiles(RegexpWord, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_regexp_nocase(element: RegexpWord, compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "regexp('\\b(' || %s || ')\\b', %s)" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))

View File

@@ -0,0 +1,122 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Custom functions for SQLite.
"""
from typing import cast, Optional, Set, Any
import json
# pylint: disable=protected-access
def weigh_search(search_vector: Optional[str], rankings: str, default: float) -> float:
""" Custom weight function for search results.
"""
if search_vector is not None:
svec = [int(x) for x in search_vector.split(',')]
for rank in json.loads(rankings):
if all(r in svec for r in rank[1]):
return cast(float, rank[0])
return default
class ArrayIntersectFuzzy:
""" Compute the array of common elements of all input integer arrays.
Very large input parameters may be ignored to speed up
computation. Therefore, the result is a superset of common elements.
Input and output arrays are given as comma-separated lists.
"""
def __init__(self) -> None:
self.first = ''
self.values: Optional[Set[int]] = None
def step(self, value: Optional[str]) -> None:
""" Add the next array to the intersection.
"""
if value is not None:
if not self.first:
self.first = value
elif len(value) < 10000000:
if self.values is None:
self.values = {int(x) for x in self.first.split(',')}
self.values.intersection_update((int(x) for x in value.split(',')))
def finalize(self) -> str:
""" Return the final result.
"""
if self.values is not None:
return ','.join(map(str, self.values))
return self.first
class ArrayUnion:
""" Compute the set of all elements of the input integer arrays.
Input and output arrays are given as strings of comma-separated lists.
"""
def __init__(self) -> None:
self.values: Optional[Set[str]] = None
def step(self, value: Optional[str]) -> None:
""" Add the next array to the union.
"""
if value is not None:
if self.values is None:
self.values = set(value.split(','))
else:
self.values.update(value.split(','))
def finalize(self) -> str:
""" Return the final result.
"""
return '' if self.values is None else ','.join(self.values)
def array_contains(container: Optional[str], containee: Optional[str]) -> Optional[bool]:
""" Is the array 'containee' completely contained in array 'container'.
"""
if container is None or containee is None:
return None
vset = container.split(',')
return all(v in vset for v in containee.split(','))
def array_pair_contains(container1: Optional[str], container2: Optional[str],
containee: Optional[str]) -> Optional[bool]:
""" Is the array 'containee' completely contained in the union of
array 'container1' and array 'container2'.
"""
if container1 is None or container2 is None or containee is None:
return None
vset = container1.split(',') + container2.split(',')
return all(v in vset for v in containee.split(','))
def install_custom_functions(conn: Any) -> None:
""" Install helper functions for Nominatim into the given SQLite
database connection.
"""
conn.create_function('weigh_search', 3, weigh_search, deterministic=True)
conn.create_function('array_contains', 2, array_contains, deterministic=True)
conn.create_function('array_pair_contains', 3, array_pair_contains, deterministic=True)
_create_aggregate(conn, 'array_intersect_fuzzy', 1, ArrayIntersectFuzzy)
_create_aggregate(conn, 'array_union', 1, ArrayUnion)
async def _make_aggregate(aioconn: Any, *args: Any) -> None:
await aioconn._execute(aioconn._conn.create_aggregate, *args)
def _create_aggregate(conn: Any, name: str, nargs: int, aggregate: Any) -> None:
try:
conn.await_(_make_aggregate(conn._connection, name, nargs, aggregate))
except Exception as error: # pylint: disable=broad-exception-caught
conn._handle_exception(error)

View File

@@ -0,0 +1,51 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Classes and function related to status call.
"""
from typing import Optional
import datetime as dt
import dataclasses
import sqlalchemy as sa
from .connection import SearchConnection
from .version import NOMINATIM_API_VERSION
@dataclasses.dataclass
class StatusResult:
""" Result of a call to the status API.
"""
status: int
message: str
software_version = NOMINATIM_API_VERSION
data_updated: Optional[dt.datetime] = None
database_version: Optional[str] = None
async def get_status(conn: SearchConnection) -> StatusResult:
""" Execute a status API call.
"""
status = StatusResult(0, 'OK')
# Last update date
sql = sa.select(conn.t.import_status.c.lastimportdate).limit(1)
status.data_updated = await conn.scalar(sql)
if status.data_updated is not None:
if status.data_updated.tzinfo is None:
status.data_updated = status.data_updated.replace(tzinfo=dt.timezone.utc)
else:
status.data_updated = status.data_updated.astimezone(dt.timezone.utc)
# Database version
try:
status.database_version = await conn.get_property('database_version')
except ValueError:
pass
return status

550
src/nominatim_api/types.py Normal file
View File

@@ -0,0 +1,550 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Complex datatypes used by the Nominatim API.
"""
from typing import Optional, Union, Tuple, NamedTuple, TypeVar, Type, Dict, \
Any, List, Sequence
from collections import abc
import dataclasses
import enum
import math
from struct import unpack
from binascii import unhexlify
from nominatim_core.errors import UsageError
from .localization import Locales
# pylint: disable=no-member,too-many-boolean-expressions,too-many-instance-attributes
@dataclasses.dataclass
class PlaceID:
""" Reference a place by Nominatim's internal ID.
A PlaceID may reference place from the main table placex, from
the interpolation tables or the postcode tables. Place IDs are not
stable between installations. You may use this type theefore only
with place IDs obtained from the same database.
"""
place_id: int
"""
The internal ID of the place to reference.
"""
@dataclasses.dataclass
class OsmID:
""" Reference a place by its OSM ID and potentially the basic category.
The OSM ID may refer to places in the main table placex and OSM
interpolation lines.
"""
osm_type: str
""" OSM type of the object. Must be one of `N`(node), `W`(way) or
`R`(relation).
"""
osm_id: int
""" The OSM ID of the object.
"""
osm_class: Optional[str] = None
""" The same OSM object may appear multiple times in the database under
different categories. The optional class parameter allows to distinguish
the different categories and corresponds to the key part of the category.
If there are multiple objects in the database and `osm_class` is
left out, then one of the objects is returned at random.
"""
def __post_init__(self) -> None:
if self.osm_type not in ('N', 'W', 'R'):
raise ValueError(f"Illegal OSM type '{self.osm_type}'. Must be one of N, W, R.")
PlaceRef = Union[PlaceID, OsmID]
class Point(NamedTuple):
""" A geographic point in WGS84 projection.
"""
x: float
y: float
@property
def lat(self) -> float:
""" Return the latitude of the point.
"""
return self.y
@property
def lon(self) -> float:
""" Return the longitude of the point.
"""
return self.x
def to_geojson(self) -> str:
""" Return the point in GeoJSON format.
"""
return f'{{"type": "Point","coordinates": [{self.x}, {self.y}]}}'
@staticmethod
def from_wkb(wkb: Union[str, bytes]) -> 'Point':
""" Create a point from EWKB as returned from the database.
"""
if isinstance(wkb, str):
wkb = unhexlify(wkb)
if len(wkb) != 25:
raise ValueError(f"Point wkb has unexpected length {len(wkb)}")
if wkb[0] == 0:
gtype, srid, x, y = unpack('>iidd', wkb[1:])
elif wkb[0] == 1:
gtype, srid, x, y = unpack('<iidd', wkb[1:])
else:
raise ValueError("WKB has unknown endian value.")
if gtype != 0x20000001:
raise ValueError("WKB must be a point geometry.")
if srid != 4326:
raise ValueError("Only WGS84 WKB supported.")
return Point(x, y)
@staticmethod
def from_param(inp: Any) -> 'Point':
""" Create a point from an input parameter. The parameter
may be given as a point, a string or a sequence of
strings or floats. Raises a UsageError if the format is
not correct.
"""
if isinstance(inp, Point):
return inp
seq: Sequence[str]
if isinstance(inp, str):
seq = inp.split(',')
elif isinstance(inp, abc.Sequence):
seq = inp
if len(seq) != 2:
raise UsageError('Point parameter needs 2 coordinates.')
try:
x, y = filter(math.isfinite, map(float, seq))
except ValueError as exc:
raise UsageError('Point parameter needs to be numbers.') from exc
if x < -180.0 or x > 180.0 or y < -90.0 or y > 90.0:
raise UsageError('Point coordinates invalid.')
return Point(x, y)
def to_wkt(self) -> str:
""" Return the WKT representation of the point.
"""
return f'POINT({self.x} {self.y})'
AnyPoint = Union[Point, Tuple[float, float]]
WKB_BBOX_HEADER_LE = b'\x01\x03\x00\x00\x20\xE6\x10\x00\x00\x01\x00\x00\x00\x05\x00\x00\x00'
WKB_BBOX_HEADER_BE = b'\x00\x20\x00\x00\x03\x00\x00\x10\xe6\x00\x00\x00\x01\x00\x00\x00\x05'
class Bbox:
""" A bounding box in WGS84 projection.
The coordinates are available as an array in the 'coord'
property in the order (minx, miny, maxx, maxy).
"""
def __init__(self, minx: float, miny: float, maxx: float, maxy: float) -> None:
""" Create a new bounding box with the given coordinates in WGS84
projection.
"""
self.coords = (minx, miny, maxx, maxy)
@property
def minlat(self) -> float:
""" Southern-most latitude, corresponding to the minimum y coordinate.
"""
return self.coords[1]
@property
def maxlat(self) -> float:
""" Northern-most latitude, corresponding to the maximum y coordinate.
"""
return self.coords[3]
@property
def minlon(self) -> float:
""" Western-most longitude, corresponding to the minimum x coordinate.
"""
return self.coords[0]
@property
def maxlon(self) -> float:
""" Eastern-most longitude, corresponding to the maximum x coordinate.
"""
return self.coords[2]
@property
def area(self) -> float:
""" Return the area of the box in WGS84.
"""
return (self.coords[2] - self.coords[0]) * (self.coords[3] - self.coords[1])
def contains(self, pt: Point) -> bool:
""" Check if the point is inside or on the boundary of the box.
"""
return self.coords[0] <= pt[0] and self.coords[1] <= pt[1]\
and self.coords[2] >= pt[0] and self.coords[3] >= pt[1]
def to_wkt(self) -> str:
""" Return the WKT representation of the Bbox. This
is a simple polygon with four points.
"""
return 'POLYGON(({0} {1},{0} {3},{2} {3},{2} {1},{0} {1}))'\
.format(*self.coords) # pylint: disable=consider-using-f-string
@staticmethod
def from_wkb(wkb: Union[None, str, bytes]) -> 'Optional[Bbox]':
""" Create a Bbox from a bounding box polygon as returned by
the database. Returns `None` if the input value is None.
"""
if wkb is None:
return None
if isinstance(wkb, str):
wkb = unhexlify(wkb)
if len(wkb) != 97:
raise ValueError("WKB must be a bounding box polygon")
if wkb.startswith(WKB_BBOX_HEADER_LE):
x1, y1, _, _, x2, y2 = unpack('<dddddd', wkb[17:65])
elif wkb.startswith(WKB_BBOX_HEADER_BE):
x1, y1, _, _, x2, y2 = unpack('>dddddd', wkb[17:65])
else:
raise ValueError("WKB has wrong header")
return Bbox(min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2))
@staticmethod
def from_point(pt: Point, buffer: float) -> 'Bbox':
""" Return a Bbox around the point with the buffer added to all sides.
"""
return Bbox(pt[0] - buffer, pt[1] - buffer,
pt[0] + buffer, pt[1] + buffer)
@staticmethod
def from_param(inp: Any) -> 'Bbox':
""" Return a Bbox from an input parameter. The box may be
given as a Bbox, a string or a list or strings or integer.
Raises a UsageError if the format is incorrect.
"""
if isinstance(inp, Bbox):
return inp
seq: Sequence[str]
if isinstance(inp, str):
seq = inp.split(',')
elif isinstance(inp, abc.Sequence):
seq = inp
if len(seq) != 4:
raise UsageError('Bounding box parameter needs 4 coordinates.')
try:
x1, y1, x2, y2 = filter(math.isfinite, map(float, seq))
except ValueError as exc:
raise UsageError('Bounding box parameter needs to be numbers.') from exc
x1 = min(180, max(-180, x1))
x2 = min(180, max(-180, x2))
y1 = min(90, max(-90, y1))
y2 = min(90, max(-90, y2))
if x1 == x2 or y1 == y2:
raise UsageError('Bounding box with invalid parameters.')
return Bbox(min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2))
class GeometryFormat(enum.Flag):
""" All search functions support returning the full geometry of a place in
various formats. The internal geometry is converted by PostGIS to
the desired format and then returned as a string. It is possible to
request multiple formats at the same time.
"""
NONE = 0
""" No geometry requested. Alias for a empty flag.
"""
GEOJSON = enum.auto()
"""
[GeoJSON](https://geojson.org/) format
"""
KML = enum.auto()
"""
[KML](https://en.wikipedia.org/wiki/Keyhole_Markup_Language) format
"""
SVG = enum.auto()
"""
[SVG](http://www.w3.org/TR/SVG/paths.html) format
"""
TEXT = enum.auto()
"""
[WKT](https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry) format
"""
class DataLayer(enum.Flag):
""" The `DataLayer` flag type defines the layers that can be selected
for reverse and forward search.
"""
ADDRESS = enum.auto()
""" The address layer contains all places relevant for addresses:
fully qualified addresses with a house number (or a house name equivalent,
for some addresses) and places that can be part of an address like
roads, cities, states.
"""
POI = enum.auto()
""" Layer for points of interest like shops, restaurants but also
recycling bins or postboxes.
"""
RAILWAY = enum.auto()
""" Layer with railway features including tracks and other infrastructure.
Note that in Nominatim's standard configuration, only very few railway
features are imported into the database. Thus a custom configuration
is required to make full use of this layer.
"""
NATURAL = enum.auto()
""" Layer with natural features like rivers, lakes and mountains.
"""
MANMADE = enum.auto()
""" Layer with other human-made features and boundaries. This layer is
the catch-all and includes all features not covered by the other
layers. A typical example for this layer are national park boundaries.
"""
def format_country(cc: Any) -> List[str]:
""" Extract a list of country codes from the input which may be either
a string or list of strings. Filters out all values that are not
a two-letter string.
"""
clist: Sequence[str]
if isinstance(cc, str):
clist = cc.split(',')
elif isinstance(cc, abc.Sequence):
clist = cc
else:
raise UsageError("Parameter 'country' needs to be a comma-separated list "
"or a Python list of strings.")
return [cc.lower() for cc in clist if isinstance(cc, str) and len(cc) == 2]
def format_excluded(ids: Any) -> List[int]:
""" Extract a list of place ids from the input which may be either
a string or a list of strings or ints. Ignores empty value but
throws a UserError on anything that cannot be converted to int.
"""
plist: Sequence[str]
if isinstance(ids, str):
plist = [s.strip() for s in ids.split(',')]
elif isinstance(ids, abc.Sequence):
plist = ids
else:
raise UsageError("Parameter 'excluded' needs to be a comma-separated list "
"or a Python list of numbers.")
if not all(isinstance(i, int) or
(isinstance(i, str) and (not i or i.isdigit())) for i in plist):
raise UsageError("Parameter 'excluded' only takes place IDs.")
return [int(id) for id in plist if id] or [0]
def format_categories(categories: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
""" Extract a list of categories. Currently a noop.
"""
return categories
TParam = TypeVar('TParam', bound='LookupDetails') # pylint: disable=invalid-name
@dataclasses.dataclass
class LookupDetails:
""" Collection of parameters that define which kind of details are
returned with a lookup or details result.
"""
geometry_output: GeometryFormat = GeometryFormat.NONE
""" Add the full geometry of the place to the result. Multiple
formats may be selected. Note that geometries can become quite large.
"""
address_details: bool = False
""" Get detailed information on the places that make up the address
for the result.
"""
linked_places: bool = False
""" Get detailed information on the places that link to the result.
"""
parented_places: bool = False
""" Get detailed information on all places that this place is a parent
for, i.e. all places for which it provides the address details.
Only POI places can have parents.
"""
keywords: bool = False
""" Add information about the search terms used for this place.
"""
geometry_simplification: float = 0.0
""" Simplification factor for a geometry in degrees WGS. A factor of
0.0 means the original geometry is kept. The higher the value, the
more the geometry gets simplified.
"""
locales: Locales = Locales()
""" Preferred languages for localization of results.
"""
@classmethod
def from_kwargs(cls: Type[TParam], kwargs: Dict[str, Any]) -> TParam:
""" Load the data fields of the class from a dictionary.
Unknown entries in the dictionary are ignored, missing ones
get the default setting.
The function supports type checking and throws a UsageError
when the value does not fit.
"""
def _check_field(v: Any, field: 'dataclasses.Field[Any]') -> Any:
if v is None:
return field.default_factory() \
if field.default_factory != dataclasses.MISSING \
else field.default
if field.metadata and 'transform' in field.metadata:
return field.metadata['transform'](v)
if not isinstance(v, field.type):
raise UsageError(f"Parameter '{field.name}' needs to be of {field.type!s}.")
return v
return cls(**{f.name: _check_field(kwargs[f.name], f)
for f in dataclasses.fields(cls) if f.name in kwargs})
@dataclasses.dataclass
class ReverseDetails(LookupDetails):
""" Collection of parameters for the reverse call.
"""
max_rank: int = dataclasses.field(default=30,
metadata={'transform': lambda v: max(0, min(v, 30))}
)
""" Highest address rank to return.
"""
layers: DataLayer = DataLayer.ADDRESS | DataLayer.POI
""" Filter which kind of data to include.
"""
@dataclasses.dataclass
class SearchDetails(LookupDetails):
""" Collection of parameters for the search call.
"""
max_results: int = 10
""" Maximum number of results to be returned. The actual number of results
may be less.
"""
min_rank: int = dataclasses.field(default=0,
metadata={'transform': lambda v: max(0, min(v, 30))}
)
""" Lowest address rank to return.
"""
max_rank: int = dataclasses.field(default=30,
metadata={'transform': lambda v: max(0, min(v, 30))}
)
""" Highest address rank to return.
"""
layers: Optional[DataLayer] = dataclasses.field(default=None,
metadata={'transform': lambda r : r})
""" Filter which kind of data to include. When 'None' (the default) then
filtering by layers is disabled.
"""
countries: List[str] = dataclasses.field(default_factory=list,
metadata={'transform': format_country})
""" Restrict search results to the given countries. An empty list (the
default) will disable this filter.
"""
excluded: List[int] = dataclasses.field(default_factory=list,
metadata={'transform': format_excluded})
""" List of OSM objects to exclude from the results. Currently only
works when the internal place ID is given.
An empty list (the default) will disable this filter.
"""
viewbox: Optional[Bbox] = dataclasses.field(default=None,
metadata={'transform': Bbox.from_param})
""" Focus the search on a given map area.
"""
bounded_viewbox: bool = False
""" Use 'viewbox' as a filter and restrict results to places within the
given area.
"""
near: Optional[Point] = dataclasses.field(default=None,
metadata={'transform': Point.from_param})
""" Order results by distance to the given point.
"""
near_radius: Optional[float] = dataclasses.field(default=None,
metadata={'transform': lambda r : r})
""" Use near point as a filter and drop results outside the given
radius. Radius is given in degrees WSG84.
"""
categories: List[Tuple[str, str]] = dataclasses.field(default_factory=list,
metadata={'transform': format_categories})
""" Restrict search to places with one of the given class/type categories.
An empty list (the default) will disable this filter.
"""
viewbox_x2: Optional[Bbox] = None
def __post_init__(self) -> None:
if self.viewbox is not None:
xext = (self.viewbox.maxlon - self.viewbox.minlon)/2
yext = (self.viewbox.maxlat - self.viewbox.minlat)/2
self.viewbox_x2 = Bbox(self.viewbox.minlon - xext, self.viewbox.minlat - yext,
self.viewbox.maxlon + xext, self.viewbox.maxlat + yext)
def restrict_min_max_rank(self, new_min: int, new_max: int) -> None:
""" Change the min_rank and max_rank fields to respect the
given boundaries.
"""
assert new_min <= new_max
self.min_rank = max(self.min_rank, new_min)
self.max_rank = min(self.max_rank, new_max)
def is_impossible(self) -> bool:
""" Check if the parameter configuration is contradictionary and
cannot yield any results.
"""
return (self.min_rank > self.max_rank
or (self.bounded_viewbox
and self.viewbox is not None and self.near is not None
and self.viewbox.contains(self.near))
or (self.layers is not None and not self.layers)
or (self.max_rank <= 4 and
self.layers is not None and not self.layers & DataLayer.ADDRESS))
def layer_enabled(self, layer: DataLayer) -> bool:
""" Check if the given layer has been chosen. Also returns
true when layer restriction has been disabled completely.
"""
return self.layers is None or bool(self.layers & layer)

View File

@@ -0,0 +1,21 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of API version v1 (aka the legacy version).
"""
#pylint: disable=useless-import-alias
from .server_glue import (ASGIAdaptor as ASGIAdaptor,
EndpointFunc as EndpointFunc,
ROUTES as ROUTES)
from . import format as _format
list_formats = _format.dispatch.list_formats
supports_format = _format.dispatch.supports_format
format_result = _format.dispatch.format_result

View File

@@ -0,0 +1,201 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Hard-coded information about tag categories.
These tables have been copied verbatim from the old PHP code. For future
version a more flexible formatting is required.
"""
from typing import Tuple, Optional, Mapping, Union
from ..results import ReverseResult, SearchResult
from ..types import Bbox
def get_label_tag(category: Tuple[str, str], extratags: Optional[Mapping[str, str]],
rank: int, country: Optional[str]) -> str:
""" Create a label tag for the given place that can be used as an XML name.
"""
if rank < 26 and extratags and 'place' in extratags:
label = extratags['place']
elif rank < 26 and extratags and 'linked_place' in extratags:
label = extratags['linked_place']
elif category == ('boundary', 'administrative'):
label = ADMIN_LABELS.get((country or '', int(rank/2)))\
or ADMIN_LABELS.get(('', int(rank/2)))\
or 'Administrative'
elif category[1] == 'postal_code':
label = 'postcode'
elif rank < 26:
label = category[1] if category[1] != 'yes' else category[0]
elif rank < 28:
label = 'road'
elif category[0] == 'place'\
and category[1] in ('house_number', 'house_name', 'country_code'):
label = category[1]
else:
label = category[0]
return label.lower().replace(' ', '_')
def bbox_from_result(result: Union[ReverseResult, SearchResult]) -> Bbox:
""" Compute a bounding box for the result. For ways and relations
a given boundingbox is used. For all other object, a box is computed
around the centroid according to dimensions derived from the
search rank.
"""
if (result.osm_object and result.osm_object[0] == 'N') or result.bbox is None:
extent = NODE_EXTENT.get(result.category, 0.00005)
return Bbox.from_point(result.centroid, extent)
return result.bbox
# pylint: disable=line-too-long
OSM_ATTRIBUTION = 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright'
OSM_TYPE_NAME = {
'N': 'node',
'W': 'way',
'R': 'relation'
}
ADMIN_LABELS = {
('', 1): 'Continent',
('', 2): 'Country',
('', 3): 'Region',
('', 4): 'State',
('', 5): 'State District',
('', 6): 'County',
('', 7): 'Municipality',
('', 8): 'City',
('', 9): 'City District',
('', 10): 'Suburb',
('', 11): 'Neighbourhood',
('', 12): 'City Block',
('no', 3): 'State',
('no', 4): 'County',
('se', 3): 'State',
('se', 4): 'County'
}
ICONS = {
('boundary', 'administrative'): 'poi_boundary_administrative',
('place', 'city'): 'poi_place_city',
('place', 'town'): 'poi_place_town',
('place', 'village'): 'poi_place_village',
('place', 'hamlet'): 'poi_place_village',
('place', 'suburb'): 'poi_place_village',
('place', 'locality'): 'poi_place_village',
('place', 'airport'): 'transport_airport2',
('aeroway', 'aerodrome'): 'transport_airport2',
('railway', 'station'): 'transport_train_station2',
('amenity', 'place_of_worship'): 'place_of_worship_unknown3',
('amenity', 'pub'): 'food_pub',
('amenity', 'bar'): 'food_bar',
('amenity', 'university'): 'education_university',
('tourism', 'museum'): 'tourist_museum',
('amenity', 'arts_centre'): 'tourist_art_gallery2',
('tourism', 'zoo'): 'tourist_zoo',
('tourism', 'theme_park'): 'poi_point_of_interest',
('tourism', 'attraction'): 'poi_point_of_interest',
('leisure', 'golf_course'): 'sport_golf',
('historic', 'castle'): 'tourist_castle',
('amenity', 'hospital'): 'health_hospital',
('amenity', 'school'): 'education_school',
('amenity', 'theatre'): 'tourist_theatre',
('amenity', 'library'): 'amenity_library',
('amenity', 'fire_station'): 'amenity_firestation3',
('amenity', 'police'): 'amenity_police2',
('amenity', 'bank'): 'money_bank2',
('amenity', 'post_office'): 'amenity_post_office',
('tourism', 'hotel'): 'accommodation_hotel2',
('amenity', 'cinema'): 'tourist_cinema',
('tourism', 'artwork'): 'tourist_art_gallery2',
('historic', 'archaeological_site'): 'tourist_archaeological2',
('amenity', 'doctors'): 'health_doctors',
('leisure', 'sports_centre'): 'sport_leisure_centre',
('leisure', 'swimming_pool'): 'sport_swimming_outdoor',
('shop', 'supermarket'): 'shopping_supermarket',
('shop', 'convenience'): 'shopping_convenience',
('amenity', 'restaurant'): 'food_restaurant',
('amenity', 'fast_food'): 'food_fastfood',
('amenity', 'cafe'): 'food_cafe',
('tourism', 'guest_house'): 'accommodation_bed_and_breakfast',
('amenity', 'pharmacy'): 'health_pharmacy_dispensing',
('amenity', 'fuel'): 'transport_fuel',
('natural', 'peak'): 'poi_peak',
('natural', 'wood'): 'landuse_coniferous_and_deciduous',
('shop', 'bicycle'): 'shopping_bicycle',
('shop', 'clothes'): 'shopping_clothes',
('shop', 'hairdresser'): 'shopping_hairdresser',
('shop', 'doityourself'): 'shopping_diy',
('shop', 'estate_agent'): 'shopping_estateagent2',
('shop', 'car'): 'shopping_car',
('shop', 'garden_centre'): 'shopping_garden_centre',
('shop', 'car_repair'): 'shopping_car_repair',
('shop', 'bakery'): 'shopping_bakery',
('shop', 'butcher'): 'shopping_butcher',
('shop', 'apparel'): 'shopping_clothes',
('shop', 'laundry'): 'shopping_laundrette',
('shop', 'beverages'): 'shopping_alcohol',
('shop', 'alcohol'): 'shopping_alcohol',
('shop', 'optician'): 'health_opticians',
('shop', 'chemist'): 'health_pharmacy',
('shop', 'gallery'): 'tourist_art_gallery2',
('shop', 'jewelry'): 'shopping_jewelry',
('tourism', 'information'): 'amenity_information',
('historic', 'ruins'): 'tourist_ruin',
('amenity', 'college'): 'education_school',
('historic', 'monument'): 'tourist_monument',
('historic', 'memorial'): 'tourist_monument',
('historic', 'mine'): 'poi_mine',
('tourism', 'caravan_site'): 'accommodation_caravan_park',
('amenity', 'bus_station'): 'transport_bus_station',
('amenity', 'atm'): 'money_atm2',
('tourism', 'viewpoint'): 'tourist_view_point',
('tourism', 'guesthouse'): 'accommodation_bed_and_breakfast',
('railway', 'tram'): 'transport_tram_stop',
('amenity', 'courthouse'): 'amenity_court',
('amenity', 'recycling'): 'amenity_recycling',
('amenity', 'dentist'): 'health_dentist',
('natural', 'beach'): 'tourist_beach',
('railway', 'tram_stop'): 'transport_tram_stop',
('amenity', 'prison'): 'amenity_prison',
('highway', 'bus_stop'): 'transport_bus_stop2'
}
NODE_EXTENT = {
('place', 'continent'): 25,
('place', 'country'): 7,
('place', 'state'): 2.6,
('place', 'province'): 2.6,
('place', 'region'): 1.0,
('place', 'county'): 0.7,
('place', 'city'): 0.16,
('place', 'municipality'): 0.16,
('place', 'island'): 0.32,
('place', 'postcode'): 0.16,
('place', 'town'): 0.04,
('place', 'village'): 0.02,
('place', 'hamlet'): 0.02,
('place', 'district'): 0.02,
('place', 'borough'): 0.02,
('place', 'suburb'): 0.02,
('place', 'locality'): 0.01,
('place', 'neighbourhood'): 0.01,
('place', 'quarter'): 0.01,
('place', 'city_block'): 0.01,
('landuse', 'farm'): 0.01,
('place', 'farm'): 0.01,
('place', 'airport'): 0.015,
('aeroway', 'aerodrome'): 0.015,
('railway', 'station'): 0.005
}

View File

@@ -0,0 +1,259 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Output formatters for API version v1.
"""
from typing import List, Dict, Mapping, Any
import collections
import datetime as dt
from nominatim_core.utils.json_writer import JsonWriter
from ..status import StatusResult
from ..results import DetailedResult, ReverseResults, SearchResults, \
AddressLines, AddressLine
from ..localization import Locales
from ..result_formatting import FormatDispatcher
from .classtypes import ICONS
from . import format_json, format_xml
class RawDataList(List[Dict[str, Any]]):
""" Data type for formatting raw data lists 'as is' in json.
"""
dispatch = FormatDispatcher()
@dispatch.format_func(StatusResult, 'text')
def _format_status_text(result: StatusResult, _: Mapping[str, Any]) -> str:
if result.status:
return f"ERROR: {result.message}"
return 'OK'
@dispatch.format_func(StatusResult, 'json')
def _format_status_json(result: StatusResult, _: Mapping[str, Any]) -> str:
out = JsonWriter()
out.start_object()\
.keyval('status', result.status)\
.keyval('message', result.message)\
.keyval_not_none('data_updated', result.data_updated,
lambda v: v.isoformat())\
.keyval('software_version', str(result.software_version))\
.keyval_not_none('database_version', result.database_version, str)\
.end_object()
return out()
def _add_address_row(writer: JsonWriter, row: AddressLine,
locales: Locales) -> None:
writer.start_object()\
.keyval('localname', locales.display_name(row.names))\
.keyval_not_none('place_id', row.place_id)
if row.osm_object is not None:
writer.keyval('osm_id', row.osm_object[1])\
.keyval('osm_type', row.osm_object[0])
if row.extratags:
writer.keyval_not_none('place_type', row.extratags.get('place_type'))
writer.keyval('class', row.category[0])\
.keyval('type', row.category[1])\
.keyval_not_none('admin_level', row.admin_level)\
.keyval('rank_address', row.rank_address)\
.keyval('distance', row.distance)\
.keyval('isaddress', row.isaddress)\
.end_object()
def _add_address_rows(writer: JsonWriter, section: str, rows: AddressLines,
locales: Locales) -> None:
writer.key(section).start_array()
for row in rows:
_add_address_row(writer, row, locales)
writer.next()
writer.end_array().next()
def _add_parent_rows_grouped(writer: JsonWriter, rows: AddressLines,
locales: Locales) -> None:
# group by category type
data = collections.defaultdict(list)
for row in rows:
sub = JsonWriter()
_add_address_row(sub, row, locales)
data[row.category[1]].append(sub())
writer.key('hierarchy').start_object()
for group, grouped in data.items():
writer.key(group).start_array()
grouped.sort() # sorts alphabetically by local name
for line in grouped:
writer.raw(line).next()
writer.end_array().next()
writer.end_object().next()
@dispatch.format_func(DetailedResult, 'json')
def _format_details_json(result: DetailedResult, options: Mapping[str, Any]) -> str:
locales = options.get('locales', Locales())
geom = result.geometry.get('geojson')
centroid = result.centroid.to_geojson()
out = JsonWriter()
out.start_object()\
.keyval_not_none('place_id', result.place_id)\
.keyval_not_none('parent_place_id', result.parent_place_id)
if result.osm_object is not None:
out.keyval('osm_type', result.osm_object[0])\
.keyval('osm_id', result.osm_object[1])
out.keyval('category', result.category[0])\
.keyval('type', result.category[1])\
.keyval('admin_level', result.admin_level)\
.keyval('localname', result.locale_name or '')\
.keyval('names', result.names or {})\
.keyval('addresstags', result.address or {})\
.keyval_not_none('housenumber', result.housenumber)\
.keyval_not_none('calculated_postcode', result.postcode)\
.keyval_not_none('country_code', result.country_code)\
.keyval_not_none('indexed_date', result.indexed_date, lambda v: v.isoformat())\
.keyval_not_none('importance', result.importance)\
.keyval('calculated_importance', result.calculated_importance())\
.keyval('extratags', result.extratags or {})\
.keyval_not_none('calculated_wikipedia', result.wikipedia)\
.keyval('rank_address', result.rank_address)\
.keyval('rank_search', result.rank_search)\
.keyval('isarea', 'Polygon' in (geom or result.geometry.get('type') or ''))\
.key('centroid').raw(centroid).next()\
.key('geometry').raw(geom or centroid).next()
if options.get('icon_base_url', None):
icon = ICONS.get(result.category)
if icon:
out.keyval('icon', f"{options['icon_base_url']}/{icon}.p.20.png")
if result.address_rows is not None:
_add_address_rows(out, 'address', result.address_rows, locales)
if result.linked_rows:
_add_address_rows(out, 'linked_places', result.linked_rows, locales)
if result.name_keywords is not None or result.address_keywords is not None:
out.key('keywords').start_object()
for sec, klist in (('name', result.name_keywords), ('address', result.address_keywords)):
out.key(sec).start_array()
for word in (klist or []):
out.start_object()\
.keyval('id', word.word_id)\
.keyval('token', word.word_token)\
.end_object().next()
out.end_array().next()
out.end_object().next()
if result.parented_rows is not None:
if options.get('group_hierarchy', False):
_add_parent_rows_grouped(out, result.parented_rows, locales)
else:
_add_address_rows(out, 'hierarchy', result.parented_rows, locales)
out.end_object()
return out()
@dispatch.format_func(ReverseResults, 'xml')
def _format_reverse_xml(results: ReverseResults, options: Mapping[str, Any]) -> str:
return format_xml.format_base_xml(results,
options, True, 'reversegeocode',
{'querystring': options.get('query', '')})
@dispatch.format_func(ReverseResults, 'geojson')
def _format_reverse_geojson(results: ReverseResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_geojson(results, options, True)
@dispatch.format_func(ReverseResults, 'geocodejson')
def _format_reverse_geocodejson(results: ReverseResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_geocodejson(results, options, True)
@dispatch.format_func(ReverseResults, 'json')
def _format_reverse_json(results: ReverseResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_json(results, options, True,
class_label='class')
@dispatch.format_func(ReverseResults, 'jsonv2')
def _format_reverse_jsonv2(results: ReverseResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_json(results, options, True,
class_label='category')
@dispatch.format_func(SearchResults, 'xml')
def _format_search_xml(results: SearchResults, options: Mapping[str, Any]) -> str:
extra = {'querystring': options.get('query', '')}
for attr in ('more_url', 'exclude_place_ids', 'viewbox'):
if options.get(attr):
extra[attr] = options[attr]
return format_xml.format_base_xml(results, options, False, 'searchresults',
extra)
@dispatch.format_func(SearchResults, 'geojson')
def _format_search_geojson(results: SearchResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_geojson(results, options, False)
@dispatch.format_func(SearchResults, 'geocodejson')
def _format_search_geocodejson(results: SearchResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_geocodejson(results, options, False)
@dispatch.format_func(SearchResults, 'json')
def _format_search_json(results: SearchResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_json(results, options, False,
class_label='class')
@dispatch.format_func(SearchResults, 'jsonv2')
def _format_search_jsonv2(results: SearchResults,
options: Mapping[str, Any]) -> str:
return format_json.format_base_json(results, options, False,
class_label='category')
@dispatch.format_func(RawDataList, 'json')
def _format_raw_data_json(results: RawDataList, _: Mapping[str, Any]) -> str:
out = JsonWriter()
out.start_array()
for res in results:
out.start_object()
for k, v in res.items():
if isinstance(v, dt.datetime):
out.keyval(k, v.isoformat(sep= ' ', timespec='seconds'))
else:
out.keyval(k, v)
out.end_object().next()
out.end_array()
return out()

View File

@@ -0,0 +1,275 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for output of results in json formats.
"""
from typing import Mapping, Any, Optional, Tuple, Union
from nominatim_core.utils.json_writer import JsonWriter
from ..results import AddressLines, ReverseResults, SearchResults
from . import classtypes as cl
#pylint: disable=too-many-branches
def _write_osm_id(out: JsonWriter, osm_object: Optional[Tuple[str, int]]) -> None:
if osm_object is not None:
out.keyval_not_none('osm_type', cl.OSM_TYPE_NAME.get(osm_object[0], None))\
.keyval('osm_id', osm_object[1])
def _write_typed_address(out: JsonWriter, address: Optional[AddressLines],
country_code: Optional[str]) -> None:
parts = {}
for line in (address or []):
if line.isaddress:
if line.local_name:
label = cl.get_label_tag(line.category, line.extratags,
line.rank_address, country_code)
if label not in parts:
parts[label] = line.local_name
if line.names and 'ISO3166-2' in line.names and line.admin_level:
parts[f"ISO3166-2-lvl{line.admin_level}"] = line.names['ISO3166-2']
for k, v in parts.items():
out.keyval(k, v)
if country_code:
out.keyval('country_code', country_code)
def _write_geocodejson_address(out: JsonWriter,
address: Optional[AddressLines],
obj_place_id: Optional[int],
country_code: Optional[str]) -> None:
extra = {}
for line in (address or []):
if line.isaddress and line.local_name:
if line.category[1] in ('postcode', 'postal_code'):
out.keyval('postcode', line.local_name)
elif line.category[1] == 'house_number':
out.keyval('housenumber', line.local_name)
elif (obj_place_id is None or obj_place_id != line.place_id) \
and line.rank_address >= 4 and line.rank_address < 28:
rank_name = GEOCODEJSON_RANKS[line.rank_address]
if rank_name not in extra:
extra[rank_name] = line.local_name
for k, v in extra.items():
out.keyval(k, v)
if country_code:
out.keyval('country_code', country_code)
def format_base_json(results: Union[ReverseResults, SearchResults],
options: Mapping[str, Any], simple: bool,
class_label: str) -> str:
""" Return the result list as a simple json string in custom Nominatim format.
"""
out = JsonWriter()
if simple:
if not results:
return '{"error":"Unable to geocode"}'
else:
out.start_array()
for result in results:
out.start_object()\
.keyval_not_none('place_id', result.place_id)\
.keyval('licence', cl.OSM_ATTRIBUTION)\
_write_osm_id(out, result.osm_object)
out.keyval('lat', f"{result.centroid.lat}")\
.keyval('lon', f"{result.centroid.lon}")\
.keyval(class_label, result.category[0])\
.keyval('type', result.category[1])\
.keyval('place_rank', result.rank_search)\
.keyval('importance', result.calculated_importance())\
.keyval('addresstype', cl.get_label_tag(result.category, result.extratags,
result.rank_address,
result.country_code))\
.keyval('name', result.locale_name or '')\
.keyval('display_name', result.display_name or '')
if options.get('icon_base_url', None):
icon = cl.ICONS.get(result.category)
if icon:
out.keyval('icon', f"{options['icon_base_url']}/{icon}.p.20.png")
if options.get('addressdetails', False):
out.key('address').start_object()
_write_typed_address(out, result.address_rows, result.country_code)
out.end_object().next()
if options.get('extratags', False):
out.keyval('extratags', result.extratags)
if options.get('namedetails', False):
out.keyval('namedetails', result.names)
bbox = cl.bbox_from_result(result)
out.key('boundingbox').start_array()\
.value(f"{bbox.minlat:0.7f}").next()\
.value(f"{bbox.maxlat:0.7f}").next()\
.value(f"{bbox.minlon:0.7f}").next()\
.value(f"{bbox.maxlon:0.7f}").next()\
.end_array().next()
if result.geometry:
for key in ('text', 'kml'):
out.keyval_not_none('geo' + key, result.geometry.get(key))
if 'geojson' in result.geometry:
out.key('geojson').raw(result.geometry['geojson']).next()
out.keyval_not_none('svg', result.geometry.get('svg'))
out.end_object()
if simple:
return out()
out.next()
out.end_array()
return out()
def format_base_geojson(results: Union[ReverseResults, SearchResults],
options: Mapping[str, Any],
simple: bool) -> str:
""" Return the result list as a geojson string.
"""
if not results and simple:
return '{"error":"Unable to geocode"}'
out = JsonWriter()
out.start_object()\
.keyval('type', 'FeatureCollection')\
.keyval('licence', cl.OSM_ATTRIBUTION)\
.key('features').start_array()
for result in results:
out.start_object()\
.keyval('type', 'Feature')\
.key('properties').start_object()
out.keyval_not_none('place_id', result.place_id)
_write_osm_id(out, result.osm_object)
out.keyval('place_rank', result.rank_search)\
.keyval('category', result.category[0])\
.keyval('type', result.category[1])\
.keyval('importance', result.calculated_importance())\
.keyval('addresstype', cl.get_label_tag(result.category, result.extratags,
result.rank_address,
result.country_code))\
.keyval('name', result.locale_name or '')\
.keyval('display_name', result.display_name or '')
if options.get('addressdetails', False):
out.key('address').start_object()
_write_typed_address(out, result.address_rows, result.country_code)
out.end_object().next()
if options.get('extratags', False):
out.keyval('extratags', result.extratags)
if options.get('namedetails', False):
out.keyval('namedetails', result.names)
out.end_object().next() # properties
out.key('bbox').start_array()
for coord in cl.bbox_from_result(result).coords:
out.float(coord, 7).next()
out.end_array().next()
out.key('geometry').raw(result.geometry.get('geojson')
or result.centroid.to_geojson()).next()
out.end_object().next()
out.end_array().next().end_object()
return out()
def format_base_geocodejson(results: Union[ReverseResults, SearchResults],
options: Mapping[str, Any], simple: bool) -> str:
""" Return the result list as a geocodejson string.
"""
if not results and simple:
return '{"error":"Unable to geocode"}'
out = JsonWriter()
out.start_object()\
.keyval('type', 'FeatureCollection')\
.key('geocoding').start_object()\
.keyval('version', '0.1.0')\
.keyval('attribution', cl.OSM_ATTRIBUTION)\
.keyval('licence', 'ODbL')\
.keyval_not_none('query', options.get('query'))\
.end_object().next()\
.key('features').start_array()
for result in results:
out.start_object()\
.keyval('type', 'Feature')\
.key('properties').start_object()\
.key('geocoding').start_object()
out.keyval_not_none('place_id', result.place_id)
_write_osm_id(out, result.osm_object)
out.keyval('osm_key', result.category[0])\
.keyval('osm_value', result.category[1])\
.keyval('type', GEOCODEJSON_RANKS[max(3, min(28, result.rank_address))])\
.keyval_not_none('accuracy', getattr(result, 'distance', None), transform=int)\
.keyval('label', result.display_name or '')\
.keyval_not_none('name', result.locale_name or None)\
if options.get('addressdetails', False):
_write_geocodejson_address(out, result.address_rows, result.place_id,
result.country_code)
out.key('admin').start_object()
if result.address_rows:
for line in result.address_rows:
if line.isaddress and (line.admin_level or 15) < 15 and line.local_name \
and line.category[0] == 'boundary' and line.category[1] == 'administrative':
out.keyval(f"level{line.admin_level}", line.local_name)
out.end_object().next()
out.end_object().next().end_object().next()
out.key('geometry').raw(result.geometry.get('geojson')
or result.centroid.to_geojson()).next()
out.end_object().next()
out.end_array().next().end_object()
return out()
GEOCODEJSON_RANKS = {
3: 'locality',
4: 'country',
5: 'state', 6: 'state', 7: 'state', 8: 'state', 9: 'state',
10: 'county', 11: 'county', 12: 'county',
13: 'city', 14: 'city', 15: 'city', 16: 'city',
17: 'district', 18: 'district', 19: 'district', 20: 'district', 21: 'district',
22: 'locality', 23: 'locality', 24: 'locality',
25: 'street', 26: 'street', 27: 'street', 28: 'house'}

View File

@@ -0,0 +1,126 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for output of results in XML format.
"""
from typing import Mapping, Any, Optional, Union
import datetime as dt
import xml.etree.ElementTree as ET
from ..results import AddressLines, ReverseResult, ReverseResults, \
SearchResult, SearchResults
from . import classtypes as cl
#pylint: disable=too-many-branches
def _write_xml_address(root: ET.Element, address: AddressLines,
country_code: Optional[str]) -> None:
parts = {}
for line in address:
if line.isaddress:
if line.local_name:
label = cl.get_label_tag(line.category, line.extratags,
line.rank_address, country_code)
if label not in parts:
parts[label] = line.local_name
if line.names and 'ISO3166-2' in line.names and line.admin_level:
parts[f"ISO3166-2-lvl{line.admin_level}"] = line.names['ISO3166-2']
for k,v in parts.items():
ET.SubElement(root, k).text = v
if country_code:
ET.SubElement(root, 'country_code').text = country_code
def _create_base_entry(result: Union[ReverseResult, SearchResult],
root: ET.Element, simple: bool) -> ET.Element:
place = ET.SubElement(root, 'result' if simple else 'place')
if result.place_id is not None:
place.set('place_id', str(result.place_id))
if result.osm_object:
osm_type = cl.OSM_TYPE_NAME.get(result.osm_object[0], None)
if osm_type is not None:
place.set('osm_type', osm_type)
place.set('osm_id', str(result.osm_object[1]))
if result.names and 'ref' in result.names:
place.set('ref', result.names['ref'])
elif result.locale_name:
# bug reproduced from PHP
place.set('ref', result.locale_name)
place.set('lat', f"{result.centroid.lat:.7f}")
place.set('lon', f"{result.centroid.lon:.7f}")
bbox = cl.bbox_from_result(result)
place.set('boundingbox',
f"{bbox.minlat:.7f},{bbox.maxlat:.7f},{bbox.minlon:.7f},{bbox.maxlon:.7f}")
place.set('place_rank', str(result.rank_search))
place.set('address_rank', str(result.rank_address))
if result.geometry:
for key in ('text', 'svg'):
if key in result.geometry:
place.set('geo' + key, result.geometry[key])
if 'kml' in result.geometry:
ET.SubElement(root if simple else place, 'geokml')\
.append(ET.fromstring(result.geometry['kml']))
if 'geojson' in result.geometry:
place.set('geojson', result.geometry['geojson'])
if simple:
place.text = result.display_name or ''
else:
place.set('display_name', result.display_name or '')
place.set('class', result.category[0])
place.set('type', result.category[1])
place.set('importance', str(result.calculated_importance()))
return place
def format_base_xml(results: Union[ReverseResults, SearchResults],
options: Mapping[str, Any],
simple: bool, xml_root_tag: str,
xml_extra_info: Mapping[str, str]) -> str:
""" Format the result into an XML response. With 'simple' exactly one
result will be output, otherwise a list.
"""
root = ET.Element(xml_root_tag)
root.set('timestamp', dt.datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +00:00'))
root.set('attribution', cl.OSM_ATTRIBUTION)
for k, v in xml_extra_info.items():
root.set(k, v)
if simple and not results:
ET.SubElement(root, 'error').text = 'Unable to geocode'
for result in results:
place = _create_base_entry(result, root, simple)
if not simple and options.get('icon_base_url', None):
icon = cl.ICONS.get(result.category)
if icon:
place.set('icon', icon)
if options.get('addressdetails', False) and result.address_rows:
_write_xml_address(ET.SubElement(root, 'addressparts') if simple else place,
result.address_rows, result.country_code)
if options.get('extratags', False):
eroot = ET.SubElement(root if simple else place, 'extratags')
if result.extratags:
for k, v in result.extratags.items():
ET.SubElement(eroot, 'tag', attrib={'key': k, 'value': v})
if options.get('namedetails', False):
eroot = ET.SubElement(root if simple else place, 'namedetails')
if result.names:
for k,v in result.names.items():
ET.SubElement(eroot, 'name', attrib={'desc': k}).text = v
return '<?xml version="1.0" encoding="UTF-8" ?>\n' + ET.tostring(root, encoding='unicode')

View File

@@ -0,0 +1,201 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper function for parsing parameters and and outputting data
specifically for the v1 version of the API.
"""
from typing import Tuple, Optional, Any, Dict, Iterable
from itertools import chain
import re
from ..results import SearchResult, SearchResults, SourceTable
from ..types import SearchDetails, GeometryFormat
REVERSE_MAX_RANKS = [2, 2, 2, # 0-2 Continent/Sea
4, 4, # 3-4 Country
8, # 5 State
10, 10, # 6-7 Region
12, 12, # 8-9 County
16, 17, # 10-11 City
18, # 12 Town
19, # 13 Village/Suburb
22, # 14 Hamlet/Neighbourhood
25, # 15 Localities
26, # 16 Major Streets
27, # 17 Minor Streets
30 # 18 Building
]
def zoom_to_rank(zoom: int) -> int:
""" Convert a zoom parameter into a rank according to the v1 API spec.
"""
return REVERSE_MAX_RANKS[max(0, min(18, zoom))]
FEATURE_TYPE_TO_RANK: Dict[Optional[str], Tuple[int, int]] = {
'country': (4, 4),
'state': (8, 8),
'city': (14, 16),
'settlement': (8, 20)
}
def feature_type_to_rank(feature_type: Optional[str]) -> Tuple[int, int]:
""" Convert a feature type parameter to a tuple of
feature type name, minimum rank and maximum rank.
"""
return FEATURE_TYPE_TO_RANK.get(feature_type, (0, 30))
#pylint: disable=too-many-arguments,too-many-branches
def extend_query_parts(queryparts: Dict[str, Any], details: Dict[str, Any],
feature_type: Optional[str],
namedetails: bool, extratags: bool,
excluded: Iterable[str]) -> None:
""" Add parameters from details dictionary to the query parts
dictionary which is suitable as URL parameter dictionary.
"""
parsed = SearchDetails.from_kwargs(details)
if parsed.geometry_output != GeometryFormat.NONE:
if GeometryFormat.GEOJSON in parsed.geometry_output:
queryparts['polygon_geojson'] = '1'
if GeometryFormat.KML in parsed.geometry_output:
queryparts['polygon_kml'] = '1'
if GeometryFormat.SVG in parsed.geometry_output:
queryparts['polygon_svg'] = '1'
if GeometryFormat.TEXT in parsed.geometry_output:
queryparts['polygon_text'] = '1'
if parsed.address_details:
queryparts['addressdetails'] = '1'
if namedetails:
queryparts['namedetails'] = '1'
if extratags:
queryparts['extratags'] = '1'
if parsed.geometry_simplification > 0.0:
queryparts['polygon_threshold'] = f"{parsed.geometry_simplification:.6g}"
if parsed.max_results != 10:
queryparts['limit'] = str(parsed.max_results)
if parsed.countries:
queryparts['countrycodes'] = ','.join(parsed.countries)
queryparts['exclude_place_ids'] = \
','.join(chain(excluded, map(str, (e for e in parsed.excluded if e > 0))))
if parsed.viewbox:
queryparts['viewbox'] = ','.join(f"{c:.7g}" for c in parsed.viewbox.coords)
if parsed.bounded_viewbox:
queryparts['bounded'] = '1'
if not details['dedupe']:
queryparts['dedupe'] = '0'
if feature_type in FEATURE_TYPE_TO_RANK:
queryparts['featureType'] = feature_type
def deduplicate_results(results: SearchResults, max_results: int) -> SearchResults:
""" Remove results that look like duplicates.
Two results are considered the same if they have the same OSM ID
or if they have the same category, display name and rank.
"""
osm_ids_done = set()
classification_done = set()
deduped = SearchResults()
for result in results:
if result.source_table == SourceTable.POSTCODE:
assert result.names and 'ref' in result.names
if any(_is_postcode_relation_for(r, result.names['ref']) for r in results):
continue
if result.source_table == SourceTable.PLACEX:
classification = (result.osm_object[0] if result.osm_object else None,
result.category,
result.display_name,
result.rank_address)
if result.osm_object not in osm_ids_done \
and classification not in classification_done:
deduped.append(result)
osm_ids_done.add(result.osm_object)
classification_done.add(classification)
else:
deduped.append(result)
if len(deduped) >= max_results:
break
return deduped
def _is_postcode_relation_for(result: SearchResult, postcode: str) -> bool:
return result.source_table == SourceTable.PLACEX \
and result.osm_object is not None \
and result.osm_object[0] == 'R' \
and result.category == ('boundary', 'postal_code') \
and result.names is not None \
and result.names.get('ref') == postcode
def _deg(axis:str) -> str:
return f"(?P<{axis}_deg>\\d+\\.\\d+)°?"
def _deg_min(axis: str) -> str:
return f"(?P<{axis}_deg>\\d+)[°\\s]+(?P<{axis}_min>[\\d.]+)[']*"
def _deg_min_sec(axis: str) -> str:
return f"(?P<{axis}_deg>\\d+)[°\\s]+(?P<{axis}_min>\\d+)['\\s]+(?P<{axis}_sec>[\\d.]+)[\"″]*"
COORD_REGEX = [re.compile(r'(?:(?P<pre>.*?)\s+)??' + r + r'(?:\s+(?P<post>.*))?') for r in (
r"(?P<ns>[NS])\s*" + _deg('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg('lon'),
_deg('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg('lon') + r"\s*(?P<ew>[EW])",
r"(?P<ns>[NS])\s*" + _deg_min('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg_min('lon'),
_deg_min('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg_min('lon') + r"\s*(?P<ew>[EW])",
r"(?P<ns>[NS])\s*" + _deg_min_sec('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg_min_sec('lon'),
_deg_min_sec('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg_min_sec('lon') + r"\s*(?P<ew>[EW])",
r"\[?(?P<lat_deg>[+-]?\d+\.\d+)[\s,]+(?P<lon_deg>[+-]?\d+\.\d+)\]?"
)]
def extract_coords_from_query(query: str) -> Tuple[str, Optional[float], Optional[float]]:
""" Look for something that is formatted like a coordinate at the
beginning or end of the query. If found, extract the coordinate and
return the remaining query (or the empty string if the query
consisted of nothing but a coordinate).
Only the first match will be returned.
"""
for regex in COORD_REGEX:
match = regex.fullmatch(query)
if match is None:
continue
groups = match.groupdict()
if not groups['pre'] or not groups['post']:
x = float(groups['lon_deg']) \
+ float(groups.get('lon_min', 0.0)) / 60.0 \
+ float(groups.get('lon_sec', 0.0)) / 3600.0
if groups.get('ew') == 'W':
x = -x
y = float(groups['lat_deg']) \
+ float(groups.get('lat_min', 0.0)) / 60.0 \
+ float(groups.get('lat_sec', 0.0)) / 3600.0
if groups.get('ns') == 'S':
y = -y
return groups['pre'] or groups['post'] or '', x, y
return query, None, None
CATEGORY_REGEX = re.compile(r'(?P<pre>.*?)\[(?P<cls>[a-zA-Z_]+)=(?P<typ>[a-zA-Z_]+)\](?P<post>.*)')
def extract_category_from_query(query: str) -> Tuple[str, Optional[str], Optional[str]]:
""" Extract a hidden category specification of the form '[key=value]' from
the query. If found, extract key and value and
return the remaining query (or the empty string if the query
consisted of nothing but a category).
Only the first match will be returned.
"""
match = CATEGORY_REGEX.search(query)
if match is not None:
return (match.group('pre').strip() + ' ' + match.group('post').strip()).strip(), \
match.group('cls'), match.group('typ')
return query, None, None

View File

@@ -0,0 +1,577 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Generic part of the server implementation of the v1 API.
Combine with the scaffolding provided for the various Python ASGI frameworks.
"""
from typing import Optional, Any, Type, Callable, NoReturn, Dict, cast
from functools import reduce
import abc
import dataclasses
import math
from urllib.parse import urlencode
import sqlalchemy as sa
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from .. import logging as loglib
from ..core import NominatimAPIAsync
from .format import dispatch as formatting
from .format import RawDataList
from ..types import DataLayer, GeometryFormat, PlaceRef, PlaceID, OsmID, Point
from ..status import StatusResult
from ..results import DetailedResult, ReverseResults, SearchResult, SearchResults
from ..localization import Locales
from . import helpers
CONTENT_TEXT = 'text/plain; charset=utf-8'
CONTENT_XML = 'text/xml; charset=utf-8'
CONTENT_HTML = 'text/html; charset=utf-8'
CONTENT_JSON = 'application/json; charset=utf-8'
CONTENT_TYPE = {'text': CONTENT_TEXT, 'xml': CONTENT_XML, 'debug': CONTENT_HTML}
class ASGIAdaptor(abc.ABC):
""" Adapter class for the different ASGI frameworks.
Wraps functionality over concrete requests and responses.
"""
content_type: str = CONTENT_TEXT
@abc.abstractmethod
def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
""" Return an input parameter as a string. If the parameter was
not provided, return the 'default' value.
"""
@abc.abstractmethod
def get_header(self, name: str, default: Optional[str] = None) -> Optional[str]:
""" Return a HTTP header parameter as a string. If the parameter was
not provided, return the 'default' value.
"""
@abc.abstractmethod
def error(self, msg: str, status: int = 400) -> Exception:
""" Construct an appropriate exception from the given error message.
The exception must result in a HTTP error with the given status.
"""
@abc.abstractmethod
def create_response(self, status: int, output: str, num_results: int) -> Any:
""" Create a response from the given parameters. The result will
be returned by the endpoint functions. The adaptor may also
return None when the response is created internally with some
different means.
The response must return the HTTP given status code 'status', set
the HTTP content-type headers to the string provided and the
body of the response to 'output'.
"""
@abc.abstractmethod
def base_uri(self) -> str:
""" Return the URI of the original request.
"""
@abc.abstractmethod
def config(self) -> Configuration:
""" Return the current configuration object.
"""
def build_response(self, output: str, status: int = 200, num_results: int = 0) -> Any:
""" Create a response from the given output. Wraps a JSONP function
around the response, if necessary.
"""
if self.content_type == CONTENT_JSON and status == 200:
jsonp = self.get('json_callback')
if jsonp is not None:
if any(not part.isidentifier() for part in jsonp.split('.')):
self.raise_error('Invalid json_callback value')
output = f"{jsonp}({output})"
self.content_type = 'application/javascript; charset=utf-8'
return self.create_response(status, output, num_results)
def raise_error(self, msg: str, status: int = 400) -> NoReturn:
""" Raise an exception resulting in the given HTTP status and
message. The message will be formatted according to the
output format chosen by the request.
"""
if self.content_type == CONTENT_XML:
msg = f"""<?xml version="1.0" encoding="UTF-8" ?>
<error>
<code>{status}</code>
<message>{msg}</message>
</error>
"""
elif self.content_type == CONTENT_JSON:
msg = f"""{{"error":{{"code":{status},"message":"{msg}"}}}}"""
elif self.content_type == CONTENT_HTML:
loglib.log().section('Execution error')
loglib.log().var_dump('Status', status)
loglib.log().var_dump('Message', msg)
msg = loglib.get_and_disable()
raise self.error(msg, status)
def get_int(self, name: str, default: Optional[int] = None) -> int:
""" Return an input parameter as an int. Raises an exception if
the parameter is given but not in an integer format.
If 'default' is given, then it will be returned when the parameter
is missing completely. When 'default' is None, an error will be
raised on a missing parameter.
"""
value = self.get(name)
if value is None:
if default is not None:
return default
self.raise_error(f"Parameter '{name}' missing.")
try:
intval = int(value)
except ValueError:
self.raise_error(f"Parameter '{name}' must be a number.")
return intval
def get_float(self, name: str, default: Optional[float] = None) -> float:
""" Return an input parameter as a flaoting-point number. Raises an
exception if the parameter is given but not in an float format.
If 'default' is given, then it will be returned when the parameter
is missing completely. When 'default' is None, an error will be
raised on a missing parameter.
"""
value = self.get(name)
if value is None:
if default is not None:
return default
self.raise_error(f"Parameter '{name}' missing.")
try:
fval = float(value)
except ValueError:
self.raise_error(f"Parameter '{name}' must be a number.")
if math.isnan(fval) or math.isinf(fval):
self.raise_error(f"Parameter '{name}' must be a number.")
return fval
def get_bool(self, name: str, default: Optional[bool] = None) -> bool:
""" Return an input parameter as bool. Only '0' is accepted as
an input for 'false' all other inputs will be interpreted as 'true'.
If 'default' is given, then it will be returned when the parameter
is missing completely. When 'default' is None, an error will be
raised on a missing parameter.
"""
value = self.get(name)
if value is None:
if default is not None:
return default
self.raise_error(f"Parameter '{name}' missing.")
return value != '0'
def get_accepted_languages(self) -> str:
""" Return the accepted languages.
"""
return self.get('accept-language')\
or self.get_header('accept-language')\
or self.config().DEFAULT_LANGUAGE
def setup_debugging(self) -> bool:
""" Set up collection of debug information if requested.
Return True when debugging was requested.
"""
if self.get_bool('debug', False):
loglib.set_log_output('html')
self.content_type = CONTENT_HTML
return True
return False
def get_layers(self) -> Optional[DataLayer]:
""" Return a parsed version of the layer parameter.
"""
param = self.get('layer', None)
if param is None:
return None
return cast(DataLayer,
reduce(DataLayer.__or__,
(getattr(DataLayer, s.upper()) for s in param.split(','))))
def parse_format(self, result_type: Type[Any], default: str) -> str:
""" Get and check the 'format' parameter and prepare the formatter.
`result_type` is the type of result to be returned by the function
and `default` the format value to assume when no parameter is present.
"""
fmt = self.get('format', default=default)
assert fmt is not None
if not formatting.supports_format(result_type, fmt):
self.raise_error("Parameter 'format' must be one of: " +
', '.join(formatting.list_formats(result_type)))
self.content_type = CONTENT_TYPE.get(fmt, CONTENT_JSON)
return fmt
def parse_geometry_details(self, fmt: str) -> Dict[str, Any]:
""" Create details structure from the supplied geometry parameters.
"""
numgeoms = 0
output = GeometryFormat.NONE
if self.get_bool('polygon_geojson', False):
output |= GeometryFormat.GEOJSON
numgeoms += 1
if fmt not in ('geojson', 'geocodejson'):
if self.get_bool('polygon_text', False):
output |= GeometryFormat.TEXT
numgeoms += 1
if self.get_bool('polygon_kml', False):
output |= GeometryFormat.KML
numgeoms += 1
if self.get_bool('polygon_svg', False):
output |= GeometryFormat.SVG
numgeoms += 1
if numgeoms > self.config().get_int('POLYGON_OUTPUT_MAX_TYPES'):
self.raise_error('Too many polygon output options selected.')
return {'address_details': True,
'geometry_simplification': self.get_float('polygon_threshold', 0.0),
'geometry_output': output
}
async def status_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /status endpoint. See API docs for details.
"""
result = await api.status()
fmt = params.parse_format(StatusResult, 'text')
if fmt == 'text' and result.status:
status_code = 500
else:
status_code = 200
return params.build_response(formatting.format_result(result, fmt, {}),
status=status_code)
async def details_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /details endpoint. See API docs for details.
"""
fmt = params.parse_format(DetailedResult, 'json')
place_id = params.get_int('place_id', 0)
place: PlaceRef
if place_id:
place = PlaceID(place_id)
else:
osmtype = params.get('osmtype')
if osmtype is None:
params.raise_error("Missing ID parameter 'place_id' or 'osmtype'.")
place = OsmID(osmtype, params.get_int('osmid'), params.get('class'))
debug = params.setup_debugging()
locales = Locales.from_accept_languages(params.get_accepted_languages())
result = await api.details(place,
address_details=params.get_bool('addressdetails', False),
linked_places=params.get_bool('linkedplaces', True),
parented_places=params.get_bool('hierarchy', False),
keywords=params.get_bool('keywords', False),
geometry_output = GeometryFormat.GEOJSON
if params.get_bool('polygon_geojson', False)
else GeometryFormat.NONE,
locales=locales
)
if debug:
return params.build_response(loglib.get_and_disable())
if result is None:
params.raise_error('No place with that OSM ID found.', status=404)
output = formatting.format_result(result, fmt,
{'locales': locales,
'group_hierarchy': params.get_bool('group_hierarchy', False),
'icon_base_url': params.config().MAPICON_URL})
return params.build_response(output, num_results=1)
async def reverse_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /reverse endpoint. See API docs for details.
"""
fmt = params.parse_format(ReverseResults, 'xml')
debug = params.setup_debugging()
coord = Point(params.get_float('lon'), params.get_float('lat'))
details = params.parse_geometry_details(fmt)
details['max_rank'] = helpers.zoom_to_rank(params.get_int('zoom', 18))
details['layers'] = params.get_layers()
details['locales'] = Locales.from_accept_languages(params.get_accepted_languages())
result = await api.reverse(coord, **details)
if debug:
return params.build_response(loglib.get_and_disable(), num_results=1 if result else 0)
if fmt == 'xml':
queryparts = {'lat': str(coord.lat), 'lon': str(coord.lon), 'format': 'xml'}
zoom = params.get('zoom', None)
if zoom:
queryparts['zoom'] = zoom
query = urlencode(queryparts)
else:
query = ''
fmt_options = {'query': query,
'extratags': params.get_bool('extratags', False),
'namedetails': params.get_bool('namedetails', False),
'addressdetails': params.get_bool('addressdetails', True)}
output = formatting.format_result(ReverseResults([result] if result else []),
fmt, fmt_options)
return params.build_response(output, num_results=1 if result else 0)
async def lookup_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /lookup endpoint. See API docs for details.
"""
fmt = params.parse_format(SearchResults, 'xml')
debug = params.setup_debugging()
details = params.parse_geometry_details(fmt)
details['locales'] = Locales.from_accept_languages(params.get_accepted_languages())
places = []
for oid in (params.get('osm_ids') or '').split(','):
oid = oid.strip()
if len(oid) > 1 and oid[0] in 'RNWrnw' and oid[1:].isdigit():
places.append(OsmID(oid[0].upper(), int(oid[1:])))
if len(places) > params.config().get_int('LOOKUP_MAX_COUNT'):
params.raise_error('Too many object IDs.')
if places:
results = await api.lookup(places, **details)
else:
results = SearchResults()
if debug:
return params.build_response(loglib.get_and_disable(), num_results=len(results))
fmt_options = {'extratags': params.get_bool('extratags', False),
'namedetails': params.get_bool('namedetails', False),
'addressdetails': params.get_bool('addressdetails', True)}
output = formatting.format_result(results, fmt, fmt_options)
return params.build_response(output, num_results=len(results))
async def _unstructured_search(query: str, api: NominatimAPIAsync,
details: Dict[str, Any]) -> SearchResults:
if not query:
return SearchResults()
# Extract special format for coordinates from query.
query, x, y = helpers.extract_coords_from_query(query)
if x is not None:
assert y is not None
details['near'] = Point(x, y)
details['near_radius'] = 0.1
# If no query is left, revert to reverse search.
if x is not None and not query:
result = await api.reverse(details['near'], **details)
if not result:
return SearchResults()
return SearchResults(
[SearchResult(**{f.name: getattr(result, f.name)
for f in dataclasses.fields(SearchResult)
if hasattr(result, f.name)})])
query, cls, typ = helpers.extract_category_from_query(query)
if cls is not None:
assert typ is not None
return await api.search_category([(cls, typ)], near_query=query, **details)
return await api.search(query, **details)
async def search_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /search endpoint. See API docs for details.
"""
fmt = params.parse_format(SearchResults, 'jsonv2')
debug = params.setup_debugging()
details = params.parse_geometry_details(fmt)
details['countries'] = params.get('countrycodes', None)
details['excluded'] = params.get('exclude_place_ids', None)
details['viewbox'] = params.get('viewbox', None) or params.get('viewboxlbrt', None)
details['bounded_viewbox'] = params.get_bool('bounded', False)
details['dedupe'] = params.get_bool('dedupe', True)
max_results = max(1, min(50, params.get_int('limit', 10)))
details['max_results'] = max_results + min(10, max_results) \
if details['dedupe'] else max_results
details['min_rank'], details['max_rank'] = \
helpers.feature_type_to_rank(params.get('featureType', ''))
if params.get('featureType', None) is not None:
details['layers'] = DataLayer.ADDRESS
else:
details['layers'] = params.get_layers()
details['locales'] = Locales.from_accept_languages(params.get_accepted_languages())
# unstructured query parameters
query = params.get('q', None)
# structured query parameters
queryparts = {}
for key in ('amenity', 'street', 'city', 'county', 'state', 'postalcode', 'country'):
details[key] = params.get(key, None)
if details[key]:
queryparts[key] = details[key]
try:
if query is not None:
if queryparts:
params.raise_error("Structured query parameters"
"(amenity, street, city, county, state, postalcode, country)"
" cannot be used together with 'q' parameter.")
queryparts['q'] = query
results = await _unstructured_search(query, api, details)
else:
query = ', '.join(queryparts.values())
results = await api.search_address(**details)
except UsageError as err:
params.raise_error(str(err))
if details['dedupe'] and len(results) > 1:
results = helpers.deduplicate_results(results, max_results)
if debug:
return params.build_response(loglib.get_and_disable(), num_results=len(results))
if fmt == 'xml':
helpers.extend_query_parts(queryparts, details,
params.get('featureType', ''),
params.get_bool('namedetails', False),
params.get_bool('extratags', False),
(str(r.place_id) for r in results if r.place_id))
queryparts['format'] = fmt
moreurl = params.base_uri() + '/search?' + urlencode(queryparts)
else:
moreurl = ''
fmt_options = {'query': query, 'more_url': moreurl,
'exclude_place_ids': queryparts.get('exclude_place_ids'),
'viewbox': queryparts.get('viewbox'),
'extratags': params.get_bool('extratags', False),
'namedetails': params.get_bool('namedetails', False),
'addressdetails': params.get_bool('addressdetails', False)}
output = formatting.format_result(results, fmt, fmt_options)
return params.build_response(output, num_results=len(results))
async def deletable_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /deletable endpoint.
This is a special endpoint that shows polygons that have been
deleted or are broken in the OSM data but are kept in the
Nominatim database to minimize disruption.
"""
fmt = params.parse_format(RawDataList, 'json')
async with api.begin() as conn:
sql = sa.text(""" SELECT p.place_id, country_code,
name->'name' as name, i.*
FROM placex p, import_polygon_delete i
WHERE p.osm_id = i.osm_id AND p.osm_type = i.osm_type
AND p.class = i.class AND p.type = i.type
""")
results = RawDataList(r._asdict() for r in await conn.execute(sql))
return params.build_response(formatting.format_result(results, fmt, {}))
async def polygons_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
""" Server glue for /polygons endpoint.
This is a special endpoint that shows polygons that have changed
their size but are kept in the Nominatim database with their
old area to minimize disruption.
"""
fmt = params.parse_format(RawDataList, 'json')
sql_params: Dict[str, Any] = {
'days': params.get_int('days', -1),
'cls': params.get('class')
}
reduced = params.get_bool('reduced', False)
async with api.begin() as conn:
sql = sa.select(sa.text("""osm_type, osm_id, class, type,
name->'name' as name,
country_code, errormessage, updated"""))\
.select_from(sa.text('import_polygon_error'))
if sql_params['days'] > 0:
sql = sql.where(sa.text("updated > 'now'::timestamp - make_interval(days => :days)"))
if reduced:
sql = sql.where(sa.text("errormessage like 'Area reduced%'"))
if sql_params['cls'] is not None:
sql = sql.where(sa.text("class = :cls"))
sql = sql.order_by(sa.literal_column('updated').desc()).limit(1000)
results = RawDataList(r._asdict() for r in await conn.execute(sql, sql_params))
return params.build_response(formatting.format_result(results, fmt, {}))
EndpointFunc = Callable[[NominatimAPIAsync, ASGIAdaptor], Any]
ROUTES = [
('status', status_endpoint),
('details', details_endpoint),
('reverse', reverse_endpoint),
('lookup', lookup_endpoint),
('search', search_endpoint),
('deletable', deletable_endpoint),
('polygons', polygons_endpoint),
]

View File

@@ -0,0 +1,11 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Version information for the Nominatim API.
"""
NOMINATIM_API_VERSION = '4.4.99'

View File

View File

@@ -0,0 +1,374 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Nominatim configuration accessor.
"""
from typing import Dict, Any, List, Mapping, Optional
import importlib.util
import logging
import os
import sys
from pathlib import Path
import json
import yaml
from dotenv import dotenv_values
from psycopg2.extensions import parse_dsn
from .typing import StrPath
from .errors import UsageError
from . import paths
LOG = logging.getLogger()
CONFIG_CACHE : Dict[str, Any] = {}
def flatten_config_list(content: Any, section: str = '') -> List[Any]:
""" Flatten YAML configuration lists that contain include sections
which are lists themselves.
"""
if not content:
return []
if not isinstance(content, list):
raise UsageError(f"List expected in section '{section}'.")
output = []
for ele in content:
if isinstance(ele, list):
output.extend(flatten_config_list(ele, section))
else:
output.append(ele)
return output
class Configuration:
""" This class wraps access to the configuration settings
for the Nominatim instance in use.
All Nominatim configuration options are prefixed with 'NOMINATIM_' to
avoid conflicts with other environment variables. All settings can
be accessed as properties of the class under the same name as the
setting but with the `NOMINATIM_` prefix removed. In addition, there
are accessor functions that convert the setting values to types
other than string.
"""
def __init__(self, project_dir: Optional[Path],
environ: Optional[Mapping[str, str]] = None) -> None:
self.environ = environ or os.environ
self.project_dir = project_dir
self.config_dir = paths.CONFIG_DIR
self._config = dotenv_values(str(self.config_dir / 'env.defaults'))
if self.project_dir is not None and (self.project_dir / '.env').is_file():
self.project_dir = self.project_dir.resolve()
self._config.update(dotenv_values(str(self.project_dir / '.env')))
class _LibDirs:
module: Path
osm2pgsql: Path
php = paths.PHPLIB_DIR
sql = paths.SQLLIB_DIR
data = paths.DATA_DIR
self.lib_dir = _LibDirs()
self._private_plugins: Dict[str, object] = {}
def set_libdirs(self, **kwargs: StrPath) -> None:
""" Set paths to library functions and data.
"""
for key, value in kwargs.items():
setattr(self.lib_dir, key, None if value is None else Path(value))
def __getattr__(self, name: str) -> str:
name = 'NOMINATIM_' + name
if name in self.environ:
return self.environ[name]
return self._config[name] or ''
def get_bool(self, name: str) -> bool:
""" Return the given configuration parameter as a boolean.
Parameters:
name: Name of the configuration parameter with the NOMINATIM_
prefix removed.
Returns:
`True` for values of '1', 'yes' and 'true', `False` otherwise.
"""
return getattr(self, name).lower() in ('1', 'yes', 'true')
def get_int(self, name: str) -> int:
""" Return the given configuration parameter as an int.
Parameters:
name: Name of the configuration parameter with the NOMINATIM_
prefix removed.
Returns:
The configuration value converted to int.
Raises:
ValueError: when the value is not a number.
"""
try:
return int(getattr(self, name))
except ValueError as exp:
LOG.fatal("Invalid setting NOMINATIM_%s. Needs to be a number.", name)
raise UsageError("Configuration error.") from exp
def get_str_list(self, name: str) -> Optional[List[str]]:
""" Return the given configuration parameter as a list of strings.
The values are assumed to be given as a comma-sparated list and
will be stripped before returning them.
Parameters:
name: Name of the configuration parameter with the NOMINATIM_
prefix removed.
Returns:
(List[str]): The comma-split parameter as a list. The
elements are stripped of leading and final spaces before
being returned.
(None): The configuration parameter was unset or empty.
"""
raw = getattr(self, name)
return [v.strip() for v in raw.split(',')] if raw else None
def get_path(self, name: str) -> Optional[Path]:
""" Return the given configuration parameter as a Path.
Parameters:
name: Name of the configuration parameter with the NOMINATIM_
prefix removed.
Returns:
(Path): A Path object of the parameter value.
If a relative path is configured, then the function converts this
into an absolute path with the project directory as root path.
(None): The configuration parameter was unset or empty.
"""
value = getattr(self, name)
if not value:
return None
cfgpath = Path(value)
if not cfgpath.is_absolute():
assert self.project_dir is not None
cfgpath = self.project_dir / cfgpath
return cfgpath.resolve()
def get_libpq_dsn(self) -> str:
""" Get configured database DSN converted into the key/value format
understood by libpq and psycopg.
"""
dsn = self.DATABASE_DSN
def quote_param(param: str) -> str:
key, val = param.split('=')
val = val.replace('\\', '\\\\').replace("'", "\\'")
if ' ' in val:
val = "'" + val + "'"
return key + '=' + val
if dsn.startswith('pgsql:'):
# Old PHP DSN format. Convert before returning.
return ' '.join([quote_param(p) for p in dsn[6:].split(';')])
return dsn
def get_database_params(self) -> Mapping[str, str]:
""" Get the configured parameters for the database connection
as a mapping.
"""
dsn = self.DATABASE_DSN
if dsn.startswith('pgsql:'):
return dict((p.split('=', 1) for p in dsn[6:].split(';')))
return parse_dsn(dsn)
def get_import_style_file(self) -> Path:
""" Return the import style file as a path object. Translates the
name of the standard styles automatically into a file in the
config style.
"""
style = getattr(self, 'IMPORT_STYLE')
if style in ('admin', 'street', 'address', 'full', 'extratags'):
return self.config_dir / f'import-{style}.lua'
return self.find_config_file('', 'IMPORT_STYLE')
def get_os_env(self) -> Dict[str, str]:
""" Return a copy of the OS environment with the Nominatim configuration
merged in.
"""
env = {k: v for k, v in self._config.items() if v is not None}
env.update(self.environ)
return env
def load_sub_configuration(self, filename: StrPath,
config: Optional[str] = None) -> Any:
""" Load additional configuration from a file. `filename` is the name
of the configuration file. The file is first searched in the
project directory and then in the global settings directory.
If `config` is set, then the name of the configuration file can
be additionally given through a .env configuration option. When
the option is set, then the file will be exclusively loaded as set:
if the name is an absolute path, the file name is taken as is,
if the name is relative, it is taken to be relative to the
project directory.
The format of the file is determined from the filename suffix.
Currently only files with extension '.yaml' are supported.
YAML files support a special '!include' construct. When the
directive is given, the value is taken to be a filename, the file
is loaded using this function and added at the position in the
configuration tree.
"""
configfile = self.find_config_file(filename, config)
if str(configfile) in CONFIG_CACHE:
return CONFIG_CACHE[str(configfile)]
if configfile.suffix in ('.yaml', '.yml'):
result = self._load_from_yaml(configfile)
elif configfile.suffix == '.json':
with configfile.open('r', encoding='utf-8') as cfg:
result = json.load(cfg)
else:
raise UsageError(f"Config file '{configfile}' has unknown format.")
CONFIG_CACHE[str(configfile)] = result
return result
def load_plugin_module(self, module_name: str, internal_path: str) -> Any:
""" Load a Python module as a plugin.
The module_name may have three variants:
* A name without any '.' is assumed to be an internal module
and will be searched relative to `internal_path`.
* If the name ends in `.py`, module_name is assumed to be a
file name relative to the project directory.
* Any other name is assumed to be an absolute module name.
In either of the variants the module name must start with a letter.
"""
if not module_name or not module_name[0].isidentifier():
raise UsageError(f'Invalid module name {module_name}')
if '.' not in module_name:
module_name = module_name.replace('-', '_')
full_module = f'{internal_path}.{module_name}'
return sys.modules.get(full_module) or importlib.import_module(full_module)
if module_name.endswith('.py'):
if self.project_dir is None or not (self.project_dir / module_name).exists():
raise UsageError(f"Cannot find module '{module_name}' in project directory.")
if module_name in self._private_plugins:
return self._private_plugins[module_name]
file_path = str(self.project_dir / module_name)
spec = importlib.util.spec_from_file_location(module_name, file_path)
if spec:
module = importlib.util.module_from_spec(spec)
# Do not add to global modules because there is no standard
# module name that Python can resolve.
self._private_plugins[module_name] = module
assert spec.loader is not None
spec.loader.exec_module(module)
return module
return sys.modules.get(module_name) or importlib.import_module(module_name)
def find_config_file(self, filename: StrPath,
config: Optional[str] = None) -> Path:
""" Resolve the location of a configuration file given a filename and
an optional configuration option with the file name.
Raises a UsageError when the file cannot be found or is not
a regular file.
"""
if config is not None:
cfg_value = getattr(self, config)
if cfg_value:
cfg_filename = Path(cfg_value)
if cfg_filename.is_absolute():
cfg_filename = cfg_filename.resolve()
if not cfg_filename.is_file():
LOG.fatal("Cannot find config file '%s'.", cfg_filename)
raise UsageError("Config file not found.")
return cfg_filename
filename = cfg_filename
search_paths = [self.project_dir, self.config_dir]
for path in search_paths:
if path is not None and (path / filename).is_file():
return path / filename
LOG.fatal("Configuration file '%s' not found.\nDirectories searched: %s",
filename, search_paths)
raise UsageError("Config file not found.")
def _load_from_yaml(self, cfgfile: Path) -> Any:
""" Load a YAML configuration file. This installs a special handler that
allows to include other YAML files using the '!include' operator.
"""
yaml.add_constructor('!include', self._yaml_include_representer,
Loader=yaml.SafeLoader)
return yaml.safe_load(cfgfile.read_text(encoding='utf-8'))
def _yaml_include_representer(self, loader: Any, node: yaml.Node) -> Any:
""" Handler for the '!include' operator in YAML files.
When the filename is relative, then the file is first searched in the
project directory and then in the global settings directory.
"""
fname = loader.construct_scalar(node)
if Path(fname).is_absolute():
configfile = Path(fname)
else:
configfile = self.find_config_file(loader.construct_scalar(node))
if configfile.suffix != '.yaml':
LOG.fatal("Format error while reading '%s': only YAML format supported.",
configfile)
raise UsageError("Cannot handle config file format.")
return yaml.safe_load(configfile.read_text(encoding='utf-8'))

View File

View File

@@ -0,0 +1,236 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
""" Non-blocking database connections.
"""
from typing import Callable, Any, Optional, Iterator, Sequence
import logging
import select
import time
import psycopg2
from psycopg2.extras import wait_select
# psycopg2 emits different exceptions pre and post 2.8. Detect if the new error
# module is available and adapt the error handling accordingly.
try:
import psycopg2.errors # pylint: disable=no-name-in-module,import-error
__has_psycopg2_errors__ = True
except ImportError:
__has_psycopg2_errors__ = False
from ..typing import T_cursor, Query
LOG = logging.getLogger()
class DeadlockHandler:
""" Context manager that catches deadlock exceptions and calls
the given handler function. All other exceptions are passed on
normally.
"""
def __init__(self, handler: Callable[[], None], ignore_sql_errors: bool = False) -> None:
self.handler = handler
self.ignore_sql_errors = ignore_sql_errors
def __enter__(self) -> 'DeadlockHandler':
return self
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> bool:
if __has_psycopg2_errors__:
if exc_type == psycopg2.errors.DeadlockDetected: # pylint: disable=E1101
self.handler()
return True
elif exc_type == psycopg2.extensions.TransactionRollbackError \
and exc_value.pgcode == '40P01':
self.handler()
return True
if self.ignore_sql_errors and isinstance(exc_value, psycopg2.Error):
LOG.info("SQL error ignored: %s", exc_value)
return True
return False
class DBConnection:
""" A single non-blocking database connection.
"""
def __init__(self, dsn: str,
cursor_factory: Optional[Callable[..., T_cursor]] = None,
ignore_sql_errors: bool = False) -> None:
self.dsn = dsn
self.current_query: Optional[Query] = None
self.current_params: Optional[Sequence[Any]] = None
self.ignore_sql_errors = ignore_sql_errors
self.conn: Optional['psycopg2._psycopg.connection'] = None
self.cursor: Optional['psycopg2._psycopg.cursor'] = None
self.connect(cursor_factory=cursor_factory)
def close(self) -> None:
""" Close all open connections. Does not wait for pending requests.
"""
if self.conn is not None:
if self.cursor is not None:
self.cursor.close()
self.cursor = None
self.conn.close()
self.conn = None
def connect(self, cursor_factory: Optional[Callable[..., T_cursor]] = None) -> None:
""" (Re)connect to the database. Creates an asynchronous connection
with JIT and parallel processing disabled. If a connection was
already open, it is closed and a new connection established.
The caller must ensure that no query is pending before reconnecting.
"""
self.close()
# Use a dict to hand in the parameters because async is a reserved
# word in Python3.
self.conn = psycopg2.connect(**{'dsn': self.dsn, 'async': True}) # type: ignore
assert self.conn
self.wait()
if cursor_factory is not None:
self.cursor = self.conn.cursor(cursor_factory=cursor_factory)
else:
self.cursor = self.conn.cursor()
# Disable JIT and parallel workers as they are known to cause problems.
# Update pg_settings instead of using SET because it does not yield
# errors on older versions of Postgres where the settings are not
# implemented.
self.perform(
""" UPDATE pg_settings SET setting = -1 WHERE name = 'jit_above_cost';
UPDATE pg_settings SET setting = 0
WHERE name = 'max_parallel_workers_per_gather';""")
self.wait()
def _deadlock_handler(self) -> None:
LOG.info("Deadlock detected (params = %s), retry.", str(self.current_params))
assert self.cursor is not None
assert self.current_query is not None
assert self.current_params is not None
self.cursor.execute(self.current_query, self.current_params)
def wait(self) -> None:
""" Block until any pending operation is done.
"""
while True:
with DeadlockHandler(self._deadlock_handler, self.ignore_sql_errors):
wait_select(self.conn)
self.current_query = None
return
def perform(self, sql: Query, args: Optional[Sequence[Any]] = None) -> None:
""" Send SQL query to the server. Returns immediately without
blocking.
"""
assert self.cursor is not None
self.current_query = sql
self.current_params = args
self.cursor.execute(sql, args)
def fileno(self) -> int:
""" File descriptor to wait for. (Makes this class select()able.)
"""
assert self.conn is not None
return self.conn.fileno()
def is_done(self) -> bool:
""" Check if the connection is available for a new query.
Also checks if the previous query has run into a deadlock.
If so, then the previous query is repeated.
"""
assert self.conn is not None
if self.current_query is None:
return True
with DeadlockHandler(self._deadlock_handler, self.ignore_sql_errors):
if self.conn.poll() == psycopg2.extensions.POLL_OK:
self.current_query = None
return True
return False
class WorkerPool:
""" A pool of asynchronous database connections.
The pool may be used as a context manager.
"""
REOPEN_CONNECTIONS_AFTER = 100000
def __init__(self, dsn: str, pool_size: int, ignore_sql_errors: bool = False) -> None:
self.threads = [DBConnection(dsn, ignore_sql_errors=ignore_sql_errors)
for _ in range(pool_size)]
self.free_workers = self._yield_free_worker()
self.wait_time = 0.0
def finish_all(self) -> None:
""" Wait for all connection to finish.
"""
for thread in self.threads:
while not thread.is_done():
thread.wait()
self.free_workers = self._yield_free_worker()
def close(self) -> None:
""" Close all connections and clear the pool.
"""
for thread in self.threads:
thread.close()
self.threads = []
self.free_workers = iter([])
def next_free_worker(self) -> DBConnection:
""" Get the next free connection.
"""
return next(self.free_workers)
def _yield_free_worker(self) -> Iterator[DBConnection]:
ready = self.threads
command_stat = 0
while True:
for thread in ready:
if thread.is_done():
command_stat += 1
yield thread
if command_stat > self.REOPEN_CONNECTIONS_AFTER:
self._reconnect_threads()
ready = self.threads
command_stat = 0
else:
tstart = time.time()
_, ready, _ = select.select([], self.threads, [])
self.wait_time += time.time() - tstart
def _reconnect_threads(self) -> None:
for thread in self.threads:
while not thread.is_done():
thread.wait()
thread.connect()
def __enter__(self) -> 'WorkerPool':
return self
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
self.finish_all()
self.close()

View File

@@ -0,0 +1,21 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Import the base library to use with asynchronous SQLAlchemy.
"""
# pylint: disable=invalid-name
from typing import Any
try:
import psycopg
PGCORE_LIB = 'psycopg'
PGCORE_ERROR: Any = psycopg.Error
except ModuleNotFoundError:
import asyncpg
PGCORE_LIB = 'asyncpg'
PGCORE_ERROR = asyncpg.PostgresError

View File

@@ -0,0 +1,254 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Specialised connection and cursor functions.
"""
from typing import Optional, Any, Callable, ContextManager, Dict, cast, overload, Tuple, Iterable
import contextlib
import logging
import os
import psycopg2
import psycopg2.extensions
import psycopg2.extras
from psycopg2 import sql as pysql
from ..typing import SysEnv, Query, T_cursor
from ..errors import UsageError
LOG = logging.getLogger()
class Cursor(psycopg2.extras.DictCursor):
""" A cursor returning dict-like objects and providing specialised
execution functions.
"""
# pylint: disable=arguments-renamed,arguments-differ
def execute(self, query: Query, args: Any = None) -> None:
""" Query execution that logs the SQL query when debugging is enabled.
"""
if LOG.isEnabledFor(logging.DEBUG):
LOG.debug(self.mogrify(query, args).decode('utf-8'))
super().execute(query, args)
def execute_values(self, sql: Query, argslist: Iterable[Tuple[Any, ...]],
template: Optional[Query] = None) -> None:
""" Wrapper for the psycopg2 convenience function to execute
SQL for a list of values.
"""
LOG.debug("SQL execute_values(%s, %s)", sql, argslist)
psycopg2.extras.execute_values(self, sql, argslist, template=template)
def scalar(self, sql: Query, args: Any = None) -> Any:
""" Execute query that returns a single value. The value is returned.
If the query yields more than one row, a ValueError is raised.
"""
self.execute(sql, args)
if self.rowcount != 1:
raise RuntimeError("Query did not return a single row.")
result = self.fetchone()
assert result is not None
return result[0]
def drop_table(self, name: str, if_exists: bool = True, cascade: bool = False) -> None:
""" Drop the table with the given name.
Set `if_exists` to False if a non-existent table should raise
an exception instead of just being ignored. If 'cascade' is set
to True then all dependent tables are deleted as well.
"""
sql = 'DROP TABLE '
if if_exists:
sql += 'IF EXISTS '
sql += '{}'
if cascade:
sql += ' CASCADE'
self.execute(pysql.SQL(sql).format(pysql.Identifier(name)))
class Connection(psycopg2.extensions.connection):
""" A connection that provides the specialised cursor by default and
adds convenience functions for administrating the database.
"""
@overload # type: ignore[override]
def cursor(self) -> Cursor:
...
@overload
def cursor(self, name: str) -> Cursor:
...
@overload
def cursor(self, cursor_factory: Callable[..., T_cursor]) -> T_cursor:
...
def cursor(self, cursor_factory = Cursor, **kwargs): # type: ignore
""" Return a new cursor. By default the specialised cursor is returned.
"""
return super().cursor(cursor_factory=cursor_factory, **kwargs)
def table_exists(self, table: str) -> bool:
""" Check that a table with the given name exists in the database.
"""
with self.cursor() as cur:
num = cur.scalar("""SELECT count(*) FROM pg_tables
WHERE tablename = %s and schemaname = 'public'""", (table, ))
return num == 1 if isinstance(num, int) else False
def table_has_column(self, table: str, column: str) -> bool:
""" Check if the table 'table' exists and has a column with name 'column'.
"""
with self.cursor() as cur:
has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
WHERE table_name = %s
and column_name = %s""",
(table, column))
return has_column > 0 if isinstance(has_column, int) else False
def index_exists(self, index: str, table: Optional[str] = None) -> bool:
""" Check that an index with the given name exists in the database.
If table is not None then the index must relate to the given
table.
"""
with self.cursor() as cur:
cur.execute("""SELECT tablename FROM pg_indexes
WHERE indexname = %s and schemaname = 'public'""", (index, ))
if cur.rowcount == 0:
return False
if table is not None:
row = cur.fetchone()
if row is None or not isinstance(row[0], str):
return False
return row[0] == table
return True
def drop_table(self, name: str, if_exists: bool = True, cascade: bool = False) -> None:
""" Drop the table with the given name.
Set `if_exists` to False if a non-existent table should raise
an exception instead of just being ignored.
"""
with self.cursor() as cur:
cur.drop_table(name, if_exists, cascade)
self.commit()
def server_version_tuple(self) -> Tuple[int, int]:
""" Return the server version as a tuple of (major, minor).
Converts correctly for pre-10 and post-10 PostgreSQL versions.
"""
version = self.server_version
if version < 100000:
return (int(version / 10000), int((version % 10000) / 100))
return (int(version / 10000), version % 10000)
def postgis_version_tuple(self) -> Tuple[int, int]:
""" Return the postgis version installed in the database as a
tuple of (major, minor). Assumes that the PostGIS extension
has been installed already.
"""
with self.cursor() as cur:
version = cur.scalar('SELECT postgis_lib_version()')
version_parts = version.split('.')
if len(version_parts) < 2:
raise UsageError(f"Error fetching Postgis version. Bad format: {version}")
return (int(version_parts[0]), int(version_parts[1]))
def extension_loaded(self, extension_name: str) -> bool:
""" Return True if the hstore extension is loaded in the database.
"""
with self.cursor() as cur:
cur.execute('SELECT extname FROM pg_extension WHERE extname = %s', (extension_name, ))
return cur.rowcount > 0
class ConnectionContext(ContextManager[Connection]):
""" Context manager of the connection that also provides direct access
to the underlying connection.
"""
connection: Connection
def connect(dsn: str) -> ConnectionContext:
""" Open a connection to the database using the specialised connection
factory. The returned object may be used in conjunction with 'with'.
When used outside a context manager, use the `connection` attribute
to get the connection.
"""
try:
conn = psycopg2.connect(dsn, connection_factory=Connection)
ctxmgr = cast(ConnectionContext, contextlib.closing(conn))
ctxmgr.connection = conn
return ctxmgr
except psycopg2.OperationalError as err:
raise UsageError(f"Cannot connect to database: {err}") from err
# Translation from PG connection string parameters to PG environment variables.
# Derived from https://www.postgresql.org/docs/current/libpq-envars.html.
_PG_CONNECTION_STRINGS = {
'host': 'PGHOST',
'hostaddr': 'PGHOSTADDR',
'port': 'PGPORT',
'dbname': 'PGDATABASE',
'user': 'PGUSER',
'password': 'PGPASSWORD',
'passfile': 'PGPASSFILE',
'channel_binding': 'PGCHANNELBINDING',
'service': 'PGSERVICE',
'options': 'PGOPTIONS',
'application_name': 'PGAPPNAME',
'sslmode': 'PGSSLMODE',
'requiressl': 'PGREQUIRESSL',
'sslcompression': 'PGSSLCOMPRESSION',
'sslcert': 'PGSSLCERT',
'sslkey': 'PGSSLKEY',
'sslrootcert': 'PGSSLROOTCERT',
'sslcrl': 'PGSSLCRL',
'requirepeer': 'PGREQUIREPEER',
'ssl_min_protocol_version': 'PGSSLMINPROTOCOLVERSION',
'ssl_max_protocol_version': 'PGSSLMAXPROTOCOLVERSION',
'gssencmode': 'PGGSSENCMODE',
'krbsrvname': 'PGKRBSRVNAME',
'gsslib': 'PGGSSLIB',
'connect_timeout': 'PGCONNECT_TIMEOUT',
'target_session_attrs': 'PGTARGETSESSIONATTRS',
}
def get_pg_env(dsn: str,
base_env: Optional[SysEnv] = None) -> Dict[str, str]:
""" Return a copy of `base_env` with the environment variables for
PostgreSQL set up from the given database connection string.
If `base_env` is None, then the OS environment is used as a base
environment.
"""
env = dict(base_env if base_env is not None else os.environ)
for param, value in psycopg2.extensions.parse_dsn(dsn).items():
if param in _PG_CONNECTION_STRINGS:
env[_PG_CONNECTION_STRINGS[param]] = value
else:
LOG.error("Unknown connection parameter '%s' ignored.", param)
return env

View File

@@ -0,0 +1,47 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Query and access functions for the in-database property table.
"""
from typing import Optional, cast
from .connection import Connection
def set_property(conn: Connection, name: str, value: str) -> None:
""" Add or replace the property with the given name.
"""
with conn.cursor() as cur:
cur.execute('SELECT value FROM nominatim_properties WHERE property = %s',
(name, ))
if cur.rowcount == 0:
sql = 'INSERT INTO nominatim_properties (value, property) VALUES (%s, %s)'
else:
sql = 'UPDATE nominatim_properties SET value = %s WHERE property = %s'
cur.execute(sql, (value, name))
conn.commit()
def get_property(conn: Connection, name: str) -> Optional[str]:
""" Return the current value of the given property or None if the property
is not set.
"""
if not conn.table_exists('nominatim_properties'):
return None
with conn.cursor() as cur:
cur.execute('SELECT value FROM nominatim_properties WHERE property = %s',
(name, ))
if cur.rowcount == 0:
return None
result = cur.fetchone()
assert result is not None
return cast(Optional[str], result[0])

View File

@@ -0,0 +1,143 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Preprocessing of SQL files.
"""
from typing import Set, Dict, Any, cast
import jinja2
from .connection import Connection
from .async_connection import WorkerPool
from ..config import Configuration
def _get_partitions(conn: Connection) -> Set[int]:
""" Get the set of partitions currently in use.
"""
with conn.cursor() as cur:
cur.execute('SELECT DISTINCT partition FROM country_name')
partitions = set([0])
for row in cur:
partitions.add(row[0])
return partitions
def _get_tables(conn: Connection) -> Set[str]:
""" Return the set of tables currently in use.
"""
with conn.cursor() as cur:
cur.execute("SELECT tablename FROM pg_tables WHERE schemaname = 'public'")
return set((row[0] for row in list(cur)))
def _get_middle_db_format(conn: Connection, tables: Set[str]) -> str:
""" Returns the version of the slim middle tables.
"""
if 'osm2pgsql_properties' not in tables:
return '1'
with conn.cursor() as cur:
cur.execute("SELECT value FROM osm2pgsql_properties WHERE property = 'db_format'")
row = cur.fetchone()
return cast(str, row[0]) if row is not None else '1'
def _setup_tablespace_sql(config: Configuration) -> Dict[str, str]:
""" Returns a dict with tablespace expressions for the different tablespace
kinds depending on whether a tablespace is configured or not.
"""
out = {}
for subset in ('ADDRESS', 'SEARCH', 'AUX'):
for kind in ('DATA', 'INDEX'):
tspace = getattr(config, f'TABLESPACE_{subset}_{kind}')
if tspace:
tspace = f'TABLESPACE "{tspace}"'
out[f'{subset.lower()}_{kind.lower()}'] = tspace
return out
def _setup_postgresql_features(conn: Connection) -> Dict[str, Any]:
""" Set up a dictionary with various optional Postgresql/Postgis features that
depend on the database version.
"""
pg_version = conn.server_version_tuple()
postgis_version = conn.postgis_version_tuple()
pg11plus = pg_version >= (11, 0, 0)
ps3 = postgis_version >= (3, 0)
return {
'has_index_non_key_column': pg11plus,
'spgist_geom' : 'SPGIST' if pg11plus and ps3 else 'GIST'
}
class SQLPreprocessor:
""" A environment for preprocessing SQL files from the
lib-sql directory.
The preprocessor provides a number of default filters and variables.
The variables may be overwritten when rendering an SQL file.
The preprocessing is currently based on the jinja2 templating library
and follows its syntax.
"""
def __init__(self, conn: Connection, config: Configuration) -> None:
self.env = jinja2.Environment(autoescape=False,
loader=jinja2.FileSystemLoader(str(config.lib_dir.sql)))
db_info: Dict[str, Any] = {}
db_info['partitions'] = _get_partitions(conn)
db_info['tables'] = _get_tables(conn)
db_info['reverse_only'] = 'search_name' not in db_info['tables']
db_info['tablespace'] = _setup_tablespace_sql(config)
db_info['middle_db_format'] = _get_middle_db_format(conn, db_info['tables'])
self.env.globals['config'] = config
self.env.globals['db'] = db_info
self.env.globals['postgres'] = _setup_postgresql_features(conn)
def run_string(self, conn: Connection, template: str, **kwargs: Any) -> None:
""" Execute the given SQL template string on the connection.
The keyword arguments may supply additional parameters
for preprocessing.
"""
sql = self.env.from_string(template).render(**kwargs)
with conn.cursor() as cur:
cur.execute(sql)
conn.commit()
def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None:
""" Execute the given SQL file on the connection. The keyword arguments
may supply additional parameters for preprocessing.
"""
sql = self.env.get_template(name).render(**kwargs)
with conn.cursor() as cur:
cur.execute(sql)
conn.commit()
def run_parallel_sql_file(self, dsn: str, name: str, num_threads: int = 1,
**kwargs: Any) -> None:
""" Execute the given SQL files using parallel asynchronous connections.
The keyword arguments may supply additional parameters for
preprocessing.
After preprocessing the SQL code is cut at lines containing only
'---'. Each chunk is sent to one of the `num_threads` workers.
"""
sql = self.env.get_template(name).render(**kwargs)
parts = sql.split('\n---\n')
with WorkerPool(dsn, num_threads) as pool:
for part in parts:
pool.next_free_worker().perform(part)

View File

@@ -0,0 +1,119 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
SQLAlchemy definitions for all tables used by the frontend.
"""
import sqlalchemy as sa
from .sqlalchemy_types import Geometry, KeyValueStore, IntArray
#pylint: disable=too-many-instance-attributes
class SearchTables:
""" Data class that holds the tables of the Nominatim database.
This schema strictly reflects the read-access view of the database.
Any data used for updates only will not be visible.
"""
def __init__(self, meta: sa.MetaData) -> None:
self.meta = meta
self.import_status = sa.Table('import_status', meta,
sa.Column('lastimportdate', sa.DateTime(True), nullable=False),
sa.Column('sequence_id', sa.Integer),
sa.Column('indexed', sa.Boolean))
self.properties = sa.Table('nominatim_properties', meta,
sa.Column('property', sa.Text, nullable=False),
sa.Column('value', sa.Text))
self.placex = sa.Table('placex', meta,
sa.Column('place_id', sa.BigInteger, nullable=False),
sa.Column('parent_place_id', sa.BigInteger),
sa.Column('linked_place_id', sa.BigInteger),
sa.Column('importance', sa.Float),
sa.Column('indexed_date', sa.DateTime),
sa.Column('rank_address', sa.SmallInteger),
sa.Column('rank_search', sa.SmallInteger),
sa.Column('indexed_status', sa.SmallInteger),
sa.Column('osm_type', sa.String(1), nullable=False),
sa.Column('osm_id', sa.BigInteger, nullable=False),
sa.Column('class', sa.Text, nullable=False, key='class_'),
sa.Column('type', sa.Text, nullable=False),
sa.Column('admin_level', sa.SmallInteger),
sa.Column('name', KeyValueStore),
sa.Column('address', KeyValueStore),
sa.Column('extratags', KeyValueStore),
sa.Column('geometry', Geometry, nullable=False),
sa.Column('wikipedia', sa.Text),
sa.Column('country_code', sa.String(2)),
sa.Column('housenumber', sa.Text),
sa.Column('postcode', sa.Text),
sa.Column('centroid', Geometry))
self.addressline = sa.Table('place_addressline', meta,
sa.Column('place_id', sa.BigInteger),
sa.Column('address_place_id', sa.BigInteger),
sa.Column('distance', sa.Float),
sa.Column('fromarea', sa.Boolean),
sa.Column('isaddress', sa.Boolean))
self.postcode = sa.Table('location_postcode', meta,
sa.Column('place_id', sa.BigInteger),
sa.Column('parent_place_id', sa.BigInteger),
sa.Column('rank_search', sa.SmallInteger),
sa.Column('rank_address', sa.SmallInteger),
sa.Column('indexed_status', sa.SmallInteger),
sa.Column('indexed_date', sa.DateTime),
sa.Column('country_code', sa.String(2)),
sa.Column('postcode', sa.Text),
sa.Column('geometry', Geometry))
self.osmline = sa.Table('location_property_osmline', meta,
sa.Column('place_id', sa.BigInteger, nullable=False),
sa.Column('osm_id', sa.BigInteger),
sa.Column('parent_place_id', sa.BigInteger),
sa.Column('indexed_date', sa.DateTime),
sa.Column('startnumber', sa.Integer),
sa.Column('endnumber', sa.Integer),
sa.Column('step', sa.SmallInteger),
sa.Column('indexed_status', sa.SmallInteger),
sa.Column('linegeo', Geometry),
sa.Column('address', KeyValueStore),
sa.Column('postcode', sa.Text),
sa.Column('country_code', sa.String(2)))
self.country_name = sa.Table('country_name', meta,
sa.Column('country_code', sa.String(2)),
sa.Column('name', KeyValueStore),
sa.Column('derived_name', KeyValueStore),
sa.Column('partition', sa.Integer))
self.country_grid = sa.Table('country_osm_grid', meta,
sa.Column('country_code', sa.String(2)),
sa.Column('area', sa.Float),
sa.Column('geometry', Geometry))
# The following tables are not necessarily present.
self.search_name = sa.Table('search_name', meta,
sa.Column('place_id', sa.BigInteger),
sa.Column('importance', sa.Float),
sa.Column('search_rank', sa.SmallInteger),
sa.Column('address_rank', sa.SmallInteger),
sa.Column('name_vector', IntArray),
sa.Column('nameaddress_vector', IntArray),
sa.Column('country_code', sa.String(2)),
sa.Column('centroid', Geometry))
self.tiger = sa.Table('location_property_tiger', meta,
sa.Column('place_id', sa.BigInteger),
sa.Column('parent_place_id', sa.BigInteger),
sa.Column('startnumber', sa.Integer),
sa.Column('endnumber', sa.Integer),
sa.Column('step', sa.SmallInteger),
sa.Column('linegeo', Geometry),
sa.Column('postcode', sa.Text))

View File

@@ -0,0 +1,17 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module with custom types for SQLAlchemy
"""
# See also https://github.com/PyCQA/pylint/issues/6006
# pylint: disable=useless-import-alias
from .geometry import (Geometry as Geometry)
from .int_array import (IntArray as IntArray)
from .key_value import (KeyValueStore as KeyValueStore)
from .json import (Json as Json)

View File

@@ -0,0 +1,308 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Custom types for SQLAlchemy.
"""
from __future__ import annotations
from typing import Callable, Any, cast
import sys
import sqlalchemy as sa
from sqlalchemy.ext.compiler import compiles
from sqlalchemy import types
from ...typing import SaColumn, SaBind
#pylint: disable=all
class Geometry_DistanceSpheroid(sa.sql.expression.FunctionElement[float]):
""" Function to compute the spherical distance in meters.
"""
type = sa.Float()
name = 'Geometry_DistanceSpheroid'
inherit_cache = True
@compiles(Geometry_DistanceSpheroid) # type: ignore[no-untyped-call, misc]
def _default_distance_spheroid(element: Geometry_DistanceSpheroid,
compiler: 'sa.Compiled', **kw: Any) -> str:
return "ST_DistanceSpheroid(%s,"\
" 'SPHEROID[\"WGS 84\",6378137,298.257223563, AUTHORITY[\"EPSG\",\"7030\"]]')"\
% compiler.process(element.clauses, **kw)
@compiles(Geometry_DistanceSpheroid, 'sqlite') # type: ignore[no-untyped-call, misc]
def _spatialite_distance_spheroid(element: Geometry_DistanceSpheroid,
compiler: 'sa.Compiled', **kw: Any) -> str:
return "COALESCE(Distance(%s, true), 0.0)" % compiler.process(element.clauses, **kw)
class Geometry_IsLineLike(sa.sql.expression.FunctionElement[Any]):
""" Check if the geometry is a line or multiline.
"""
name = 'Geometry_IsLineLike'
inherit_cache = True
@compiles(Geometry_IsLineLike) # type: ignore[no-untyped-call, misc]
def _default_is_line_like(element: Geometry_IsLineLike,
compiler: 'sa.Compiled', **kw: Any) -> str:
return "ST_GeometryType(%s) IN ('ST_LineString', 'ST_MultiLineString')" % \
compiler.process(element.clauses, **kw)
@compiles(Geometry_IsLineLike, 'sqlite') # type: ignore[no-untyped-call, misc]
def _sqlite_is_line_like(element: Geometry_IsLineLike,
compiler: 'sa.Compiled', **kw: Any) -> str:
return "ST_GeometryType(%s) IN ('LINESTRING', 'MULTILINESTRING')" % \
compiler.process(element.clauses, **kw)
class Geometry_IsAreaLike(sa.sql.expression.FunctionElement[Any]):
""" Check if the geometry is a polygon or multipolygon.
"""
name = 'Geometry_IsLineLike'
inherit_cache = True
@compiles(Geometry_IsAreaLike) # type: ignore[no-untyped-call, misc]
def _default_is_area_like(element: Geometry_IsAreaLike,
compiler: 'sa.Compiled', **kw: Any) -> str:
return "ST_GeometryType(%s) IN ('ST_Polygon', 'ST_MultiPolygon')" % \
compiler.process(element.clauses, **kw)
@compiles(Geometry_IsAreaLike, 'sqlite') # type: ignore[no-untyped-call, misc]
def _sqlite_is_area_like(element: Geometry_IsAreaLike,
compiler: 'sa.Compiled', **kw: Any) -> str:
return "ST_GeometryType(%s) IN ('POLYGON', 'MULTIPOLYGON')" % \
compiler.process(element.clauses, **kw)
class Geometry_IntersectsBbox(sa.sql.expression.FunctionElement[Any]):
""" Check if the bounding boxes of the given geometries intersect.
"""
name = 'Geometry_IntersectsBbox'
inherit_cache = True
@compiles(Geometry_IntersectsBbox) # type: ignore[no-untyped-call, misc]
def _default_intersects(element: Geometry_IntersectsBbox,
compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "%s && %s" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
@compiles(Geometry_IntersectsBbox, 'sqlite') # type: ignore[no-untyped-call, misc]
def _sqlite_intersects(element: Geometry_IntersectsBbox,
compiler: 'sa.Compiled', **kw: Any) -> str:
return "MbrIntersects(%s) = 1" % compiler.process(element.clauses, **kw)
class Geometry_ColumnIntersectsBbox(sa.sql.expression.FunctionElement[Any]):
""" Check if the bounding box of the geometry intersects with the
given table column, using the spatial index for the column.
The index must exist or the query may return nothing.
"""
name = 'Geometry_ColumnIntersectsBbox'
inherit_cache = True
@compiles(Geometry_ColumnIntersectsBbox) # type: ignore[no-untyped-call, misc]
def default_intersects_column(element: Geometry_ColumnIntersectsBbox,
compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "%s && %s" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
@compiles(Geometry_ColumnIntersectsBbox, 'sqlite') # type: ignore[no-untyped-call, misc]
def spatialite_intersects_column(element: Geometry_ColumnIntersectsBbox,
compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "MbrIntersects(%s, %s) = 1 and "\
"%s.ROWID IN (SELECT ROWID FROM SpatialIndex "\
"WHERE f_table_name = '%s' AND f_geometry_column = '%s' "\
"AND search_frame = %s)" %(
compiler.process(arg1, **kw),
compiler.process(arg2, **kw),
arg1.table.name, arg1.table.name, arg1.name,
compiler.process(arg2, **kw))
class Geometry_ColumnDWithin(sa.sql.expression.FunctionElement[Any]):
""" Check if the geometry is within the distance of the
given table column, using the spatial index for the column.
The index must exist or the query may return nothing.
"""
name = 'Geometry_ColumnDWithin'
inherit_cache = True
@compiles(Geometry_ColumnDWithin) # type: ignore[no-untyped-call, misc]
def default_dwithin_column(element: Geometry_ColumnDWithin,
compiler: 'sa.Compiled', **kw: Any) -> str:
return "ST_DWithin(%s)" % compiler.process(element.clauses, **kw)
@compiles(Geometry_ColumnDWithin, 'sqlite') # type: ignore[no-untyped-call, misc]
def spatialite_dwithin_column(element: Geometry_ColumnDWithin,
compiler: 'sa.Compiled', **kw: Any) -> str:
geom1, geom2, dist = list(element.clauses)
return "ST_Distance(%s, %s) < %s and "\
"%s.ROWID IN (SELECT ROWID FROM SpatialIndex "\
"WHERE f_table_name = '%s' AND f_geometry_column = '%s' "\
"AND search_frame = ST_Expand(%s, %s))" %(
compiler.process(geom1, **kw),
compiler.process(geom2, **kw),
compiler.process(dist, **kw),
geom1.table.name, geom1.table.name, geom1.name,
compiler.process(geom2, **kw),
compiler.process(dist, **kw))
class Geometry(types.UserDefinedType): # type: ignore[type-arg]
""" Simplified type decorator for PostGIS geometry. This type
only supports geometries in 4326 projection.
"""
cache_ok = True
def __init__(self, subtype: str = 'Geometry'):
self.subtype = subtype
def get_col_spec(self) -> str:
return f'GEOMETRY({self.subtype}, 4326)'
def bind_processor(self, dialect: 'sa.Dialect') -> Callable[[Any], str]:
def process(value: Any) -> str:
if isinstance(value, str):
return value
return cast(str, value.to_wkt())
return process
def result_processor(self, dialect: 'sa.Dialect', coltype: object) -> Callable[[Any], str]:
def process(value: Any) -> str:
assert isinstance(value, str)
return value
return process
def column_expression(self, col: SaColumn) -> SaColumn:
return sa.func.ST_AsEWKB(col)
def bind_expression(self, bindvalue: SaBind) -> SaColumn:
return sa.func.ST_GeomFromText(bindvalue, sa.text('4326'), type_=self)
class comparator_factory(types.UserDefinedType.Comparator): # type: ignore[type-arg]
def intersects(self, other: SaColumn, use_index: bool = True) -> 'sa.Operators':
if not use_index:
return Geometry_IntersectsBbox(sa.func.coalesce(sa.null(), self.expr), other)
if isinstance(self.expr, sa.Column):
return Geometry_ColumnIntersectsBbox(self.expr, other)
return Geometry_IntersectsBbox(self.expr, other)
def is_line_like(self) -> SaColumn:
return Geometry_IsLineLike(self)
def is_area(self) -> SaColumn:
return Geometry_IsAreaLike(self)
def within_distance(self, other: SaColumn, distance: SaColumn) -> SaColumn:
if isinstance(self.expr, sa.Column):
return Geometry_ColumnDWithin(self.expr, other, distance)
return self.ST_Distance(other) < distance
def ST_Distance(self, other: SaColumn) -> SaColumn:
return sa.func.ST_Distance(self, other, type_=sa.Float)
def ST_Contains(self, other: SaColumn) -> SaColumn:
return sa.func.ST_Contains(self, other, type_=sa.Boolean)
def ST_CoveredBy(self, other: SaColumn) -> SaColumn:
return sa.func.ST_CoveredBy(self, other, type_=sa.Boolean)
def ST_ClosestPoint(self, other: SaColumn) -> SaColumn:
return sa.func.coalesce(sa.func.ST_ClosestPoint(self, other, type_=Geometry),
other)
def ST_Buffer(self, other: SaColumn) -> SaColumn:
return sa.func.ST_Buffer(self, other, type_=Geometry)
def ST_Expand(self, other: SaColumn) -> SaColumn:
return sa.func.ST_Expand(self, other, type_=Geometry)
def ST_Collect(self) -> SaColumn:
return sa.func.ST_Collect(self, type_=Geometry)
def ST_Centroid(self) -> SaColumn:
return sa.func.ST_Centroid(self, type_=Geometry)
def ST_LineInterpolatePoint(self, other: SaColumn) -> SaColumn:
return sa.func.ST_LineInterpolatePoint(self, other, type_=Geometry)
def ST_LineLocatePoint(self, other: SaColumn) -> SaColumn:
return sa.func.ST_LineLocatePoint(self, other, type_=sa.Float)
def distance_spheroid(self, other: SaColumn) -> SaColumn:
return Geometry_DistanceSpheroid(self, other)
@compiles(Geometry, 'sqlite') # type: ignore[no-untyped-call]
def get_col_spec(self, *args, **kwargs): # type: ignore[no-untyped-def]
return 'GEOMETRY'
SQLITE_FUNCTION_ALIAS = (
('ST_AsEWKB', sa.Text, 'AsEWKB'),
('ST_GeomFromEWKT', Geometry, 'GeomFromEWKT'),
('ST_AsGeoJSON', sa.Text, 'AsGeoJSON'),
('ST_AsKML', sa.Text, 'AsKML'),
('ST_AsSVG', sa.Text, 'AsSVG'),
('ST_LineLocatePoint', sa.Float, 'ST_Line_Locate_Point'),
('ST_LineInterpolatePoint', sa.Float, 'ST_Line_Interpolate_Point'),
)
def _add_function_alias(func: str, ftype: type, alias: str) -> None:
_FuncDef = type(func, (sa.sql.functions.GenericFunction, ), {
"type": ftype(),
"name": func,
"identifier": func,
"inherit_cache": True})
func_templ = f"{alias}(%s)"
def _sqlite_impl(element: Any, compiler: Any, **kw: Any) -> Any:
return func_templ % compiler.process(element.clauses, **kw)
compiles(_FuncDef, 'sqlite')(_sqlite_impl) # type: ignore[no-untyped-call]
for alias in SQLITE_FUNCTION_ALIAS:
_add_function_alias(*alias)

View File

@@ -0,0 +1,123 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Custom type for an array of integers.
"""
from typing import Any, List, cast, Optional
import sqlalchemy as sa
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.dialects.postgresql import ARRAY
from ...typing import SaDialect, SaColumn
# pylint: disable=all
class IntList(sa.types.TypeDecorator[Any]):
""" A list of integers saved as a text of comma-separated numbers.
"""
impl = sa.types.Unicode
cache_ok = True
def process_bind_param(self, value: Optional[Any], dialect: 'sa.Dialect') -> Optional[str]:
if value is None:
return None
assert isinstance(value, list)
return ','.join(map(str, value))
def process_result_value(self, value: Optional[Any],
dialect: SaDialect) -> Optional[List[int]]:
return [int(v) for v in value.split(',')] if value is not None else None
def copy(self, **kw: Any) -> 'IntList':
return IntList(self.impl.length)
class IntArray(sa.types.TypeDecorator[Any]):
""" Dialect-independent list of integers.
"""
impl = IntList
cache_ok = True
def load_dialect_impl(self, dialect: SaDialect) -> sa.types.TypeEngine[Any]:
if dialect.name == 'postgresql':
return ARRAY(sa.Integer()) #pylint: disable=invalid-name
return IntList()
class comparator_factory(sa.types.UserDefinedType.Comparator): # type: ignore[type-arg]
def __add__(self, other: SaColumn) -> 'sa.ColumnOperators':
""" Concate the array with the given array. If one of the
operants is null, the value of the other will be returned.
"""
return ArrayCat(self.expr, other)
def contains(self, other: SaColumn, **kwargs: Any) -> 'sa.ColumnOperators':
""" Return true if the array contains all the value of the argument
array.
"""
return ArrayContains(self.expr, other)
class ArrayAgg(sa.sql.functions.GenericFunction[Any]):
""" Aggregate function to collect elements in an array.
"""
type = IntArray()
identifier = 'ArrayAgg'
name = 'array_agg'
inherit_cache = True
@compiles(ArrayAgg, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_array_agg(element: ArrayAgg, compiler: 'sa.Compiled', **kw: Any) -> str:
return "group_concat(%s, ',')" % compiler.process(element.clauses, **kw)
class ArrayContains(sa.sql.expression.FunctionElement[Any]):
""" Function to check if an array is fully contained in another.
"""
name = 'ArrayContains'
inherit_cache = True
@compiles(ArrayContains) # type: ignore[no-untyped-call, misc]
def generic_array_contains(element: ArrayContains, compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "(%s @> %s)" % (compiler.process(arg1, **kw),
compiler.process(arg2, **kw))
@compiles(ArrayContains, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_array_contains(element: ArrayContains, compiler: 'sa.Compiled', **kw: Any) -> str:
return "array_contains(%s)" % compiler.process(element.clauses, **kw)
class ArrayCat(sa.sql.expression.FunctionElement[Any]):
""" Function to check if an array is fully contained in another.
"""
type = IntArray()
identifier = 'ArrayCat'
inherit_cache = True
@compiles(ArrayCat) # type: ignore[no-untyped-call, misc]
def generic_array_cat(element: ArrayCat, compiler: 'sa.Compiled', **kw: Any) -> str:
return "array_cat(%s)" % compiler.process(element.clauses, **kw)
@compiles(ArrayCat, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_array_cat(element: ArrayCat, compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "(%s || ',' || %s)" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))

View File

@@ -0,0 +1,30 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Common json type for different dialects.
"""
from typing import Any
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.dialects.sqlite import JSON as sqlite_json
from ...typing import SaDialect
# pylint: disable=all
class Json(sa.types.TypeDecorator[Any]):
""" Dialect-independent type for JSON.
"""
impl = sa.types.JSON
cache_ok = True
def load_dialect_impl(self, dialect: SaDialect) -> sa.types.TypeEngine[Any]:
if dialect.name == 'postgresql':
return JSONB(none_as_null=True) # type: ignore[no-untyped-call]
return sqlite_json(none_as_null=True)

View File

@@ -0,0 +1,62 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
A custom type that implements a simple key-value store of strings.
"""
from typing import Any
import sqlalchemy as sa
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.dialects.postgresql import HSTORE
from sqlalchemy.dialects.sqlite import JSON as sqlite_json
from ...typing import SaDialect, SaColumn
# pylint: disable=all
class KeyValueStore(sa.types.TypeDecorator[Any]):
""" Dialect-independent type of a simple key-value store of strings.
"""
impl = HSTORE
cache_ok = True
def load_dialect_impl(self, dialect: SaDialect) -> sa.types.TypeEngine[Any]:
if dialect.name == 'postgresql':
return HSTORE() # type: ignore[no-untyped-call]
return sqlite_json(none_as_null=True)
class comparator_factory(sa.types.UserDefinedType.Comparator): # type: ignore[type-arg]
def merge(self, other: SaColumn) -> 'sa.Operators':
""" Merge the values from the given KeyValueStore into this
one, overwriting values where necessary. When the argument
is null, nothing happens.
"""
return KeyValueConcat(self.expr, other)
class KeyValueConcat(sa.sql.expression.FunctionElement[Any]):
""" Return the merged key-value store from the input parameters.
"""
type = KeyValueStore()
name = 'JsonConcat'
inherit_cache = True
@compiles(KeyValueConcat) # type: ignore[no-untyped-call, misc]
def default_json_concat(element: KeyValueConcat, compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "(%s || coalesce(%s, ''::hstore))" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
@compiles(KeyValueConcat, 'sqlite') # type: ignore[no-untyped-call, misc]
def sqlite_json_concat(element: KeyValueConcat, compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "json_patch(%s, coalesce(%s, '{}'))" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))

View File

@@ -0,0 +1,127 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Access and helper functions for the status and status log table.
"""
from typing import Optional, Tuple, cast
import datetime as dt
import logging
import re
from .connection import Connection
from ..utils.url_utils import get_url
from ..errors import UsageError
from ..typing import TypedDict
LOG = logging.getLogger()
ISODATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
class StatusRow(TypedDict):
""" Dictionary of columns of the import_status table.
"""
lastimportdate: dt.datetime
sequence_id: Optional[int]
indexed: Optional[bool]
def compute_database_date(conn: Connection, offline: bool = False) -> dt.datetime:
""" Determine the date of the database from the newest object in the
data base.
"""
# If there is a date from osm2pgsql available, use that.
if conn.table_exists('osm2pgsql_properties'):
with conn.cursor() as cur:
cur.execute(""" SELECT value FROM osm2pgsql_properties
WHERE property = 'current_timestamp' """)
row = cur.fetchone()
if row is not None:
return dt.datetime.strptime(row[0], "%Y-%m-%dT%H:%M:%SZ")\
.replace(tzinfo=dt.timezone.utc)
if offline:
raise UsageError("Cannot determine database date from data in offline mode.")
# Else, find the node with the highest ID in the database
with conn.cursor() as cur:
if conn.table_exists('place'):
osmid = cur.scalar("SELECT max(osm_id) FROM place WHERE osm_type='N'")
else:
osmid = cur.scalar("SELECT max(osm_id) FROM placex WHERE osm_type='N'")
if osmid is None:
LOG.fatal("No data found in the database.")
raise UsageError("No data found in the database.")
LOG.info("Using node id %d for timestamp lookup", osmid)
# Get the node from the API to find the timestamp when it was created.
node_url = f'https://www.openstreetmap.org/api/0.6/node/{osmid}/1'
data = get_url(node_url)
match = re.search(r'timestamp="((\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}))Z"', data)
if match is None:
LOG.fatal("The node data downloaded from the API does not contain valid data.\n"
"URL used: %s", node_url)
raise UsageError("Bad API data.")
LOG.debug("Found timestamp %s", match.group(1))
return dt.datetime.strptime(match.group(1), ISODATE_FORMAT).replace(tzinfo=dt.timezone.utc)
def set_status(conn: Connection, date: Optional[dt.datetime],
seq: Optional[int] = None, indexed: bool = True) -> None:
""" Replace the current status with the given status. If date is `None`
then only sequence and indexed will be updated as given. Otherwise
the whole status is replaced.
The change will be committed to the database.
"""
assert date is None or date.tzinfo == dt.timezone.utc
with conn.cursor() as cur:
if date is None:
cur.execute("UPDATE import_status set sequence_id = %s, indexed = %s",
(seq, indexed))
else:
cur.execute("TRUNCATE TABLE import_status")
cur.execute("""INSERT INTO import_status (lastimportdate, sequence_id, indexed)
VALUES (%s, %s, %s)""", (date, seq, indexed))
conn.commit()
def get_status(conn: Connection) -> Tuple[Optional[dt.datetime], Optional[int], Optional[bool]]:
""" Return the current status as a triple of (date, sequence, indexed).
If status has not been set up yet, a triple of None is returned.
"""
with conn.cursor() as cur:
cur.execute("SELECT * FROM import_status LIMIT 1")
if cur.rowcount < 1:
return None, None, None
row = cast(StatusRow, cur.fetchone())
return row['lastimportdate'], row['sequence_id'], row['indexed']
def set_indexed(conn: Connection, state: bool) -> None:
""" Set the indexed flag in the status table to the given state.
"""
with conn.cursor() as cur:
cur.execute("UPDATE import_status SET indexed = %s", (state, ))
conn.commit()
def log_status(conn: Connection, start: dt.datetime,
event: str, batchsize: Optional[int] = None) -> None:
""" Write a new status line to the `import_osmosis_log` table.
"""
with conn.cursor() as cur:
cur.execute("""INSERT INTO import_osmosis_log
(batchend, batchseq, batchsize, starttime, endtime, event)
SELECT lastimportdate, sequence_id, %s, %s, now(), %s FROM import_status""",
(batchsize, start, event))
conn.commit()

View File

@@ -0,0 +1,129 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for handling DB accesses.
"""
from typing import IO, Optional, Union, Any, Iterable
import subprocess
import logging
import gzip
import io
from pathlib import Path
from .connection import get_pg_env, Cursor
from ..errors import UsageError
LOG = logging.getLogger()
def _pipe_to_proc(proc: 'subprocess.Popen[bytes]',
fdesc: Union[IO[bytes], gzip.GzipFile]) -> int:
assert proc.stdin is not None
chunk = fdesc.read(2048)
while chunk and proc.poll() is None:
try:
proc.stdin.write(chunk)
except BrokenPipeError as exc:
raise UsageError("Failed to execute SQL file.") from exc
chunk = fdesc.read(2048)
return len(chunk)
def execute_file(dsn: str, fname: Path,
ignore_errors: bool = False,
pre_code: Optional[str] = None,
post_code: Optional[str] = None) -> None:
""" Read an SQL file and run its contents against the given database
using psql. Use `pre_code` and `post_code` to run extra commands
before or after executing the file. The commands are run within the
same session, so they may be used to wrap the file execution in a
transaction.
"""
cmd = ['psql']
if not ignore_errors:
cmd.extend(('-v', 'ON_ERROR_STOP=1'))
if not LOG.isEnabledFor(logging.INFO):
cmd.append('--quiet')
with subprocess.Popen(cmd, env=get_pg_env(dsn), stdin=subprocess.PIPE) as proc:
assert proc.stdin is not None
try:
if not LOG.isEnabledFor(logging.INFO):
proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8'))
if pre_code:
proc.stdin.write((pre_code + ';').encode('utf-8'))
if fname.suffix == '.gz':
with gzip.open(str(fname), 'rb') as fdesc:
remain = _pipe_to_proc(proc, fdesc)
else:
with fname.open('rb') as fdesc:
remain = _pipe_to_proc(proc, fdesc)
if remain == 0 and post_code:
proc.stdin.write((';' + post_code).encode('utf-8'))
finally:
proc.stdin.close()
ret = proc.wait()
if ret != 0 or remain > 0:
raise UsageError("Failed to execute SQL file.")
# List of characters that need to be quoted for the copy command.
_SQL_TRANSLATION = {ord('\\'): '\\\\',
ord('\t'): '\\t',
ord('\n'): '\\n'}
class CopyBuffer:
""" Data collector for the copy_from command.
"""
def __init__(self) -> None:
self.buffer = io.StringIO()
def __enter__(self) -> 'CopyBuffer':
return self
def size(self) -> int:
""" Return the number of bytes the buffer currently contains.
"""
return self.buffer.tell()
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
if self.buffer is not None:
self.buffer.close()
def add(self, *data: Any) -> None:
""" Add another row of data to the copy buffer.
"""
first = True
for column in data:
if first:
first = False
else:
self.buffer.write('\t')
if column is None:
self.buffer.write('\\N')
else:
self.buffer.write(str(column).translate(_SQL_TRANSLATION))
self.buffer.write('\n')
def copy_out(self, cur: Cursor, table: str, columns: Optional[Iterable[str]] = None) -> None:
""" Copy all collected data into the given table.
The buffer is empty and reusable after this operation.
"""
if self.buffer.tell() > 0:
self.buffer.seek(0)
cur.copy_from(self.buffer, table, columns=columns)
self.buffer = io.StringIO()

View File

@@ -0,0 +1,14 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Custom exception and error classes for Nominatim.
"""
class UsageError(Exception):
""" An error raised because of bad user input. This error will usually
not cause a stack trace to be printed unless debugging is enabled.
"""

View File

@@ -0,0 +1,15 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Path settings for extra data used by Nominatim.
"""
from pathlib import Path
PHPLIB_DIR = (Path(__file__) / '..' / '..' / '..' / 'lib-php').resolve()
SQLLIB_DIR = (Path(__file__) / '..' / '..' / '..' / 'lib-sql').resolve()
DATA_DIR = (Path(__file__) / '..' / '..' / '..' / 'data').resolve()
CONFIG_DIR = (Path(__file__) / '..' / '..' / '..' / 'settings').resolve()

View File

View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Type definitions for typing annotations.
Complex type definitions are moved here, to keep the source files readable.
"""
from typing import Any, Union, Mapping, TypeVar, Sequence, TYPE_CHECKING
# Generics variable names do not confirm to naming styles, ignore globally here.
# pylint: disable=invalid-name,abstract-method,multiple-statements
# pylint: disable=missing-class-docstring,useless-import-alias
if TYPE_CHECKING:
import psycopg2.sql
import psycopg2.extensions
import psycopg2.extras
import os
StrPath = Union[str, 'os.PathLike[str]']
SysEnv = Mapping[str, str]
# psycopg2-related types
Query = Union[str, bytes, 'psycopg2.sql.Composable']
T_ResultKey = TypeVar('T_ResultKey', int, str)
class DictCursorResult(Mapping[str, Any]):
def __getitem__(self, x: Union[int, str]) -> Any: ...
DictCursorResults = Sequence[DictCursorResult]
T_cursor = TypeVar('T_cursor', bound='psycopg2.extensions.cursor')
# The following typing features require typing_extensions to work
# on all supported Python versions.
# Only require this for type checking but not for normal operations.
if TYPE_CHECKING:
from typing_extensions import (Protocol as Protocol,
Final as Final,
TypedDict as TypedDict)
else:
Protocol = object
Final = 'Final'
TypedDict = dict
# SQLAlchemy introduced generic types in version 2.0 making typing
# incompatible with older versions. Add wrappers here so we don't have
# to litter the code with bare-string types.
if TYPE_CHECKING:
import sqlalchemy as sa
from typing_extensions import (TypeAlias as TypeAlias)
else:
TypeAlias = str
SaLambdaSelect: TypeAlias = 'Union[sa.Select[Any], sa.StatementLambdaElement]'
SaSelect: TypeAlias = 'sa.Select[Any]'
SaScalarSelect: TypeAlias = 'sa.ScalarSelect[Any]'
SaRow: TypeAlias = 'sa.Row[Any]'
SaColumn: TypeAlias = 'sa.ColumnElement[Any]'
SaExpression: TypeAlias = 'sa.ColumnElement[bool]'
SaLabel: TypeAlias = 'sa.Label[Any]'
SaFromClause: TypeAlias = 'sa.FromClause'
SaSelectable: TypeAlias = 'sa.Selectable'
SaBind: TypeAlias = 'sa.BindParameter[Any]'
SaDialect: TypeAlias = 'sa.Dialect'

View File

View File

@@ -0,0 +1,49 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for computation of centroids.
"""
from typing import Tuple, Any
from collections.abc import Collection
class PointsCentroid:
""" Centroid computation from single points using an online algorithm.
More points may be added at any time.
Coordinates are internally treated as a 7-digit fixed-point float
(i.e. in OSM style).
"""
def __init__(self) -> None:
self.sum_x = 0
self.sum_y = 0
self.count = 0
def centroid(self) -> Tuple[float, float]:
""" Return the centroid of all points collected so far.
"""
if self.count == 0:
raise ValueError("No points available for centroid.")
return (float(self.sum_x/self.count)/10000000,
float(self.sum_y/self.count)/10000000)
def __len__(self) -> int:
return self.count
def __iadd__(self, other: Any) -> 'PointsCentroid':
if isinstance(other, Collection) and len(other) == 2:
if all(isinstance(p, (float, int)) for p in other):
x, y = other
self.sum_x += int(x * 10000000)
self.sum_y += int(y * 10000000)
self.count += 1
return self
raise ValueError("Can only add 2-element tuples to centroid.")

View File

@@ -0,0 +1,149 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Streaming JSON encoder.
"""
from typing import Any, TypeVar, Optional, Callable
import io
try:
import ujson as json
except ModuleNotFoundError:
import json # type: ignore[no-redef]
T = TypeVar('T') # pylint: disable=invalid-name
class JsonWriter:
""" JSON encoder that renders the output directly into an output
stream. This is a very simple writer which produces JSON in a
compact as possible form.
The writer does not check for syntactic correctness. It is the
responsibility of the caller to call the write functions in an
order that produces correct JSON.
All functions return the writer object itself so that function
calls can be chained.
"""
def __init__(self) -> None:
self.data = io.StringIO()
self.pending = ''
def __call__(self) -> str:
""" Return the rendered JSON content as a string.
The writer remains usable after calling this function.
"""
if self.pending:
assert self.pending in (']', '}')
self.data.write(self.pending)
self.pending = ''
return self.data.getvalue()
def start_object(self) -> 'JsonWriter':
""" Write the open bracket of a JSON object.
"""
if self.pending:
self.data.write(self.pending)
self.pending = '{'
return self
def end_object(self) -> 'JsonWriter':
""" Write the closing bracket of a JSON object.
"""
assert self.pending in (',', '{', '')
if self.pending == '{':
self.data.write(self.pending)
self.pending = '}'
return self
def start_array(self) -> 'JsonWriter':
""" Write the opening bracket of a JSON array.
"""
if self.pending:
self.data.write(self.pending)
self.pending = '['
return self
def end_array(self) -> 'JsonWriter':
""" Write the closing bracket of a JSON array.
"""
assert self.pending in (',', '[', ']', ')', '')
if self.pending not in (',', ''):
self.data.write(self.pending)
self.pending = ']'
return self
def key(self, name: str) -> 'JsonWriter':
""" Write the key string of a JSON object.
"""
assert self.pending
self.data.write(self.pending)
self.data.write(json.dumps(name, ensure_ascii=False))
self.pending = ':'
return self
def value(self, value: Any) -> 'JsonWriter':
""" Write out a value as JSON. The function uses the json.dumps()
function for encoding the JSON. Thus any value that can be
encoded by that function is permissible here.
"""
return self.raw(json.dumps(value, ensure_ascii=False))
def float(self, value: float, precision: int) -> 'JsonWriter':
""" Write out a float value with the given precision.
"""
return self.raw(f"{value:0.{precision}f}")
def next(self) -> 'JsonWriter':
""" Write out a delimiter comma between JSON object or array elements.
"""
if self.pending:
self.data.write(self.pending)
self.pending = ','
return self
def raw(self, raw_json: str) -> 'JsonWriter':
""" Write out the given value as is. This function is useful if
a value is already available in JSON format.
"""
if self.pending:
self.data.write(self.pending)
self.pending = ''
self.data.write(raw_json)
return self
def keyval(self, key: str, value: Any) -> 'JsonWriter':
""" Write out an object element with the given key and value.
This is a shortcut for calling 'key()', 'value()' and 'next()'.
"""
self.key(key)
self.value(value)
return self.next()
def keyval_not_none(self, key: str, value: Optional[T],
transform: Optional[Callable[[T], Any]] = None) -> 'JsonWriter':
""" Write out an object element only if the value is not None.
If 'transform' is given, it must be a function that takes the
value type and returns a JSON encodable type. The transform
function will be called before the value is written out.
"""
if value is not None:
self.key(key)
self.value(transform(value) if transform else value)
self.next()
return self

View File

@@ -0,0 +1,31 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for accessing URL.
"""
from typing import IO
import logging
import urllib.request as urlrequest
from ..version import NOMINATIM_CORE_VERSION
LOG = logging.getLogger()
def get_url(url: str) -> str:
""" Get the contents from the given URL and return it as a UTF-8 string.
This version makes sure that an appropriate user agent is sent.
"""
headers = {"User-Agent": f"Nominatim/{NOMINATIM_CORE_VERSION!s}"}
try:
request = urlrequest.Request(url, headers=headers)
with urlrequest.urlopen(request) as response: # type: IO[bytes]
return response.read().decode('utf-8')
except Exception:
LOG.fatal('Failed to load URL: %s', url)
raise

View File

@@ -0,0 +1,11 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Version information for the Nominatim core package.
"""
NOMINATIM_CORE_VERSION = '4.4.99'

View File

228
src/nominatim_db/cli.py Normal file
View File

@@ -0,0 +1,228 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Command-line interface to the Nominatim functions for import, update,
database administration and querying.
"""
from typing import Optional, Any
import importlib
import logging
import os
import sys
import argparse
from pathlib import Path
from nominatim_core.config import Configuration
from nominatim_core.errors import UsageError
from .tools.exec_utils import run_php_server
from . import clicmd
from . import version
from .clicmd.args import NominatimArgs, Subcommand
LOG = logging.getLogger()
class CommandlineParser:
""" Wraps some of the common functions for parsing the command line
and setting up subcommands.
"""
def __init__(self, prog: str, description: Optional[str]):
self.parser = argparse.ArgumentParser(
prog=prog,
description=description,
formatter_class=argparse.RawDescriptionHelpFormatter)
self.subs = self.parser.add_subparsers(title='available commands',
dest='subcommand')
# Global arguments that only work if no sub-command given
self.parser.add_argument('--version', action='store_true',
help='Print Nominatim version and exit')
# Arguments added to every sub-command
self.default_args = argparse.ArgumentParser(add_help=False)
group = self.default_args.add_argument_group('Default arguments')
group.add_argument('-h', '--help', action='help',
help='Show this help message and exit')
group.add_argument('-q', '--quiet', action='store_const', const=0,
dest='verbose', default=1,
help='Print only error messages')
group.add_argument('-v', '--verbose', action='count', default=1,
help='Increase verboseness of output')
group.add_argument('--project-dir', metavar='DIR', default='.',
help='Base directory of the Nominatim installation (default:.)')
group.add_argument('-j', '--threads', metavar='NUM', type=int,
help='Number of parallel threads to use')
def nominatim_version_text(self) -> str:
""" Program name and version number as string
"""
text = f'Nominatim version {version.NOMINATIM_VERSION!s}'
if version.GIT_COMMIT_HASH is not None:
text += f' ({version.GIT_COMMIT_HASH})'
return text
def add_subcommand(self, name: str, cmd: Subcommand) -> None:
""" Add a subcommand to the parser. The subcommand must be a class
with a function add_args() that adds the parameters for the
subcommand and a run() function that executes the command.
"""
assert cmd.__doc__ is not None
parser = self.subs.add_parser(name, parents=[self.default_args],
help=cmd.__doc__.split('\n', 1)[0],
description=cmd.__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
add_help=False)
parser.set_defaults(command=cmd)
cmd.add_args(parser)
def run(self, **kwargs: Any) -> int:
""" Parse the command line arguments of the program and execute the
appropriate subcommand.
"""
args = NominatimArgs()
try:
self.parser.parse_args(args=kwargs.get('cli_args'), namespace=args)
except SystemExit:
return 1
if args.version:
print(self.nominatim_version_text())
return 0
if args.subcommand is None:
self.parser.print_help()
return 1
args.project_dir = Path(args.project_dir).resolve()
if 'cli_args' not in kwargs:
logging.basicConfig(stream=sys.stderr,
format='%(asctime)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=max(4 - args.verbose, 1) * 10)
args.config = Configuration(args.project_dir,
environ=kwargs.get('environ', os.environ))
args.config.set_libdirs(module=kwargs['module_dir'],
osm2pgsql=kwargs['osm2pgsql_path'])
log = logging.getLogger()
log.warning('Using project directory: %s', str(args.project_dir))
try:
return args.command.run(args)
except UsageError as exception:
if log.isEnabledFor(logging.DEBUG):
raise # use Python's exception printing
log.fatal('FATAL: %s', exception)
# If we get here, then execution has failed in some way.
return 1
# Subcommand classes
#
# Each class needs to implement two functions: add_args() adds the CLI parameters
# for the subfunction, run() executes the subcommand.
#
# The class documentation doubles as the help text for the command. The
# first line is also used in the summary when calling the program without
# a subcommand.
#
# No need to document the functions each time.
# pylint: disable=C0111
class AdminServe:
"""\
Start a simple web server for serving the API.
This command starts a built-in webserver to serve the website
from the current project directory. This webserver is only suitable
for testing and development. Do not use it in production setups!
There are different webservers available. The default 'php' engine
runs the classic PHP frontend. The other engines are Python servers
which run the new Python frontend code. This is highly experimental
at the moment and may not include the full API.
By the default, the webserver can be accessed at: http://127.0.0.1:8088
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Server arguments')
group.add_argument('--server', default='127.0.0.1:8088',
help='The address the server will listen to.')
group.add_argument('--engine', default='falcon',
choices=('php', 'falcon', 'starlette'),
help='Webserver framework to run. (default: falcon)')
def run(self, args: NominatimArgs) -> int:
if args.engine == 'php':
if args.config.lib_dir.php is None:
raise UsageError("PHP frontend not configured.")
run_php_server(args.server, args.project_dir / 'website')
else:
import uvicorn # pylint: disable=import-outside-toplevel
server_info = args.server.split(':', 1)
host = server_info[0]
if len(server_info) > 1:
if not server_info[1].isdigit():
raise UsageError('Invalid format for --server parameter. Use <host>:<port>')
port = int(server_info[1])
else:
port = 8088
server_module = importlib.import_module(f'nominatim.server.{args.engine}.server')
app = server_module.get_application(args.project_dir)
uvicorn.run(app, host=host, port=port)
return 0
def get_set_parser() -> CommandlineParser:
"""\
Initializes the parser and adds various subcommands for
nominatim cli.
"""
parser = CommandlineParser('nominatim', nominatim.__doc__)
parser.add_subcommand('import', clicmd.SetupAll())
parser.add_subcommand('freeze', clicmd.SetupFreeze())
parser.add_subcommand('replication', clicmd.UpdateReplication())
parser.add_subcommand('special-phrases', clicmd.ImportSpecialPhrases())
parser.add_subcommand('add-data', clicmd.UpdateAddData())
parser.add_subcommand('index', clicmd.UpdateIndex())
parser.add_subcommand('refresh', clicmd.UpdateRefresh())
parser.add_subcommand('admin', clicmd.AdminFuncs())
parser.add_subcommand('export', clicmd.QueryExport())
parser.add_subcommand('convert', clicmd.ConvertDB())
parser.add_subcommand('serve', AdminServe())
parser.add_subcommand('search', clicmd.APISearch())
parser.add_subcommand('reverse', clicmd.APIReverse())
parser.add_subcommand('lookup', clicmd.APILookup())
parser.add_subcommand('details', clicmd.APIDetails())
parser.add_subcommand('status', clicmd.APIStatus())
return parser
def nominatim(**kwargs: Any) -> int:
"""\
Command-line tools for importing, updating, administrating and
querying the Nominatim database.
"""
return get_set_parser().run(**kwargs)

View File

@@ -0,0 +1,28 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2023 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Subcommand definitions for the command-line tool.
"""
# mypy and pylint disagree about the style of explicit exports,
# see https://github.com/PyCQA/pylint/issues/6006.
# pylint: disable=useless-import-alias
from .setup import SetupAll as SetupAll
from .replication import UpdateReplication as UpdateReplication
from .api import (APISearch as APISearch,
APIReverse as APIReverse,
APILookup as APILookup,
APIDetails as APIDetails,
APIStatus as APIStatus)
from .index import UpdateIndex as UpdateIndex
from .refresh import UpdateRefresh as UpdateRefresh
from .add_data import UpdateAddData as UpdateAddData
from .admin import AdminFuncs as AdminFuncs
from .freeze import SetupFreeze as SetupFreeze
from .special_phrases import ImportSpecialPhrases as ImportSpecialPhrases
from .export import QueryExport as QueryExport
from .convert import ConvertDB as ConvertDB

View File

@@ -0,0 +1,101 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'add-data' subcommand.
"""
from typing import cast
import argparse
import logging
import psutil
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
LOG = logging.getLogger()
class UpdateAddData:
"""\
Add additional data from a file or an online source.
This command allows to add or update the search data in the database.
The data can come either from an OSM file or single OSM objects can
directly be downloaded from the OSM API. This function only loads the
data into the database. Afterwards it still needs to be integrated
in the search index. Use the `nominatim index` command for that.
The command can also be used to add external non-OSM data to the
database. At the moment the only supported format is TIGER housenumber
data. See the online documentation at
https://nominatim.org/release-docs/latest/admin/Import/#installing-tiger-housenumber-data-for-the-us
for more information.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group_name = parser.add_argument_group('Source')
group1 = group_name.add_mutually_exclusive_group(required=True)
group1.add_argument('--file', metavar='FILE',
help='Import data from an OSM file or diff file')
group1.add_argument('--diff', metavar='FILE',
help='Import data from an OSM diff file (deprecated: use --file)')
group1.add_argument('--node', metavar='ID', type=int,
help='Import a single node from the API')
group1.add_argument('--way', metavar='ID', type=int,
help='Import a single way from the API')
group1.add_argument('--relation', metavar='ID', type=int,
help='Import a single relation from the API')
group1.add_argument('--tiger-data', metavar='DIR',
help='Add housenumbers from the US TIGER census database')
group2 = parser.add_argument_group('Extra arguments')
group2.add_argument('--use-main-api', action='store_true',
help='Use OSM API instead of Overpass to download objects')
group2.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
help='Size of cache to be used by osm2pgsql (in MB)')
group2.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
help='Set timeout for file downloads')
def run(self, args: NominatimArgs) -> int:
from ..tokenizer import factory as tokenizer_factory
from ..tools import tiger_data, add_osm_data
if args.tiger_data:
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
return tiger_data.add_tiger_data(args.tiger_data,
args.config,
args.threads or psutil.cpu_count() or 1,
tokenizer)
osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
if args.file or args.diff:
return add_osm_data.add_data_from_file(args.config.get_libpq_dsn(),
cast(str, args.file or args.diff),
osm2pgsql_params)
if args.node:
return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
'node', args.node,
args.use_main_api,
osm2pgsql_params)
if args.way:
return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
'way', args.way,
args.use_main_api,
osm2pgsql_params)
if args.relation:
return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
'relation', args.relation,
args.use_main_api,
osm2pgsql_params)
return 0

View File

@@ -0,0 +1,123 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'admin' subcommand.
"""
import logging
import argparse
import random
import nominatim_api as napi
from nominatim_core.db.connection import connect
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
LOG = logging.getLogger()
class AdminFuncs:
"""\
Analyse and maintain the database.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Admin tasks')
objs = group.add_mutually_exclusive_group(required=True)
objs.add_argument('--warm', action='store_true',
help='Warm database caches for search and reverse queries')
objs.add_argument('--check-database', action='store_true',
help='Check that the database is complete and operational')
objs.add_argument('--migrate', action='store_true',
help='Migrate the database to a new software version')
objs.add_argument('--analyse-indexing', action='store_true',
help='Print performance analysis of the indexing process')
objs.add_argument('--collect-os-info', action="store_true",
help="Generate a report about the host system information")
objs.add_argument('--clean-deleted', action='store', metavar='AGE',
help='Clean up deleted relations')
group = parser.add_argument_group('Arguments for cache warming')
group.add_argument('--search-only', action='store_const', dest='target',
const='search',
help="Only pre-warm tables for search queries")
group.add_argument('--reverse-only', action='store_const', dest='target',
const='reverse',
help="Only pre-warm tables for reverse queries")
group = parser.add_argument_group('Arguments for index anaysis')
mgroup = group.add_mutually_exclusive_group()
mgroup.add_argument('--osm-id', type=str,
help='Analyse indexing of the given OSM object')
mgroup.add_argument('--place-id', type=int,
help='Analyse indexing of the given Nominatim object')
def run(self, args: NominatimArgs) -> int:
# pylint: disable=too-many-return-statements
if args.warm:
return self._warm(args)
if args.check_database:
LOG.warning('Checking database')
from ..tools import check_database
return check_database.check_database(args.config)
if args.analyse_indexing:
LOG.warning('Analysing performance of indexing function')
from ..tools import admin
admin.analyse_indexing(args.config, osm_id=args.osm_id, place_id=args.place_id)
return 0
if args.migrate:
LOG.warning('Checking for necessary database migrations')
from ..tools import migration
return migration.migrate(args.config, args)
if args.collect_os_info:
LOG.warning("Reporting System Information")
from ..tools import collect_os_info
collect_os_info.report_system_information(args.config)
return 0
if args.clean_deleted:
LOG.warning('Cleaning up deleted relations')
from ..tools import admin
admin.clean_deleted_relations(args.config, age=args.clean_deleted)
return 0
return 1
def _warm(self, args: NominatimArgs) -> int:
LOG.warning('Warming database caches')
api = napi.NominatimAPI(args.project_dir)
try:
if args.target != 'search':
for _ in range(1000):
api.reverse((random.uniform(-90, 90), random.uniform(-180, 180)),
address_details=True)
if args.target != 'reverse':
from ..tokenizer import factory as tokenizer_factory
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
with connect(args.config.get_libpq_dsn()) as conn:
if conn.table_exists('search_name'):
words = tokenizer.most_frequent_words(conn, 1000)
else:
words = []
for word in words:
api.search(word)
finally:
api.close()
return 0

View File

@@ -0,0 +1,374 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Subcommand definitions for API calls from the command line.
"""
from typing import Dict, Any
import argparse
import logging
import json
import sys
import nominatim_api as napi
import nominatim_api.v1 as api_output
from nominatim_api.v1.helpers import zoom_to_rank, deduplicate_results
from nominatim_api.v1.format import dispatch as formatting
import nominatim_api.logging as loglib
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
LOG = logging.getLogger()
STRUCTURED_QUERY = (
('amenity', 'name and/or type of POI'),
('street', 'housenumber and street'),
('city', 'city, town or village'),
('county', 'county'),
('state', 'state'),
('country', 'country'),
('postalcode', 'postcode')
)
EXTRADATA_PARAMS = (
('addressdetails', 'Include a breakdown of the address into elements'),
('extratags', ("Include additional information if available "
"(e.g. wikipedia link, opening hours)")),
('namedetails', 'Include a list of alternative names')
)
def _add_api_output_arguments(parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Output arguments')
group.add_argument('--format', default='jsonv2',
choices=formatting.list_formats(napi.SearchResults) + ['debug'],
help='Format of result')
for name, desc in EXTRADATA_PARAMS:
group.add_argument('--' + name, action='store_true', help=desc)
group.add_argument('--lang', '--accept-language', metavar='LANGS',
help='Preferred language order for presenting search results')
group.add_argument('--polygon-output',
choices=['geojson', 'kml', 'svg', 'text'],
help='Output geometry of results as a GeoJSON, KML, SVG or WKT')
group.add_argument('--polygon-threshold', type=float, default = 0.0,
metavar='TOLERANCE',
help=("Simplify output geometry."
"Parameter is difference tolerance in degrees."))
class APISearch:
"""\
Execute a search query.
This command works exactly the same as if calling the /search endpoint on
the web API. See the online documentation for more details on the
various parameters:
https://nominatim.org/release-docs/latest/api/Search/
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Query arguments')
group.add_argument('--query',
help='Free-form query string')
for name, desc in STRUCTURED_QUERY:
group.add_argument('--' + name, help='Structured query: ' + desc)
_add_api_output_arguments(parser)
group = parser.add_argument_group('Result limitation')
group.add_argument('--countrycodes', metavar='CC,..',
help='Limit search results to one or more countries')
group.add_argument('--exclude_place_ids', metavar='ID,..',
help='List of search object to be excluded')
group.add_argument('--limit', type=int, default=10,
help='Limit the number of returned results')
group.add_argument('--viewbox', metavar='X1,Y1,X2,Y2',
help='Preferred area to find search results')
group.add_argument('--bounded', action='store_true',
help='Strictly restrict results to viewbox area')
group = parser.add_argument_group('Other arguments')
group.add_argument('--no-dedupe', action='store_false', dest='dedupe',
help='Do not remove duplicates from the result list')
def run(self, args: NominatimArgs) -> int:
if args.format == 'debug':
loglib.set_log_output('text')
api = napi.NominatimAPI(args.project_dir)
params: Dict[str, Any] = {'max_results': args.limit + min(args.limit, 10),
'address_details': True, # needed for display name
'geometry_output': args.get_geometry_output(),
'geometry_simplification': args.polygon_threshold,
'countries': args.countrycodes,
'excluded': args.exclude_place_ids,
'viewbox': args.viewbox,
'bounded_viewbox': args.bounded,
'locales': args.get_locales(api.config.DEFAULT_LANGUAGE)
}
if args.query:
results = api.search(args.query, **params)
else:
results = api.search_address(amenity=args.amenity,
street=args.street,
city=args.city,
county=args.county,
state=args.state,
postalcode=args.postalcode,
country=args.country,
**params)
if args.dedupe and len(results) > 1:
results = deduplicate_results(results, args.limit)
if args.format == 'debug':
print(loglib.get_and_disable())
return 0
output = api_output.format_result(
results,
args.format,
{'extratags': args.extratags,
'namedetails': args.namedetails,
'addressdetails': args.addressdetails})
if args.format != 'xml':
# reformat the result, so it is pretty-printed
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
else:
sys.stdout.write(output)
sys.stdout.write('\n')
return 0
class APIReverse:
"""\
Execute API reverse query.
This command works exactly the same as if calling the /reverse endpoint on
the web API. See the online documentation for more details on the
various parameters:
https://nominatim.org/release-docs/latest/api/Reverse/
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Query arguments')
group.add_argument('--lat', type=float, required=True,
help='Latitude of coordinate to look up (in WGS84)')
group.add_argument('--lon', type=float, required=True,
help='Longitude of coordinate to look up (in WGS84)')
group.add_argument('--zoom', type=int,
help='Level of detail required for the address')
group.add_argument('--layer', metavar='LAYER',
choices=[n.name.lower() for n in napi.DataLayer if n.name],
action='append', required=False, dest='layers',
help='OSM id to lookup in format <NRW><id> (may be repeated)')
_add_api_output_arguments(parser)
def run(self, args: NominatimArgs) -> int:
if args.format == 'debug':
loglib.set_log_output('text')
api = napi.NominatimAPI(args.project_dir)
result = api.reverse(napi.Point(args.lon, args.lat),
max_rank=zoom_to_rank(args.zoom or 18),
layers=args.get_layers(napi.DataLayer.ADDRESS | napi.DataLayer.POI),
address_details=True, # needed for display name
geometry_output=args.get_geometry_output(),
geometry_simplification=args.polygon_threshold,
locales=args.get_locales(api.config.DEFAULT_LANGUAGE))
if args.format == 'debug':
print(loglib.get_and_disable())
return 0
if result:
output = api_output.format_result(
napi.ReverseResults([result]),
args.format,
{'extratags': args.extratags,
'namedetails': args.namedetails,
'addressdetails': args.addressdetails})
if args.format != 'xml':
# reformat the result, so it is pretty-printed
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
else:
sys.stdout.write(output)
sys.stdout.write('\n')
return 0
LOG.error("Unable to geocode.")
return 42
class APILookup:
"""\
Execute API lookup query.
This command works exactly the same as if calling the /lookup endpoint on
the web API. See the online documentation for more details on the
various parameters:
https://nominatim.org/release-docs/latest/api/Lookup/
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Query arguments')
group.add_argument('--id', metavar='OSMID',
action='append', required=True, dest='ids',
help='OSM id to lookup in format <NRW><id> (may be repeated)')
_add_api_output_arguments(parser)
def run(self, args: NominatimArgs) -> int:
if args.format == 'debug':
loglib.set_log_output('text')
api = napi.NominatimAPI(args.project_dir)
if args.format == 'debug':
print(loglib.get_and_disable())
return 0
places = [napi.OsmID(o[0], int(o[1:])) for o in args.ids]
results = api.lookup(places,
address_details=True, # needed for display name
geometry_output=args.get_geometry_output(),
geometry_simplification=args.polygon_threshold or 0.0,
locales=args.get_locales(api.config.DEFAULT_LANGUAGE))
output = api_output.format_result(
results,
args.format,
{'extratags': args.extratags,
'namedetails': args.namedetails,
'addressdetails': args.addressdetails})
if args.format != 'xml':
# reformat the result, so it is pretty-printed
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
else:
sys.stdout.write(output)
sys.stdout.write('\n')
return 0
class APIDetails:
"""\
Execute API details query.
This command works exactly the same as if calling the /details endpoint on
the web API. See the online documentation for more details on the
various parameters:
https://nominatim.org/release-docs/latest/api/Details/
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Query arguments')
objs = group.add_mutually_exclusive_group(required=True)
objs.add_argument('--node', '-n', type=int,
help="Look up the OSM node with the given ID.")
objs.add_argument('--way', '-w', type=int,
help="Look up the OSM way with the given ID.")
objs.add_argument('--relation', '-r', type=int,
help="Look up the OSM relation with the given ID.")
objs.add_argument('--place_id', '-p', type=int,
help='Database internal identifier of the OSM object to look up')
group.add_argument('--class', dest='object_class',
help=("Class type to disambiguated multiple entries "
"of the same object."))
group = parser.add_argument_group('Output arguments')
group.add_argument('--addressdetails', action='store_true',
help='Include a breakdown of the address into elements')
group.add_argument('--keywords', action='store_true',
help='Include a list of name keywords and address keywords')
group.add_argument('--linkedplaces', action='store_true',
help='Include a details of places that are linked with this one')
group.add_argument('--hierarchy', action='store_true',
help='Include details of places lower in the address hierarchy')
group.add_argument('--group_hierarchy', action='store_true',
help='Group the places by type')
group.add_argument('--polygon_geojson', action='store_true',
help='Include geometry of result')
group.add_argument('--lang', '--accept-language', metavar='LANGS',
help='Preferred language order for presenting search results')
def run(self, args: NominatimArgs) -> int:
place: napi.PlaceRef
if args.node:
place = napi.OsmID('N', args.node, args.object_class)
elif args.way:
place = napi.OsmID('W', args.way, args.object_class)
elif args.relation:
place = napi.OsmID('R', args.relation, args.object_class)
else:
assert args.place_id is not None
place = napi.PlaceID(args.place_id)
api = napi.NominatimAPI(args.project_dir)
locales = args.get_locales(api.config.DEFAULT_LANGUAGE)
result = api.details(place,
address_details=args.addressdetails,
linked_places=args.linkedplaces,
parented_places=args.hierarchy,
keywords=args.keywords,
geometry_output=napi.GeometryFormat.GEOJSON
if args.polygon_geojson
else napi.GeometryFormat.NONE,
locales=locales)
if result:
output = api_output.format_result(
result,
'json',
{'locales': locales,
'group_hierarchy': args.group_hierarchy})
# reformat the result, so it is pretty-printed
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
sys.stdout.write('\n')
return 0
LOG.error("Object not found in database.")
return 42
class APIStatus:
"""
Execute API status query.
This command works exactly the same as if calling the /status endpoint on
the web API. See the online documentation for more details on the
various parameters:
https://nominatim.org/release-docs/latest/api/Status/
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
formats = api_output.list_formats(napi.StatusResult)
group = parser.add_argument_group('API parameters')
group.add_argument('--format', default=formats[0], choices=formats,
help='Format of result')
def run(self, args: NominatimArgs) -> int:
status = napi.NominatimAPI(args.project_dir).status()
print(api_output.format_result(status, args.format, {}))
return 0

View File

@@ -0,0 +1,260 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Provides custom functions over command-line arguments.
"""
from typing import Optional, List, Dict, Any, Sequence, Tuple
import argparse
import logging
from functools import reduce
from pathlib import Path
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from nominatim_core.typing import Protocol
import nominatim_api as napi
LOG = logging.getLogger()
class Subcommand(Protocol):
"""
Interface to be implemented by classes implementing a CLI subcommand.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
"""
Fill the given parser for the subcommand with the appropriate
parameters.
"""
def run(self, args: 'NominatimArgs') -> int:
"""
Run the subcommand with the given parsed arguments.
"""
class NominatimArgs:
""" Customized namespace class for the nominatim command line tool
to receive the command-line arguments.
"""
# Basic environment set by root program.
config: Configuration
project_dir: Path
# Global switches
version: bool
subcommand: Optional[str]
command: Subcommand
# Shared parameters
osm2pgsql_cache: Optional[int]
socket_timeout: int
# Arguments added to all subcommands.
verbose: int
threads: Optional[int]
# Arguments to 'add-data'
file: Optional[str]
diff: Optional[str]
node: Optional[int]
way: Optional[int]
relation: Optional[int]
tiger_data: Optional[str]
use_main_api: bool
# Arguments to 'admin'
warm: bool
check_database: bool
migrate: bool
collect_os_info: bool
clean_deleted: str
analyse_indexing: bool
target: Optional[str]
osm_id: Optional[str]
place_id: Optional[int]
# Arguments to 'import'
osm_file: List[str]
continue_at: Optional[str]
reverse_only: bool
no_partitions: bool
no_updates: bool
offline: bool
ignore_errors: bool
index_noanalyse: bool
prepare_database: bool
# Arguments to 'index'
boundaries_only: bool
no_boundaries: bool
minrank: int
maxrank: int
# Arguments to 'export'
output_type: str
output_format: str
output_all_postcodes: bool
language: Optional[str]
restrict_to_country: Optional[str]
# Arguments to 'convert'
output: Path
# Arguments to 'refresh'
postcodes: bool
word_tokens: bool
word_counts: bool
address_levels: bool
functions: bool
wiki_data: bool
secondary_importance: bool
importance: bool
website: bool
diffs: bool
enable_debug_statements: bool
data_object: Sequence[Tuple[str, int]]
data_area: Sequence[Tuple[str, int]]
# Arguments to 'replication'
init: bool
update_functions: bool
check_for_updates: bool
once: bool
catch_up: bool
do_index: bool
# Arguments to 'serve'
server: str
engine: str
# Arguments to 'special-phrases
import_from_wiki: bool
import_from_csv: Optional[str]
no_replace: bool
# Arguments to all query functions
format: str
addressdetails: bool
extratags: bool
namedetails: bool
lang: Optional[str]
polygon_output: Optional[str]
polygon_threshold: Optional[float]
# Arguments to 'search'
query: Optional[str]
amenity: Optional[str]
street: Optional[str]
city: Optional[str]
county: Optional[str]
state: Optional[str]
country: Optional[str]
postalcode: Optional[str]
countrycodes: Optional[str]
exclude_place_ids: Optional[str]
limit: int
viewbox: Optional[str]
bounded: bool
dedupe: bool
# Arguments to 'reverse'
lat: float
lon: float
zoom: Optional[int]
layers: Optional[Sequence[str]]
# Arguments to 'lookup'
ids: Sequence[str]
# Arguments to 'details'
object_class: Optional[str]
linkedplaces: bool
hierarchy: bool
keywords: bool
polygon_geojson: bool
group_hierarchy: bool
def osm2pgsql_options(self, default_cache: int,
default_threads: int) -> Dict[str, Any]:
""" Return the standard osm2pgsql options that can be derived
from the command line arguments. The resulting dict can be
further customized and then used in `run_osm2pgsql()`.
"""
return dict(osm2pgsql=self.config.OSM2PGSQL_BINARY or self.config.lib_dir.osm2pgsql,
osm2pgsql_cache=self.osm2pgsql_cache or default_cache,
osm2pgsql_style=self.config.get_import_style_file(),
osm2pgsql_style_path=self.config.config_dir,
threads=self.threads or default_threads,
dsn=self.config.get_libpq_dsn(),
flatnode_file=str(self.config.get_path('FLATNODE_FILE') or ''),
tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA,
slim_index=self.config.TABLESPACE_OSM_INDEX,
main_data=self.config.TABLESPACE_PLACE_DATA,
main_index=self.config.TABLESPACE_PLACE_INDEX
)
)
def get_osm_file_list(self) -> Optional[List[Path]]:
""" Return the --osm-file argument as a list of Paths or None
if no argument was given. The function also checks if the files
exist and raises a UsageError if one cannot be found.
"""
if not self.osm_file:
return None
files = [Path(f) for f in self.osm_file]
for fname in files:
if not fname.is_file():
LOG.fatal("OSM file '%s' does not exist.", fname)
raise UsageError('Cannot access file.')
return files
def get_geometry_output(self) -> napi.GeometryFormat:
""" Get the requested geometry output format in a API-compatible
format.
"""
if not self.polygon_output:
return napi.GeometryFormat.NONE
if self.polygon_output == 'geojson':
return napi.GeometryFormat.GEOJSON
if self.polygon_output == 'kml':
return napi.GeometryFormat.KML
if self.polygon_output == 'svg':
return napi.GeometryFormat.SVG
if self.polygon_output == 'text':
return napi.GeometryFormat.TEXT
try:
return napi.GeometryFormat[self.polygon_output.upper()]
except KeyError as exp:
raise UsageError(f"Unknown polygon output format '{self.polygon_output}'.") from exp
def get_locales(self, default: Optional[str]) -> napi.Locales:
""" Get the locales from the language parameter.
"""
if self.lang:
return napi.Locales.from_accept_languages(self.lang)
if default:
return napi.Locales.from_accept_languages(default)
return napi.Locales()
def get_layers(self, default: napi.DataLayer) -> Optional[napi.DataLayer]:
""" Get the list of selected layers as a DataLayer enum.
"""
if not self.layers:
return default
return reduce(napi.DataLayer.__or__,
(napi.DataLayer[s.upper()] for s in self.layers))

View File

@@ -0,0 +1,95 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'convert' subcommand.
"""
from typing import Set, Any, Union, Optional, Sequence
import argparse
import asyncio
from pathlib import Path
from nominatim_core.errors import UsageError
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
class WithAction(argparse.Action):
""" Special action that saves a list of flags, given on the command-line
as `--with-foo` or `--without-foo`.
"""
def __init__(self, option_strings: Sequence[str], dest: Any,
default: bool = True, **kwargs: Any) -> None:
if 'nargs' in kwargs:
raise ValueError("nargs not allowed.")
if option_strings is None:
raise ValueError("Positional parameter not allowed.")
self.dest_set = kwargs.pop('dest_set')
full_option_strings = []
for opt in option_strings:
if not opt.startswith('--'):
raise ValueError("short-form options not allowed")
if default:
self.dest_set.add(opt[2:])
full_option_strings.append(f"--with-{opt[2:]}")
full_option_strings.append(f"--without-{opt[2:]}")
super().__init__(full_option_strings, argparse.SUPPRESS, nargs=0, **kwargs)
def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace,
values: Union[str, Sequence[Any], None],
option_string: Optional[str] = None) -> None:
assert option_string
if option_string.startswith('--with-'):
self.dest_set.add(option_string[7:])
if option_string.startswith('--without-'):
self.dest_set.discard(option_string[10:])
class ConvertDB:
""" Convert an existing database into a different format. (EXPERIMENTAL)
Dump a read-only version of the database in a different format.
At the moment only a SQLite database suitable for reverse lookup
can be created.
"""
def __init__(self) -> None:
self.options: Set[str] = set()
def add_args(self, parser: argparse.ArgumentParser) -> None:
parser.add_argument('--format', default='sqlite',
choices=('sqlite', ),
help='Format of the output database (must be sqlite currently)')
parser.add_argument('--output', '-o', required=True, type=Path,
help='File to write the database to.')
group = parser.add_argument_group('Switches to define database layout'
'(currently no effect)')
group.add_argument('--reverse', action=WithAction, dest_set=self.options, default=True,
help='Enable/disable support for reverse and lookup API'
' (default: enabled)')
group.add_argument('--search', action=WithAction, dest_set=self.options, default=True,
help='Enable/disable support for search API (default: disabled)')
group.add_argument('--details', action=WithAction, dest_set=self.options, default=True,
help='Enable/disable support for details API (default: enabled)')
def run(self, args: NominatimArgs) -> int:
if args.output.exists():
raise UsageError(f"File '{args.output}' already exists. Refusing to overwrite.")
if args.format == 'sqlite':
from ..tools import convert_sqlite
asyncio.run(convert_sqlite.convert(args.project_dir, args.output, self.options))
return 0
return 1

View File

@@ -0,0 +1,200 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'export' subcommand.
"""
from typing import Optional, List, cast
import logging
import argparse
import asyncio
import csv
import sys
import sqlalchemy as sa
import nominatim_api as napi
from nominatim_api.results import create_from_placex_row, ReverseResult, add_result_details
from nominatim_api.types import LookupDetails
from nominatim_core.errors import UsageError
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
# Needed for SQLAlchemy
# pylint: disable=singleton-comparison
LOG = logging.getLogger()
RANK_RANGE_MAP = {
'country': (4, 4),
'state': (5, 9),
'county': (10, 12),
'city': (13, 16),
'suburb': (17, 21),
'street': (26, 26),
'path': (27, 27)
}
RANK_TO_OUTPUT_MAP = {
4: 'country',
5: 'state', 6: 'state', 7: 'state', 8: 'state', 9: 'state',
10: 'county', 11: 'county', 12: 'county',
13: 'city', 14: 'city', 15: 'city', 16: 'city',
17: 'suburb', 18: 'suburb', 19: 'suburb', 20: 'suburb', 21: 'suburb',
26: 'street', 27: 'path'}
class QueryExport:
"""\
Export places as CSV file from the database.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Output arguments')
group.add_argument('--output-type', default='street',
choices=('country', 'state', 'county',
'city', 'suburb', 'street', 'path'),
help='Type of places to output (default: street)')
group.add_argument('--output-format',
default='street;suburb;city;county;state;country',
help=("Semicolon-separated list of address types "
"(see --output-type). Additionally accepts:"
"placeid,postcode"))
group.add_argument('--language',
help=("Preferred language for output "
"(use local name, if omitted)"))
group = parser.add_argument_group('Filter arguments')
group.add_argument('--restrict-to-country', metavar='COUNTRY_CODE',
help='Export only objects within country')
group.add_argument('--restrict-to-osm-node', metavar='ID', type=int,
dest='node',
help='Export only children of this OSM node')
group.add_argument('--restrict-to-osm-way', metavar='ID', type=int,
dest='way',
help='Export only children of this OSM way')
group.add_argument('--restrict-to-osm-relation', metavar='ID', type=int,
dest='relation',
help='Export only children of this OSM relation')
def run(self, args: NominatimArgs) -> int:
return asyncio.run(export(args))
async def export(args: NominatimArgs) -> int:
""" The actual export as a asynchronous function.
"""
api = napi.NominatimAPIAsync(args.project_dir)
try:
output_range = RANK_RANGE_MAP[args.output_type]
writer = init_csv_writer(args.output_format)
async with api.begin() as conn, api.begin() as detail_conn:
t = conn.t.placex
sql = sa.select(t.c.place_id, t.c.parent_place_id,
t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.admin_level,
t.c.address, t.c.extratags,
t.c.housenumber, t.c.postcode, t.c.country_code,
t.c.importance, t.c.wikipedia, t.c.indexed_date,
t.c.rank_address, t.c.rank_search,
t.c.centroid)\
.where(t.c.linked_place_id == None)\
.where(t.c.rank_address.between(*output_range))
parent_place_id = await get_parent_id(conn, args.node, args.way, args.relation)
if parent_place_id:
taddr = conn.t.addressline
sql = sql.join(taddr, taddr.c.place_id == t.c.place_id)\
.where(taddr.c.address_place_id == parent_place_id)\
.where(taddr.c.isaddress)
if args.restrict_to_country:
sql = sql.where(t.c.country_code == args.restrict_to_country.lower())
results = []
for row in await conn.execute(sql):
result = create_from_placex_row(row, ReverseResult)
if result is not None:
results.append(result)
if len(results) == 1000:
await dump_results(detail_conn, results, writer, args.language)
results = []
if results:
await dump_results(detail_conn, results, writer, args.language)
finally:
await api.close()
return 0
def init_csv_writer(output_format: str) -> 'csv.DictWriter[str]':
fields = output_format.split(';')
writer = csv.DictWriter(sys.stdout, fieldnames=fields, extrasaction='ignore')
writer.writeheader()
return writer
async def dump_results(conn: napi.SearchConnection,
results: List[ReverseResult],
writer: 'csv.DictWriter[str]',
lang: Optional[str]) -> None:
locale = napi.Locales([lang] if lang else None)
await add_result_details(conn, results,
LookupDetails(address_details=True, locales=locale))
for result in results:
data = {'placeid': result.place_id,
'postcode': result.postcode}
for line in (result.address_rows or []):
if line.isaddress and line.local_name:
if line.category[1] == 'postcode':
data['postcode'] = line.local_name
elif line.rank_address in RANK_TO_OUTPUT_MAP:
data[RANK_TO_OUTPUT_MAP[line.rank_address]] = line.local_name
writer.writerow(data)
async def get_parent_id(conn: napi.SearchConnection, node_id: Optional[int],
way_id: Optional[int],
relation_id: Optional[int]) -> Optional[int]:
""" Get the place ID for the given OSM object.
"""
if node_id is not None:
osm_type, osm_id = 'N', node_id
elif way_id is not None:
osm_type, osm_id = 'W', way_id
elif relation_id is not None:
osm_type, osm_id = 'R', relation_id
else:
return None
t = conn.t.placex
sql = sa.select(t.c.place_id).limit(1)\
.where(t.c.osm_type == osm_type)\
.where(t.c.osm_id == osm_id)\
.where(t.c.rank_address > 0)\
.order_by(t.c.rank_address)
for result in await conn.execute(sql):
return cast(int, result[0])
raise UsageError(f'Cannot find a place {osm_type}{osm_id}.')

View File

@@ -0,0 +1,43 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'freeze' subcommand.
"""
import argparse
from nominatim_core.db.connection import connect
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
class SetupFreeze:
"""\
Make database read-only.
About half of data in the Nominatim database is kept only to be able to
keep the data up-to-date with new changes made in OpenStreetMap. This
command drops all this data and only keeps the part needed for geocoding
itself.
This command has the same effect as the `--no-updates` option for imports.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
pass # No options
def run(self, args: NominatimArgs) -> int:
from ..tools import freeze
with connect(args.config.get_libpq_dsn()) as conn:
freeze.drop_update_tables(conn)
freeze.drop_flatnode_file(args.config.get_path('FLATNODE_FILE'))
return 0

View File

@@ -0,0 +1,66 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'index' subcommand.
"""
import argparse
import psutil
from nominatim_core.db import status
from nominatim_core.db.connection import connect
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
class UpdateIndex:
"""\
Reindex all new and modified data.
Indexing is the process of computing the address and search terms for
the places in the database. Every time data is added or changed, indexing
needs to be run. Imports and replication updates automatically take care
of indexing. For other cases, this function allows to run indexing manually.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Filter arguments')
group.add_argument('--boundaries-only', action='store_true',
help="""Index only administrative boundaries.""")
group.add_argument('--no-boundaries', action='store_true',
help="""Index everything except administrative boundaries.""")
group.add_argument('--minrank', '-r', type=int, metavar='RANK', default=0,
help='Minimum/starting rank')
group.add_argument('--maxrank', '-R', type=int, metavar='RANK', default=30,
help='Maximum/finishing rank')
def run(self, args: NominatimArgs) -> int:
from ..indexer.indexer import Indexer
from ..tokenizer import factory as tokenizer_factory
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
args.threads or psutil.cpu_count() or 1)
if not args.no_boundaries:
indexer.index_boundaries(args.minrank, args.maxrank)
if not args.boundaries_only:
indexer.index_by_rank(args.minrank, args.maxrank)
indexer.index_postcodes()
if not args.no_boundaries and not args.boundaries_only \
and args.minrank == 0 and args.maxrank == 30:
with connect(args.config.get_libpq_dsn()) as conn:
status.set_indexed(conn, True)
return 0

View File

@@ -0,0 +1,187 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of 'refresh' subcommand.
"""
from typing import Tuple, Optional
import argparse
import logging
from pathlib import Path
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect
from ..tokenizer.base import AbstractTokenizer
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
LOG = logging.getLogger()
def _parse_osm_object(obj: str) -> Tuple[str, int]:
""" Parse the given argument into a tuple of OSM type and ID.
Raises an ArgumentError if the format is not recognized.
"""
if len(obj) < 2 or obj[0].lower() not in 'nrw' or not obj[1:].isdigit():
raise argparse.ArgumentTypeError("Cannot parse OSM ID. Expect format: [N|W|R]<id>.")
return (obj[0].upper(), int(obj[1:]))
class UpdateRefresh:
"""\
Recompute auxiliary data used by the indexing process.
This sub-commands updates various static data and functions in the database.
It usually needs to be run after changing various aspects of the
configuration. The configuration documentation will mention the exact
command to use in such case.
Warning: the 'update' command must not be run in parallel with other update
commands like 'replication' or 'add-data'.
"""
def __init__(self) -> None:
self.tokenizer: Optional[AbstractTokenizer] = None
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Data arguments')
group.add_argument('--postcodes', action='store_true',
help='Update postcode centroid table')
group.add_argument('--word-tokens', action='store_true',
help='Clean up search terms')
group.add_argument('--word-counts', action='store_true',
help='Compute frequency of full-word search terms')
group.add_argument('--address-levels', action='store_true',
help='Reimport address level configuration')
group.add_argument('--functions', action='store_true',
help='Update the PL/pgSQL functions in the database')
group.add_argument('--wiki-data', action='store_true',
help='Update Wikipedia/data importance numbers')
group.add_argument('--secondary-importance', action='store_true',
help='Update secondary importance raster data')
group.add_argument('--importance', action='store_true',
help='Recompute place importances (expensive!)')
group.add_argument('--website', action='store_true',
help='Refresh the directory that serves the scripts for the web API')
group.add_argument('--data-object', action='append',
type=_parse_osm_object, metavar='OBJECT',
help='Mark the given OSM object as requiring an update'
' (format: [NWR]<id>)')
group.add_argument('--data-area', action='append',
type=_parse_osm_object, metavar='OBJECT',
help='Mark the area around the given OSM object as requiring an update'
' (format: [NWR]<id>)')
group = parser.add_argument_group('Arguments for function refresh')
group.add_argument('--no-diff-updates', action='store_false', dest='diffs',
help='Do not enable code for propagating updates')
group.add_argument('--enable-debug-statements', action='store_true',
help='Enable debug warning statements in functions')
def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, too-many-statements
from ..tools import refresh, postcodes
from ..indexer.indexer import Indexer
need_function_refresh = args.functions
if args.postcodes:
if postcodes.can_compute(args.config.get_libpq_dsn()):
LOG.warning("Update postcodes centroid")
tokenizer = self._get_tokenizer(args.config)
postcodes.update_postcodes(args.config.get_libpq_dsn(),
args.project_dir, tokenizer)
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
args.threads or 1)
indexer.index_postcodes()
else:
LOG.error("The place table doesn't exist. "
"Postcode updates on a frozen database is not possible.")
if args.word_tokens:
LOG.warning('Updating word tokens')
tokenizer = self._get_tokenizer(args.config)
tokenizer.update_word_tokens()
if args.word_counts:
LOG.warning('Recompute word statistics')
self._get_tokenizer(args.config).update_statistics(args.config,
threads=args.threads or 1)
if args.address_levels:
LOG.warning('Updating address levels')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.load_address_levels_from_config(conn, args.config)
# Attention: must come BEFORE functions
if args.secondary_importance:
with connect(args.config.get_libpq_dsn()) as conn:
# If the table did not exist before, then the importance code
# needs to be enabled.
if not conn.table_exists('secondary_importance'):
args.functions = True
LOG.warning('Import secondary importance raster data from %s', args.project_dir)
if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
args.project_dir) > 0:
LOG.fatal('FATAL: Cannot update secondary importance raster data')
return 1
need_function_refresh = True
if args.wiki_data:
data_path = Path(args.config.WIKIPEDIA_DATA_PATH
or args.project_dir)
LOG.warning('Import wikipedia article importance from %s', data_path)
if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
data_path) > 0:
LOG.fatal('FATAL: Wikipedia importance file not found in %s', data_path)
return 1
need_function_refresh = True
if need_function_refresh:
LOG.warning('Create functions')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.create_functions(conn, args.config,
args.diffs, args.enable_debug_statements)
self._get_tokenizer(args.config).update_sql_functions(args.config)
# Attention: importance MUST come after wiki data import and after functions.
if args.importance:
LOG.warning('Update importance values for database')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.recompute_importance(conn)
if args.website:
webdir = args.project_dir / 'website'
LOG.warning('Setting up website directory at %s', webdir)
# This is a little bit hacky: call the tokenizer setup, so that
# the tokenizer directory gets repopulated as well, in case it
# wasn't there yet.
self._get_tokenizer(args.config)
with connect(args.config.get_libpq_dsn()) as conn:
refresh.setup_website(webdir, args.config, conn)
if args.data_object or args.data_area:
with connect(args.config.get_libpq_dsn()) as conn:
for obj in args.data_object or []:
refresh.invalidate_osm_object(*obj, conn, recursive=False)
for obj in args.data_area or []:
refresh.invalidate_osm_object(*obj, conn, recursive=True)
conn.commit()
return 0
def _get_tokenizer(self, config: Configuration) -> AbstractTokenizer:
if self.tokenizer is None:
from ..tokenizer import factory as tokenizer_factory
self.tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
return self.tokenizer

View File

@@ -0,0 +1,200 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'replication' sub-command.
"""
from typing import Optional
import argparse
import datetime as dt
import logging
import socket
import time
from nominatim_core.db import status
from nominatim_core.db.connection import connect
from nominatim_core.errors import UsageError
from .args import NominatimArgs
LOG = logging.getLogger()
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to make pyosmium optional for replication only.
# pylint: disable=C0415
class UpdateReplication:
"""\
Update the database using an online replication service.
An OSM replication service is an online service that provides regular
updates (OSM diff files) for the planet or update they provide. The OSMF
provides the primary replication service for the full planet at
https://planet.osm.org/replication/ but there are other providers of
extracts of OSM data who provide such a service as well.
This sub-command allows to set up such a replication service and download
and import updates at regular intervals. You need to call '--init' once to
set up the process or whenever you change the replication configuration
parameters. Without any arguments, the sub-command will go into a loop and
continuously apply updates as they become available. Giving `--once` just
downloads and imports the next batch of updates.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Arguments for initialisation')
group.add_argument('--init', action='store_true',
help='Initialise the update process')
group.add_argument('--no-update-functions', dest='update_functions',
action='store_false',
help="Do not update the trigger function to "
"support differential updates (EXPERT)")
group = parser.add_argument_group('Arguments for updates')
group.add_argument('--check-for-updates', action='store_true',
help='Check if new updates are available and exit')
group.add_argument('--once', action='store_true',
help="Download and apply updates only once. When "
"not set, updates are continuously applied")
group.add_argument('--catch-up', action='store_true',
help="Download and apply updates until no new "
"data is available on the server")
group.add_argument('--no-index', action='store_false', dest='do_index',
help=("Do not index the new data. Only usable "
"together with --once"))
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
help='Size of cache to be used by osm2pgsql (in MB)')
group = parser.add_argument_group('Download parameters')
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
help='Set timeout for file downloads')
def _init_replication(self, args: NominatimArgs) -> int:
from ..tools import replication, refresh
LOG.warning("Initialising replication updates")
with connect(args.config.get_libpq_dsn()) as conn:
replication.init_replication(conn, base_url=args.config.REPLICATION_URL,
socket_timeout=args.socket_timeout)
if args.update_functions:
LOG.warning("Create functions")
refresh.create_functions(conn, args.config, True, False)
return 0
def _check_for_updates(self, args: NominatimArgs) -> int:
from ..tools import replication
with connect(args.config.get_libpq_dsn()) as conn:
return replication.check_for_updates(conn, base_url=args.config.REPLICATION_URL,
socket_timeout=args.socket_timeout)
def _report_update(self, batchdate: dt.datetime,
start_import: dt.datetime,
start_index: Optional[dt.datetime]) -> None:
def round_time(delta: dt.timedelta) -> dt.timedelta:
return dt.timedelta(seconds=int(delta.total_seconds()))
end = dt.datetime.now(dt.timezone.utc)
LOG.warning("Update completed. Import: %s. %sTotal: %s. Remaining backlog: %s.",
round_time((start_index or end) - start_import),
f"Indexing: {round_time(end - start_index)} " if start_index else '',
round_time(end - start_import),
round_time(end - batchdate))
def _compute_update_interval(self, args: NominatimArgs) -> int:
if args.catch_up:
return 0
update_interval = args.config.get_int('REPLICATION_UPDATE_INTERVAL')
# Sanity check to not overwhelm the Geofabrik servers.
if 'download.geofabrik.de' in args.config.REPLICATION_URL\
and update_interval < 86400:
LOG.fatal("Update interval too low for download.geofabrik.de.\n"
"Please check install documentation "
"(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
"setting-up-the-update-process).")
raise UsageError("Invalid replication update interval setting.")
return update_interval
def _update(self, args: NominatimArgs) -> None:
# pylint: disable=too-many-locals
from ..tools import replication
from ..indexer.indexer import Indexer
from ..tokenizer import factory as tokenizer_factory
update_interval = self._compute_update_interval(args)
params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
params.update(base_url=args.config.REPLICATION_URL,
update_interval=update_interval,
import_file=args.project_dir / 'osmosischange.osc',
max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'),
indexed_only=not args.once)
if not args.once:
if not args.do_index:
LOG.fatal("Indexing cannot be disabled when running updates continuously.")
raise UsageError("Bad argument '--no-index'.")
recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, args.threads or 1)
dsn = args.config.get_libpq_dsn()
while True:
start = dt.datetime.now(dt.timezone.utc)
state = replication.update(dsn, params, socket_timeout=args.socket_timeout)
with connect(dsn) as conn:
if state is not replication.UpdateState.NO_CHANGES:
status.log_status(conn, start, 'import')
batchdate, _, _ = status.get_status(conn)
conn.commit()
if state is not replication.UpdateState.NO_CHANGES and args.do_index:
index_start = dt.datetime.now(dt.timezone.utc)
indexer.index_full(analyse=False)
with connect(dsn) as conn:
status.set_indexed(conn, True)
status.log_status(conn, index_start, 'index')
conn.commit()
else:
index_start = None
if state is replication.UpdateState.NO_CHANGES and \
args.catch_up or update_interval > 40*60:
while indexer.has_pending():
indexer.index_full(analyse=False)
if LOG.isEnabledFor(logging.WARNING):
assert batchdate is not None
self._report_update(batchdate, start, index_start)
if args.once or (args.catch_up and state is replication.UpdateState.NO_CHANGES):
break
if state is replication.UpdateState.NO_CHANGES:
LOG.warning("No new changes. Sleeping for %d sec.", recheck_interval)
time.sleep(recheck_interval)
def run(self, args: NominatimArgs) -> int:
socket.setdefaulttimeout(args.socket_timeout)
if args.init:
return self._init_replication(args)
if args.check_for_updates:
return self._check_for_updates(args)
self._update(args)
return 0

View File

@@ -0,0 +1,229 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'import' subcommand.
"""
from typing import Optional
import argparse
import logging
from pathlib import Path
import psutil
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect
from nominatim_core.db import status, properties
from ..tokenizer.base import AbstractTokenizer
from ..version import NOMINATIM_VERSION
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=C0415
LOG = logging.getLogger()
class SetupAll:
"""\
Create a new Nominatim database from an OSM file.
This sub-command sets up a new Nominatim database from scratch starting
with creating a new database in Postgresql. The user running this command
needs superuser rights on the database.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group1 = parser.add_argument_group('Required arguments')
group1.add_argument('--osm-file', metavar='FILE', action='append',
help='OSM file to be imported'
' (repeat for importing multiple files)',
default=None)
group1.add_argument('--continue', dest='continue_at',
choices=['import-from-file', 'load-data', 'indexing', 'db-postprocess'],
help='Continue an import that was interrupted',
default=None)
group2 = parser.add_argument_group('Optional arguments')
group2.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
help='Size of cache to be used by osm2pgsql (in MB)')
group2.add_argument('--reverse-only', action='store_true',
help='Do not create tables and indexes for searching')
group2.add_argument('--no-partitions', action='store_true',
help=("Do not partition search indices "
"(speeds up import of single country extracts)"))
group2.add_argument('--no-updates', action='store_true',
help="Do not keep tables that are only needed for "
"updating the database later")
group2.add_argument('--offline', action='store_true',
help="Do not attempt to load any additional data from the internet")
group3 = parser.add_argument_group('Expert options')
group3.add_argument('--ignore-errors', action='store_true',
help='Continue import even when errors in SQL are present')
group3.add_argument('--index-noanalyse', action='store_true',
help='Do not perform analyse operations during index (expert only)')
group3.add_argument('--prepare-database', action='store_true',
help='Create the database but do not import any data')
def run(self, args: NominatimArgs) -> int: # pylint: disable=too-many-statements, too-many-branches
from ..data import country_info
from ..tools import database_import, refresh, postcodes, freeze
from ..indexer.indexer import Indexer
num_threads = args.threads or psutil.cpu_count() or 1
country_info.setup_country_config(args.config)
if args.osm_file is None and args.continue_at is None and not args.prepare_database:
raise UsageError("No input files (use --osm-file).")
if args.osm_file is not None and args.continue_at not in ('import-from-file', None):
raise UsageError(f"Cannot use --continue {args.continue_at} and --osm-file together.")
if args.continue_at is not None and args.prepare_database:
raise UsageError(
"Cannot use --continue and --prepare-database together."
)
if args.prepare_database or args.continue_at is None:
LOG.warning('Creating database')
database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
rouser=args.config.DATABASE_WEBUSER)
if args.prepare_database:
return 0
if args.continue_at in (None, 'import-from-file'):
files = args.get_osm_file_list()
if not files:
raise UsageError("No input files (use --osm-file).")
if args.continue_at in ('import-from-file', None):
# Check if the correct plugins are installed
database_import.check_existing_database_plugins(args.config.get_libpq_dsn())
LOG.warning('Setting up country tables')
country_info.setup_country_tables(args.config.get_libpq_dsn(),
args.config.lib_dir.data,
args.no_partitions)
LOG.warning('Importing OSM data file')
database_import.import_osm_data(files,
args.osm2pgsql_options(0, 1),
drop=args.no_updates,
ignore_errors=args.ignore_errors)
LOG.warning('Importing wikipedia importance data')
data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
data_path) > 0:
LOG.error('Wikipedia importance dump file not found. '
'Calculating importance values of locations will not '
'use Wikipedia importance data.')
LOG.warning('Importing secondary importance raster data')
if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
args.project_dir) != 0:
LOG.error('Secondary importance file not imported. '
'Falling back to default ranking.')
self._setup_tables(args.config, args.reverse_only)
if args.continue_at in ('import-from-file', 'load-data', None):
LOG.warning('Initialise tables')
with connect(args.config.get_libpq_dsn()) as conn:
database_import.truncate_data_tables(conn)
LOG.warning('Load data into placex table')
database_import.load_data(args.config.get_libpq_dsn(), num_threads)
LOG.warning("Setting up tokenizer")
tokenizer = self._get_tokenizer(args.continue_at, args.config)
if args.continue_at in ('import-from-file', 'load-data', None):
LOG.warning('Calculate postcodes')
postcodes.update_postcodes(args.config.get_libpq_dsn(),
args.project_dir, tokenizer)
if args.continue_at in \
('import-from-file', 'load-data', 'indexing', None):
LOG.warning('Indexing places')
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, num_threads)
indexer.index_full(analyse=not args.index_noanalyse)
LOG.warning('Post-process tables')
with connect(args.config.get_libpq_dsn()) as conn:
database_import.create_search_indices(conn, args.config,
drop=args.no_updates,
threads=num_threads)
LOG.warning('Create search index for default country names.')
country_info.create_country_names(conn, tokenizer,
args.config.get_str_list('LANGUAGES'))
if args.no_updates:
freeze.drop_update_tables(conn)
tokenizer.finalize_import(args.config)
LOG.warning('Recompute word counts')
tokenizer.update_statistics(args.config, threads=num_threads)
webdir = args.project_dir / 'website'
LOG.warning('Setup website at %s', webdir)
with connect(args.config.get_libpq_dsn()) as conn:
refresh.setup_website(webdir, args.config, conn)
self._finalize_database(args.config.get_libpq_dsn(), args.offline)
return 0
def _setup_tables(self, config: Configuration, reverse_only: bool) -> None:
""" Set up the basic database layout: tables, indexes and functions.
"""
from ..tools import database_import, refresh
with connect(config.get_libpq_dsn()) as conn:
LOG.warning('Create functions (1st pass)')
refresh.create_functions(conn, config, False, False)
LOG.warning('Create tables')
database_import.create_tables(conn, config, reverse_only=reverse_only)
refresh.load_address_levels_from_config(conn, config)
LOG.warning('Create functions (2nd pass)')
refresh.create_functions(conn, config, False, False)
LOG.warning('Create table triggers')
database_import.create_table_triggers(conn, config)
LOG.warning('Create partition tables')
database_import.create_partition_tables(conn, config)
LOG.warning('Create functions (3rd pass)')
refresh.create_functions(conn, config, False, False)
def _get_tokenizer(self, continue_at: Optional[str],
config: Configuration) -> AbstractTokenizer:
""" Set up a new tokenizer or load an already initialised one.
"""
from ..tokenizer import factory as tokenizer_factory
if continue_at in ('import-from-file', 'load-data', None):
# (re)initialise the tokenizer data
return tokenizer_factory.create_tokenizer(config)
# just load the tokenizer
return tokenizer_factory.get_tokenizer_for_db(config)
def _finalize_database(self, dsn: str, offline: bool) -> None:
""" Determine the database date and set the status accordingly.
"""
with connect(dsn) as conn:
properties.set_property(conn, 'database_version', str(NOMINATIM_VERSION))
try:
dbdate = status.compute_database_date(conn, offline)
status.set_status(conn, dbdate)
LOG.info('Database is at %s.', dbdate)
except Exception as exc: # pylint: disable=broad-except
LOG.error('Cannot determine date of database: %s', exc)

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'special-phrases' command.
"""
import argparse
import logging
from pathlib import Path
from nominatim_core.errors import UsageError
from nominatim_core.db.connection import connect
from ..tools.special_phrases.sp_importer import SPImporter, SpecialPhraseLoader
from ..tools.special_phrases.sp_wiki_loader import SPWikiLoader
from ..tools.special_phrases.sp_csv_loader import SPCsvLoader
from .args import NominatimArgs
LOG = logging.getLogger()
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
class ImportSpecialPhrases:
"""\
Import special phrases.
Special phrases are search terms that narrow down the type of object
that should be searched. For example, you might want to search for
'Hotels in Barcelona'. The OSM wiki has a selection of special phrases
in many languages, which can be imported with this command.
You can also provide your own phrases in a CSV file. The file needs to have
the following five columns:
* phrase - the term expected for searching
* class - the OSM tag key of the object type
* type - the OSM tag value of the object type
* operator - the kind of search to be done (one of: in, near, name, -)
* plural - whether the term is a plural or not (Y/N)
An example file can be found in the Nominatim sources at
'test/testdb/full_en_phrases_test.csv'.
The import can be further configured to ignore specific key/value pairs.
This is particularly useful when importing phrases from the wiki. The
default configuration excludes some very common tags like building=yes.
The configuration can be customized by putting a file `phrase-settings.json`
with custom rules into the project directory or by using the `--config`
option to point to another configuration file.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Input arguments')
group.add_argument('--import-from-wiki', action='store_true',
help='Import special phrases from the OSM wiki to the database')
group.add_argument('--import-from-csv', metavar='FILE',
help='Import special phrases from a CSV file')
group.add_argument('--no-replace', action='store_true',
help='Keep the old phrases and only add the new ones')
def run(self, args: NominatimArgs) -> int:
if args.import_from_wiki:
self.start_import(args, SPWikiLoader(args.config))
if args.import_from_csv:
if not Path(args.import_from_csv).is_file():
LOG.fatal("CSV file '%s' does not exist.", args.import_from_csv)
raise UsageError('Cannot access file.')
self.start_import(args, SPCsvLoader(args.import_from_csv))
return 0
def start_import(self, args: NominatimArgs, loader: SpecialPhraseLoader) -> None:
"""
Create the SPImporter object containing the right
sp loader and then start the import of special phrases.
"""
from ..tokenizer import factory as tokenizer_factory
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
should_replace = not args.no_replace
with connect(args.config.get_libpq_dsn()) as db_connection:
SPImporter(
args.config, db_connection, loader
).import_phrases(tokenizer, should_replace)

View File

View File

@@ -0,0 +1,175 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for importing and managing static country information.
"""
from typing import Dict, Any, Iterable, Tuple, Optional, Container, overload
from pathlib import Path
import psycopg2.extras
from nominatim_core.db import utils as db_utils
from nominatim_core.db.connection import connect, Connection
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from ..tokenizer.base import AbstractTokenizer
def _flatten_name_list(names: Any) -> Dict[str, str]:
if names is None:
return {}
if not isinstance(names, dict):
raise UsageError("Expected key-value list for names in country_settings.py")
flat = {}
for prefix, remain in names.items():
if isinstance(remain, str):
flat[prefix] = remain
elif not isinstance(remain, dict):
raise UsageError("Entries in names must be key-value lists.")
else:
for suffix, name in remain.items():
if suffix == 'default':
flat[prefix] = name
else:
flat[f'{prefix}:{suffix}'] = name
return flat
class _CountryInfo:
""" Caches country-specific properties from the configuration file.
"""
def __init__(self) -> None:
self._info: Dict[str, Dict[str, Any]] = {}
def load(self, config: Configuration) -> None:
""" Load the country properties from the configuration files,
if they are not loaded yet.
"""
if not self._info:
self._info = config.load_sub_configuration('country_settings.yaml')
for prop in self._info.values():
# Convert languages into a list for simpler handling.
if 'languages' not in prop:
prop['languages'] = []
elif not isinstance(prop['languages'], list):
prop['languages'] = [x.strip()
for x in prop['languages'].split(',')]
prop['names'] = _flatten_name_list(prop.get('names'))
def items(self) -> Iterable[Tuple[str, Dict[str, Any]]]:
""" Return tuples of (country_code, property dict) as iterable.
"""
return self._info.items()
def get(self, country_code: str) -> Dict[str, Any]:
""" Get country information for the country with the given country code.
"""
return self._info.get(country_code, {})
_COUNTRY_INFO = _CountryInfo()
def setup_country_config(config: Configuration) -> None:
""" Load country properties from the configuration file.
Needs to be called before using any other functions in this
file.
"""
_COUNTRY_INFO.load(config)
@overload
def iterate() -> Iterable[Tuple[str, Dict[str, Any]]]:
...
@overload
def iterate(prop: str) -> Iterable[Tuple[str, Any]]:
...
def iterate(prop: Optional[str] = None) -> Iterable[Tuple[str, Dict[str, Any]]]:
""" Iterate over country code and properties.
When `prop` is None, all countries are returned with their complete
set of properties.
If `prop` is given, then only countries are returned where the
given property is set. The second item of the tuple contains only
the content of the given property.
"""
if prop is None:
return _COUNTRY_INFO.items()
return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
def setup_country_tables(dsn: str, sql_dir: Path, ignore_partitions: bool = False) -> None:
""" Create and populate the tables with basic static data that provides
the background for geocoding. Data is assumed to not yet exist.
"""
db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')
params = []
for ccode, props in _COUNTRY_INFO.items():
if ccode is not None and props is not None:
if ignore_partitions:
partition = 0
else:
partition = props.get('partition', 0)
lang = props['languages'][0] if len(
props['languages']) == 1 else None
params.append((ccode, props['names'], lang, partition))
with connect(dsn) as conn:
with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur)
cur.execute(
""" CREATE TABLE public.country_name (
country_code character varying(2),
name public.hstore,
derived_name public.hstore,
country_default_language_code text,
partition integer
); """)
cur.execute_values(
""" INSERT INTO public.country_name
(country_code, name, country_default_language_code, partition) VALUES %s
""", params)
conn.commit()
def create_country_names(conn: Connection, tokenizer: AbstractTokenizer,
languages: Optional[Container[str]] = None) -> None:
""" Add default country names to search index. `languages` is a comma-
separated list of language codes as used in OSM. If `languages` is not
empty then only name translations for the given languages are added
to the index.
"""
def _include_key(key: str) -> bool:
return ':' not in key or not languages or \
key[key.index(':') + 1:] in languages
with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur)
cur.execute("""SELECT country_code, name FROM country_name
WHERE country_code is not null""")
with tokenizer.name_analyzer() as analyzer:
for code, name in cur:
names = {'countrycode': code}
# country names (only in languages as provided)
if name:
names.update({k : v for k, v in name.items() if _include_key(k)})
analyzer.add_country_names(code, names)
conn.commit()

View File

@@ -0,0 +1,86 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Wrapper around place information the indexer gets from the database and hands to
the tokenizer.
"""
from typing import Optional, Mapping, Any, Tuple
class PlaceInfo:
""" This data class contains all information the tokenizer can access
about a place.
"""
def __init__(self, info: Mapping[str, Any]) -> None:
self._info = info
@property
def name(self) -> Optional[Mapping[str, str]]:
""" A dictionary with the names of the place. Keys and values represent
the full key and value of the corresponding OSM tag. Which tags
are saved as names is determined by the import style.
The property may be None if the place has no names.
"""
return self._info.get('name')
@property
def address(self) -> Optional[Mapping[str, str]]:
""" A dictionary with the address elements of the place. They key
usually corresponds to the suffix part of the key of an OSM
'addr:*' or 'isin:*' tag. There are also some special keys like
`country` or `country_code` which merge OSM keys that contain
the same information. See [Import Styles][1] for details.
The property may be None if the place has no address information.
[1]: ../customize/Import-Styles.md
"""
return self._info.get('address')
@property
def country_code(self) -> Optional[str]:
""" The country code of the country the place is in. Guaranteed
to be a two-letter lower-case string. If the place is not inside
any country, the property is set to None.
"""
return self._info.get('country_code')
@property
def rank_address(self) -> int:
""" The [rank address][1] before any rank correction is applied.
[1]: ../customize/Ranking.md#address-rank
"""
return self._info.get('rank_address', 0)
@property
def centroid(self) -> Optional[Tuple[float, float]]:
""" A center point of the place in WGS84. May be None when the
geometry of the place is unknown.
"""
x, y = self._info.get('centroid_x'), self._info.get('centroid_y')
return None if x is None or y is None else (x, y)
def is_a(self, key: str, value: str) -> bool:
""" Set to True when the place's primary tag corresponds to the given
key and value.
"""
return self._info.get('class') == key and self._info.get('type') == value
def is_country(self) -> bool:
""" Set to True when the place is a valid country boundary.
"""
return self.rank_address == 4 \
and self.is_a('boundary', 'administrative') \
and self.country_code is not None

View File

@@ -0,0 +1,78 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Data class for a single name of a place.
"""
from typing import Optional, Dict, Mapping
class PlaceName:
""" Each name and address part of a place is encapsulated in an object of
this class. It saves not only the name proper but also describes the
kind of name with two properties:
* `kind` describes the name of the OSM key used without any suffixes
(i.e. the part after the colon removed)
* `suffix` contains the suffix of the OSM tag, if any. The suffix
is the part of the key after the first colon.
In addition to that, a name may have arbitrary additional attributes.
How attributes are used, depends on the sanitizers and token analysers.
The exception is the 'analyzer' attribute. This attribute determines
which token analysis module will be used to finalize the treatment of
names.
"""
def __init__(self, name: str, kind: str, suffix: Optional[str]):
self.name = name
self.kind = kind
self.suffix = suffix
self.attr: Dict[str, str] = {}
def __repr__(self) -> str:
return f"PlaceName(name={self.name!r},kind={self.kind!r},suffix={self.suffix!r})"
def clone(self, name: Optional[str] = None,
kind: Optional[str] = None,
suffix: Optional[str] = None,
attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
""" Create a deep copy of the place name, optionally with the
given parameters replaced. In the attribute list only the given
keys are updated. The list is not replaced completely.
In particular, the function cannot to be used to remove an
attribute from a place name.
"""
newobj = PlaceName(name or self.name,
kind or self.kind,
suffix or self.suffix)
newobj.attr.update(self.attr)
if attr:
newobj.attr.update(attr)
return newobj
def set_attr(self, key: str, value: str) -> None:
""" Add the given property to the name. If the property was already
set, then the value is overwritten.
"""
self.attr[key] = value
def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
""" Return the given property or the value of 'default' if it
is not set.
"""
return self.attr.get(key, default)
def has_attr(self, key: str) -> bool:
""" Check if the given attribute is set.
"""
return key in self.attr

View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for formatting postcodes according to their country-specific
format.
"""
from typing import Any, Mapping, Optional, Set, Match
import re
from nominatim_core.errors import UsageError
from . import country_info
class CountryPostcodeMatcher:
""" Matches and formats a postcode according to a format definition
of the given country.
"""
def __init__(self, country_code: str, config: Mapping[str, Any]) -> None:
if 'pattern' not in config:
raise UsageError("Field 'pattern' required for 'postcode' "
f"for country '{country_code}'")
pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?({pc_pattern})\\s*')
self.pattern = re.compile(pc_pattern)
self.output = config.get('output', r'\g<0>')
def match(self, postcode: str) -> Optional[Match[str]]:
""" Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the match was successful
and None otherwise.
"""
# Upper-case, strip spaces and leading country code.
normalized = self.norm_pattern.fullmatch(postcode.upper())
if normalized:
return self.pattern.fullmatch(normalized.group(1))
return None
def normalize(self, match: Match[str]) -> str:
""" Return the default format of the postcode for the given match.
`match` must be a `re.Match` object previously returned by
`match()`
"""
return match.expand(self.output)
class PostcodeFormatter:
""" Container for different postcode formats of the world and
access functions.
"""
def __init__(self) -> None:
# Objects without a country code can't have a postcode per definition.
self.country_without_postcode: Set[Optional[str]] = {None}
self.country_matcher = {}
self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
for ccode, prop in country_info.iterate('postcode'):
if prop is False:
self.country_without_postcode.add(ccode)
elif isinstance(prop, dict):
self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
else:
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
def set_default_pattern(self, pattern: str) -> None:
""" Set the postcode match pattern to use, when a country does not
have a specific pattern.
"""
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
""" Return the CountryPostcodeMatcher for the given country.
Returns None if the country doesn't have a postcode and the
default matcher if there is no specific matcher configured for
the country.
"""
if country_code in self.country_without_postcode:
return None
assert country_code is not None
return self.country_matcher.get(country_code, self.default_matcher)
def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
""" Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the country has a pattern
and the match was successful or None if the match failed.
"""
if country_code in self.country_without_postcode:
return None
assert country_code is not None
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
def normalize(self, country_code: str, match: Match[str]) -> str:
""" Return the default format of the postcode for the given match.
`match` must be a `re.Match` object previously returned by
`match()`
"""
return self.country_matcher.get(country_code, self.default_matcher).normalize(match)

View File

View File

@@ -0,0 +1,242 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Main work horse for indexing (computing addresses) the database.
"""
from typing import Optional, Any, cast
import logging
import time
import psycopg2.extras
from nominatim_core.typing import DictCursorResults
from nominatim_core.db.async_connection import DBConnection, WorkerPool
from nominatim_core.db.connection import connect, Connection, Cursor
from ..tokenizer.base import AbstractTokenizer
from .progress import ProgressLogger
from . import runners
LOG = logging.getLogger()
class PlaceFetcher:
""" Asynchronous connection that fetches place details for processing.
"""
def __init__(self, dsn: str, setup_conn: Connection) -> None:
self.wait_time = 0.0
self.current_ids: Optional[DictCursorResults] = None
self.conn: Optional[DBConnection] = DBConnection(dsn,
cursor_factory=psycopg2.extras.DictCursor)
with setup_conn.cursor() as cur:
# need to fetch those manually because register_hstore cannot
# fetch them on an asynchronous connection below.
hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
array_oid=hstore_array_oid)
def close(self) -> None:
""" Close the underlying asynchronous connection.
"""
if self.conn:
self.conn.close()
self.conn = None
def fetch_next_batch(self, cur: Cursor, runner: runners.Runner) -> bool:
""" Send a request for the next batch of places.
If details for the places are required, they will be fetched
asynchronously.
Returns true if there is still data available.
"""
ids = cast(Optional[DictCursorResults], cur.fetchmany(100))
if not ids:
self.current_ids = None
return False
assert self.conn is not None
self.current_ids = runner.get_place_details(self.conn, ids)
return True
def get_batch(self) -> DictCursorResults:
""" Get the next batch of data, previously requested with
`fetch_next_batch`.
"""
assert self.conn is not None
assert self.conn.cursor is not None
if self.current_ids is not None and not self.current_ids:
tstart = time.time()
self.conn.wait()
self.wait_time += time.time() - tstart
self.current_ids = cast(Optional[DictCursorResults],
self.conn.cursor.fetchall())
return self.current_ids if self.current_ids is not None else []
def __enter__(self) -> 'PlaceFetcher':
return self
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
assert self.conn is not None
self.conn.wait()
self.close()
class Indexer:
""" Main indexing routine.
"""
def __init__(self, dsn: str, tokenizer: AbstractTokenizer, num_threads: int):
self.dsn = dsn
self.tokenizer = tokenizer
self.num_threads = num_threads
def has_pending(self) -> bool:
""" Check if any data still needs indexing.
This function must only be used after the import has finished.
Otherwise it will be very expensive.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.execute("SELECT 'a' FROM placex WHERE indexed_status > 0 LIMIT 1")
return cur.rowcount > 0
def index_full(self, analyse: bool = True) -> None:
""" Index the complete database. This will first index boundaries
followed by all other objects. When `analyse` is True, then the
database will be analysed at the appropriate places to
ensure that database statistics are updated.
"""
with connect(self.dsn) as conn:
conn.autocommit = True
def _analyze() -> None:
if analyse:
with conn.cursor() as cur:
cur.execute('ANALYZE')
if self.index_by_rank(0, 4) > 0:
_analyze()
if self.index_boundaries(0, 30) > 100:
_analyze()
if self.index_by_rank(5, 25) > 100:
_analyze()
if self.index_by_rank(26, 30) > 1000:
_analyze()
if self.index_postcodes() > 100:
_analyze()
def index_boundaries(self, minrank: int, maxrank: int) -> int:
""" Index only administrative boundaries within the given rank range.
"""
total = 0
LOG.warning("Starting indexing boundaries using %s threads",
self.num_threads)
with self.tokenizer.name_analyzer() as analyzer:
for rank in range(max(minrank, 4), min(maxrank, 26)):
total += self._index(runners.BoundaryRunner(rank, analyzer))
return total
def index_by_rank(self, minrank: int, maxrank: int) -> int:
""" Index all entries of placex in the given rank range (inclusive)
in order of their address rank.
When rank 30 is requested then also interpolations and
places with address rank 0 will be indexed.
"""
total = 0
maxrank = min(maxrank, 30)
LOG.warning("Starting indexing rank (%i to %i) using %i threads",
minrank, maxrank, self.num_threads)
with self.tokenizer.name_analyzer() as analyzer:
for rank in range(max(1, minrank), maxrank + 1):
total += self._index(runners.RankRunner(rank, analyzer), 20 if rank == 30 else 1)
if maxrank == 30:
total += self._index(runners.RankRunner(0, analyzer))
total += self._index(runners.InterpolationRunner(analyzer), 20)
return total
def index_postcodes(self) -> int:
"""Index the entries of the location_postcode table.
"""
LOG.warning("Starting indexing postcodes using %s threads", self.num_threads)
return self._index(runners.PostcodeRunner(), 20)
def update_status_table(self) -> None:
""" Update the status in the status table to 'indexed'.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.execute('UPDATE import_status SET indexed = true')
conn.commit()
def _index(self, runner: runners.Runner, batch: int = 1) -> int:
""" Index a single rank or table. `runner` describes the SQL to use
for indexing. `batch` describes the number of objects that
should be processed with a single SQL statement
"""
LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
with connect(self.dsn) as conn:
psycopg2.extras.register_hstore(conn)
with conn.cursor() as cur:
total_tuples = cur.scalar(runner.sql_count_objects())
LOG.debug("Total number of rows: %i", total_tuples)
conn.commit()
progress = ProgressLogger(runner.name(), total_tuples)
if total_tuples > 0:
with conn.cursor(name='places') as cur:
cur.execute(runner.sql_get_objects())
with PlaceFetcher(self.dsn, conn) as fetcher:
with WorkerPool(self.dsn, self.num_threads) as pool:
has_more = fetcher.fetch_next_batch(cur, runner)
while has_more:
places = fetcher.get_batch()
# asynchronously get the next batch
has_more = fetcher.fetch_next_batch(cur, runner)
# And insert the current batch
for idx in range(0, len(places), batch):
part = places[idx:idx + batch]
LOG.debug("Processing places: %s", str(part))
runner.index_places(pool.next_free_worker(), part)
progress.add(len(part))
LOG.info("Wait time: fetcher: %.2fs, pool: %.2fs",
fetcher.wait_time, pool.wait_time)
conn.commit()
return progress.done()

View File

@@ -0,0 +1,74 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helpers for progress logging.
"""
import logging
from datetime import datetime
LOG = logging.getLogger()
INITIAL_PROGRESS = 10
class ProgressLogger:
""" Tracks and prints progress for the indexing process.
`name` is the name of the indexing step being tracked.
`total` sets up the total number of items that need processing.
`log_interval` denotes the interval in seconds at which progress
should be reported.
"""
def __init__(self, name: str, total: int, log_interval: int = 1) -> None:
self.name = name
self.total_places = total
self.done_places = 0
self.rank_start_time = datetime.now()
self.log_interval = log_interval
self.next_info = INITIAL_PROGRESS if LOG.isEnabledFor(logging.WARNING) else total + 1
def add(self, num: int = 1) -> None:
""" Mark `num` places as processed. Print a log message if the
logging is at least info and the log interval has passed.
"""
self.done_places += num
if self.done_places < self.next_info:
return
now = datetime.now()
done_time = (now - self.rank_start_time).total_seconds()
if done_time < 2:
self.next_info = self.done_places + INITIAL_PROGRESS
return
places_per_sec = self.done_places / done_time
eta = (self.total_places - self.done_places) / places_per_sec
LOG.warning("Done %d in %d @ %.3f per second - %s ETA (seconds): %.2f",
self.done_places, int(done_time),
places_per_sec, self.name, eta)
self.next_info += int(places_per_sec) * self.log_interval
def done(self) -> int:
""" Print final statistics about the progress.
"""
rank_end_time = datetime.now()
if rank_end_time == self.rank_start_time:
diff_seconds = 0.0
places_per_sec = float(self.done_places)
else:
diff_seconds = (rank_end_time - self.rank_start_time).total_seconds()
places_per_sec = self.done_places / diff_seconds
LOG.warning("Done %d/%d in %d @ %.3f per second - FINISHED %s\n",
self.done_places, self.total_places, int(diff_seconds),
places_per_sec, self.name)
return self.done_places

View File

@@ -0,0 +1,196 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Mix-ins that provide the actual commands for the indexer for various indexing
tasks.
"""
from typing import Any, List
import functools
from psycopg2 import sql as pysql
import psycopg2.extras
from nominatim_core.typing import Query, DictCursorResult, DictCursorResults, Protocol
from nominatim_core.db.async_connection import DBConnection
from ..data.place_info import PlaceInfo
from ..tokenizer.base import AbstractAnalyzer
# pylint: disable=C0111
def _mk_valuelist(template: str, num: int) -> pysql.Composed:
return pysql.SQL(',').join([pysql.SQL(template)] * num)
def _analyze_place(place: DictCursorResult, analyzer: AbstractAnalyzer) -> psycopg2.extras.Json:
return psycopg2.extras.Json(analyzer.process_place(PlaceInfo(place)))
class Runner(Protocol):
def name(self) -> str: ...
def sql_count_objects(self) -> Query: ...
def sql_get_objects(self) -> Query: ...
def get_place_details(self, worker: DBConnection,
ids: DictCursorResults) -> DictCursorResults: ...
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None: ...
class AbstractPlacexRunner:
""" Returns SQL commands for indexing of the placex table.
"""
SELECT_SQL = pysql.SQL('SELECT place_id FROM placex ')
UPDATE_LINE = "(%s, %s::hstore, %s::hstore, %s::int, %s::jsonb)"
def __init__(self, rank: int, analyzer: AbstractAnalyzer) -> None:
self.rank = rank
self.analyzer = analyzer
@functools.lru_cache(maxsize=1)
def _index_sql(self, num_places: int) -> pysql.Composed:
return pysql.SQL(
""" UPDATE placex
SET indexed_status = 0, address = v.addr, token_info = v.ti,
name = v.name, linked_place_id = v.linked_place_id
FROM (VALUES {}) as v(id, name, addr, linked_place_id, ti)
WHERE place_id = v.id
""").format(_mk_valuelist(AbstractPlacexRunner.UPDATE_LINE, num_places))
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
worker.perform("""SELECT place_id, extra.*
FROM placex, LATERAL placex_indexing_prepare(placex) as extra
WHERE place_id IN %s""",
(tuple((p[0] for p in ids)), ))
return []
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
values: List[Any] = []
for place in places:
for field in ('place_id', 'name', 'address', 'linked_place_id'):
values.append(place[field])
values.append(_analyze_place(place, self.analyzer))
worker.perform(self._index_sql(len(places)), values)
class RankRunner(AbstractPlacexRunner):
""" Returns SQL commands for indexing one rank within the placex table.
"""
def name(self) -> str:
return f"rank {self.rank}"
def sql_count_objects(self) -> pysql.Composed:
return pysql.SQL("""SELECT count(*) FROM placex
WHERE rank_address = {} and indexed_status > 0
""").format(pysql.Literal(self.rank))
def sql_get_objects(self) -> pysql.Composed:
return self.SELECT_SQL + pysql.SQL(
"""WHERE indexed_status > 0 and rank_address = {}
ORDER BY geometry_sector
""").format(pysql.Literal(self.rank))
class BoundaryRunner(AbstractPlacexRunner):
""" Returns SQL commands for indexing the administrative boundaries
of a certain rank.
"""
def name(self) -> str:
return f"boundaries rank {self.rank}"
def sql_count_objects(self) -> pysql.Composed:
return pysql.SQL("""SELECT count(*) FROM placex
WHERE indexed_status > 0
AND rank_search = {}
AND class = 'boundary' and type = 'administrative'
""").format(pysql.Literal(self.rank))
def sql_get_objects(self) -> pysql.Composed:
return self.SELECT_SQL + pysql.SQL(
"""WHERE indexed_status > 0 and rank_search = {}
and class = 'boundary' and type = 'administrative'
ORDER BY partition, admin_level
""").format(pysql.Literal(self.rank))
class InterpolationRunner:
""" Returns SQL commands for indexing the address interpolation table
location_property_osmline.
"""
def __init__(self, analyzer: AbstractAnalyzer) -> None:
self.analyzer = analyzer
def name(self) -> str:
return "interpolation lines (location_property_osmline)"
def sql_count_objects(self) -> str:
return """SELECT count(*) FROM location_property_osmline
WHERE indexed_status > 0"""
def sql_get_objects(self) -> str:
return """SELECT place_id
FROM location_property_osmline
WHERE indexed_status > 0
ORDER BY geometry_sector"""
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
FROM location_property_osmline WHERE place_id IN %s""",
(tuple((p[0] for p in ids)), ))
return []
@functools.lru_cache(maxsize=1)
def _index_sql(self, num_places: int) -> pysql.Composed:
return pysql.SQL("""UPDATE location_property_osmline
SET indexed_status = 0, address = v.addr, token_info = v.ti
FROM (VALUES {}) as v(id, addr, ti)
WHERE place_id = v.id
""").format(_mk_valuelist("(%s, %s::hstore, %s::jsonb)", num_places))
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
values: List[Any] = []
for place in places:
values.extend((place[x] for x in ('place_id', 'address')))
values.append(_analyze_place(place, self.analyzer))
worker.perform(self._index_sql(len(places)), values)
class PostcodeRunner(Runner):
""" Provides the SQL commands for indexing the location_postcode table.
"""
def name(self) -> str:
return "postcodes (location_postcode)"
def sql_count_objects(self) -> str:
return 'SELECT count(*) FROM location_postcode WHERE indexed_status > 0'
def sql_get_objects(self) -> str:
return """SELECT place_id FROM location_postcode
WHERE indexed_status > 0
ORDER BY country_code, postcode"""
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
return ids
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
worker.perform(pysql.SQL("""UPDATE location_postcode SET indexed_status = 0
WHERE place_id IN ({})""")
.format(pysql.SQL(',').join((pysql.Literal(i[0]) for i in places))))

View File

View File

@@ -0,0 +1,253 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Abstract class definitions for tokenizers. These base classes are here
mainly for documentation purposes.
"""
from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any, Optional, Iterable
from pathlib import Path
from nominatim_core.typing import Protocol
from nominatim_core.config import Configuration
from nominatim_core.db.connection import Connection
from ..data.place_info import PlaceInfo
class AbstractAnalyzer(ABC):
""" The analyzer provides the functions for analysing names and building
the token database.
Analyzers are instantiated on a per-thread base. Access to global data
structures must be synchronised accordingly.
"""
def __enter__(self) -> 'AbstractAnalyzer':
return self
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
self.close()
@abstractmethod
def close(self) -> None:
""" Free all resources used by the analyzer.
"""
@abstractmethod
def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
""" Return token information for the given list of words.
The function is used for testing and debugging only
and does not need to be particularly efficient.
Arguments:
words: A list of words to look up the tokens for.
If a word starts with # it is assumed to be a full name
otherwise is a partial term.
Returns:
The function returns the list of all tuples that could be
found for the given words. Each list entry is a tuple of
(original word, word token, word id).
"""
@abstractmethod
def normalize_postcode(self, postcode: str) -> str:
""" Convert the postcode to its standardized form.
This function must yield exactly the same result as the SQL function
`token_normalized_postcode()`.
Arguments:
postcode: The postcode to be normalized.
Returns:
The given postcode after normalization.
"""
@abstractmethod
def update_postcodes_from_db(self) -> None:
""" Update the tokenizer's postcode tokens from the current content
of the `location_postcode` table.
"""
@abstractmethod
def update_special_phrases(self,
phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
""" Update the tokenizer's special phrase tokens from the given
list of special phrases.
Arguments:
phrases: The new list of special phrases. Each entry is
a tuple of (phrase, class, type, operator).
should_replace: If true, replace the current list of phrases.
When false, just add the given phrases to the
ones that already exist.
"""
@abstractmethod
def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
""" Add the given names to the tokenizer's list of country tokens.
Arguments:
country_code: two-letter country code for the country the names
refer to.
names: Dictionary of name type to name.
"""
@abstractmethod
def process_place(self, place: PlaceInfo) -> Any:
""" Extract tokens for the given place and compute the
information to be handed to the PL/pgSQL processor for building
the search index.
Arguments:
place: Place information retrieved from the database.
Returns:
A JSON-serialisable structure that will be handed into
the database via the `token_info` field.
"""
class AbstractTokenizer(ABC):
""" The tokenizer instance is the central instance of the tokenizer in
the system. There will only be a single instance of the tokenizer
active at any time.
"""
@abstractmethod
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
""" Set up a new tokenizer for the database.
The function should copy all necessary data into the project
directory or save it in the property table to make sure that
the tokenizer remains stable over updates.
Arguments:
config: Read-only object with configuration options.
init_db: When set to False, then initialisation of database
tables should be skipped. This option is only required for
migration purposes and can be safely ignored by custom
tokenizers.
"""
@abstractmethod
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from an existing database setup.
The function should load all previously saved configuration from
the project directory and/or the property table.
Arguments:
config: Read-only object with configuration options.
"""
@abstractmethod
def finalize_import(self, config: Configuration) -> None:
""" This function is called at the very end of an import when all
data has been imported and indexed. The tokenizer may create
at this point any additional indexes and data structures needed
during query time.
Arguments:
config: Read-only object with configuration options.
"""
@abstractmethod
def update_sql_functions(self, config: Configuration) -> None:
""" Update the SQL part of the tokenizer. This function is called
automatically on migrations or may be called explicitly by the
user through the `nominatim refresh --functions` command.
The tokenizer must only update the code of the tokenizer. The
data structures or data itself must not be changed by this function.
Arguments:
config: Read-only object with configuration options.
"""
@abstractmethod
def check_database(self, config: Configuration) -> Optional[str]:
""" Check that the database is set up correctly and ready for being
queried.
Arguments:
config: Read-only object with configuration options.
Returns:
If an issue was found, return an error message with the
description of the issue as well as hints for the user on
how to resolve the issue. If everything is okay, return `None`.
"""
@abstractmethod
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute any tokenizer statistics necessary for efficient lookup.
This function is meant to be called from time to time by the user
to improve performance. However, the tokenizer must not depend on
it to be called in order to work.
"""
@abstractmethod
def update_word_tokens(self) -> None:
""" Do house-keeping on the tokenizers internal data structures.
Remove unused word tokens, resort data etc.
"""
@abstractmethod
def name_analyzer(self) -> AbstractAnalyzer:
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
be used accordingly:
```
with tokenizer.name_analyzer() as analyzer:
analyser.tokenize()
```
When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer.
"""
@abstractmethod
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the most frequent full words in the database.
Arguments:
conn: Open connection to the database which may be used to
retrieve the words.
num: Maximum number of words to return.
"""
class TokenizerModule(Protocol):
""" Interface that must be exported by modules that implement their
own tokenizer.
"""
def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
""" Factory for new tokenizers.
"""

View File

@@ -0,0 +1,102 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for creating a tokenizer or initialising the right one for an
existing database.
A tokenizer is something that is bound to the lifetime of a database. It
can be chosen and configured before the initial import but then needs to
be used consistently when querying and updating the database.
This module provides the functions to create and configure a new tokenizer
as well as instantiating the appropriate tokenizer for updating an existing
database.
A tokenizer usually also includes PHP code for querying. The appropriate PHP
normalizer module is installed, when the tokenizer is created.
"""
from typing import Optional
import logging
import importlib
from pathlib import Path
from nominatim_core.errors import UsageError
from nominatim_core.db import properties
from nominatim_core.db.connection import connect
from nominatim_core.config import Configuration
from ..tokenizer.base import AbstractTokenizer, TokenizerModule
LOG = logging.getLogger()
def _import_tokenizer(name: str) -> TokenizerModule:
""" Load the tokenizer.py module from project directory.
"""
src_file = Path(__file__).parent / (name + '_tokenizer.py')
if not src_file.is_file():
LOG.fatal("No tokenizer named '%s' available. "
"Check the setting of NOMINATIM_TOKENIZER.", name)
raise UsageError('Tokenizer not found')
return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
def create_tokenizer(config: Configuration, init_db: bool = True,
module_name: Optional[str] = None) -> AbstractTokenizer:
""" Create a new tokenizer as defined by the given configuration.
The tokenizer data and code is copied into the 'tokenizer' directory
of the project directory and the tokenizer loaded from its new location.
"""
if module_name is None:
module_name = config.TOKENIZER
# Create the directory for the tokenizer data
assert config.project_dir is not None
basedir = config.project_dir / 'tokenizer'
if not basedir.exists():
basedir.mkdir()
elif not basedir.is_dir():
LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
raise UsageError("Tokenizer setup failed.")
# Import and initialize the tokenizer.
tokenizer_module = _import_tokenizer(module_name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
tokenizer.init_new_db(config, init_db=init_db)
with connect(config.get_libpq_dsn()) as conn:
properties.set_property(conn, 'tokenizer', module_name)
return tokenizer
def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
""" Instantiate a tokenizer for an existing database.
The function looks up the appropriate tokenizer in the database
and initialises it.
"""
assert config.project_dir is not None
basedir = config.project_dir / 'tokenizer'
if not basedir.is_dir():
# Directory will be repopulated by tokenizer below.
basedir.mkdir()
with connect(config.get_libpq_dsn()) as conn:
name = properties.get_property(conn, 'tokenizer')
if name is None:
LOG.fatal("Tokenizer was not set up properly. Database property missing.")
raise UsageError('Cannot initialize tokenizer.')
tokenizer_module = _import_tokenizer(name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
tokenizer.init_from_project(config)
return tokenizer

View File

@@ -0,0 +1,196 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper class to create ICU rules from a configuration file.
"""
from typing import Mapping, Any, Dict, Optional
import io
import json
import logging
from icu import Transliterator
from nominatim_core.config import flatten_config_list, Configuration
from nominatim_core.db.properties import set_property, get_property
from nominatim_core.db.connection import Connection
from nominatim_core.errors import UsageError
from .place_sanitizer import PlaceSanitizer
from .icu_token_analysis import ICUTokenAnalysis
from .token_analysis.base import AnalysisModule, Analyzer
from ..data import country_info
LOG = logging.getLogger()
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
def _get_section(rules: Mapping[str, Any], section: str) -> Any:
""" Get the section named 'section' from the rules. If the section does
not exist, raise a usage error with a meaningful message.
"""
if section not in rules:
LOG.fatal("Section '%s' not found in tokenizer config.", section)
raise UsageError("Syntax error in tokenizer configuration file.")
return rules[section]
class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file.
"""
def __init__(self, config: Configuration) -> None:
self.config = config
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
# Make sure country information is available to analyzers and sanitizers.
country_info.setup_country_config(config)
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self.analysis_rules = _get_section(rules, 'token-analysis')
self._setup_analysis()
# Load optional sanitizer rule set.
self.sanitizer_rules = rules.get('sanitizers', [])
def load_config_from_db(self, conn: Connection) -> None:
""" Get previously saved parts of the configuration from the
database.
"""
rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
if rules is not None:
self.normalization_rules = rules
rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
if rules is not None:
self.transliteration_rules = rules
rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
if rules:
self.analysis_rules = json.loads(rules)
else:
self.analysis_rules = []
self._setup_analysis()
def save_config_to_db(self, conn: Connection) -> None:
""" Save the part of the configuration that cannot be changed into
the database.
"""
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
def make_sanitizer(self) -> PlaceSanitizer:
""" Create a place sanitizer from the configured rules.
"""
return PlaceSanitizer(self.sanitizer_rules, self.config)
def make_token_analysis(self) -> ICUTokenAnalysis:
""" Create a token analyser from the reviouly loaded rules.
"""
return ICUTokenAnalysis(self.normalization_rules,
self.transliteration_rules, self.analysis)
def get_search_rules(self) -> str:
""" Return the ICU rules to be used during search.
The rules combine normalization and transliteration.
"""
# First apply the normalization rules.
rules = io.StringIO()
rules.write(self.normalization_rules)
# Then add transliteration.
rules.write(self.transliteration_rules)
return rules.getvalue()
def get_normalization_rules(self) -> str:
""" Return rules for normalisation of a term.
"""
return self.normalization_rules
def get_transliteration_rules(self) -> str:
""" Return the rules for converting a string into its asciii representation.
"""
return self.transliteration_rules
def _setup_analysis(self) -> None:
""" Process the rules used for creating the various token analyzers.
"""
self.analysis: Dict[Optional[str], TokenAnalyzerRule] = {}
if not isinstance(self.analysis_rules, list):
raise UsageError("Configuration section 'token-analysis' must be a list.")
norm = Transliterator.createFromRules("rule_loader_normalization",
self.normalization_rules)
trans = Transliterator.createFromRules("rule_loader_transliteration",
self.transliteration_rules)
for section in self.analysis_rules:
name = section.get('id', None)
if name in self.analysis:
if name is None:
LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
else:
LOG.fatal("ICU tokenizer configuration has two token "
"analyzers with id '%s'.", name)
raise UsageError("Syntax error in ICU tokenizer config.")
self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
self.config)
@staticmethod
def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
""" Load an ICU ruleset from the given section. If the section is a
simple string, it is interpreted as a file name and the rules are
loaded verbatim from the given file. The filename is expected to be
relative to the tokenizer rule file. If the section is a list then
each line is assumed to be a rule. All rules are concatenated and returned.
"""
content = _get_section(rules, section)
if content is None:
return ''
return ';'.join(flatten_config_list(content, section)) + ';'
class TokenAnalyzerRule:
""" Factory for a single analysis module. The class saves the configuration
and creates a new token analyzer on request.
"""
def __init__(self, rules: Mapping[str, Any],
normalizer: Any, transliterator: Any,
config: Configuration) -> None:
analyzer_name = _get_section(rules, 'analyzer')
if not analyzer_name or not isinstance(analyzer_name, str):
raise UsageError("'analyzer' parameter needs to be simple string")
self._analysis_mod: AnalysisModule = \
config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
self.config = self._analysis_mod.configure(rules, normalizer,
transliterator)
def create(self, normalizer: Any, transliterator: Any) -> Analyzer:
""" Create a new analyser instance for the given rule.
"""
return self._analysis_mod.create(normalizer, transliterator, self.config)

View File

@@ -0,0 +1,43 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Container class collecting all components required to transform an OSM name
into a Nominatim token.
"""
from typing import Mapping, Optional, TYPE_CHECKING
from icu import Transliterator
from .token_analysis.base import Analyzer
if TYPE_CHECKING:
from typing import Any
from .icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import
class ICUTokenAnalysis:
""" Container class collecting the transliterators and token analysis
modules for a single Analyser instance.
"""
def __init__(self, norm_rules: str, trans_rules: str,
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
self.normalizer = Transliterator.createFromRules("icu_normalization",
norm_rules)
trans_rules += ";[:Space:]+ > ' '"
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
trans_rules)
self.search = Transliterator.createFromRules("icu_search",
norm_rules + trans_rules)
self.analysis = {name: arules.create(self.normalizer, self.to_ascii)
for name, arules in analysis_rules.items()}
def get_analyzer(self, name: Optional[str]) -> Analyzer:
""" Return the given named analyzer. If no analyzer with that
name exists, return the default analyzer.
"""
return self.analysis.get(name) or self.analysis[None]

View File

@@ -0,0 +1,952 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tokenizer implementing normalisation as used before Nominatim 4 but using
libICU instead of the PostgreSQL module.
"""
from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
Dict, Set, Iterable
import itertools
import json
import logging
from pathlib import Path
from textwrap import dedent
from nominatim_core.db.connection import connect, Connection, Cursor
from nominatim_core.config import Configuration
from nominatim_core.db.utils import CopyBuffer
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from ..data.place_info import PlaceInfo
from ..data.place_name import PlaceName
from .icu_rule_loader import ICURuleLoader
from .place_sanitizer import PlaceSanitizer
from .icu_token_analysis import ICUTokenAnalysis
from .base import AbstractAnalyzer, AbstractTokenizer
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
LOG = logging.getLogger()
WORD_TYPES =(('country_names', 'C'),
('postcodes', 'P'),
('full_word', 'W'),
('housenumbers', 'H'))
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
""" Create a new instance of the tokenizer provided by this module.
"""
return ICUTokenizer(dsn, data_dir)
class ICUTokenizer(AbstractTokenizer):
""" This tokenizer uses libICU to convert names and queries to ASCII.
Otherwise it uses the same algorithms and data structures as the
normalization routines in Nominatim 3.
"""
def __init__(self, dsn: str, data_dir: Path) -> None:
self.dsn = dsn
self.data_dir = data_dir
self.loader: Optional[ICURuleLoader] = None
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
""" Set up a new tokenizer for the database.
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
self.loader = ICURuleLoader(config)
self._install_php(config.lib_dir.php, overwrite=True)
self._save_config()
if init_db:
self.update_sql_functions(config)
self._setup_db_tables(config)
self._create_base_indices(config, 'word')
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from the project directory.
"""
self.loader = ICURuleLoader(config)
with connect(self.dsn) as conn:
self.loader.load_config_from_db(conn)
self._install_php(config.lib_dir.php, overwrite=False)
def finalize_import(self, config: Configuration) -> None:
""" Do any required postprocessing to make the tokenizer data ready
for use.
"""
self._create_lookup_indices(config, 'word')
def update_sql_functions(self, config: Configuration) -> None:
""" Reimport the SQL functions for this tokenizer.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
def check_database(self, config: Configuration) -> None:
""" Check that the tokenizer is set up correctly.
"""
# Will throw an error if there is an issue.
self.init_from_project(config)
def update_statistics(self, config: Configuration, threads: int = 2) -> None:
""" Recompute frequencies for all name words.
"""
with connect(self.dsn) as conn:
if not conn.table_exists('search_name'):
return
with conn.cursor() as cur:
cur.execute('ANALYSE search_name')
if threads > 1:
cur.execute('SET max_parallel_workers_per_gather TO %s',
(min(threads, 6),))
if conn.server_version_tuple() < (12, 0):
LOG.info('Computing word frequencies')
cur.drop_table('word_frequencies')
cur.drop_table('addressword_frequencies')
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON word_frequencies(id)')
cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON addressword_frequencies(id)')
cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
INOUT info JSONB)
AS $$
DECLARE rec RECORD;
BEGIN
IF info is null THEN
info = '{}'::jsonb;
END IF;
FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('count', rec.count);
END LOOP;
FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('addr_count', rec.count);
END LOOP;
IF info = '{}'::jsonb THEN
info = null;
END IF;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
""")
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
word_freq_update(word_id, info) as info
FROM word
""")
cur.drop_table('word_frequencies')
cur.drop_table('addressword_frequencies')
else:
LOG.info('Computing word frequencies')
cur.drop_table('word_frequencies')
cur.execute("""
CREATE TEMP TABLE word_frequencies AS
WITH word_freq AS MATERIALIZED (
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id),
addr_freq AS MATERIALIZED (
SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id)
SELECT coalesce(a.id, w.id) as id,
(CASE WHEN w.count is null THEN '{}'::JSONB
ELSE jsonb_build_object('count', w.count) END
||
CASE WHEN a.count is null THEN '{}'::JSONB
ELSE jsonb_build_object('addr_count', a.count) END) as info
FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
""")
cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
cur.execute('ANALYSE word_frequencies')
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
(CASE WHEN wf.info is null THEN word.info
ELSE coalesce(word.info, '{}'::jsonb) || wf.info
END) as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id
""")
cur.drop_table('word_frequencies')
with conn.cursor() as cur:
cur.execute('SET max_parallel_workers_per_gather TO 0')
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn,
'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
conn.commit()
self._create_base_indices(config, 'tmp_word')
self._create_lookup_indices(config, 'tmp_word')
self._move_temporary_word_table('tmp_word')
def _cleanup_housenumbers(self) -> None:
""" Remove unused house numbers.
"""
with connect(self.dsn) as conn:
if not conn.table_exists('search_name'):
return
with conn.cursor(name="hnr_counter") as cur:
cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
FROM word
WHERE type = 'H'
AND NOT EXISTS(SELECT * FROM search_name
WHERE ARRAY[word.word_id] && name_vector)
AND (char_length(coalesce(word, word_token)) > 6
OR coalesce(word, word_token) not similar to '\\d+')
""")
candidates = {token: wid for wid, token in cur}
with conn.cursor(name="hnr_counter") as cur:
cur.execute("""SELECT housenumber FROM placex
WHERE housenumber is not null
AND (char_length(housenumber) > 6
OR housenumber not similar to '\\d+')
""")
for row in cur:
for hnr in row[0].split(';'):
candidates.pop(hnr, None)
LOG.info("There are %s outdated housenumbers.", len(candidates))
LOG.debug("Outdated housenumbers: %s", candidates.keys())
if candidates:
with conn.cursor() as cur:
cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
(list(candidates.values()), ))
conn.commit()
def update_word_tokens(self) -> None:
""" Remove unused tokens.
"""
LOG.warning("Cleaning up housenumber tokens.")
self._cleanup_housenumbers()
LOG.warning("Tokenizer house-keeping done.")
def name_analyzer(self) -> 'ICUNameAnalyzer':
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
be used accordingly:
```
with tokenizer.name_analyzer() as analyzer:
analyser.tokenize()
```
When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer.
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
assert self.loader is not None
return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
self.loader.make_token_analysis())
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
with conn.cursor() as cur:
cur.execute("""SELECT word, sum((info->>'count')::int) as count
FROM word WHERE type = 'W'
GROUP BY word
ORDER BY count DESC LIMIT %s""", (num,))
return list(s[0].split('@')[0] for s in cur)
def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""
if phpdir is not None:
assert self.loader is not None
php_file = self.data_dir / "tokenizer.php"
if not php_file.exists() or overwrite:
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
def _save_config(self) -> None:
""" Save the configuration that needs to remain stable for the given
database as database properties.
"""
assert self.loader is not None
with connect(self.dsn) as conn:
self.loader.save_config_to_db(conn)
def _setup_db_tables(self, config: Configuration) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('word')
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn, """
CREATE TABLE word (
word_id INTEGER,
word_token text NOT NULL,
type text NOT NULL,
word text,
info jsonb
) {{db.tablespace.search_data}};
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS seq_word;
CREATE SEQUENCE seq_word start 1;
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
""")
conn.commit()
def _create_base_indices(self, config: Configuration, table_name: str) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn,
"""CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
USING BTREE (word_token) {{db.tablespace.search_index}}""",
table_name=table_name)
for name, ctype in WORD_TYPES:
sqlp.run_string(conn,
"""CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
USING BTREE (word) {{db.tablespace.address_index}}
WHERE type = '{{column_type}}'
""",
table_name=table_name, idx_name=name,
column_type=ctype)
conn.commit()
def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
""" Create additional indexes used when running the API.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
# Index required for details lookup.
sqlp.run_string(conn, """
CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
""",
table_name=table_name)
conn.commit()
def _move_temporary_word_table(self, old: str) -> None:
""" Rename all tables and indexes used by the tokenizer.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('word')
cur.execute(f"ALTER TABLE {old} RENAME TO word")
for idx in ('word_token', 'word_id'):
cur.execute(f"""ALTER INDEX idx_{old}_{idx}
RENAME TO idx_word_{idx}""")
for name, _ in WORD_TYPES:
cur.execute(f"""ALTER INDEX idx_{old}_{name}
RENAME TO idx_word_{name}""")
conn.commit()
class ICUNameAnalyzer(AbstractAnalyzer):
""" The ICU analyzer uses the ICU library for splitting names.
Each instance opens a connection to the database to request the
normalization.
"""
def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
token_analysis: ICUTokenAnalysis) -> None:
self.conn: Optional[Connection] = connect(dsn).connection
self.conn.autocommit = True
self.sanitizer = sanitizer
self.token_analysis = token_analysis
self._cache = _TokenCache()
def close(self) -> None:
""" Free all resources used by the analyzer.
"""
if self.conn:
self.conn.close()
self.conn = None
def _search_normalized(self, name: str) -> str:
""" Return the search token transliteration of the given name.
"""
return cast(str, self.token_analysis.search.transliterate(name)).strip()
def _normalized(self, name: str) -> str:
""" Return the normalized version of the given name with all
non-relevant information removed.
"""
return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
The function returns a list of tuples with
(original word, word token, word id).
The function is used for testing and debugging only
and not necessarily efficient.
"""
assert self.conn is not None
full_tokens = {}
partial_tokens = {}
for word in words:
if word.startswith('#'):
full_tokens[word] = self._search_normalized(word[1:])
else:
partial_tokens[word] = self._search_normalized(word)
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id
FROM word WHERE word_token = ANY(%s) and type = 'W'
""", (list(full_tokens.values()),))
full_ids = {r[0]: r[1] for r in cur}
cur.execute("""SELECT word_token, word_id
FROM word WHERE word_token = ANY(%s) and type = 'w'""",
(list(partial_tokens.values()),))
part_ids = {r[0]: r[1] for r in cur}
return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
+ [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
def normalize_postcode(self, postcode: str) -> str:
""" Convert the postcode to a standardized form.
This function must yield exactly the same result as the SQL function
'token_normalized_postcode()'.
"""
return postcode.strip().upper()
def update_postcodes_from_db(self) -> None:
""" Update postcode tokens in the word table from the location_postcode
table.
"""
assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@postcode')
with self.conn.cursor() as cur:
# First get all postcode names currently in the word table.
cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
word_entries = set((entry[0] for entry in cur))
# Then compute the required postcode names from the postcode table.
needed_entries = set()
cur.execute("SELECT country_code, postcode FROM location_postcode")
for cc, postcode in cur:
info = PlaceInfo({'country_code': cc,
'class': 'place', 'type': 'postcode',
'address': {'postcode': postcode}})
address = self.sanitizer.process_names(info)[1]
for place in address:
if place.kind == 'postcode':
if analyzer is None:
postcode_name = place.name.strip().upper()
variant_base = None
else:
postcode_name = analyzer.get_canonical_id(place)
variant_base = place.get_attr("variant")
if variant_base:
needed_entries.add(f'{postcode_name}@{variant_base}')
else:
needed_entries.add(postcode_name)
break
# Now update the word table.
self._delete_unused_postcode_words(word_entries - needed_entries)
self._add_missing_postcode_words(needed_entries - word_entries)
def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if tokens:
with self.conn.cursor() as cur:
cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
(list(tokens), ))
def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if not tokens:
return
analyzer = self.token_analysis.analysis.get('@postcode')
terms = []
for postcode_name in tokens:
if '@' in postcode_name:
term, variant = postcode_name.split('@', 2)
term = self._search_normalized(term)
if analyzer is None:
variants = [term]
else:
variants = analyzer.compute_variants(variant)
if term not in variants:
variants.append(term)
else:
variants = [self._search_normalized(postcode_name)]
terms.append((postcode_name, variants))
if terms:
with self.conn.cursor() as cur:
cur.execute_values("""SELECT create_postcode_word(pc, var)
FROM (VALUES %s) AS v(pc, var)""",
terms)
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
""" Replace the search index for special phrases with the new phrases.
If `should_replace` is True, then the previous set of will be
completely replaced. Otherwise the phrases are added to the
already existing ones.
"""
assert self.conn is not None
norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
# Get the old phrases.
existing_phrases = set()
cur.execute("SELECT word, info FROM word WHERE type = 'S'")
for word, info in cur:
existing_phrases.add((word, info['class'], info['type'],
info.get('op') or '-'))
added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
if should_replace:
deleted = self._remove_special_phrases(cur, norm_phrases,
existing_phrases)
else:
deleted = 0
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
len(norm_phrases), added, deleted)
def _add_special_phrases(self, cursor: Cursor,
new_phrases: Set[Tuple[str, str, str, str]],
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
""" Add all phrases to the database that are not yet there.
"""
to_add = new_phrases - existing_phrases
added = 0
with CopyBuffer() as copystr:
for word, cls, typ, oper in to_add:
term = self._search_normalized(word)
if term:
copystr.add(term, 'S', word,
json.dumps({'class': cls, 'type': typ,
'op': oper if oper in ('in', 'near') else None}))
added += 1
copystr.copy_out(cursor, 'word',
columns=['word_token', 'type', 'word', 'info'])
return added
def _remove_special_phrases(self, cursor: Cursor,
new_phrases: Set[Tuple[str, str, str, str]],
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
""" Remove all phrases from the database that are no longer in the
new phrase list.
"""
to_delete = existing_phrases - new_phrases
if to_delete:
cursor.execute_values(
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
WHERE type = 'S' and word = name
and info->>'class' = in_class and info->>'type' = in_type
and ((op = '-' and info->>'op' is null) or op = info->>'op')
""", to_delete)
return len(to_delete)
def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
""" Add default names for the given country to the search index.
"""
# Make sure any name preprocessing for country names applies.
info = PlaceInfo({'name': names, 'country_code': country_code,
'rank_address': 4, 'class': 'boundary',
'type': 'administrative'})
self._add_country_full_names(country_code,
self.sanitizer.process_names(info)[0],
internal=True)
def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
internal: bool = False) -> None:
""" Add names for the given country from an already sanitized
name list.
"""
assert self.conn is not None
word_tokens = set()
for name in names:
norm_name = self._search_normalized(name.name)
if norm_name:
word_tokens.add(norm_name)
with self.conn.cursor() as cur:
# Get existing names
cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
FROM word
WHERE type = 'C' and word = %s""",
(country_code, ))
# internal/external names
existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
for word in cur:
existing_tokens[word[1]].add(word[0])
# Delete names that no longer exist.
gone_tokens = existing_tokens[internal] - word_tokens
if internal:
gone_tokens.update(existing_tokens[False] & word_tokens)
if gone_tokens:
cur.execute("""DELETE FROM word
USING unnest(%s) as token
WHERE type = 'C' and word = %s
and word_token = token""",
(list(gone_tokens), country_code))
# Only add those names that are not yet in the list.
new_tokens = word_tokens - existing_tokens[True]
if not internal:
new_tokens -= existing_tokens[False]
if new_tokens:
if internal:
sql = """INSERT INTO word (word_token, type, word, info)
(SELECT token, 'C', %s, '{"internal": "yes"}'
FROM unnest(%s) as token)
"""
else:
sql = """INSERT INTO word (word_token, type, word)
(SELECT token, 'C', %s
FROM unnest(%s) as token)
"""
cur.execute(sql, (country_code, list(new_tokens)))
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
""" Determine tokenizer information about the given place.
Returns a JSON-serializable structure that will be handed into
the database via the token_info field.
"""
token_info = _TokenInfo()
names, address = self.sanitizer.process_names(place)
if names:
token_info.set_names(*self._compute_name_tokens(names))
if place.is_country():
assert place.country_code is not None
self._add_country_full_names(place.country_code, names)
if address:
self._process_place_address(token_info, address)
return token_info.to_dict()
def _process_place_address(self, token_info: '_TokenInfo',
address: Sequence[PlaceName]) -> None:
for item in address:
if item.kind == 'postcode':
token_info.set_postcode(self._add_postcode(item))
elif item.kind == 'housenumber':
token_info.add_housenumber(*self._compute_housenumber_token(item))
elif item.kind == 'street':
token_info.add_street(self._retrieve_full_tokens(item.name))
elif item.kind == 'place':
if not item.suffix:
token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
elif not item.kind.startswith('_') and not item.suffix and \
item.kind not in ('country', 'full', 'inclusion'):
token_info.add_address_term(item.kind,
itertools.chain(*self._compute_name_tokens([item])))
def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
""" Normalize the housenumber and return the word token and the
canonical form.
"""
assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@housenumber')
result: Tuple[Optional[int], Optional[str]] = (None, None)
if analyzer is None:
# When no custom analyzer is set, simply normalize and transliterate
norm_name = self._search_normalized(hnr.name)
if norm_name:
result = self._cache.housenumbers.get(norm_name, result)
if result[0] is None:
with self.conn.cursor() as cur:
hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
result = hid, norm_name
self._cache.housenumbers[norm_name] = result
else:
# Otherwise use the analyzer to determine the canonical name.
# Per convention we use the first variant as the 'lookup name', the
# name that gets saved in the housenumber field of the place.
word_id = analyzer.get_canonical_id(hnr)
if word_id:
result = self._cache.housenumbers.get(word_id, result)
if result[0] is None:
variants = analyzer.compute_variants(word_id)
if variants:
with self.conn.cursor() as cur:
hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
(word_id, list(variants)))
result = hid, variants[0]
self._cache.housenumbers[word_id] = result
return result
def _retrieve_full_tokens(self, name: str) -> List[int]:
""" Get the full name token for the given name, if it exists.
The name is only retrieved for the standard analyser.
"""
assert self.conn is not None
norm_name = self._search_normalized(name)
# return cached if possible
if norm_name in self._cache.fulls:
return self._cache.fulls[norm_name]
with self.conn.cursor() as cur:
cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
(norm_name, ))
full = [row[0] for row in cur]
self._cache.fulls[norm_name] = full
return full
def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
""" Computes the full name and partial name tokens for the given
dictionary of names.
"""
assert self.conn is not None
full_tokens: Set[int] = set()
partial_tokens: Set[int] = set()
for name in names:
analyzer_id = name.get_attr('analyzer')
analyzer = self.token_analysis.get_analyzer(analyzer_id)
word_id = analyzer.get_canonical_id(name)
if analyzer_id is None:
token_id = word_id
else:
token_id = f'{word_id}@{analyzer_id}'
full, part = self._cache.names.get(token_id, (None, None))
if full is None:
variants = analyzer.compute_variants(word_id)
if not variants:
continue
with self.conn.cursor() as cur:
cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
(token_id, variants))
full, part = cast(Tuple[int, List[int]], cur.fetchone())
self._cache.names[token_id] = (full, part)
assert part is not None
full_tokens.add(full)
partial_tokens.update(part)
return full_tokens, partial_tokens
def _add_postcode(self, item: PlaceName) -> Optional[str]:
""" Make sure the normalized postcode is present in the word table.
"""
assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@postcode')
if analyzer is None:
postcode_name = item.name.strip().upper()
variant_base = None
else:
postcode_name = analyzer.get_canonical_id(item)
variant_base = item.get_attr("variant")
if variant_base:
postcode = f'{postcode_name}@{variant_base}'
else:
postcode = postcode_name
if postcode not in self._cache.postcodes:
term = self._search_normalized(postcode_name)
if not term:
return None
variants = {term}
if analyzer is not None and variant_base:
variants.update(analyzer.compute_variants(variant_base))
with self.conn.cursor() as cur:
cur.execute("SELECT create_postcode_word(%s, %s)",
(postcode, list(variants)))
self._cache.postcodes.add(postcode)
return postcode_name
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
def __init__(self) -> None:
self.names: Optional[str] = None
self.housenumbers: Set[str] = set()
self.housenumber_tokens: Set[int] = set()
self.street_tokens: Optional[Set[int]] = None
self.place_tokens: Set[int] = set()
self.address_tokens: Dict[str, str] = {}
self.postcode: Optional[str] = None
def _mk_array(self, tokens: Iterable[Any]) -> str:
return f"{{{','.join((str(s) for s in tokens))}}}"
def to_dict(self) -> Dict[str, Any]:
""" Return the token information in database importable format.
"""
out: Dict[str, Any] = {}
if self.names:
out['names'] = self.names
if self.housenumbers:
out['hnr'] = ';'.join(self.housenumbers)
out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
if self.street_tokens is not None:
out['street'] = self._mk_array(self.street_tokens)
if self.place_tokens:
out['place'] = self._mk_array(self.place_tokens)
if self.address_tokens:
out['addr'] = self.address_tokens
if self.postcode:
out['postcode'] = self.postcode
return out
def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
""" Adds token information for the normalised names.
"""
self.names = self._mk_array(itertools.chain(fulls, partials))
def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
""" Extract housenumber information from a list of normalised
housenumbers.
"""
if token:
assert hnr is not None
self.housenumbers.add(hnr)
self.housenumber_tokens.add(token)
def add_street(self, tokens: Iterable[int]) -> None:
""" Add addr:street match terms.
"""
if self.street_tokens is None:
self.street_tokens = set()
self.street_tokens.update(tokens)
def add_place(self, tokens: Iterable[int]) -> None:
""" Add addr:place search and match terms.
"""
self.place_tokens.update(tokens)
def add_address_term(self, key: str, partials: Iterable[int]) -> None:
""" Add additional address terms.
"""
array = self._mk_array(partials)
if len(array) > 2:
self.address_tokens[key] = array
def set_postcode(self, postcode: Optional[str]) -> None:
""" Set the postcode to the given one.
"""
self.postcode = postcode
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
This cache is not thread-safe and needs to be instantiated per
analyzer.
"""
def __init__(self) -> None:
self.names: Dict[str, Tuple[int, List[int]]] = {}
self.partials: Dict[str, int] = {}
self.fulls: Dict[str, List[int]] = {}
self.postcodes: Set[str] = set()
self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}

View File

@@ -0,0 +1,681 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tokenizer implementing normalisation as used before Nominatim 4.
"""
from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
cast, Dict, Set, Iterable
from collections import OrderedDict
import logging
from pathlib import Path
import re
import shutil
from textwrap import dedent
from icu import Transliterator
import psycopg2
import psycopg2.extras
from nominatim_core.errors import UsageError
from nominatim_core.db.connection import connect, Connection
from nominatim_core.config import Configuration
from nominatim_core.db import properties
from nominatim_core.db import utils as db_utils
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from ..data.place_info import PlaceInfo
from .base import AbstractAnalyzer, AbstractTokenizer
DBCFG_NORMALIZATION = "tokenizer_normalization"
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
LOG = logging.getLogger()
def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
""" Create a new instance of the tokenizer provided by this module.
"""
return LegacyTokenizer(dsn, data_dir)
def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
""" Copies the PostgreSQL normalisation module into the project
directory if necessary. For historical reasons the module is
saved in the '/module' subdirectory and not with the other tokenizer
data.
The function detects when the installation is run from the
build directory. It doesn't touch the module in that case.
"""
# Custom module locations are simply used as is.
if config_module_path:
LOG.info("Using custom path for database module at '%s'", config_module_path)
return config_module_path
# Compatibility mode for builddir installations.
if module_dir.exists() and src_dir.samefile(module_dir):
LOG.info('Running from build directory. Leaving database module as is.')
return str(module_dir)
# In any other case install the module in the project directory.
if not module_dir.exists():
module_dir.mkdir()
destfile = module_dir / 'nominatim.so'
shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
destfile.chmod(0o755)
LOG.info('Database module installed at %s', str(destfile))
return str(module_dir)
def _check_module(module_dir: str, conn: Connection) -> None:
""" Try to use the PostgreSQL module to confirm that it is correctly
installed and accessible from PostgreSQL.
"""
with conn.cursor() as cur:
try:
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
RETURNS text AS %s, 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
DROP FUNCTION nominatim_test_import_func(text)
""", (f'{module_dir}/nominatim.so', ))
except psycopg2.DatabaseError as err:
LOG.fatal("Error accessing database module: %s", err)
raise UsageError("Database module cannot be accessed.") from err
class LegacyTokenizer(AbstractTokenizer):
""" The legacy tokenizer uses a special PostgreSQL module to normalize
names and queries. The tokenizer thus implements normalization through
calls to the database.
"""
def __init__(self, dsn: str, data_dir: Path) -> None:
self.dsn = dsn
self.data_dir = data_dir
self.normalization: Optional[str] = None
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
""" Set up a new tokenizer for the database.
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
assert config.project_dir is not None
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
self.normalization = config.TERM_NORMALIZATION
self._install_php(config, overwrite=True)
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn, config)
conn.commit()
if init_db:
self.update_sql_functions(config)
self._init_db_tables(config)
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from the project directory.
"""
assert config.project_dir is not None
with connect(self.dsn) as conn:
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
if not (config.project_dir / 'module' / 'nominatim.so').exists():
_install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
self._install_php(config, overwrite=False)
def finalize_import(self, config: Configuration) -> None:
""" Do any required postprocessing to make the tokenizer data ready
for use.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
def update_sql_functions(self, config: Configuration) -> None:
""" Reimport the SQL functions for this tokenizer.
"""
assert config.project_dir is not None
with connect(self.dsn) as conn:
max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
modulepath = config.DATABASE_MODULE_PATH or \
str((config.project_dir / 'module').resolve())
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
max_word_freq=max_word_freq,
modulepath=modulepath)
def check_database(self, _: Configuration) -> Optional[str]:
""" Check that the tokenizer is set up correctly.
"""
hint = """\
The Postgresql extension nominatim.so was not correctly loaded.
Error: {error}
Hints:
* Check the output of the CMmake/make installation step
* Does nominatim.so exist?
* Does nominatim.so exist on the database server?
* Can nominatim.so be accessed by the database user?
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
try:
out = cur.scalar("SELECT make_standard_name('a')")
except psycopg2.Error as err:
return hint.format(error=str(err))
if out != 'a':
return hint.format(error='Unexpected result for make_standard_name()')
return None
def migrate_database(self, config: Configuration) -> None:
""" Initialise the project directory of an existing database for
use with this tokenizer.
This is a special migration function for updating existing databases
to new software versions.
"""
assert config.project_dir is not None
self.normalization = config.TERM_NORMALIZATION
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn, config)
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute the frequency of full words.
"""
with connect(self.dsn) as conn:
if conn.table_exists('search_name'):
with conn.cursor() as cur:
cur.drop_table("word_frequencies")
LOG.info("Computing word frequencies")
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute("CREATE INDEX ON word_frequencies(id)")
LOG.info("Update word table with recomputed frequencies")
cur.execute("""UPDATE word SET search_name_count = count
FROM word_frequencies
WHERE word_token like ' %' and word_id = id""")
cur.drop_table("word_frequencies")
conn.commit()
def update_word_tokens(self) -> None:
""" No house-keeping implemented for the legacy tokenizer.
"""
LOG.info("No tokenizer clean-up available.")
def name_analyzer(self) -> 'LegacyNameAnalyzer':
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
be used accordingly:
```
with tokenizer.name_analyzer() as analyzer:
analyser.tokenize()
```
When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer.
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
normalizer = Transliterator.createFromRules("phrase normalizer",
self.normalization)
return LegacyNameAnalyzer(self.dsn, normalizer)
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
with conn.cursor() as cur:
cur.execute(""" SELECT word FROM word WHERE word is not null
ORDER BY search_name_count DESC LIMIT %s""", (num,))
return list(s[0] for s in cur)
def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""
if config.lib_dir.php is not None:
php_file = self.data_dir / "tokenizer.php"
if not php_file.exists() or overwrite:
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
@define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
"""), encoding='utf-8')
def _init_db_tables(self, config: Configuration) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
conn.commit()
LOG.warning("Precomputing word tokens")
db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
def _save_config(self, conn: Connection, config: Configuration) -> None:
""" Save the configuration that needs to remain stable for the given
database as database properties.
"""
assert self.normalization is not None
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
class LegacyNameAnalyzer(AbstractAnalyzer):
""" The legacy analyzer uses the special Postgresql module for
splitting names.
Each instance opens a connection to the database to request the
normalization.
"""
def __init__(self, dsn: str, normalizer: Any):
self.conn: Optional[Connection] = connect(dsn).connection
self.conn.autocommit = True
self.normalizer = normalizer
psycopg2.extras.register_hstore(self.conn)
self._cache = _TokenCache(self.conn)
def close(self) -> None:
""" Free all resources used by the analyzer.
"""
if self.conn:
self.conn.close()
self.conn = None
def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
The function returns a list of tuples with
(original word, word token, word id).
The function is used for testing and debugging only
and not necessarily efficient.
"""
assert self.conn is not None
with self.conn.cursor() as cur:
cur.execute("""SELECT t.term, word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = (CASE
WHEN left(t.term, 1) = '#' THEN
' ' || make_standard_name(substring(t.term from 2))
ELSE
make_standard_name(t.term)
END)
and class is null and country_code is null""",
(words, ))
return [(r[0], r[1], r[2]) for r in cur]
def normalize(self, phrase: str) -> str:
""" Normalize the given phrase, i.e. remove all properties that
are irrelevant for search.
"""
return cast(str, self.normalizer.transliterate(phrase))
def normalize_postcode(self, postcode: str) -> str:
""" Convert the postcode to a standardized form.
This function must yield exactly the same result as the SQL function
'token_normalized_postcode()'.
"""
return postcode.strip().upper()
def update_postcodes_from_db(self) -> None:
""" Update postcode tokens in the word table from the location_postcode
table.
"""
assert self.conn is not None
with self.conn.cursor() as cur:
# This finds us the rows in location_postcode and word that are
# missing in the other table.
cur.execute("""SELECT * FROM
(SELECT pc, word FROM
(SELECT distinct(postcode) as pc FROM location_postcode) p
FULL JOIN
(SELECT word FROM word
WHERE class ='place' and type = 'postcode') w
ON pc = word) x
WHERE pc is null or word is null""")
to_delete = []
to_add = []
for postcode, word in cur:
if postcode is None:
to_delete.append(word)
else:
to_add.append(postcode)
if to_delete:
cur.execute("""DELETE FROM WORD
WHERE class ='place' and type = 'postcode'
and word = any(%s)
""", (to_delete, ))
if to_add:
cur.execute("""SELECT count(create_postcode_id(pc))
FROM unnest(%s) as pc
""", (to_add, ))
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
""" Replace the search index for special phrases with the new phrases.
"""
assert self.conn is not None
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
# Get the old phrases.
existing_phrases = set()
cur.execute("""SELECT word, class, type, operator FROM word
WHERE class != 'place'
OR (type != 'house' AND type != 'postcode')""")
for label, cls, typ, oper in cur:
existing_phrases.add((label, cls, typ, oper or '-'))
to_add = norm_phrases - existing_phrases
to_delete = existing_phrases - norm_phrases
if to_add:
cur.execute_values(
""" INSERT INTO word (word_id, word_token, word, class, type,
search_name_count, operator)
(SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
class, type, 0,
CASE WHEN op in ('in', 'near') THEN op ELSE null END
FROM (VALUES %s) as v(name, class, type, op))""",
to_add)
if to_delete and should_replace:
cur.execute_values(
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
WHERE word = name and class = in_class and type = in_type
and ((op = '-' and operator is null) or op = operator)""",
to_delete)
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
len(norm_phrases), len(to_add), len(to_delete))
def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
""" Add names for the given country to the search index.
"""
assert self.conn is not None
with self.conn.cursor() as cur:
cur.execute(
"""INSERT INTO word (word_id, word_token, country_code)
(SELECT nextval('seq_word'), lookup_token, %s
FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
FROM unnest(%s)n) y
WHERE NOT EXISTS(SELECT * FROM word
WHERE word_token = lookup_token and country_code = %s))
""", (country_code, list(names.values()), country_code))
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
""" Determine tokenizer information about the given place.
Returns a JSON-serialisable structure that will be handed into
the database via the token_info field.
"""
assert self.conn is not None
token_info = _TokenInfo(self._cache)
names = place.name
if names:
token_info.add_names(self.conn, names)
if place.is_country():
assert place.country_code is not None
self.add_country_names(place.country_code, names)
address = place.address
if address:
self._process_place_address(token_info, address)
return token_info.data
def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
assert self.conn is not None
hnrs = []
addr_terms = []
for key, value in address.items():
if key == 'postcode':
# Make sure the normalized postcode is present in the word table.
if re.search(r'[:,;]', value) is None:
norm_pc = self.normalize_postcode(value)
token_info.set_postcode(norm_pc)
self._cache.add_postcode(self.conn, norm_pc)
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value)
elif key == 'street':
token_info.add_street(self.conn, value)
elif key == 'place':
token_info.add_place(self.conn, value)
elif not key.startswith('_') \
and key not in ('country', 'full', 'inclusion'):
addr_terms.append((key, value))
if hnrs:
token_info.add_housenumbers(self.conn, hnrs)
if addr_terms:
token_info.add_address_terms(self.conn, addr_terms)
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
def __init__(self, cache: '_TokenCache') -> None:
self.cache = cache
self.data: Dict[str, Any] = {}
def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
""" Add token information for the names of the place.
"""
with conn.cursor() as cur:
# Create the token IDs for all names.
self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
(names, ))
def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
""" Extract housenumber information from the address.
"""
if len(hnrs) == 1:
token = self.cache.get_housenumber(hnrs[0])
if token is not None:
self.data['hnr_tokens'] = token
self.data['hnr'] = hnrs[0]
return
# split numbers if necessary
simple_list: List[str] = []
for hnr in hnrs:
simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
if len(simple_list) > 1:
simple_list = list(set(simple_list))
with conn.cursor() as cur:
cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
result = cur.fetchone()
assert result is not None
self.data['hnr_tokens'], self.data['hnr'] = result
def set_postcode(self, postcode: str) -> None:
""" Set or replace the postcode token with the given value.
"""
self.data['postcode'] = postcode
def add_street(self, conn: Connection, street: str) -> None:
""" Add addr:street match terms.
"""
def _get_street(name: str) -> Optional[str]:
with conn.cursor() as cur:
return cast(Optional[str],
cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
tokens = self.cache.streets.get(street, _get_street)
self.data['street'] = tokens or '{}'
def add_place(self, conn: Connection, place: str) -> None:
""" Add addr:place search and match terms.
"""
def _get_place(name: str) -> Tuple[List[int], List[int]]:
with conn.cursor() as cur:
cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
word_ids_from_name(%s)::text""",
(name, name))
return cast(Tuple[List[int], List[int]], cur.fetchone())
self.data['place_search'], self.data['place_match'] = \
self.cache.places.get(place, _get_place)
def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
""" Add additional address terms.
"""
def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
with conn.cursor() as cur:
cur.execute("""SELECT addr_ids_from_name(%s)::text,
word_ids_from_name(%s)::text""",
(name, name))
return cast(Tuple[List[int], List[int]], cur.fetchone())
tokens = {}
for key, value in terms:
items = self.cache.address_terms.get(value, _get_address_term)
if items[0] or items[1]:
tokens[key] = items
if tokens:
self.data['addr'] = tokens
class _LRU:
""" Least recently used cache that accepts a generator function to
produce the item when there is a cache miss.
"""
def __init__(self, maxsize: int = 128):
self.data: 'OrderedDict[str, Any]' = OrderedDict()
self.maxsize = maxsize
def get(self, key: str, generator: Callable[[str], Any]) -> Any:
""" Get the item with the given key from the cache. If nothing
is found in the cache, generate the value through the
generator function and store it in the cache.
"""
value = self.data.get(key)
if value is not None:
self.data.move_to_end(key)
else:
value = generator(key)
if len(self.data) >= self.maxsize:
self.data.popitem(last=False)
self.data[key] = value
return value
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
This cache is not thread-safe and needs to be instantiated per
analyzer.
"""
def __init__(self, conn: Connection):
# various LRU caches
self.streets = _LRU(maxsize=256)
self.places = _LRU(maxsize=128)
self.address_terms = _LRU(maxsize=1024)
# Lookup houseunumbers up to 100 and cache them
with conn.cursor() as cur:
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
FROM generate_series(1, 100) as i""")
self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
# For postcodes remember the ones that have already been added
self.postcodes: Set[str] = set()
def get_housenumber(self, number: str) -> Optional[str]:
""" Get a housenumber token from the cache.
"""
return self._cached_housenumbers.get(number)
def add_postcode(self, conn: Connection, postcode: str) -> None:
""" Make sure the given postcode is in the database.
"""
if postcode not in self.postcodes:
with conn.cursor() as cur:
cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
self.postcodes.add(postcode)

View File

@@ -0,0 +1,53 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Handler for cleaning name and address tags in place information before it
is handed to the token analysis.
"""
from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from .sanitizers.config import SanitizerConfig
from .sanitizers.base import SanitizerHandler, ProcessInfo
from ..data.place_name import PlaceName
from ..data.place_info import PlaceInfo
class PlaceSanitizer:
""" Controller class which applies sanitizer functions on the place
names and address before they are used by the token analysers.
"""
def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]],
config: Configuration) -> None:
self.handlers: List[Callable[[ProcessInfo], None]] = []
if rules:
for func in rules:
if 'step' not in func:
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
if not isinstance(func['step'], str):
raise UsageError("'step' attribute must be a simple string.")
module: SanitizerHandler = \
config.load_plugin_module(func['step'], 'nominatim.tokenizer.sanitizers')
self.handlers.append(module.create(SanitizerConfig(func)))
def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
""" Extract a sanitized list of names and address parts from the
given place. The function returns a tuple
(list of names, list of address names)
"""
obj = ProcessInfo(place)
for func in self.handlers:
func(obj)
return obj.names, obj.address

View File

@@ -0,0 +1,64 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Common data types and protocols for sanitizers.
"""
from typing import Optional, List, Mapping, Callable
from nominatim_core.typing import Protocol, Final
from ...data.place_info import PlaceInfo
from ...data.place_name import PlaceName
from .config import SanitizerConfig
class ProcessInfo:
""" Container class for information handed into to handler functions.
The 'names' and 'address' members are mutable. A handler must change
them by either modifying the lists place or replacing the old content
with a new list.
"""
def __init__(self, place: PlaceInfo):
self.place: Final = place
self.names = self._convert_name_dict(place.name)
self.address = self._convert_name_dict(place.address)
@staticmethod
def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
""" Convert a dictionary of names into a list of PlaceNames.
The dictionary key is split into the primary part of the key
and the suffix (the part after an optional colon).
"""
out = []
if names:
for key, value in names.items():
parts = key.split(':', 1)
out.append(PlaceName(value.strip(),
parts[0].strip(),
parts[1].strip() if len(parts) > 1 else None))
return out
class SanitizerHandler(Protocol):
""" Protocol for sanitizer modules.
"""
def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
"""
Create a function for sanitizing a place.
Arguments:
config: A dictionary with the additional configuration options
specified in the tokenizer configuration
Return:
The result must be a callable that takes a place description
and transforms name and address as required.
"""

View File

@@ -0,0 +1,80 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer that preprocesses address tags for house numbers. The sanitizer
allows to
* define which tags are to be considered house numbers (see 'filter-kind')
* split house number lists into individual numbers (see 'delimiters')
Arguments:
delimiters: Define the set of characters to be used for
splitting a list of house numbers into parts. (default: ',;')
filter-kind: Define the address tags that are considered to be a
house number. Either takes a single string or a list of strings,
where each string is a regular expression. An address item
is considered a house number if the 'kind' fully matches any
of the given regular expressions. (default: 'housenumber')
convert-to-name: Define house numbers that should be treated as a name
instead of a house number. Either takes a single string
or a list of strings, where each string is a regular
expression that must match the full house number value.
"""
from typing import Callable, Iterator, List
from ...data.place_name import PlaceName
from .base import ProcessInfo
from .config import SanitizerConfig
class _HousenumberSanitizer:
def __init__(self, config: SanitizerConfig) -> None:
self.filter_kind = config.get_filter('filter-kind', ['housenumber'])
self.split_regexp = config.get_delimiter()
self.filter_name = config.get_filter('convert-to-name', 'FAIL_ALL')
def __call__(self, obj: ProcessInfo) -> None:
if not obj.address:
return
new_address: List[PlaceName] = []
for item in obj.address:
if self.filter_kind(item.kind):
if self.filter_name(item.name):
obj.names.append(item.clone(kind='housenumber'))
else:
new_address.extend(item.clone(kind='housenumber', name=n)
for n in self.sanitize(item.name))
else:
# Don't touch other address items.
new_address.append(item)
obj.address = new_address
def sanitize(self, value: str) -> Iterator[str]:
""" Extract housenumbers in a regularized format from an OSM value.
The function works as a generator that yields all valid housenumbers
that can be created from the value.
"""
for hnr in self.split_regexp.split(value):
if hnr:
yield from self._regularize(hnr)
def _regularize(self, hnr: str) -> Iterator[str]:
yield hnr
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a housenumber processing function.
"""
return _HousenumberSanitizer(config)

View File

@@ -0,0 +1,80 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer that filters postcodes by their officially allowed pattern.
Arguments:
convert-to-address: If set to 'yes' (the default), then postcodes that do
not conform with their country-specific pattern are
converted to an address component. That means that
the postcode does not take part when computing the
postcode centroids of a country but is still searchable.
When set to 'no', non-conforming postcodes are not
searchable either.
default-pattern: Pattern to use, when there is none available for the
country in question. Warning: will not be used for
objects that have no country assigned. These are always
assumed to have no postcode.
"""
from typing import Callable, Optional, Tuple
from ...data.postcode_format import PostcodeFormatter
from .base import ProcessInfo
from .config import SanitizerConfig
class _PostcodeSanitizer:
def __init__(self, config: SanitizerConfig) -> None:
self.convert_to_address = config.get_bool('convert-to-address', True)
self.matcher = PostcodeFormatter()
default_pattern = config.get('default-pattern')
if default_pattern is not None and isinstance(default_pattern, str):
self.matcher.set_default_pattern(default_pattern)
def __call__(self, obj: ProcessInfo) -> None:
if not obj.address:
return
postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
for pos, postcode in postcodes:
formatted = self.scan(postcode.name, obj.place.country_code)
if formatted is None:
if self.convert_to_address:
postcode.kind = 'unofficial_postcode'
else:
obj.address.pop(pos)
else:
postcode.name = formatted[0]
postcode.set_attr('variant', formatted[1])
def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
""" Check the postcode for correct formatting and return the
normalized version. Returns None if the postcode does not
correspond to the official format of the given country.
"""
match = self.matcher.match(country, postcode)
if match is None:
return None
assert country is not None
return self.matcher.normalize(country, match),\
' '.join(filter(lambda p: p is not None, match.groups()))
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function that filters postcodes by their officially allowed pattern.
"""
return _PostcodeSanitizer(config)

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer that preprocesses tags from the TIGER import.
It makes the following changes:
* remove state reference from tiger:county
"""
from typing import Callable
import re
from .base import ProcessInfo
from .config import SanitizerConfig
COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]')
def _clean_tiger_county(obj: ProcessInfo) -> None:
""" Remove the state reference from tiger:county tags.
This transforms a name like 'Hamilton, AL' into 'Hamilton'.
If no state reference is detected at the end, the name is left as is.
"""
if not obj.address:
return
for item in obj.address:
if item.kind == 'tiger' and item.suffix == 'county':
m = COUNTY_MATCH.fullmatch(item.name)
if m:
item.name = m[1]
# Switch kind and suffix, the split left them reversed.
item.kind = 'county'
item.suffix = 'tiger'
return
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function that preprocesses tags from the TIGER import.
"""
return _clean_tiger_county

Some files were not shown because too many files have changed in this diff Show More