forked from hans/Nominatim
split code into submodules
This commit is contained in:
38
src/nominatim_api/__init__.py
Normal file
38
src/nominatim_api/__init__.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
The public interface of the Nominatim library.
|
||||
|
||||
Classes and functions defined in this file are considered stable. Always
|
||||
import from this file, not from the source files directly.
|
||||
"""
|
||||
|
||||
# See also https://github.com/PyCQA/pylint/issues/6006
|
||||
# pylint: disable=useless-import-alias
|
||||
|
||||
from .core import (NominatimAPI as NominatimAPI,
|
||||
NominatimAPIAsync as NominatimAPIAsync)
|
||||
from .connection import (SearchConnection as SearchConnection)
|
||||
from .status import (StatusResult as StatusResult)
|
||||
from .types import (PlaceID as PlaceID,
|
||||
OsmID as OsmID,
|
||||
PlaceRef as PlaceRef,
|
||||
Point as Point,
|
||||
Bbox as Bbox,
|
||||
GeometryFormat as GeometryFormat,
|
||||
DataLayer as DataLayer)
|
||||
from .results import (SourceTable as SourceTable,
|
||||
AddressLine as AddressLine,
|
||||
AddressLines as AddressLines,
|
||||
WordInfo as WordInfo,
|
||||
WordInfos as WordInfos,
|
||||
DetailedResult as DetailedResult,
|
||||
ReverseResult as ReverseResult,
|
||||
ReverseResults as ReverseResults,
|
||||
SearchResult as SearchResult,
|
||||
SearchResults as SearchResults)
|
||||
from .localization import (Locales as Locales)
|
||||
149
src/nominatim_api/connection.py
Normal file
149
src/nominatim_api/connection.py
Normal file
@@ -0,0 +1,149 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Extended SQLAlchemy connection class that also includes access to the schema.
|
||||
"""
|
||||
from typing import cast, Any, Mapping, Sequence, Union, Dict, Optional, Set, \
|
||||
Awaitable, Callable, TypeVar
|
||||
import asyncio
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.ext.asyncio import AsyncConnection
|
||||
|
||||
from nominatim_core.typing import SaFromClause
|
||||
from nominatim_core.db.sqlalchemy_schema import SearchTables
|
||||
from nominatim_core.db.sqlalchemy_types import Geometry
|
||||
from .logging import log
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
class SearchConnection:
|
||||
""" An extended SQLAlchemy connection class, that also contains
|
||||
the table definitions. The underlying asynchronous SQLAlchemy
|
||||
connection can be accessed with the 'connection' property.
|
||||
The 't' property is the collection of Nominatim tables.
|
||||
"""
|
||||
|
||||
def __init__(self, conn: AsyncConnection,
|
||||
tables: SearchTables,
|
||||
properties: Dict[str, Any]) -> None:
|
||||
self.connection = conn
|
||||
self.t = tables # pylint: disable=invalid-name
|
||||
self._property_cache = properties
|
||||
self._classtables: Optional[Set[str]] = None
|
||||
self.query_timeout: Optional[int] = None
|
||||
|
||||
|
||||
def set_query_timeout(self, timeout: Optional[int]) -> None:
|
||||
""" Set the timeout after which a query over this connection
|
||||
is cancelled.
|
||||
"""
|
||||
self.query_timeout = timeout
|
||||
|
||||
|
||||
async def scalar(self, sql: sa.sql.base.Executable,
|
||||
params: Union[Mapping[str, Any], None] = None
|
||||
) -> Any:
|
||||
""" Execute a 'scalar()' query on the connection.
|
||||
"""
|
||||
log().sql(self.connection, sql, params)
|
||||
return await asyncio.wait_for(self.connection.scalar(sql, params), self.query_timeout)
|
||||
|
||||
|
||||
async def execute(self, sql: 'sa.Executable',
|
||||
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None] = None
|
||||
) -> 'sa.Result[Any]':
|
||||
""" Execute a 'execute()' query on the connection.
|
||||
"""
|
||||
log().sql(self.connection, sql, params)
|
||||
return await asyncio.wait_for(self.connection.execute(sql, params), self.query_timeout)
|
||||
|
||||
|
||||
async def get_property(self, name: str, cached: bool = True) -> str:
|
||||
""" Get a property from Nominatim's property table.
|
||||
|
||||
Property values are normally cached so that they are only
|
||||
retrieved from the database when they are queried for the
|
||||
first time with this function. Set 'cached' to False to force
|
||||
reading the property from the database.
|
||||
|
||||
Raises a ValueError if the property does not exist.
|
||||
"""
|
||||
lookup_name = f'DBPROP:{name}'
|
||||
|
||||
if cached and lookup_name in self._property_cache:
|
||||
return cast(str, self._property_cache[lookup_name])
|
||||
|
||||
sql = sa.select(self.t.properties.c.value)\
|
||||
.where(self.t.properties.c.property == name)
|
||||
value = await self.connection.scalar(sql)
|
||||
|
||||
if value is None:
|
||||
raise ValueError(f"Property '{name}' not found in database.")
|
||||
|
||||
self._property_cache[lookup_name] = cast(str, value)
|
||||
|
||||
return cast(str, value)
|
||||
|
||||
|
||||
async def get_db_property(self, name: str) -> Any:
|
||||
""" Get a setting from the database. At the moment, only
|
||||
'server_version', the version of the database software, can
|
||||
be retrieved with this function.
|
||||
|
||||
Raises a ValueError if the property does not exist.
|
||||
"""
|
||||
if name != 'server_version':
|
||||
raise ValueError(f"DB setting '{name}' not found in database.")
|
||||
|
||||
return self._property_cache['DB:server_version']
|
||||
|
||||
|
||||
async def get_cached_value(self, group: str, name: str,
|
||||
factory: Callable[[], Awaitable[T]]) -> T:
|
||||
""" Access the cache for this Nominatim instance.
|
||||
Each cache value needs to belong to a group and have a name.
|
||||
This function is for internal API use only.
|
||||
|
||||
`factory` is an async callback function that produces
|
||||
the value if it is not already cached.
|
||||
|
||||
Returns the cached value or the result of factory (also caching
|
||||
the result).
|
||||
"""
|
||||
full_name = f'{group}:{name}'
|
||||
|
||||
if full_name in self._property_cache:
|
||||
return cast(T, self._property_cache[full_name])
|
||||
|
||||
value = await factory()
|
||||
self._property_cache[full_name] = value
|
||||
|
||||
return value
|
||||
|
||||
|
||||
async def get_class_table(self, cls: str, typ: str) -> Optional[SaFromClause]:
|
||||
""" Lookup up if there is a classtype table for the given category
|
||||
and return a SQLAlchemy table for it, if it exists.
|
||||
"""
|
||||
if self._classtables is None:
|
||||
res = await self.execute(sa.text("""SELECT tablename FROM pg_tables
|
||||
WHERE tablename LIKE 'place_classtype_%'
|
||||
"""))
|
||||
self._classtables = {r[0] for r in res}
|
||||
|
||||
tablename = f"place_classtype_{cls}_{typ}"
|
||||
|
||||
if tablename not in self._classtables:
|
||||
return None
|
||||
|
||||
if tablename in self.t.meta.tables:
|
||||
return self.t.meta.tables[tablename]
|
||||
|
||||
return sa.Table(tablename, self.t.meta,
|
||||
sa.Column('place_id', sa.BigInteger),
|
||||
sa.Column('centroid', Geometry))
|
||||
974
src/nominatim_api/core.py
Normal file
974
src/nominatim_api/core.py
Normal file
@@ -0,0 +1,974 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of classes for API access via libraries.
|
||||
"""
|
||||
from typing import Mapping, Optional, Any, AsyncIterator, Dict, Sequence, List, Tuple
|
||||
import asyncio
|
||||
import sys
|
||||
import contextlib
|
||||
from pathlib import Path
|
||||
|
||||
import sqlalchemy as sa
|
||||
import sqlalchemy.ext.asyncio as sa_asyncio
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.db.sqlalchemy_schema import SearchTables
|
||||
from nominatim_core.db.async_core_library import PGCORE_LIB, PGCORE_ERROR
|
||||
from nominatim_core.config import Configuration
|
||||
from .sql import sqlite_functions, sqlalchemy_functions #pylint: disable=unused-import
|
||||
from .connection import SearchConnection
|
||||
from .status import get_status, StatusResult
|
||||
from .lookup import get_detailed_place, get_simple_place
|
||||
from .reverse import ReverseGeocoder
|
||||
from .search import ForwardGeocoder, Phrase, PhraseType, make_query_analyzer
|
||||
from . import types as ntyp
|
||||
from .results import DetailedResult, ReverseResult, SearchResults
|
||||
|
||||
|
||||
class NominatimAPIAsync: #pylint: disable=too-many-instance-attributes
|
||||
""" The main frontend to the Nominatim database implements the
|
||||
functions for lookup, forward and reverse geocoding using
|
||||
asynchronous functions.
|
||||
|
||||
This class shares most of the functions with its synchronous
|
||||
version. There are some additional functions or parameters,
|
||||
which are documented below.
|
||||
"""
|
||||
def __init__(self, project_dir: Path,
|
||||
environ: Optional[Mapping[str, str]] = None,
|
||||
loop: Optional[asyncio.AbstractEventLoop] = None) -> None:
|
||||
""" Initiate a new frontend object with synchronous API functions.
|
||||
|
||||
Parameters:
|
||||
project_dir: Path to the
|
||||
[project directory](../admin/Import.md#creating-the-project-directory)
|
||||
of the local Nominatim installation.
|
||||
environ: Mapping of [configuration parameters](../customize/Settings.md).
|
||||
When set, replaces any configuration via environment variables.
|
||||
Settings in this mapping also have precedence over any
|
||||
parameters found in the `.env` file of the project directory.
|
||||
loop: The asyncio event loop that will be used when calling
|
||||
functions. Only needed, when a custom event loop is used
|
||||
and the Python version is 3.9 or earlier.
|
||||
"""
|
||||
self.config = Configuration(project_dir, environ)
|
||||
self.query_timeout = self.config.get_int('QUERY_TIMEOUT') \
|
||||
if self.config.QUERY_TIMEOUT else None
|
||||
self.reverse_restrict_to_country_area = self.config.get_bool('SEARCH_WITHIN_COUNTRIES')
|
||||
self.server_version = 0
|
||||
|
||||
if sys.version_info >= (3, 10):
|
||||
self._engine_lock = asyncio.Lock()
|
||||
else:
|
||||
self._engine_lock = asyncio.Lock(loop=loop) # pylint: disable=unexpected-keyword-arg
|
||||
self._engine: Optional[sa_asyncio.AsyncEngine] = None
|
||||
self._tables: Optional[SearchTables] = None
|
||||
self._property_cache: Dict[str, Any] = {'DB:server_version': 0}
|
||||
|
||||
|
||||
async def setup_database(self) -> None:
|
||||
""" Set up the SQL engine and connections.
|
||||
|
||||
This function will be implicitly called when the database is
|
||||
accessed for the first time. You may also call it explicitly to
|
||||
avoid that the first call is delayed by the setup.
|
||||
"""
|
||||
async with self._engine_lock:
|
||||
if self._engine:
|
||||
return
|
||||
|
||||
extra_args: Dict[str, Any] = {'future': True,
|
||||
'echo': self.config.get_bool('DEBUG_SQL')}
|
||||
|
||||
if self.config.get_int('API_POOL_SIZE') == 0:
|
||||
extra_args['poolclass'] = sa.pool.NullPool
|
||||
else:
|
||||
extra_args['poolclass'] = sa.pool.AsyncAdaptedQueuePool
|
||||
extra_args['max_overflow'] = 0
|
||||
extra_args['pool_size'] = self.config.get_int('API_POOL_SIZE')
|
||||
|
||||
|
||||
is_sqlite = self.config.DATABASE_DSN.startswith('sqlite:')
|
||||
|
||||
if is_sqlite:
|
||||
params = dict((p.split('=', 1)
|
||||
for p in self.config.DATABASE_DSN[7:].split(';')))
|
||||
dburl = sa.engine.URL.create('sqlite+aiosqlite',
|
||||
database=params.get('dbname'))
|
||||
|
||||
if not ('NOMINATIM_DATABASE_RW' in self.config.environ
|
||||
and self.config.get_bool('DATABASE_RW')) \
|
||||
and not Path(params.get('dbname', '')).is_file():
|
||||
raise UsageError(f"SQlite database '{params.get('dbname')}' does not exist.")
|
||||
else:
|
||||
dsn = self.config.get_database_params()
|
||||
query = {k: v for k, v in dsn.items()
|
||||
if k not in ('user', 'password', 'dbname', 'host', 'port')}
|
||||
|
||||
dburl = sa.engine.URL.create(
|
||||
f'postgresql+{PGCORE_LIB}',
|
||||
database=dsn.get('dbname'),
|
||||
username=dsn.get('user'),
|
||||
password=dsn.get('password'),
|
||||
host=dsn.get('host'),
|
||||
port=int(dsn['port']) if 'port' in dsn else None,
|
||||
query=query)
|
||||
|
||||
engine = sa_asyncio.create_async_engine(dburl, **extra_args)
|
||||
|
||||
if is_sqlite:
|
||||
server_version = 0
|
||||
|
||||
@sa.event.listens_for(engine.sync_engine, "connect")
|
||||
def _on_sqlite_connect(dbapi_con: Any, _: Any) -> None:
|
||||
dbapi_con.run_async(lambda conn: conn.enable_load_extension(True))
|
||||
sqlite_functions.install_custom_functions(dbapi_con)
|
||||
cursor = dbapi_con.cursor()
|
||||
cursor.execute("SELECT load_extension('mod_spatialite')")
|
||||
cursor.execute('SELECT SetDecimalPrecision(7)')
|
||||
dbapi_con.run_async(lambda conn: conn.enable_load_extension(False))
|
||||
else:
|
||||
try:
|
||||
async with engine.begin() as conn:
|
||||
result = await conn.scalar(sa.text('SHOW server_version_num'))
|
||||
server_version = int(result)
|
||||
if server_version >= 110000:
|
||||
await conn.execute(sa.text("SET jit_above_cost TO '-1'"))
|
||||
await conn.execute(sa.text(
|
||||
"SET max_parallel_workers_per_gather TO '0'"))
|
||||
except (PGCORE_ERROR, sa.exc.OperationalError):
|
||||
server_version = 0
|
||||
|
||||
if server_version >= 110000:
|
||||
@sa.event.listens_for(engine.sync_engine, "connect")
|
||||
def _on_connect(dbapi_con: Any, _: Any) -> None:
|
||||
cursor = dbapi_con.cursor()
|
||||
cursor.execute("SET jit_above_cost TO '-1'")
|
||||
cursor.execute("SET max_parallel_workers_per_gather TO '0'")
|
||||
|
||||
self._property_cache['DB:server_version'] = server_version
|
||||
|
||||
self._tables = SearchTables(sa.MetaData()) # pylint: disable=no-member
|
||||
self._engine = engine
|
||||
|
||||
|
||||
async def close(self) -> None:
|
||||
""" Close all active connections to the database. The NominatimAPIAsync
|
||||
object remains usable after closing. If a new API functions is
|
||||
called, new connections are created.
|
||||
"""
|
||||
if self._engine is not None:
|
||||
await self._engine.dispose()
|
||||
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def begin(self) -> AsyncIterator[SearchConnection]:
|
||||
""" Create a new connection with automatic transaction handling.
|
||||
|
||||
This function may be used to get low-level access to the database.
|
||||
Refer to the documentation of SQLAlchemy for details how to use
|
||||
the connection object.
|
||||
"""
|
||||
if self._engine is None:
|
||||
await self.setup_database()
|
||||
|
||||
assert self._engine is not None
|
||||
assert self._tables is not None
|
||||
|
||||
async with self._engine.begin() as conn:
|
||||
yield SearchConnection(conn, self._tables, self._property_cache)
|
||||
|
||||
|
||||
async def status(self) -> StatusResult:
|
||||
""" Return the status of the database.
|
||||
"""
|
||||
try:
|
||||
async with self.begin() as conn:
|
||||
conn.set_query_timeout(self.query_timeout)
|
||||
status = await get_status(conn)
|
||||
except (PGCORE_ERROR, sa.exc.OperationalError):
|
||||
return StatusResult(700, 'Database connection failed')
|
||||
|
||||
return status
|
||||
|
||||
|
||||
async def details(self, place: ntyp.PlaceRef, **params: Any) -> Optional[DetailedResult]:
|
||||
""" Get detailed information about a place in the database.
|
||||
|
||||
Returns None if there is no entry under the given ID.
|
||||
"""
|
||||
details = ntyp.LookupDetails.from_kwargs(params)
|
||||
async with self.begin() as conn:
|
||||
conn.set_query_timeout(self.query_timeout)
|
||||
if details.keywords:
|
||||
await make_query_analyzer(conn)
|
||||
return await get_detailed_place(conn, place, details)
|
||||
|
||||
|
||||
async def lookup(self, places: Sequence[ntyp.PlaceRef], **params: Any) -> SearchResults:
|
||||
""" Get simple information about a list of places.
|
||||
|
||||
Returns a list of place information for all IDs that were found.
|
||||
"""
|
||||
details = ntyp.LookupDetails.from_kwargs(params)
|
||||
async with self.begin() as conn:
|
||||
conn.set_query_timeout(self.query_timeout)
|
||||
if details.keywords:
|
||||
await make_query_analyzer(conn)
|
||||
return SearchResults(filter(None,
|
||||
[await get_simple_place(conn, p, details) for p in places]))
|
||||
|
||||
|
||||
async def reverse(self, coord: ntyp.AnyPoint, **params: Any) -> Optional[ReverseResult]:
|
||||
""" Find a place by its coordinates. Also known as reverse geocoding.
|
||||
|
||||
Returns the closest result that can be found or None if
|
||||
no place matches the given criteria.
|
||||
"""
|
||||
# The following negation handles NaN correctly. Don't change.
|
||||
if not abs(coord[0]) <= 180 or not abs(coord[1]) <= 90:
|
||||
# There are no results to be expected outside valid coordinates.
|
||||
return None
|
||||
|
||||
details = ntyp.ReverseDetails.from_kwargs(params)
|
||||
async with self.begin() as conn:
|
||||
conn.set_query_timeout(self.query_timeout)
|
||||
if details.keywords:
|
||||
await make_query_analyzer(conn)
|
||||
geocoder = ReverseGeocoder(conn, details,
|
||||
self.reverse_restrict_to_country_area)
|
||||
return await geocoder.lookup(coord)
|
||||
|
||||
|
||||
async def search(self, query: str, **params: Any) -> SearchResults:
|
||||
""" Find a place by free-text search. Also known as forward geocoding.
|
||||
"""
|
||||
query = query.strip()
|
||||
if not query:
|
||||
raise UsageError('Nothing to search for.')
|
||||
|
||||
async with self.begin() as conn:
|
||||
conn.set_query_timeout(self.query_timeout)
|
||||
geocoder = ForwardGeocoder(conn, ntyp.SearchDetails.from_kwargs(params),
|
||||
self.config.get_int('REQUEST_TIMEOUT') \
|
||||
if self.config.REQUEST_TIMEOUT else None)
|
||||
phrases = [Phrase(PhraseType.NONE, p.strip()) for p in query.split(',')]
|
||||
return await geocoder.lookup(phrases)
|
||||
|
||||
|
||||
# pylint: disable=too-many-arguments,too-many-branches
|
||||
async def search_address(self, amenity: Optional[str] = None,
|
||||
street: Optional[str] = None,
|
||||
city: Optional[str] = None,
|
||||
county: Optional[str] = None,
|
||||
state: Optional[str] = None,
|
||||
country: Optional[str] = None,
|
||||
postalcode: Optional[str] = None,
|
||||
**params: Any) -> SearchResults:
|
||||
""" Find an address using structured search.
|
||||
"""
|
||||
async with self.begin() as conn:
|
||||
conn.set_query_timeout(self.query_timeout)
|
||||
details = ntyp.SearchDetails.from_kwargs(params)
|
||||
|
||||
phrases: List[Phrase] = []
|
||||
|
||||
if amenity:
|
||||
phrases.append(Phrase(PhraseType.AMENITY, amenity))
|
||||
if street:
|
||||
phrases.append(Phrase(PhraseType.STREET, street))
|
||||
if city:
|
||||
phrases.append(Phrase(PhraseType.CITY, city))
|
||||
if county:
|
||||
phrases.append(Phrase(PhraseType.COUNTY, county))
|
||||
if state:
|
||||
phrases.append(Phrase(PhraseType.STATE, state))
|
||||
if postalcode:
|
||||
phrases.append(Phrase(PhraseType.POSTCODE, postalcode))
|
||||
if country:
|
||||
phrases.append(Phrase(PhraseType.COUNTRY, country))
|
||||
|
||||
if not phrases:
|
||||
raise UsageError('Nothing to search for.')
|
||||
|
||||
if amenity or street:
|
||||
details.restrict_min_max_rank(26, 30)
|
||||
elif city:
|
||||
details.restrict_min_max_rank(13, 25)
|
||||
elif county:
|
||||
details.restrict_min_max_rank(10, 12)
|
||||
elif state:
|
||||
details.restrict_min_max_rank(5, 9)
|
||||
elif postalcode:
|
||||
details.restrict_min_max_rank(5, 11)
|
||||
else:
|
||||
details.restrict_min_max_rank(4, 4)
|
||||
|
||||
if 'layers' not in params:
|
||||
details.layers = ntyp.DataLayer.ADDRESS
|
||||
if amenity:
|
||||
details.layers |= ntyp.DataLayer.POI
|
||||
|
||||
geocoder = ForwardGeocoder(conn, details,
|
||||
self.config.get_int('REQUEST_TIMEOUT') \
|
||||
if self.config.REQUEST_TIMEOUT else None)
|
||||
return await geocoder.lookup(phrases)
|
||||
|
||||
|
||||
async def search_category(self, categories: List[Tuple[str, str]],
|
||||
near_query: Optional[str] = None,
|
||||
**params: Any) -> SearchResults:
|
||||
""" Find an object of a certain category near another place.
|
||||
The near place may either be given as an unstructured search
|
||||
query in itself or as coordinates.
|
||||
"""
|
||||
if not categories:
|
||||
return SearchResults()
|
||||
|
||||
details = ntyp.SearchDetails.from_kwargs(params)
|
||||
async with self.begin() as conn:
|
||||
conn.set_query_timeout(self.query_timeout)
|
||||
if near_query:
|
||||
phrases = [Phrase(PhraseType.NONE, p) for p in near_query.split(',')]
|
||||
else:
|
||||
phrases = []
|
||||
if details.keywords:
|
||||
await make_query_analyzer(conn)
|
||||
|
||||
geocoder = ForwardGeocoder(conn, details,
|
||||
self.config.get_int('REQUEST_TIMEOUT') \
|
||||
if self.config.REQUEST_TIMEOUT else None)
|
||||
return await geocoder.lookup_pois(categories, phrases)
|
||||
|
||||
|
||||
|
||||
class NominatimAPI:
|
||||
""" This class provides a thin synchronous wrapper around the asynchronous
|
||||
Nominatim functions. It creates its own event loop and runs each
|
||||
synchronous function call to completion using that loop.
|
||||
"""
|
||||
|
||||
def __init__(self, project_dir: Path,
|
||||
environ: Optional[Mapping[str, str]] = None) -> None:
|
||||
""" Initiate a new frontend object with synchronous API functions.
|
||||
|
||||
Parameters:
|
||||
project_dir: Path to the
|
||||
[project directory](../admin/Import.md#creating-the-project-directory)
|
||||
of the local Nominatim installation.
|
||||
environ: Mapping of [configuration parameters](../customize/Settings.md).
|
||||
When set, replaces any configuration via environment variables.
|
||||
Settings in this mapping also have precedence over any
|
||||
parameters found in the `.env` file of the project directory.
|
||||
"""
|
||||
self._loop = asyncio.new_event_loop()
|
||||
self._async_api = NominatimAPIAsync(project_dir, environ, loop=self._loop)
|
||||
|
||||
|
||||
def close(self) -> None:
|
||||
""" Close all active connections to the database.
|
||||
|
||||
This function also closes the asynchronous worker loop making
|
||||
the NominatimAPI object unusable.
|
||||
"""
|
||||
self._loop.run_until_complete(self._async_api.close())
|
||||
self._loop.close()
|
||||
|
||||
|
||||
@property
|
||||
def config(self) -> Configuration:
|
||||
""" Provide read-only access to the [configuration](#Configuration)
|
||||
used by the API.
|
||||
"""
|
||||
return self._async_api.config
|
||||
|
||||
def status(self) -> StatusResult:
|
||||
""" Return the status of the database as a dataclass object
|
||||
with the fields described below.
|
||||
|
||||
Returns:
|
||||
status(int): A status code as described on the status page.
|
||||
message(str): Either 'OK' or a human-readable message of the
|
||||
problem encountered.
|
||||
software_version(tuple): A tuple with the version of the
|
||||
Nominatim library consisting of (major, minor, patch, db-patch)
|
||||
version.
|
||||
database_version(tuple): A tuple with the version of the library
|
||||
which was used for the import or last migration.
|
||||
Also consists of (major, minor, patch, db-patch).
|
||||
data_updated(datetime): Timestamp with the age of the data.
|
||||
"""
|
||||
return self._loop.run_until_complete(self._async_api.status())
|
||||
|
||||
|
||||
def details(self, place: ntyp.PlaceRef, **params: Any) -> Optional[DetailedResult]:
|
||||
""" Get detailed information about a place in the database.
|
||||
|
||||
The result is a dataclass object with the fields described below
|
||||
or `None` if the place could not be found in the database.
|
||||
|
||||
Parameters:
|
||||
place: Description of the place to look up. See
|
||||
[Place identification](Input-Parameter-Types.md#place-identification)
|
||||
for the various ways to reference a place.
|
||||
|
||||
Other parameters:
|
||||
geometry_output (enum): Add the full geometry of the place to the result.
|
||||
Multiple formats may be selected. Note that geometries can become
|
||||
quite large. (Default: none)
|
||||
geometry_simplification (float): Simplification factor to use on
|
||||
the geometries before returning them. The factor expresses
|
||||
the tolerance in degrees from which the geometry may differ.
|
||||
Topology is preserved. (Default: 0.0)
|
||||
address_details (bool): Add detailed information about the places
|
||||
that make up the address of the requested object. (Default: False)
|
||||
linked_places (bool): Add detailed information about the places
|
||||
that link to the result. (Default: False)
|
||||
parented_places (bool): Add detailed information about all places
|
||||
for which the requested object is a parent, i.e. all places for
|
||||
which the object provides the address details.
|
||||
Only POI places can have parents. (Default: False)
|
||||
keywords (bool): Add detailed information about the search terms
|
||||
used for this place.
|
||||
|
||||
Returns:
|
||||
source_table (enum): Data source of the place. See below for possible values.
|
||||
category (tuple): A tuple of two strings with the primary OSM tag
|
||||
and value.
|
||||
centroid (Point): Point position of the place.
|
||||
place_id (Optional[int]): Internal ID of the place. This ID may differ
|
||||
for the same place between different installations.
|
||||
parent_place_id (Optional(int]): Internal ID of the parent of this
|
||||
place. Only meaning full for POI-like objects (places with a
|
||||
rank_address of 30).
|
||||
linked_place_id (Optional[int]): Internal ID of the place this object
|
||||
links to. When this ID is set then there is no guarantee that
|
||||
the rest of the result information is complete.
|
||||
admin_level (int): Value of the `admin_level` OSM tag. Only meaningful
|
||||
for administrative boundary objects.
|
||||
indexed_date (datetime): Timestamp when the place was last updated.
|
||||
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
|
||||
names (Optional[dict]): Dictionary of names of the place. Keys are
|
||||
usually the corresponding OSM tag keys.
|
||||
address (Optional[dict]): Dictionary of address parts directly
|
||||
attributed to the place. Keys are usually the corresponding
|
||||
OSM tag keys with the `addr:` prefix removed.
|
||||
extratags (Optional[dict]): Dictionary of additional attributes for
|
||||
the place. Usually OSM tag keys and values.
|
||||
housenumber (Optional[str]): House number of the place, normalised
|
||||
for lookup. To get the house number in its original spelling,
|
||||
use `address['housenumber']`.
|
||||
postcode (Optional[str]): Computed postcode for the place. To get
|
||||
directly attributed postcodes, use `address['postcode']` instead.
|
||||
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
|
||||
The string has the format <language code>:<wikipedia title>.
|
||||
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
|
||||
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
|
||||
importance (Optional[float]): Relative importance of the place. This is a measure
|
||||
how likely the place will be searched for.
|
||||
country_code (Optional[str]): Country the feature is in as
|
||||
ISO 3166-1 alpha-2 country code.
|
||||
address_rows (Optional[AddressLines]): List of places that make up the
|
||||
computed address. `None` when `address_details` parameter was False.
|
||||
linked_rows (Optional[AddressLines]): List of places that link to the object.
|
||||
`None` when `linked_places` parameter was False.
|
||||
parented_rows (Optional[AddressLines]): List of direct children of the place.
|
||||
`None` when `parented_places` parameter was False.
|
||||
name_keywords (Optional[WordInfos]): List of search words for the name of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
address_keywords (Optional[WordInfos]): List of search word for the address of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
geometry (dict): Dictionary containing the full geometry of the place
|
||||
in the formats requested in the `geometry_output` parameter.
|
||||
"""
|
||||
return self._loop.run_until_complete(self._async_api.details(place, **params))
|
||||
|
||||
|
||||
def lookup(self, places: Sequence[ntyp.PlaceRef], **params: Any) -> SearchResults:
|
||||
""" Get simple information about a list of places.
|
||||
|
||||
Returns a list of place information for all IDs that were found.
|
||||
Each result is a dataclass with the fields detailed below.
|
||||
|
||||
Parameters:
|
||||
places: List of descriptions of the place to look up. See
|
||||
[Place identification](Input-Parameter-Types.md#place-identification)
|
||||
for the various ways to reference a place.
|
||||
|
||||
Other parameters:
|
||||
geometry_output (enum): Add the full geometry of the place to the result.
|
||||
Multiple formats may be selected. Note that geometries can become
|
||||
quite large. (Default: none)
|
||||
geometry_simplification (float): Simplification factor to use on
|
||||
the geometries before returning them. The factor expresses
|
||||
the tolerance in degrees from which the geometry may differ.
|
||||
Topology is preserved. (Default: 0.0)
|
||||
address_details (bool): Add detailed information about the places
|
||||
that make up the address of the requested object. (Default: False)
|
||||
linked_places (bool): Add detailed information about the places
|
||||
that link to the result. (Default: False)
|
||||
parented_places (bool): Add detailed information about all places
|
||||
for which the requested object is a parent, i.e. all places for
|
||||
which the object provides the address details.
|
||||
Only POI places can have parents. (Default: False)
|
||||
keywords (bool): Add detailed information about the search terms
|
||||
used for this place.
|
||||
|
||||
Returns:
|
||||
source_table (enum): Data source of the place. See below for possible values.
|
||||
category (tuple): A tuple of two strings with the primary OSM tag
|
||||
and value.
|
||||
centroid (Point): Point position of the place.
|
||||
place_id (Optional[int]): Internal ID of the place. This ID may differ
|
||||
for the same place between different installations.
|
||||
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
|
||||
names (Optional[dict]): Dictionary of names of the place. Keys are
|
||||
usually the corresponding OSM tag keys.
|
||||
address (Optional[dict]): Dictionary of address parts directly
|
||||
attributed to the place. Keys are usually the corresponding
|
||||
OSM tag keys with the `addr:` prefix removed.
|
||||
extratags (Optional[dict]): Dictionary of additional attributes for
|
||||
the place. Usually OSM tag keys and values.
|
||||
housenumber (Optional[str]): House number of the place, normalised
|
||||
for lookup. To get the house number in its original spelling,
|
||||
use `address['housenumber']`.
|
||||
postcode (Optional[str]): Computed postcode for the place. To get
|
||||
directly attributed postcodes, use `address['postcode']` instead.
|
||||
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
|
||||
The string has the format <language code>:<wikipedia title>.
|
||||
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
|
||||
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
|
||||
importance (Optional[float]): Relative importance of the place. This is a measure
|
||||
how likely the place will be searched for.
|
||||
country_code (Optional[str]): Country the feature is in as
|
||||
ISO 3166-1 alpha-2 country code.
|
||||
address_rows (Optional[AddressLines]): List of places that make up the
|
||||
computed address. `None` when `address_details` parameter was False.
|
||||
linked_rows (Optional[AddressLines]): List of places that link to the object.
|
||||
`None` when `linked_places` parameter was False.
|
||||
parented_rows (Optional[AddressLines]): List of direct children of the place.
|
||||
`None` when `parented_places` parameter was False.
|
||||
name_keywords (Optional[WordInfos]): List of search words for the name of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
address_keywords (Optional[WordInfos]): List of search word for the address of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
bbox (Bbox): Bounding box of the full geometry of the place.
|
||||
If the place is a single point, then the size of the bounding
|
||||
box is guessed according to the type of place.
|
||||
geometry (dict): Dictionary containing the full geometry of the place
|
||||
in the formats requested in the `geometry_output` parameter.
|
||||
"""
|
||||
return self._loop.run_until_complete(self._async_api.lookup(places, **params))
|
||||
|
||||
|
||||
def reverse(self, coord: ntyp.AnyPoint, **params: Any) -> Optional[ReverseResult]:
|
||||
""" Find a place by its coordinates. Also known as reverse geocoding.
|
||||
|
||||
Returns the closest result that can be found or `None` if
|
||||
no place matches the given criteria. The result is a dataclass
|
||||
with the fields as detailed below.
|
||||
|
||||
Parameters:
|
||||
coord: Coordinate to lookup the place for as a Point
|
||||
or a tuple (x, y). Must be in WGS84 projection.
|
||||
|
||||
Other parameters:
|
||||
max_rank (int): Highest address rank to return. Can be used to
|
||||
restrict search to streets or settlements.
|
||||
layers (enum): Defines the kind of data to take into account.
|
||||
See description of layers below. (Default: addresses and POIs)
|
||||
geometry_output (enum): Add the full geometry of the place to the result.
|
||||
Multiple formats may be selected. Note that geometries can become
|
||||
quite large. (Default: none)
|
||||
geometry_simplification (float): Simplification factor to use on
|
||||
the geometries before returning them. The factor expresses
|
||||
the tolerance in degrees from which the geometry may differ.
|
||||
Topology is preserved. (Default: 0.0)
|
||||
address_details (bool): Add detailed information about the places
|
||||
that make up the address of the requested object. (Default: False)
|
||||
linked_places (bool): Add detailed information about the places
|
||||
that link to the result. (Default: False)
|
||||
parented_places (bool): Add detailed information about all places
|
||||
for which the requested object is a parent, i.e. all places for
|
||||
which the object provides the address details.
|
||||
Only POI places can have parents. (Default: False)
|
||||
keywords (bool): Add detailed information about the search terms
|
||||
used for this place.
|
||||
|
||||
Returns:
|
||||
source_table (enum): Data source of the place. See below for possible values.
|
||||
category (tuple): A tuple of two strings with the primary OSM tag
|
||||
and value.
|
||||
centroid (Point): Point position of the place.
|
||||
place_id (Optional[int]): Internal ID of the place. This ID may differ
|
||||
for the same place between different installations.
|
||||
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
|
||||
names (Optional[dict]): Dictionary of names of the place. Keys are
|
||||
usually the corresponding OSM tag keys.
|
||||
address (Optional[dict]): Dictionary of address parts directly
|
||||
attributed to the place. Keys are usually the corresponding
|
||||
OSM tag keys with the `addr:` prefix removed.
|
||||
extratags (Optional[dict]): Dictionary of additional attributes for
|
||||
the place. Usually OSM tag keys and values.
|
||||
housenumber (Optional[str]): House number of the place, normalised
|
||||
for lookup. To get the house number in its original spelling,
|
||||
use `address['housenumber']`.
|
||||
postcode (Optional[str]): Computed postcode for the place. To get
|
||||
directly attributed postcodes, use `address['postcode']` instead.
|
||||
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
|
||||
The string has the format <language code>:<wikipedia title>.
|
||||
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
|
||||
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
|
||||
importance (Optional[float]): Relative importance of the place. This is a measure
|
||||
how likely the place will be searched for.
|
||||
country_code (Optional[str]): Country the feature is in as
|
||||
ISO 3166-1 alpha-2 country code.
|
||||
address_rows (Optional[AddressLines]): List of places that make up the
|
||||
computed address. `None` when `address_details` parameter was False.
|
||||
linked_rows (Optional[AddressLines]): List of places that link to the object.
|
||||
`None` when `linked_places` parameter was False.
|
||||
parented_rows (Optional[AddressLines]): List of direct children of the place.
|
||||
`None` when `parented_places` parameter was False.
|
||||
name_keywords (Optional[WordInfos]): List of search words for the name of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
address_keywords (Optional[WordInfos]): List of search word for the address of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
bbox (Bbox): Bounding box of the full geometry of the place.
|
||||
If the place is a single point, then the size of the bounding
|
||||
box is guessed according to the type of place.
|
||||
geometry (dict): Dictionary containing the full geometry of the place
|
||||
in the formats requested in the `geometry_output` parameter.
|
||||
distance (Optional[float]): Distance in degree from the input point.
|
||||
"""
|
||||
return self._loop.run_until_complete(self._async_api.reverse(coord, **params))
|
||||
|
||||
|
||||
def search(self, query: str, **params: Any) -> SearchResults:
|
||||
""" Find a place by free-text search. Also known as forward geocoding.
|
||||
|
||||
Parameters:
|
||||
query: Free-form text query searching for a place.
|
||||
|
||||
Other parameters:
|
||||
max_results (int): Maximum number of results to return. The
|
||||
actual number of results may be less. (Default: 10)
|
||||
min_rank (int): Lowest permissible rank for the result.
|
||||
For addressable places this is the minimum
|
||||
[address rank](../customize/Ranking.md#address-rank). For all
|
||||
other places the [search rank](../customize/Ranking.md#search-rank)
|
||||
is used.
|
||||
max_rank (int): Highest permissible rank for the result. See min_rank above.
|
||||
layers (enum): Defines the kind of data to take into account.
|
||||
See [layers section](Input-Parameter-Types.md#layers) for details.
|
||||
(Default: addresses and POIs)
|
||||
countries (list[str]): Restrict search to countries with the given
|
||||
ISO 3166-1 alpha-2 country code. An empty list (the default)
|
||||
disables this filter.
|
||||
excluded (list[int]): A list of internal IDs of places to exclude
|
||||
from the search.
|
||||
viewbox (Optional[Bbox]): Bounding box of an area to focus search on.
|
||||
bounded_viewbox (bool): Consider the bounding box given in `viewbox`
|
||||
as a filter and return only results within the bounding box.
|
||||
near (Optional[Point]): Focus search around the given point and
|
||||
return results ordered by distance to the given point.
|
||||
near_radius (Optional[float]): Restrict results to results within
|
||||
the given distance in degrees of `near` point. Ignored, when
|
||||
`near` is not set.
|
||||
categories (list[tuple]): Restrict search to places of the given
|
||||
categories. The category is the main OSM tag assigned to each
|
||||
place. An empty list (the default) disables this filter.
|
||||
geometry_output (enum): Add the full geometry of the place to the result.
|
||||
Multiple formats may be selected. Note that geometries can become
|
||||
quite large. (Default: none)
|
||||
geometry_simplification (float): Simplification factor to use on
|
||||
the geometries before returning them. The factor expresses
|
||||
the tolerance in degrees from which the geometry may differ.
|
||||
Topology is preserved. (Default: 0.0)
|
||||
address_details (bool): Add detailed information about the places
|
||||
that make up the address of the requested object. (Default: False)
|
||||
linked_places (bool): Add detailed information about the places
|
||||
that link to the result. (Default: False)
|
||||
parented_places (bool): Add detailed information about all places
|
||||
for which the requested object is a parent, i.e. all places for
|
||||
which the object provides the address details.
|
||||
Only POI places can have parents. (Default: False)
|
||||
keywords (bool): Add detailed information about the search terms
|
||||
used for this place.
|
||||
|
||||
Returns:
|
||||
source_table (enum): Data source of the place. See below for possible values.
|
||||
category (tuple): A tuple of two strings with the primary OSM tag
|
||||
and value.
|
||||
centroid (Point): Point position of the place.
|
||||
place_id (Optional[int]): Internal ID of the place. This ID may differ
|
||||
for the same place between different installations.
|
||||
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
|
||||
names (Optional[dict]): Dictionary of names of the place. Keys are
|
||||
usually the corresponding OSM tag keys.
|
||||
address (Optional[dict]): Dictionary of address parts directly
|
||||
attributed to the place. Keys are usually the corresponding
|
||||
OSM tag keys with the `addr:` prefix removed.
|
||||
extratags (Optional[dict]): Dictionary of additional attributes for
|
||||
the place. Usually OSM tag keys and values.
|
||||
housenumber (Optional[str]): House number of the place, normalised
|
||||
for lookup. To get the house number in its original spelling,
|
||||
use `address['housenumber']`.
|
||||
postcode (Optional[str]): Computed postcode for the place. To get
|
||||
directly attributed postcodes, use `address['postcode']` instead.
|
||||
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
|
||||
The string has the format <language code>:<wikipedia title>.
|
||||
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
|
||||
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
|
||||
importance (Optional[float]): Relative importance of the place. This is a measure
|
||||
how likely the place will be searched for.
|
||||
country_code (Optional[str]): Country the feature is in as
|
||||
ISO 3166-1 alpha-2 country code.
|
||||
address_rows (Optional[AddressLines]): List of places that make up the
|
||||
computed address. `None` when `address_details` parameter was False.
|
||||
linked_rows (Optional[AddressLines]): List of places that link to the object.
|
||||
`None` when `linked_places` parameter was False.
|
||||
parented_rows (Optional[AddressLines]): List of direct children of the place.
|
||||
`None` when `parented_places` parameter was False.
|
||||
name_keywords (Optional[WordInfos]): List of search words for the name of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
address_keywords (Optional[WordInfos]): List of search word for the address of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
bbox (Bbox): Bounding box of the full geometry of the place.
|
||||
If the place is a single point, then the size of the bounding
|
||||
box is guessed according to the type of place.
|
||||
geometry (dict): Dictionary containing the full geometry of the place
|
||||
in the formats requested in the `geometry_output` parameter.
|
||||
"""
|
||||
return self._loop.run_until_complete(
|
||||
self._async_api.search(query, **params))
|
||||
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
def search_address(self, amenity: Optional[str] = None,
|
||||
street: Optional[str] = None,
|
||||
city: Optional[str] = None,
|
||||
county: Optional[str] = None,
|
||||
state: Optional[str] = None,
|
||||
country: Optional[str] = None,
|
||||
postalcode: Optional[str] = None,
|
||||
**params: Any) -> SearchResults:
|
||||
""" Find an address using structured search.
|
||||
|
||||
Parameters:
|
||||
amenity: Name of a POI.
|
||||
street: Street and optionally housenumber of the address. If the address
|
||||
does not have a street, then the place the housenumber references to.
|
||||
city: Postal city of the address.
|
||||
county: County equivalent of the address. Does not exist in all
|
||||
jurisdictions.
|
||||
state: State or province of the address.
|
||||
country: Country with its full name or its ISO 3166-1 alpha-2 country code.
|
||||
Do not use together with the country_code filter.
|
||||
postalcode: Post code or ZIP for the place.
|
||||
|
||||
Other parameters:
|
||||
max_results (int): Maximum number of results to return. The
|
||||
actual number of results may be less. (Default: 10)
|
||||
min_rank (int): Lowest permissible rank for the result.
|
||||
For addressable places this is the minimum
|
||||
[address rank](../customize/Ranking.md#address-rank). For all
|
||||
other places the [search rank](../customize/Ranking.md#search-rank)
|
||||
is used.
|
||||
max_rank (int): Highest permissible rank for the result. See min_rank above.
|
||||
layers (enum): Defines the kind of data to take into account.
|
||||
See [layers section](Input-Parameter-Types.md#layers) for details.
|
||||
(Default: addresses and POIs)
|
||||
countries (list[str]): Restrict search to countries with the given
|
||||
ISO 3166-1 alpha-2 country code. An empty list (the default)
|
||||
disables this filter. Do not use, when the country parameter
|
||||
is used.
|
||||
excluded (list[int]): A list of internal IDs of places to exclude
|
||||
from the search.
|
||||
viewbox (Optional[Bbox]): Bounding box of an area to focus search on.
|
||||
bounded_viewbox (bool): Consider the bounding box given in `viewbox`
|
||||
as a filter and return only results within the bounding box.
|
||||
near (Optional[Point]): Focus search around the given point and
|
||||
return results ordered by distance to the given point.
|
||||
near_radius (Optional[float]): Restrict results to results within
|
||||
the given distance in degrees of `near` point. Ignored, when
|
||||
`near` is not set.
|
||||
categories (list[tuple]): Restrict search to places of the given
|
||||
categories. The category is the main OSM tag assigned to each
|
||||
place. An empty list (the default) disables this filter.
|
||||
geometry_output (enum): Add the full geometry of the place to the result.
|
||||
Multiple formats may be selected. Note that geometries can become
|
||||
quite large. (Default: none)
|
||||
geometry_simplification (float): Simplification factor to use on
|
||||
the geometries before returning them. The factor expresses
|
||||
the tolerance in degrees from which the geometry may differ.
|
||||
Topology is preserved. (Default: 0.0)
|
||||
address_details (bool): Add detailed information about the places
|
||||
that make up the address of the requested object. (Default: False)
|
||||
linked_places (bool): Add detailed information about the places
|
||||
that link to the result. (Default: False)
|
||||
parented_places (bool): Add detailed information about all places
|
||||
for which the requested object is a parent, i.e. all places for
|
||||
which the object provides the address details.
|
||||
Only POI places can have parents. (Default: False)
|
||||
keywords (bool): Add detailed information about the search terms
|
||||
used for this place.
|
||||
|
||||
Returns:
|
||||
source_table (enum): Data source of the place. See below for possible values.
|
||||
category (tuple): A tuple of two strings with the primary OSM tag
|
||||
and value.
|
||||
centroid (Point): Point position of the place.
|
||||
place_id (Optional[int]): Internal ID of the place. This ID may differ
|
||||
for the same place between different installations.
|
||||
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
|
||||
names (Optional[dict]): Dictionary of names of the place. Keys are
|
||||
usually the corresponding OSM tag keys.
|
||||
address (Optional[dict]): Dictionary of address parts directly
|
||||
attributed to the place. Keys are usually the corresponding
|
||||
OSM tag keys with the `addr:` prefix removed.
|
||||
extratags (Optional[dict]): Dictionary of additional attributes for
|
||||
the place. Usually OSM tag keys and values.
|
||||
housenumber (Optional[str]): House number of the place, normalised
|
||||
for lookup. To get the house number in its original spelling,
|
||||
use `address['housenumber']`.
|
||||
postcode (Optional[str]): Computed postcode for the place. To get
|
||||
directly attributed postcodes, use `address['postcode']` instead.
|
||||
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
|
||||
The string has the format <language code>:<wikipedia title>.
|
||||
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
|
||||
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
|
||||
importance (Optional[float]): Relative importance of the place. This is a measure
|
||||
how likely the place will be searched for.
|
||||
country_code (Optional[str]): Country the feature is in as
|
||||
ISO 3166-1 alpha-2 country code.
|
||||
address_rows (Optional[AddressLines]): List of places that make up the
|
||||
computed address. `None` when `address_details` parameter was False.
|
||||
linked_rows (Optional[AddressLines]): List of places that link to the object.
|
||||
`None` when `linked_places` parameter was False.
|
||||
parented_rows (Optional[AddressLines]): List of direct children of the place.
|
||||
`None` when `parented_places` parameter was False.
|
||||
name_keywords (Optional[WordInfos]): List of search words for the name of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
address_keywords (Optional[WordInfos]): List of search word for the address of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
bbox (Bbox): Bounding box of the full geometry of the place.
|
||||
If the place is a single point, then the size of the bounding
|
||||
box is guessed according to the type of place.
|
||||
geometry (dict): Dictionary containing the full geometry of the place
|
||||
in the formats requested in the `geometry_output` parameter.
|
||||
"""
|
||||
return self._loop.run_until_complete(
|
||||
self._async_api.search_address(amenity, street, city, county,
|
||||
state, country, postalcode, **params))
|
||||
|
||||
|
||||
def search_category(self, categories: List[Tuple[str, str]],
|
||||
near_query: Optional[str] = None,
|
||||
**params: Any) -> SearchResults:
|
||||
""" Find an object of a certain category near another place.
|
||||
|
||||
The near place may either be given as an unstructured search
|
||||
query in itself or as a geographic area through the
|
||||
viewbox or near parameters.
|
||||
|
||||
Parameters:
|
||||
categories: Restrict search to places of the given
|
||||
categories. The category is the main OSM tag assigned to each
|
||||
place.
|
||||
near_query: Optional free-text query to define the are to
|
||||
restrict search to.
|
||||
|
||||
Other parameters:
|
||||
max_results (int): Maximum number of results to return. The
|
||||
actual number of results may be less. (Default: 10)
|
||||
min_rank (int): Lowest permissible rank for the result.
|
||||
For addressable places this is the minimum
|
||||
[address rank](../customize/Ranking.md#address-rank). For all
|
||||
other places the [search rank](../customize/Ranking.md#search-rank)
|
||||
is used.
|
||||
max_rank (int): Highest permissible rank for the result. See min_rank above.
|
||||
layers (enum): Defines the kind of data to take into account.
|
||||
See [layers section](Input-Parameter-Types.md#layers) for details.
|
||||
(Default: addresses and POIs)
|
||||
countries (list[str]): Restrict search to countries with the given
|
||||
ISO 3166-1 alpha-2 country code. An empty list (the default)
|
||||
disables this filter.
|
||||
excluded (list[int]): A list of internal IDs of places to exclude
|
||||
from the search.
|
||||
viewbox (Optional[Bbox]): Bounding box of an area to focus search on.
|
||||
bounded_viewbox (bool): Consider the bounding box given in `viewbox`
|
||||
as a filter and return only results within the bounding box.
|
||||
near (Optional[Point]): Focus search around the given point and
|
||||
return results ordered by distance to the given point.
|
||||
near_radius (Optional[float]): Restrict results to results within
|
||||
the given distance in degrees of `near` point. Ignored, when
|
||||
`near` is not set.
|
||||
geometry_output (enum): Add the full geometry of the place to the result.
|
||||
Multiple formats may be selected. Note that geometries can become
|
||||
quite large. (Default: none)
|
||||
geometry_simplification (float): Simplification factor to use on
|
||||
the geometries before returning them. The factor expresses
|
||||
the tolerance in degrees from which the geometry may differ.
|
||||
Topology is preserved. (Default: 0.0)
|
||||
address_details (bool): Add detailed information about the places
|
||||
that make up the address of the requested object. (Default: False)
|
||||
linked_places (bool): Add detailed information about the places
|
||||
that link to the result. (Default: False)
|
||||
parented_places (bool): Add detailed information about all places
|
||||
for which the requested object is a parent, i.e. all places for
|
||||
which the object provides the address details.
|
||||
Only POI places can have parents. (Default: False)
|
||||
keywords (bool): Add detailed information about the search terms
|
||||
used for this place.
|
||||
|
||||
Returns:
|
||||
source_table (enum): Data source of the place. See below for possible values.
|
||||
category (tuple): A tuple of two strings with the primary OSM tag
|
||||
and value.
|
||||
centroid (Point): Point position of the place.
|
||||
place_id (Optional[int]): Internal ID of the place. This ID may differ
|
||||
for the same place between different installations.
|
||||
osm_object (Optional[tuple]): OSM type and ID of the place, if available.
|
||||
names (Optional[dict]): Dictionary of names of the place. Keys are
|
||||
usually the corresponding OSM tag keys.
|
||||
address (Optional[dict]): Dictionary of address parts directly
|
||||
attributed to the place. Keys are usually the corresponding
|
||||
OSM tag keys with the `addr:` prefix removed.
|
||||
extratags (Optional[dict]): Dictionary of additional attributes for
|
||||
the place. Usually OSM tag keys and values.
|
||||
housenumber (Optional[str]): House number of the place, normalised
|
||||
for lookup. To get the house number in its original spelling,
|
||||
use `address['housenumber']`.
|
||||
postcode (Optional[str]): Computed postcode for the place. To get
|
||||
directly attributed postcodes, use `address['postcode']` instead.
|
||||
wikipedia (Optional[str]): Reference to a wikipedia site for the place.
|
||||
The string has the format <language code>:<wikipedia title>.
|
||||
rank_address (int): [Address rank](../customize/Ranking.md#address-rank).
|
||||
rank_search (int): [Search rank](../customize/Ranking.md#search-rank).
|
||||
importance (Optional[float]): Relative importance of the place. This is a measure
|
||||
how likely the place will be searched for.
|
||||
country_code (Optional[str]): Country the feature is in as
|
||||
ISO 3166-1 alpha-2 country code.
|
||||
address_rows (Optional[AddressLines]): List of places that make up the
|
||||
computed address. `None` when `address_details` parameter was False.
|
||||
linked_rows (Optional[AddressLines]): List of places that link to the object.
|
||||
`None` when `linked_places` parameter was False.
|
||||
parented_rows (Optional[AddressLines]): List of direct children of the place.
|
||||
`None` when `parented_places` parameter was False.
|
||||
name_keywords (Optional[WordInfos]): List of search words for the name of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
address_keywords (Optional[WordInfos]): List of search word for the address of
|
||||
the place. `None` when `keywords` parameter is set to False.
|
||||
bbox (Bbox): Bounding box of the full geometry of the place.
|
||||
If the place is a single point, then the size of the bounding
|
||||
box is guessed according to the type of place.
|
||||
geometry (dict): Dictionary containing the full geometry of the place
|
||||
in the formats requested in the `geometry_output` parameter.
|
||||
"""
|
||||
return self._loop.run_until_complete(
|
||||
self._async_api.search_category(categories, near_query, **params))
|
||||
97
src/nominatim_api/localization.py
Normal file
97
src/nominatim_api/localization.py
Normal file
@@ -0,0 +1,97 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Helper functions for localizing names of results.
|
||||
"""
|
||||
from typing import Mapping, List, Optional
|
||||
|
||||
import re
|
||||
|
||||
class Locales:
|
||||
""" Helper class for localization of names.
|
||||
|
||||
It takes a list of language prefixes in their order of preferred
|
||||
usage.
|
||||
"""
|
||||
|
||||
def __init__(self, langs: Optional[List[str]] = None):
|
||||
self.languages = langs or []
|
||||
self.name_tags: List[str] = []
|
||||
|
||||
# Build the list of supported tags. It is currently hard-coded.
|
||||
self._add_lang_tags('name')
|
||||
self._add_tags('name', 'brand')
|
||||
self._add_lang_tags('official_name', 'short_name')
|
||||
self._add_tags('official_name', 'short_name', 'ref')
|
||||
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return len(self.languages) > 0
|
||||
|
||||
|
||||
def _add_tags(self, *tags: str) -> None:
|
||||
for tag in tags:
|
||||
self.name_tags.append(tag)
|
||||
self.name_tags.append(f"_place_{tag}")
|
||||
|
||||
|
||||
def _add_lang_tags(self, *tags: str) -> None:
|
||||
for tag in tags:
|
||||
for lang in self.languages:
|
||||
self.name_tags.append(f"{tag}:{lang}")
|
||||
self.name_tags.append(f"_place_{tag}:{lang}")
|
||||
|
||||
|
||||
def display_name(self, names: Optional[Mapping[str, str]]) -> str:
|
||||
""" Return the best matching name from a dictionary of names
|
||||
containing different name variants.
|
||||
|
||||
If 'names' is null or empty, an empty string is returned. If no
|
||||
appropriate localization is found, the first name is returned.
|
||||
"""
|
||||
if not names:
|
||||
return ''
|
||||
|
||||
if len(names) > 1:
|
||||
for tag in self.name_tags:
|
||||
if tag in names:
|
||||
return names[tag]
|
||||
|
||||
# Nothing? Return any of the other names as a default.
|
||||
return next(iter(names.values()))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def from_accept_languages(langstr: str) -> 'Locales':
|
||||
""" Create a localization object from a language list in the
|
||||
format of HTTP accept-languages header.
|
||||
|
||||
The functions tries to be forgiving of format errors by first splitting
|
||||
the string into comma-separated parts and then parsing each
|
||||
description separately. Badly formatted parts are then ignored.
|
||||
"""
|
||||
# split string into languages
|
||||
candidates = []
|
||||
for desc in langstr.split(','):
|
||||
m = re.fullmatch(r'\s*([a-z_-]+)(?:;\s*q\s*=\s*([01](?:\.\d+)?))?\s*',
|
||||
desc, flags=re.I)
|
||||
if m:
|
||||
candidates.append((m[1], float(m[2] or 1.0)))
|
||||
|
||||
# sort the results by the weight of each language (preserving order).
|
||||
candidates.sort(reverse=True, key=lambda e: e[1])
|
||||
|
||||
# If a language has a region variant, also add the language without
|
||||
# variant but only if it isn't already in the list to not mess up the weight.
|
||||
languages = []
|
||||
for lid, _ in candidates:
|
||||
languages.append(lid)
|
||||
parts = lid.split('-', 1)
|
||||
if len(parts) > 1 and all(c[0] != parts[0] for c in candidates):
|
||||
languages.append(parts[0])
|
||||
|
||||
return Locales(languages)
|
||||
433
src/nominatim_api/logging.py
Normal file
433
src/nominatim_api/logging.py
Normal file
@@ -0,0 +1,433 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for specialised logging with HTML output.
|
||||
"""
|
||||
from typing import Any, Iterator, Optional, List, Tuple, cast, Union, Mapping, Sequence
|
||||
from contextvars import ContextVar
|
||||
import datetime as dt
|
||||
import textwrap
|
||||
import io
|
||||
import re
|
||||
import html
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.ext.asyncio import AsyncConnection
|
||||
|
||||
try:
|
||||
from pygments import highlight
|
||||
from pygments.lexers import PythonLexer, PostgresLexer
|
||||
from pygments.formatters import HtmlFormatter
|
||||
CODE_HIGHLIGHT = True
|
||||
except ModuleNotFoundError:
|
||||
CODE_HIGHLIGHT = False
|
||||
|
||||
|
||||
def _debug_name(res: Any) -> str:
|
||||
if res.names:
|
||||
return cast(str, res.names.get('name', next(iter(res.names.values()))))
|
||||
|
||||
return f"Hnr {res.housenumber}" if res.housenumber is not None else '[NONE]'
|
||||
|
||||
|
||||
class BaseLogger:
|
||||
""" Interface for logging function.
|
||||
|
||||
The base implementation does nothing. Overwrite the functions
|
||||
in derived classes which implement logging functionality.
|
||||
"""
|
||||
def get_buffer(self) -> str:
|
||||
""" Return the current content of the log buffer.
|
||||
"""
|
||||
return ''
|
||||
|
||||
def function(self, func: str, **kwargs: Any) -> None:
|
||||
""" Start a new debug chapter for the given function and its parameters.
|
||||
"""
|
||||
|
||||
|
||||
def section(self, heading: str) -> None:
|
||||
""" Start a new section with the given title.
|
||||
"""
|
||||
|
||||
|
||||
def comment(self, text: str) -> None:
|
||||
""" Add a simple comment to the debug output.
|
||||
"""
|
||||
|
||||
|
||||
def var_dump(self, heading: str, var: Any) -> None:
|
||||
""" Print the content of the variable to the debug output prefixed by
|
||||
the given heading.
|
||||
"""
|
||||
|
||||
|
||||
def table_dump(self, heading: str, rows: Iterator[Optional[List[Any]]]) -> None:
|
||||
""" Print the table generated by the generator function.
|
||||
"""
|
||||
|
||||
|
||||
def result_dump(self, heading: str, results: Iterator[Tuple[Any, Any]]) -> None:
|
||||
""" Print a list of search results generated by the generator function.
|
||||
"""
|
||||
|
||||
|
||||
def sql(self, conn: AsyncConnection, statement: 'sa.Executable',
|
||||
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None]) -> None:
|
||||
""" Print the SQL for the given statement.
|
||||
"""
|
||||
|
||||
def format_sql(self, conn: AsyncConnection, statement: 'sa.Executable',
|
||||
extra_params: Union[Mapping[str, Any],
|
||||
Sequence[Mapping[str, Any]], None]) -> str:
|
||||
""" Return the compiled version of the statement.
|
||||
"""
|
||||
compiled = cast('sa.ClauseElement', statement).compile(conn.sync_engine)
|
||||
|
||||
params = dict(compiled.params)
|
||||
if isinstance(extra_params, Mapping):
|
||||
for k, v in extra_params.items():
|
||||
if hasattr(v, 'to_wkt'):
|
||||
params[k] = v.to_wkt()
|
||||
elif isinstance(v, (int, float)):
|
||||
params[k] = v
|
||||
else:
|
||||
params[k] = str(v)
|
||||
elif isinstance(extra_params, Sequence) and extra_params:
|
||||
for k in extra_params[0]:
|
||||
params[k] = f':{k}'
|
||||
|
||||
sqlstr = str(compiled)
|
||||
|
||||
if conn.dialect.name == 'postgresql':
|
||||
if sa.__version__.startswith('1'):
|
||||
try:
|
||||
sqlstr = re.sub(r'__\[POSTCOMPILE_[^]]*\]', '%s', sqlstr)
|
||||
return sqlstr % tuple((repr(params.get(name, None))
|
||||
for name in compiled.positiontup)) # type: ignore
|
||||
except TypeError:
|
||||
return sqlstr
|
||||
|
||||
# Fixes an odd issue with Python 3.7 where percentages are not
|
||||
# quoted correctly.
|
||||
sqlstr = re.sub(r'%(?!\()', '%%', sqlstr)
|
||||
sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', r'%(\1)s', sqlstr)
|
||||
return sqlstr % params
|
||||
|
||||
assert conn.dialect.name == 'sqlite'
|
||||
|
||||
# params in positional order
|
||||
pparams = (repr(params.get(name, None)) for name in compiled.positiontup) # type: ignore
|
||||
|
||||
sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', '?', sqlstr)
|
||||
sqlstr = re.sub(r"\?", lambda m: next(pparams), sqlstr)
|
||||
|
||||
return sqlstr
|
||||
|
||||
class HTMLLogger(BaseLogger):
|
||||
""" Logger that formats messages in HTML.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
self.buffer = io.StringIO()
|
||||
|
||||
|
||||
def _timestamp(self) -> None:
|
||||
self._write(f'<p class="timestamp">[{dt.datetime.now()}]</p>')
|
||||
|
||||
|
||||
def get_buffer(self) -> str:
|
||||
return HTML_HEADER + self.buffer.getvalue() + HTML_FOOTER
|
||||
|
||||
|
||||
def function(self, func: str, **kwargs: Any) -> None:
|
||||
self._timestamp()
|
||||
self._write(f"<h1>Debug output for {func}()</h1>\n<p>Parameters:<dl>")
|
||||
for name, value in kwargs.items():
|
||||
self._write(f'<dt>{name}</dt><dd>{self._python_var(value)}</dd>')
|
||||
self._write('</dl></p>')
|
||||
|
||||
|
||||
def section(self, heading: str) -> None:
|
||||
self._timestamp()
|
||||
self._write(f"<h2>{heading}</h2>")
|
||||
|
||||
|
||||
def comment(self, text: str) -> None:
|
||||
self._timestamp()
|
||||
self._write(f"<p>{text}</p>")
|
||||
|
||||
|
||||
def var_dump(self, heading: str, var: Any) -> None:
|
||||
self._timestamp()
|
||||
if callable(var):
|
||||
var = var()
|
||||
|
||||
self._write(f'<h5>{heading}</h5>{self._python_var(var)}')
|
||||
|
||||
|
||||
def table_dump(self, heading: str, rows: Iterator[Optional[List[Any]]]) -> None:
|
||||
self._timestamp()
|
||||
head = next(rows)
|
||||
assert head
|
||||
self._write(f'<table><thead><tr><th colspan="{len(head)}">{heading}</th></tr><tr>')
|
||||
for cell in head:
|
||||
self._write(f'<th>{cell}</th>')
|
||||
self._write('</tr></thead><tbody>')
|
||||
for row in rows:
|
||||
if row is not None:
|
||||
self._write('<tr>')
|
||||
for cell in row:
|
||||
self._write(f'<td>{cell}</td>')
|
||||
self._write('</tr>')
|
||||
self._write('</tbody></table>')
|
||||
|
||||
|
||||
def result_dump(self, heading: str, results: Iterator[Tuple[Any, Any]]) -> None:
|
||||
""" Print a list of search results generated by the generator function.
|
||||
"""
|
||||
self._timestamp()
|
||||
def format_osm(osm_object: Optional[Tuple[str, int]]) -> str:
|
||||
if not osm_object:
|
||||
return '-'
|
||||
|
||||
t, i = osm_object
|
||||
if t == 'N':
|
||||
fullt = 'node'
|
||||
elif t == 'W':
|
||||
fullt = 'way'
|
||||
elif t == 'R':
|
||||
fullt = 'relation'
|
||||
else:
|
||||
return f'{t}{i}'
|
||||
|
||||
return f'<a href="https://www.openstreetmap.org/{fullt}/{i}">{t}{i}</a>'
|
||||
|
||||
self._write(f'<h5>{heading}</h5><p><dl>')
|
||||
total = 0
|
||||
for rank, res in results:
|
||||
self._write(f'<dt>[{rank:.3f}]</dt> <dd>{res.source_table.name}(')
|
||||
self._write(f"{_debug_name(res)}, type=({','.join(res.category)}), ")
|
||||
self._write(f"rank={res.rank_address}, ")
|
||||
self._write(f"osm={format_osm(res.osm_object)}, ")
|
||||
self._write(f'cc={res.country_code}, ')
|
||||
self._write(f'importance={res.importance or float("nan"):.5f})</dd>')
|
||||
total += 1
|
||||
self._write(f'</dl><b>TOTAL:</b> {total}</p>')
|
||||
|
||||
|
||||
def sql(self, conn: AsyncConnection, statement: 'sa.Executable',
|
||||
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None]) -> None:
|
||||
self._timestamp()
|
||||
sqlstr = self.format_sql(conn, statement, params)
|
||||
if CODE_HIGHLIGHT:
|
||||
sqlstr = highlight(sqlstr, PostgresLexer(),
|
||||
HtmlFormatter(nowrap=True, lineseparator='<br />'))
|
||||
self._write(f'<div class="highlight"><code class="lang-sql">{sqlstr}</code></div>')
|
||||
else:
|
||||
self._write(f'<code class="lang-sql">{html.escape(sqlstr)}</code>')
|
||||
|
||||
|
||||
def _python_var(self, var: Any) -> str:
|
||||
if CODE_HIGHLIGHT:
|
||||
fmt = highlight(str(var), PythonLexer(), HtmlFormatter(nowrap=True))
|
||||
return f'<div class="highlight"><code class="lang-python">{fmt}</code></div>'
|
||||
|
||||
return f'<code class="lang-python">{html.escape(str(var))}</code>'
|
||||
|
||||
|
||||
def _write(self, text: str) -> None:
|
||||
""" Add the raw text to the debug output.
|
||||
"""
|
||||
self.buffer.write(text)
|
||||
|
||||
|
||||
class TextLogger(BaseLogger):
|
||||
""" Logger creating output suitable for the console.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
self.buffer = io.StringIO()
|
||||
|
||||
|
||||
def _timestamp(self) -> None:
|
||||
self._write(f'[{dt.datetime.now()}]\n')
|
||||
|
||||
|
||||
def get_buffer(self) -> str:
|
||||
return self.buffer.getvalue()
|
||||
|
||||
|
||||
def function(self, func: str, **kwargs: Any) -> None:
|
||||
self._write(f"#### Debug output for {func}()\n\nParameters:\n")
|
||||
for name, value in kwargs.items():
|
||||
self._write(f' {name}: {self._python_var(value)}\n')
|
||||
self._write('\n')
|
||||
|
||||
|
||||
def section(self, heading: str) -> None:
|
||||
self._timestamp()
|
||||
self._write(f"\n# {heading}\n\n")
|
||||
|
||||
|
||||
def comment(self, text: str) -> None:
|
||||
self._write(f"{text}\n")
|
||||
|
||||
|
||||
def var_dump(self, heading: str, var: Any) -> None:
|
||||
if callable(var):
|
||||
var = var()
|
||||
|
||||
self._write(f'{heading}:\n {self._python_var(var)}\n\n')
|
||||
|
||||
|
||||
def table_dump(self, heading: str, rows: Iterator[Optional[List[Any]]]) -> None:
|
||||
self._write(f'{heading}:\n')
|
||||
data = [list(map(self._python_var, row)) if row else None for row in rows]
|
||||
assert data[0] is not None
|
||||
num_cols = len(data[0])
|
||||
|
||||
maxlens = [max(len(d[i]) for d in data if d) for i in range(num_cols)]
|
||||
tablewidth = sum(maxlens) + 3 * num_cols + 1
|
||||
row_format = '| ' +' | '.join(f'{{:<{l}}}' for l in maxlens) + ' |\n'
|
||||
self._write('-'*tablewidth + '\n')
|
||||
self._write(row_format.format(*data[0]))
|
||||
self._write('-'*tablewidth + '\n')
|
||||
for row in data[1:]:
|
||||
if row:
|
||||
self._write(row_format.format(*row))
|
||||
else:
|
||||
self._write('-'*tablewidth + '\n')
|
||||
if data[-1]:
|
||||
self._write('-'*tablewidth + '\n')
|
||||
|
||||
|
||||
def result_dump(self, heading: str, results: Iterator[Tuple[Any, Any]]) -> None:
|
||||
self._timestamp()
|
||||
self._write(f'{heading}:\n')
|
||||
total = 0
|
||||
for rank, res in results:
|
||||
self._write(f'[{rank:.3f}] {res.source_table.name}(')
|
||||
self._write(f"{_debug_name(res)}, type=({','.join(res.category)}), ")
|
||||
self._write(f"rank={res.rank_address}, ")
|
||||
self._write(f"osm={''.join(map(str, res.osm_object or []))}, ")
|
||||
self._write(f'cc={res.country_code}, ')
|
||||
self._write(f'importance={res.importance or -1:.5f})\n')
|
||||
total += 1
|
||||
self._write(f'TOTAL: {total}\n\n')
|
||||
|
||||
|
||||
def sql(self, conn: AsyncConnection, statement: 'sa.Executable',
|
||||
params: Union[Mapping[str, Any], Sequence[Mapping[str, Any]], None]) -> None:
|
||||
self._timestamp()
|
||||
sqlstr = '\n| '.join(textwrap.wrap(self.format_sql(conn, statement, params), width=78))
|
||||
self._write(f"| {sqlstr}\n\n")
|
||||
|
||||
|
||||
def _python_var(self, var: Any) -> str:
|
||||
return str(var)
|
||||
|
||||
|
||||
def _write(self, text: str) -> None:
|
||||
self.buffer.write(text)
|
||||
|
||||
|
||||
logger: ContextVar[BaseLogger] = ContextVar('logger', default=BaseLogger())
|
||||
|
||||
|
||||
def set_log_output(fmt: str) -> None:
|
||||
""" Enable collecting debug information.
|
||||
"""
|
||||
if fmt == 'html':
|
||||
logger.set(HTMLLogger())
|
||||
elif fmt == 'text':
|
||||
logger.set(TextLogger())
|
||||
else:
|
||||
logger.set(BaseLogger())
|
||||
|
||||
|
||||
def log() -> BaseLogger:
|
||||
""" Return the logger for the current context.
|
||||
"""
|
||||
return logger.get()
|
||||
|
||||
|
||||
def get_and_disable() -> str:
|
||||
""" Return the current content of the debug buffer and disable logging.
|
||||
"""
|
||||
buf = logger.get().get_buffer()
|
||||
logger.set(BaseLogger())
|
||||
return buf
|
||||
|
||||
|
||||
HTML_HEADER: str = """<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Nominatim - Debug</title>
|
||||
<style>
|
||||
""" + \
|
||||
(HtmlFormatter(nobackground=True).get_style_defs('.highlight') if CODE_HIGHLIGHT else '') +\
|
||||
"""
|
||||
h2 { font-size: x-large }
|
||||
|
||||
dl {
|
||||
padding-left: 10pt;
|
||||
font-family: monospace
|
||||
}
|
||||
|
||||
dt {
|
||||
float: left;
|
||||
font-weight: bold;
|
||||
margin-right: 0.5em
|
||||
}
|
||||
|
||||
dt::after { content: ": "; }
|
||||
|
||||
dd::after {
|
||||
clear: left;
|
||||
display: block
|
||||
}
|
||||
|
||||
.lang-sql {
|
||||
color: #555;
|
||||
font-size: small
|
||||
}
|
||||
|
||||
h5 {
|
||||
border: solid lightgrey 0.1pt;
|
||||
margin-bottom: 0;
|
||||
background-color: #f7f7f7
|
||||
}
|
||||
|
||||
h5 + .highlight {
|
||||
padding: 3pt;
|
||||
border: solid lightgrey 0.1pt
|
||||
}
|
||||
|
||||
table, th, tbody {
|
||||
border: thin solid;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
td {
|
||||
border-right: thin solid;
|
||||
padding-left: 3pt;
|
||||
padding-right: 3pt;
|
||||
}
|
||||
|
||||
.timestamp {
|
||||
font-size: 0.8em;
|
||||
color: darkblue;
|
||||
width: calc(100% - 5pt);
|
||||
text-align: right;
|
||||
position: absolute;
|
||||
left: 0;
|
||||
margin-top: -5px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
"""
|
||||
|
||||
HTML_FOOTER: str = "</body></html>"
|
||||
250
src/nominatim_api/lookup.py
Normal file
250
src/nominatim_api/lookup.py
Normal file
@@ -0,0 +1,250 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of place lookup by ID.
|
||||
"""
|
||||
from typing import Optional, Callable, Tuple, Type
|
||||
import datetime as dt
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from nominatim_core.typing import SaColumn, SaRow, SaSelect
|
||||
from .connection import SearchConnection
|
||||
from .logging import log
|
||||
from . import types as ntyp
|
||||
from . import results as nres
|
||||
|
||||
RowFunc = Callable[[Optional[SaRow], Type[nres.BaseResultT]], Optional[nres.BaseResultT]]
|
||||
|
||||
GeomFunc = Callable[[SaSelect, SaColumn], SaSelect]
|
||||
|
||||
|
||||
async def find_in_placex(conn: SearchConnection, place: ntyp.PlaceRef,
|
||||
add_geometries: GeomFunc) -> Optional[SaRow]:
|
||||
""" Search for the given place in the placex table and return the
|
||||
base information.
|
||||
"""
|
||||
log().section("Find in placex table")
|
||||
t = conn.t.placex
|
||||
sql = sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
|
||||
t.c.class_, t.c.type, t.c.admin_level,
|
||||
t.c.address, t.c.extratags,
|
||||
t.c.housenumber, t.c.postcode, t.c.country_code,
|
||||
t.c.importance, t.c.wikipedia, t.c.indexed_date,
|
||||
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
|
||||
t.c.linked_place_id,
|
||||
t.c.geometry.ST_Expand(0).label('bbox'),
|
||||
t.c.centroid)
|
||||
|
||||
if isinstance(place, ntyp.PlaceID):
|
||||
sql = sql.where(t.c.place_id == place.place_id)
|
||||
elif isinstance(place, ntyp.OsmID):
|
||||
sql = sql.where(t.c.osm_type == place.osm_type)\
|
||||
.where(t.c.osm_id == place.osm_id)
|
||||
if place.osm_class:
|
||||
sql = sql.where(t.c.class_ == place.osm_class)
|
||||
else:
|
||||
sql = sql.order_by(t.c.class_)
|
||||
sql = sql.limit(1)
|
||||
else:
|
||||
return None
|
||||
|
||||
return (await conn.execute(add_geometries(sql, t.c.geometry))).one_or_none()
|
||||
|
||||
|
||||
async def find_in_osmline(conn: SearchConnection, place: ntyp.PlaceRef,
|
||||
add_geometries: GeomFunc) -> Optional[SaRow]:
|
||||
""" Search for the given place in the osmline table and return the
|
||||
base information.
|
||||
"""
|
||||
log().section("Find in interpolation table")
|
||||
t = conn.t.osmline
|
||||
sql = sa.select(t.c.place_id, t.c.osm_id, t.c.parent_place_id,
|
||||
t.c.indexed_date, t.c.startnumber, t.c.endnumber,
|
||||
t.c.step, t.c.address, t.c.postcode, t.c.country_code,
|
||||
t.c.linegeo.ST_Centroid().label('centroid'))
|
||||
|
||||
if isinstance(place, ntyp.PlaceID):
|
||||
sql = sql.where(t.c.place_id == place.place_id)
|
||||
elif isinstance(place, ntyp.OsmID) and place.osm_type == 'W':
|
||||
# There may be multiple interpolations for a single way.
|
||||
# If 'class' contains a number, return the one that belongs to that number.
|
||||
sql = sql.where(t.c.osm_id == place.osm_id).limit(1)
|
||||
if place.osm_class and place.osm_class.isdigit():
|
||||
sql = sql.order_by(sa.func.greatest(0,
|
||||
int(place.osm_class) - t.c.endnumber,
|
||||
t.c.startnumber - int(place.osm_class)))
|
||||
else:
|
||||
return None
|
||||
|
||||
return (await conn.execute(add_geometries(sql, t.c.linegeo))).one_or_none()
|
||||
|
||||
|
||||
async def find_in_tiger(conn: SearchConnection, place: ntyp.PlaceRef,
|
||||
add_geometries: GeomFunc) -> Optional[SaRow]:
|
||||
""" Search for the given place in the table of Tiger addresses and return
|
||||
the base information. Only lookup by place ID is supported.
|
||||
"""
|
||||
if not isinstance(place, ntyp.PlaceID):
|
||||
return None
|
||||
|
||||
log().section("Find in TIGER table")
|
||||
t = conn.t.tiger
|
||||
parent = conn.t.placex
|
||||
sql = sa.select(t.c.place_id, t.c.parent_place_id,
|
||||
parent.c.osm_type, parent.c.osm_id,
|
||||
t.c.startnumber, t.c.endnumber, t.c.step,
|
||||
t.c.postcode,
|
||||
t.c.linegeo.ST_Centroid().label('centroid'))\
|
||||
.where(t.c.place_id == place.place_id)\
|
||||
.join(parent, t.c.parent_place_id == parent.c.place_id, isouter=True)
|
||||
|
||||
return (await conn.execute(add_geometries(sql, t.c.linegeo))).one_or_none()
|
||||
|
||||
|
||||
async def find_in_postcode(conn: SearchConnection, place: ntyp.PlaceRef,
|
||||
add_geometries: GeomFunc) -> Optional[SaRow]:
|
||||
""" Search for the given place in the postcode table and return the
|
||||
base information. Only lookup by place ID is supported.
|
||||
"""
|
||||
if not isinstance(place, ntyp.PlaceID):
|
||||
return None
|
||||
|
||||
log().section("Find in postcode table")
|
||||
t = conn.t.postcode
|
||||
sql = sa.select(t.c.place_id, t.c.parent_place_id,
|
||||
t.c.rank_search, t.c.rank_address,
|
||||
t.c.indexed_date, t.c.postcode, t.c.country_code,
|
||||
t.c.geometry.label('centroid')) \
|
||||
.where(t.c.place_id == place.place_id)
|
||||
|
||||
return (await conn.execute(add_geometries(sql, t.c.geometry))).one_or_none()
|
||||
|
||||
|
||||
async def find_in_all_tables(conn: SearchConnection, place: ntyp.PlaceRef,
|
||||
add_geometries: GeomFunc
|
||||
) -> Tuple[Optional[SaRow], RowFunc[nres.BaseResultT]]:
|
||||
""" Search for the given place in all data tables
|
||||
and return the base information.
|
||||
"""
|
||||
row = await find_in_placex(conn, place, add_geometries)
|
||||
log().var_dump('Result (placex)', row)
|
||||
if row is not None:
|
||||
return row, nres.create_from_placex_row
|
||||
|
||||
row = await find_in_osmline(conn, place, add_geometries)
|
||||
log().var_dump('Result (osmline)', row)
|
||||
if row is not None:
|
||||
return row, nres.create_from_osmline_row
|
||||
|
||||
row = await find_in_postcode(conn, place, add_geometries)
|
||||
log().var_dump('Result (postcode)', row)
|
||||
if row is not None:
|
||||
return row, nres.create_from_postcode_row
|
||||
|
||||
row = await find_in_tiger(conn, place, add_geometries)
|
||||
log().var_dump('Result (tiger)', row)
|
||||
return row, nres.create_from_tiger_row
|
||||
|
||||
|
||||
async def get_detailed_place(conn: SearchConnection, place: ntyp.PlaceRef,
|
||||
details: ntyp.LookupDetails) -> Optional[nres.DetailedResult]:
|
||||
""" Retrieve a place with additional details from the database.
|
||||
"""
|
||||
log().function('get_detailed_place', place=place, details=details)
|
||||
|
||||
if details.geometry_output and details.geometry_output != ntyp.GeometryFormat.GEOJSON:
|
||||
raise ValueError("lookup only supports geojosn polygon output.")
|
||||
|
||||
if details.geometry_output & ntyp.GeometryFormat.GEOJSON:
|
||||
def _add_geometry(sql: SaSelect, column: SaColumn) -> SaSelect:
|
||||
return sql.add_columns(sa.func.ST_AsGeoJSON(
|
||||
sa.case((sa.func.ST_NPoints(column) > 5000,
|
||||
sa.func.ST_SimplifyPreserveTopology(column, 0.0001)),
|
||||
else_=column), 7).label('geometry_geojson'))
|
||||
else:
|
||||
def _add_geometry(sql: SaSelect, column: SaColumn) -> SaSelect:
|
||||
return sql.add_columns(sa.func.ST_GeometryType(column).label('geometry_type'))
|
||||
|
||||
row_func: RowFunc[nres.DetailedResult]
|
||||
row, row_func = await find_in_all_tables(conn, place, _add_geometry)
|
||||
|
||||
if row is None:
|
||||
return None
|
||||
|
||||
result = row_func(row, nres.DetailedResult)
|
||||
assert result is not None
|
||||
|
||||
# add missing details
|
||||
assert result is not None
|
||||
if 'type' in result.geometry:
|
||||
result.geometry['type'] = GEOMETRY_TYPE_MAP.get(result.geometry['type'],
|
||||
result.geometry['type'])
|
||||
indexed_date = getattr(row, 'indexed_date', None)
|
||||
if indexed_date is not None:
|
||||
result.indexed_date = indexed_date.replace(tzinfo=dt.timezone.utc)
|
||||
|
||||
await nres.add_result_details(conn, [result], details)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def get_simple_place(conn: SearchConnection, place: ntyp.PlaceRef,
|
||||
details: ntyp.LookupDetails) -> Optional[nres.SearchResult]:
|
||||
""" Retrieve a place as a simple search result from the database.
|
||||
"""
|
||||
log().function('get_simple_place', place=place, details=details)
|
||||
|
||||
def _add_geometry(sql: SaSelect, col: SaColumn) -> SaSelect:
|
||||
if not details.geometry_output:
|
||||
return sql
|
||||
|
||||
out = []
|
||||
|
||||
if details.geometry_simplification > 0.0:
|
||||
col = sa.func.ST_SimplifyPreserveTopology(col, details.geometry_simplification)
|
||||
|
||||
if details.geometry_output & ntyp.GeometryFormat.GEOJSON:
|
||||
out.append(sa.func.ST_AsGeoJSON(col, 7).label('geometry_geojson'))
|
||||
if details.geometry_output & ntyp.GeometryFormat.TEXT:
|
||||
out.append(sa.func.ST_AsText(col).label('geometry_text'))
|
||||
if details.geometry_output & ntyp.GeometryFormat.KML:
|
||||
out.append(sa.func.ST_AsKML(col, 7).label('geometry_kml'))
|
||||
if details.geometry_output & ntyp.GeometryFormat.SVG:
|
||||
out.append(sa.func.ST_AsSVG(col, 0, 7).label('geometry_svg'))
|
||||
|
||||
return sql.add_columns(*out)
|
||||
|
||||
|
||||
row_func: RowFunc[nres.SearchResult]
|
||||
row, row_func = await find_in_all_tables(conn, place, _add_geometry)
|
||||
|
||||
if row is None:
|
||||
return None
|
||||
|
||||
result = row_func(row, nres.SearchResult)
|
||||
assert result is not None
|
||||
|
||||
# add missing details
|
||||
assert result is not None
|
||||
if hasattr(row, 'bbox'):
|
||||
result.bbox = ntyp.Bbox.from_wkb(row.bbox)
|
||||
|
||||
await nres.add_result_details(conn, [result], details)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
GEOMETRY_TYPE_MAP = {
|
||||
'POINT': 'ST_Point',
|
||||
'MULTIPOINT': 'ST_MultiPoint',
|
||||
'LINESTRING': 'ST_LineString',
|
||||
'MULTILINESTRING': 'ST_MultiLineString',
|
||||
'POLYGON': 'ST_Polygon',
|
||||
'MULTIPOLYGON': 'ST_MultiPolygon',
|
||||
'GEOMETRYCOLLECTION': 'ST_GeometryCollection'
|
||||
}
|
||||
0
src/nominatim_api/py.typed
Normal file
0
src/nominatim_api/py.typed
Normal file
56
src/nominatim_api/result_formatting.py
Normal file
56
src/nominatim_api/result_formatting.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Helper classes and functions for formatting results into API responses.
|
||||
"""
|
||||
from typing import Type, TypeVar, Dict, List, Callable, Any, Mapping
|
||||
from collections import defaultdict
|
||||
|
||||
T = TypeVar('T') # pylint: disable=invalid-name
|
||||
FormatFunc = Callable[[T, Mapping[str, Any]], str]
|
||||
|
||||
|
||||
class FormatDispatcher:
|
||||
""" Helper class to conveniently create formatting functions in
|
||||
a module using decorators.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.format_functions: Dict[Type[Any], Dict[str, FormatFunc[Any]]] = defaultdict(dict)
|
||||
|
||||
|
||||
def format_func(self, result_class: Type[T],
|
||||
fmt: str) -> Callable[[FormatFunc[T]], FormatFunc[T]]:
|
||||
""" Decorator for a function that formats a given type of result into the
|
||||
selected format.
|
||||
"""
|
||||
def decorator(func: FormatFunc[T]) -> FormatFunc[T]:
|
||||
self.format_functions[result_class][fmt] = func
|
||||
return func
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def list_formats(self, result_type: Type[Any]) -> List[str]:
|
||||
""" Return a list of formats supported by this formatter.
|
||||
"""
|
||||
return list(self.format_functions[result_type].keys())
|
||||
|
||||
|
||||
def supports_format(self, result_type: Type[Any], fmt: str) -> bool:
|
||||
""" Check if the given format is supported by this formatter.
|
||||
"""
|
||||
return fmt in self.format_functions[result_type]
|
||||
|
||||
|
||||
def format_result(self, result: Any, fmt: str, options: Mapping[str, Any]) -> str:
|
||||
""" Convert the given result into a string using the given format.
|
||||
|
||||
The format is expected to be in the list returned by
|
||||
`list_formats()`.
|
||||
"""
|
||||
return self.format_functions[type(result)][fmt](result, options)
|
||||
752
src/nominatim_api/results.py
Normal file
752
src/nominatim_api/results.py
Normal file
@@ -0,0 +1,752 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Dataclasses for search results and helper functions to fill them.
|
||||
|
||||
Data classes are part of the public API while the functions are for
|
||||
internal use only. That's why they are implemented as free-standing functions
|
||||
instead of member functions.
|
||||
"""
|
||||
from typing import Optional, Tuple, Dict, Sequence, TypeVar, Type, List, cast, Callable
|
||||
import enum
|
||||
import dataclasses
|
||||
import datetime as dt
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from nominatim_core.typing import SaSelect, SaRow
|
||||
from nominatim_core.db.sqlalchemy_types import Geometry
|
||||
from .types import Point, Bbox, LookupDetails
|
||||
from .connection import SearchConnection
|
||||
from .logging import log
|
||||
from .localization import Locales
|
||||
|
||||
# This file defines complex result data classes.
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
|
||||
def _mingle_name_tags(names: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]:
|
||||
""" Mix-in names from linked places, so that they show up
|
||||
as standard names where necessary.
|
||||
"""
|
||||
if not names:
|
||||
return None
|
||||
|
||||
out = {}
|
||||
for k, v in names.items():
|
||||
if k.startswith('_place_'):
|
||||
outkey = k[7:]
|
||||
out[k if outkey in names else outkey] = v
|
||||
else:
|
||||
out[k] = v
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class SourceTable(enum.Enum):
|
||||
""" The `SourceTable` type lists the possible sources a result can have.
|
||||
"""
|
||||
PLACEX = 1
|
||||
""" The placex table is the main source for result usually containing
|
||||
OSM data.
|
||||
"""
|
||||
OSMLINE = 2
|
||||
""" The osmline table contains address interpolations from OSM data.
|
||||
Interpolation addresses are always approximate. The OSM id in the
|
||||
result refers to the OSM way with the interpolation line object.
|
||||
"""
|
||||
TIGER = 3
|
||||
""" TIGER address data contains US addresses imported on the side,
|
||||
see [Installing TIGER data](../customize/Tiger.md).
|
||||
TIGER address are also interpolations. The addresses always refer
|
||||
to a street from OSM data. The OSM id in the result refers to
|
||||
that street.
|
||||
"""
|
||||
POSTCODE = 4
|
||||
""" The postcode table contains artificial centroids for postcodes,
|
||||
computed from the postcodes available with address points. Results
|
||||
are always approximate.
|
||||
"""
|
||||
COUNTRY = 5
|
||||
""" The country table provides a fallback, when country data is missing
|
||||
in the OSM data.
|
||||
"""
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class AddressLine:
|
||||
""" The `AddressLine` may contain the following fields about a related place
|
||||
and its function as an address object. Most fields are optional.
|
||||
Their presence depends on the kind and function of the address part.
|
||||
"""
|
||||
category: Tuple[str, str]
|
||||
""" Main category of the place, described by a key-value pair.
|
||||
"""
|
||||
names: Dict[str, str]
|
||||
""" All available names for the place including references, alternative
|
||||
names and translations.
|
||||
"""
|
||||
fromarea: bool
|
||||
""" If true, then the exact area of the place is known. Without area
|
||||
information, Nominatim has to make an educated guess if an address
|
||||
belongs to one place or another.
|
||||
"""
|
||||
isaddress: bool
|
||||
""" If true, this place should be considered for the final address display.
|
||||
Nominatim will sometimes include more than one candidate for
|
||||
the address in the list when it cannot reliably determine where the
|
||||
place belongs. It will consider names of all candidates when searching
|
||||
but when displaying the result, only the most likely candidate should
|
||||
be shown.
|
||||
"""
|
||||
rank_address: int
|
||||
""" [Address rank](../customize/Ranking.md#address-rank) of the place.
|
||||
"""
|
||||
distance: float
|
||||
""" Distance in degrees between the result place and this address part.
|
||||
"""
|
||||
place_id: Optional[int] = None
|
||||
""" Internal ID of the place.
|
||||
"""
|
||||
osm_object: Optional[Tuple[str, int]] = None
|
||||
""" OSM type and ID of the place, if such an object exists.
|
||||
"""
|
||||
extratags: Optional[Dict[str, str]] = None
|
||||
""" Any extra information available about the place. This is a dictionary
|
||||
that usually contains OSM tag key-value pairs.
|
||||
"""
|
||||
|
||||
admin_level: Optional[int] = None
|
||||
""" The administrative level of a boundary as tagged in the input data.
|
||||
This field is only meaningful for places of the category
|
||||
(boundary, administrative).
|
||||
"""
|
||||
|
||||
local_name: Optional[str] = None
|
||||
""" Place holder for localization of this address part. See
|
||||
[Localization](#localization) below.
|
||||
"""
|
||||
|
||||
|
||||
class AddressLines(List[AddressLine]):
|
||||
""" Sequence of address lines order in descending order by their rank.
|
||||
"""
|
||||
|
||||
def localize(self, locales: Locales) -> List[str]:
|
||||
""" Set the local name of address parts according to the chosen
|
||||
locale. Return the list of local names without duplicates.
|
||||
|
||||
Only address parts that are marked as isaddress are localized
|
||||
and returned.
|
||||
"""
|
||||
label_parts: List[str] = []
|
||||
|
||||
for line in self:
|
||||
if line.isaddress and line.names:
|
||||
line.local_name = locales.display_name(line.names)
|
||||
if not label_parts or label_parts[-1] != line.local_name:
|
||||
label_parts.append(line.local_name)
|
||||
|
||||
return label_parts
|
||||
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class WordInfo:
|
||||
""" Each entry in the list of search terms contains the
|
||||
following detailed information.
|
||||
"""
|
||||
word_id: int
|
||||
""" Internal identifier for the word.
|
||||
"""
|
||||
word_token: str
|
||||
""" Normalised and transliterated form of the word.
|
||||
This form is used for searching.
|
||||
"""
|
||||
word: Optional[str] = None
|
||||
""" Untransliterated form, if available.
|
||||
"""
|
||||
|
||||
|
||||
WordInfos = Sequence[WordInfo]
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class BaseResult:
|
||||
""" Data class collecting information common to all
|
||||
types of search results.
|
||||
"""
|
||||
source_table: SourceTable
|
||||
category: Tuple[str, str]
|
||||
centroid: Point
|
||||
|
||||
place_id : Optional[int] = None
|
||||
osm_object: Optional[Tuple[str, int]] = None
|
||||
parent_place_id: Optional[int] = None
|
||||
linked_place_id: Optional[int] = None
|
||||
admin_level: int = 15
|
||||
|
||||
locale_name: Optional[str] = None
|
||||
display_name: Optional[str] = None
|
||||
|
||||
names: Optional[Dict[str, str]] = None
|
||||
address: Optional[Dict[str, str]] = None
|
||||
extratags: Optional[Dict[str, str]] = None
|
||||
|
||||
housenumber: Optional[str] = None
|
||||
postcode: Optional[str] = None
|
||||
wikipedia: Optional[str] = None
|
||||
|
||||
rank_address: int = 30
|
||||
rank_search: int = 30
|
||||
importance: Optional[float] = None
|
||||
|
||||
country_code: Optional[str] = None
|
||||
|
||||
address_rows: Optional[AddressLines] = None
|
||||
linked_rows: Optional[AddressLines] = None
|
||||
parented_rows: Optional[AddressLines] = None
|
||||
name_keywords: Optional[WordInfos] = None
|
||||
address_keywords: Optional[WordInfos] = None
|
||||
|
||||
geometry: Dict[str, str] = dataclasses.field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def lat(self) -> float:
|
||||
""" Get the latitude (or y) of the center point of the place.
|
||||
"""
|
||||
return self.centroid[1]
|
||||
|
||||
|
||||
@property
|
||||
def lon(self) -> float:
|
||||
""" Get the longitude (or x) of the center point of the place.
|
||||
"""
|
||||
return self.centroid[0]
|
||||
|
||||
|
||||
def calculated_importance(self) -> float:
|
||||
""" Get a valid importance value. This is either the stored importance
|
||||
of the value or an artificial value computed from the place's
|
||||
search rank.
|
||||
"""
|
||||
return self.importance or (0.40001 - (self.rank_search/75.0))
|
||||
|
||||
|
||||
def localize(self, locales: Locales) -> None:
|
||||
""" Fill the locale_name and the display_name field for the
|
||||
place and, if available, its address information.
|
||||
"""
|
||||
self.locale_name = locales.display_name(self.names)
|
||||
if self.address_rows:
|
||||
self.display_name = ', '.join(self.address_rows.localize(locales))
|
||||
else:
|
||||
self.display_name = self.locale_name
|
||||
|
||||
|
||||
|
||||
BaseResultT = TypeVar('BaseResultT', bound=BaseResult)
|
||||
|
||||
@dataclasses.dataclass
|
||||
class DetailedResult(BaseResult):
|
||||
""" A search result with more internal information from the database
|
||||
added.
|
||||
"""
|
||||
indexed_date: Optional[dt.datetime] = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ReverseResult(BaseResult):
|
||||
""" A search result for reverse geocoding.
|
||||
"""
|
||||
distance: Optional[float] = None
|
||||
bbox: Optional[Bbox] = None
|
||||
|
||||
|
||||
class ReverseResults(List[ReverseResult]):
|
||||
""" Sequence of reverse lookup results ordered by distance.
|
||||
May be empty when no result was found.
|
||||
"""
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class SearchResult(BaseResult):
|
||||
""" A search result for forward geocoding.
|
||||
"""
|
||||
bbox: Optional[Bbox] = None
|
||||
accuracy: float = 0.0
|
||||
|
||||
|
||||
@property
|
||||
def ranking(self) -> float:
|
||||
""" Return the ranking, a combined measure of accuracy and importance.
|
||||
"""
|
||||
return (self.accuracy if self.accuracy is not None else 1) \
|
||||
- self.calculated_importance()
|
||||
|
||||
|
||||
class SearchResults(List[SearchResult]):
|
||||
""" Sequence of forward lookup results ordered by relevance.
|
||||
May be empty when no result was found.
|
||||
"""
|
||||
|
||||
|
||||
def _filter_geometries(row: SaRow) -> Dict[str, str]:
|
||||
return {k[9:]: v for k, v in row._mapping.items() # pylint: disable=W0212
|
||||
if k.startswith('geometry_')}
|
||||
|
||||
|
||||
def create_from_placex_row(row: Optional[SaRow],
|
||||
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
|
||||
""" Construct a new result and add the data from the result row
|
||||
from the placex table. 'class_type' defines the type of result
|
||||
to return. Returns None if the row is None.
|
||||
"""
|
||||
if row is None:
|
||||
return None
|
||||
|
||||
return class_type(source_table=SourceTable.PLACEX,
|
||||
place_id=row.place_id,
|
||||
osm_object=(row.osm_type, row.osm_id),
|
||||
category=(row.class_, row.type),
|
||||
parent_place_id = row.parent_place_id,
|
||||
linked_place_id = getattr(row, 'linked_place_id', None),
|
||||
admin_level = getattr(row, 'admin_level', 15),
|
||||
names=_mingle_name_tags(row.name),
|
||||
address=row.address,
|
||||
extratags=row.extratags,
|
||||
housenumber=row.housenumber,
|
||||
postcode=row.postcode,
|
||||
wikipedia=row.wikipedia,
|
||||
rank_address=row.rank_address,
|
||||
rank_search=row.rank_search,
|
||||
importance=row.importance,
|
||||
country_code=row.country_code,
|
||||
centroid=Point.from_wkb(row.centroid),
|
||||
geometry=_filter_geometries(row))
|
||||
|
||||
|
||||
def create_from_osmline_row(row: Optional[SaRow],
|
||||
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
|
||||
""" Construct a new result and add the data from the result row
|
||||
from the address interpolation table osmline. 'class_type' defines
|
||||
the type of result to return. Returns None if the row is None.
|
||||
|
||||
If the row contains a housenumber, then the housenumber is filled out.
|
||||
Otherwise the result contains the interpolation information in extratags.
|
||||
"""
|
||||
if row is None:
|
||||
return None
|
||||
|
||||
hnr = getattr(row, 'housenumber', None)
|
||||
|
||||
res = class_type(source_table=SourceTable.OSMLINE,
|
||||
place_id=row.place_id,
|
||||
parent_place_id = row.parent_place_id,
|
||||
osm_object=('W', row.osm_id),
|
||||
category=('place', 'houses' if hnr is None else 'house'),
|
||||
address=row.address,
|
||||
postcode=row.postcode,
|
||||
country_code=row.country_code,
|
||||
centroid=Point.from_wkb(row.centroid),
|
||||
geometry=_filter_geometries(row))
|
||||
|
||||
if hnr is None:
|
||||
res.extratags = {'startnumber': str(row.startnumber),
|
||||
'endnumber': str(row.endnumber),
|
||||
'step': str(row.step)}
|
||||
else:
|
||||
res.housenumber = str(hnr)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def create_from_tiger_row(row: Optional[SaRow],
|
||||
class_type: Type[BaseResultT],
|
||||
osm_type: Optional[str] = None,
|
||||
osm_id: Optional[int] = None) -> Optional[BaseResultT]:
|
||||
""" Construct a new result and add the data from the result row
|
||||
from the Tiger data interpolation table. 'class_type' defines
|
||||
the type of result to return. Returns None if the row is None.
|
||||
|
||||
If the row contains a housenumber, then the housenumber is filled out.
|
||||
Otherwise the result contains the interpolation information in extratags.
|
||||
"""
|
||||
if row is None:
|
||||
return None
|
||||
|
||||
hnr = getattr(row, 'housenumber', None)
|
||||
|
||||
res = class_type(source_table=SourceTable.TIGER,
|
||||
place_id=row.place_id,
|
||||
parent_place_id = row.parent_place_id,
|
||||
osm_object=(osm_type or row.osm_type, osm_id or row.osm_id),
|
||||
category=('place', 'houses' if hnr is None else 'house'),
|
||||
postcode=row.postcode,
|
||||
country_code='us',
|
||||
centroid=Point.from_wkb(row.centroid),
|
||||
geometry=_filter_geometries(row))
|
||||
|
||||
if hnr is None:
|
||||
res.extratags = {'startnumber': str(row.startnumber),
|
||||
'endnumber': str(row.endnumber),
|
||||
'step': str(row.step)}
|
||||
else:
|
||||
res.housenumber = str(hnr)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def create_from_postcode_row(row: Optional[SaRow],
|
||||
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
|
||||
""" Construct a new result and add the data from the result row
|
||||
from the postcode table. 'class_type' defines
|
||||
the type of result to return. Returns None if the row is None.
|
||||
"""
|
||||
if row is None:
|
||||
return None
|
||||
|
||||
return class_type(source_table=SourceTable.POSTCODE,
|
||||
place_id=row.place_id,
|
||||
parent_place_id = row.parent_place_id,
|
||||
category=('place', 'postcode'),
|
||||
names={'ref': row.postcode},
|
||||
rank_search=row.rank_search,
|
||||
rank_address=row.rank_address,
|
||||
country_code=row.country_code,
|
||||
centroid=Point.from_wkb(row.centroid),
|
||||
geometry=_filter_geometries(row))
|
||||
|
||||
|
||||
def create_from_country_row(row: Optional[SaRow],
|
||||
class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
|
||||
""" Construct a new result and add the data from the result row
|
||||
from the fallback country tables. 'class_type' defines
|
||||
the type of result to return. Returns None if the row is None.
|
||||
"""
|
||||
if row is None:
|
||||
return None
|
||||
|
||||
return class_type(source_table=SourceTable.COUNTRY,
|
||||
category=('place', 'country'),
|
||||
centroid=Point.from_wkb(row.centroid),
|
||||
names=row.name,
|
||||
rank_address=4, rank_search=4,
|
||||
country_code=row.country_code,
|
||||
geometry=_filter_geometries(row))
|
||||
|
||||
|
||||
async def add_result_details(conn: SearchConnection, results: List[BaseResultT],
|
||||
details: LookupDetails) -> None:
|
||||
""" Retrieve more details from the database according to the
|
||||
parameters specified in 'details'.
|
||||
"""
|
||||
if results:
|
||||
log().section('Query details for result')
|
||||
if details.address_details:
|
||||
log().comment('Query address details')
|
||||
await complete_address_details(conn, results)
|
||||
if details.linked_places:
|
||||
log().comment('Query linked places')
|
||||
for result in results:
|
||||
await complete_linked_places(conn, result)
|
||||
if details.parented_places:
|
||||
log().comment('Query parent places')
|
||||
for result in results:
|
||||
await complete_parented_places(conn, result)
|
||||
if details.keywords:
|
||||
log().comment('Query keywords')
|
||||
for result in results:
|
||||
await complete_keywords(conn, result)
|
||||
for result in results:
|
||||
result.localize(details.locales)
|
||||
|
||||
|
||||
def _result_row_to_address_row(row: SaRow, isaddress: Optional[bool] = None) -> AddressLine:
|
||||
""" Create a new AddressLine from the results of a database query.
|
||||
"""
|
||||
extratags: Dict[str, str] = getattr(row, 'extratags', {}) or {}
|
||||
if 'linked_place' in extratags:
|
||||
extratags['place'] = extratags['linked_place']
|
||||
|
||||
names = _mingle_name_tags(row.name) or {}
|
||||
if getattr(row, 'housenumber', None) is not None:
|
||||
names['housenumber'] = row.housenumber
|
||||
|
||||
if isaddress is None:
|
||||
isaddress = getattr(row, 'isaddress', True)
|
||||
|
||||
return AddressLine(place_id=row.place_id,
|
||||
osm_object=None if row.osm_type is None else (row.osm_type, row.osm_id),
|
||||
category=(getattr(row, 'class'), row.type),
|
||||
names=names,
|
||||
extratags=extratags,
|
||||
admin_level=row.admin_level,
|
||||
fromarea=row.fromarea,
|
||||
isaddress=isaddress,
|
||||
rank_address=row.rank_address,
|
||||
distance=row.distance)
|
||||
|
||||
|
||||
def _get_address_lookup_id(result: BaseResultT) -> int:
|
||||
assert result.place_id
|
||||
if result.source_table != SourceTable.PLACEX or result.rank_search > 27:
|
||||
return result.parent_place_id or result.place_id
|
||||
|
||||
return result.linked_place_id or result.place_id
|
||||
|
||||
|
||||
async def _finalize_entry(conn: SearchConnection, result: BaseResultT) -> None:
|
||||
assert result.address_rows is not None
|
||||
if result.category[0] not in ('boundary', 'place')\
|
||||
or result.category[1] not in ('postal_code', 'postcode'):
|
||||
postcode = result.postcode
|
||||
if not postcode and result.address:
|
||||
postcode = result.address.get('postcode')
|
||||
if postcode and ',' not in postcode and ';' not in postcode:
|
||||
result.address_rows.append(AddressLine(
|
||||
category=('place', 'postcode'),
|
||||
names={'ref': postcode},
|
||||
fromarea=False, isaddress=True, rank_address=5,
|
||||
distance=0.0))
|
||||
if result.country_code:
|
||||
async def _get_country_names() -> Optional[Dict[str, str]]:
|
||||
t = conn.t.country_name
|
||||
sql = sa.select(t.c.name, t.c.derived_name)\
|
||||
.where(t.c.country_code == result.country_code)
|
||||
for cres in await conn.execute(sql):
|
||||
names = cast(Dict[str, str], cres[0])
|
||||
if cres[1]:
|
||||
names.update(cast(Dict[str, str], cres[1]))
|
||||
return names
|
||||
return None
|
||||
|
||||
country_names = await conn.get_cached_value('COUNTRY_NAME',
|
||||
result.country_code,
|
||||
_get_country_names)
|
||||
if country_names:
|
||||
result.address_rows.append(AddressLine(
|
||||
category=('place', 'country'),
|
||||
names=country_names,
|
||||
fromarea=False, isaddress=True, rank_address=4,
|
||||
distance=0.0))
|
||||
result.address_rows.append(AddressLine(
|
||||
category=('place', 'country_code'),
|
||||
names={'ref': result.country_code}, extratags = {},
|
||||
fromarea=True, isaddress=False, rank_address=4,
|
||||
distance=0.0))
|
||||
|
||||
|
||||
def _setup_address_details(result: BaseResultT) -> None:
|
||||
""" Retrieve information about places that make up the address of the result.
|
||||
"""
|
||||
result.address_rows = AddressLines()
|
||||
if result.names:
|
||||
result.address_rows.append(AddressLine(
|
||||
place_id=result.place_id,
|
||||
osm_object=result.osm_object,
|
||||
category=result.category,
|
||||
names=result.names,
|
||||
extratags=result.extratags or {},
|
||||
admin_level=result.admin_level,
|
||||
fromarea=True, isaddress=True,
|
||||
rank_address=result.rank_address, distance=0.0))
|
||||
if result.source_table == SourceTable.PLACEX and result.address:
|
||||
housenumber = result.address.get('housenumber')\
|
||||
or result.address.get('streetnumber')\
|
||||
or result.address.get('conscriptionnumber')
|
||||
elif result.housenumber:
|
||||
housenumber = result.housenumber
|
||||
else:
|
||||
housenumber = None
|
||||
if housenumber:
|
||||
result.address_rows.append(AddressLine(
|
||||
category=('place', 'house_number'),
|
||||
names={'ref': housenumber},
|
||||
fromarea=True, isaddress=True, rank_address=28, distance=0))
|
||||
if result.address and '_unlisted_place' in result.address:
|
||||
result.address_rows.append(AddressLine(
|
||||
category=('place', 'locality'),
|
||||
names={'name': result.address['_unlisted_place']},
|
||||
fromarea=False, isaddress=True, rank_address=25, distance=0))
|
||||
|
||||
|
||||
async def complete_address_details(conn: SearchConnection, results: List[BaseResultT]) -> None:
|
||||
""" Retrieve information about places that make up the address of the result.
|
||||
"""
|
||||
for result in results:
|
||||
_setup_address_details(result)
|
||||
|
||||
### Lookup entries from place_address line
|
||||
|
||||
lookup_ids = [{'pid': r.place_id,
|
||||
'lid': _get_address_lookup_id(r),
|
||||
'names': list(r.address.values()) if r.address else [],
|
||||
'c': ('SRID=4326;' + r.centroid.to_wkt()) if r.centroid else '' }
|
||||
for r in results if r.place_id]
|
||||
|
||||
if not lookup_ids:
|
||||
return
|
||||
|
||||
ltab = sa.func.JsonArrayEach(sa.type_coerce(lookup_ids, sa.JSON))\
|
||||
.table_valued(sa.column('value', type_=sa.JSON))
|
||||
|
||||
t = conn.t.placex
|
||||
taddr = conn.t.addressline
|
||||
|
||||
sql = sa.select(ltab.c.value['pid'].as_integer().label('src_place_id'),
|
||||
t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
|
||||
t.c.class_, t.c.type, t.c.extratags,
|
||||
t.c.admin_level, taddr.c.fromarea,
|
||||
sa.case((t.c.rank_address == 11, 5),
|
||||
else_=t.c.rank_address).label('rank_address'),
|
||||
taddr.c.distance, t.c.country_code, t.c.postcode)\
|
||||
.join(taddr, sa.or_(taddr.c.place_id == ltab.c.value['pid'].as_integer(),
|
||||
taddr.c.place_id == ltab.c.value['lid'].as_integer()))\
|
||||
.join(t, taddr.c.address_place_id == t.c.place_id)\
|
||||
.order_by('src_place_id')\
|
||||
.order_by(sa.column('rank_address').desc())\
|
||||
.order_by((taddr.c.place_id == ltab.c.value['pid'].as_integer()).desc())\
|
||||
.order_by(sa.case((sa.func.CrosscheckNames(t.c.name, ltab.c.value['names']), 2),
|
||||
(taddr.c.isaddress, 0),
|
||||
(sa.and_(taddr.c.fromarea,
|
||||
t.c.geometry.ST_Contains(
|
||||
sa.func.ST_GeomFromEWKT(
|
||||
ltab.c.value['c'].as_string()))), 1),
|
||||
else_=-1).desc())\
|
||||
.order_by(taddr.c.fromarea.desc())\
|
||||
.order_by(taddr.c.distance.desc())\
|
||||
.order_by(t.c.rank_search.desc())
|
||||
|
||||
|
||||
current_result = None
|
||||
current_rank_address = -1
|
||||
for row in await conn.execute(sql):
|
||||
if current_result is None or row.src_place_id != current_result.place_id:
|
||||
current_result = next((r for r in results if r.place_id == row.src_place_id), None)
|
||||
assert current_result is not None
|
||||
current_rank_address = -1
|
||||
|
||||
location_isaddress = row.rank_address != current_rank_address
|
||||
|
||||
if current_result.country_code is None and row.country_code:
|
||||
current_result.country_code = row.country_code
|
||||
|
||||
if row.type in ('postcode', 'postal_code') and location_isaddress:
|
||||
if not row.fromarea or \
|
||||
(current_result.address and 'postcode' in current_result.address):
|
||||
location_isaddress = False
|
||||
else:
|
||||
current_result.postcode = None
|
||||
|
||||
assert current_result.address_rows is not None
|
||||
current_result.address_rows.append(_result_row_to_address_row(row, location_isaddress))
|
||||
current_rank_address = row.rank_address
|
||||
|
||||
for result in results:
|
||||
await _finalize_entry(conn, result)
|
||||
|
||||
|
||||
### Finally add the record for the parent entry where necessary.
|
||||
|
||||
parent_lookup_ids = list(filter(lambda e: e['pid'] != e['lid'], lookup_ids))
|
||||
if parent_lookup_ids:
|
||||
ltab = sa.func.JsonArrayEach(sa.type_coerce(parent_lookup_ids, sa.JSON))\
|
||||
.table_valued(sa.column('value', type_=sa.JSON))
|
||||
sql = sa.select(ltab.c.value['pid'].as_integer().label('src_place_id'),
|
||||
t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
|
||||
t.c.class_, t.c.type, t.c.extratags,
|
||||
t.c.admin_level,
|
||||
t.c.rank_address)\
|
||||
.where(t.c.place_id == ltab.c.value['lid'].as_integer())
|
||||
|
||||
for row in await conn.execute(sql):
|
||||
current_result = next((r for r in results if r.place_id == row.src_place_id), None)
|
||||
assert current_result is not None
|
||||
assert current_result.address_rows is not None
|
||||
|
||||
current_result.address_rows.append(AddressLine(
|
||||
place_id=row.place_id,
|
||||
osm_object=(row.osm_type, row.osm_id),
|
||||
category=(row.class_, row.type),
|
||||
names=row.name, extratags=row.extratags or {},
|
||||
admin_level=row.admin_level,
|
||||
fromarea=True, isaddress=True,
|
||||
rank_address=row.rank_address, distance=0.0))
|
||||
|
||||
### Now sort everything
|
||||
def mk_sort_key(place_id: Optional[int]) -> Callable[[AddressLine], Tuple[bool, int, bool]]:
|
||||
return lambda a: (a.place_id != place_id, -a.rank_address, a.isaddress)
|
||||
|
||||
for result in results:
|
||||
assert result.address_rows is not None
|
||||
result.address_rows.sort(key=mk_sort_key(result.place_id))
|
||||
|
||||
|
||||
def _placex_select_address_row(conn: SearchConnection,
|
||||
centroid: Point) -> SaSelect:
|
||||
t = conn.t.placex
|
||||
return sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
|
||||
t.c.class_.label('class'), t.c.type,
|
||||
t.c.admin_level, t.c.housenumber,
|
||||
t.c.geometry.is_area().label('fromarea'),
|
||||
t.c.rank_address,
|
||||
t.c.geometry.distance_spheroid(
|
||||
sa.bindparam('centroid', value=centroid, type_=Geometry)).label('distance'))
|
||||
|
||||
|
||||
async def complete_linked_places(conn: SearchConnection, result: BaseResult) -> None:
|
||||
""" Retrieve information about places that link to the result.
|
||||
"""
|
||||
result.linked_rows = AddressLines()
|
||||
if result.source_table != SourceTable.PLACEX:
|
||||
return
|
||||
|
||||
sql = _placex_select_address_row(conn, result.centroid)\
|
||||
.where(conn.t.placex.c.linked_place_id == result.place_id)
|
||||
|
||||
for row in await conn.execute(sql):
|
||||
result.linked_rows.append(_result_row_to_address_row(row))
|
||||
|
||||
|
||||
async def complete_keywords(conn: SearchConnection, result: BaseResult) -> None:
|
||||
""" Retrieve information about the search terms used for this place.
|
||||
|
||||
Requires that the query analyzer was initialised to get access to
|
||||
the word table.
|
||||
"""
|
||||
t = conn.t.search_name
|
||||
sql = sa.select(t.c.name_vector, t.c.nameaddress_vector)\
|
||||
.where(t.c.place_id == result.place_id)
|
||||
|
||||
result.name_keywords = []
|
||||
result.address_keywords = []
|
||||
|
||||
t = conn.t.meta.tables['word']
|
||||
sel = sa.select(t.c.word_id, t.c.word_token, t.c.word)
|
||||
|
||||
for name_tokens, address_tokens in await conn.execute(sql):
|
||||
for row in await conn.execute(sel.where(t.c.word_id.in_(name_tokens))):
|
||||
result.name_keywords.append(WordInfo(*row))
|
||||
|
||||
for row in await conn.execute(sel.where(t.c.word_id.in_(address_tokens))):
|
||||
result.address_keywords.append(WordInfo(*row))
|
||||
|
||||
|
||||
async def complete_parented_places(conn: SearchConnection, result: BaseResult) -> None:
|
||||
""" Retrieve information about places that the result provides the
|
||||
address for.
|
||||
"""
|
||||
result.parented_rows = AddressLines()
|
||||
if result.source_table != SourceTable.PLACEX:
|
||||
return
|
||||
|
||||
sql = _placex_select_address_row(conn, result.centroid)\
|
||||
.where(conn.t.placex.c.parent_place_id == result.place_id)\
|
||||
.where(conn.t.placex.c.rank_search == 30)
|
||||
|
||||
for row in await conn.execute(sql):
|
||||
result.parented_rows.append(_result_row_to_address_row(row))
|
||||
603
src/nominatim_api/reverse.py
Normal file
603
src/nominatim_api/reverse.py
Normal file
@@ -0,0 +1,603 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of reverse geocoding.
|
||||
"""
|
||||
from typing import Optional, List, Callable, Type, Tuple, Dict, Any, cast, Union
|
||||
import functools
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from nominatim_core.typing import SaColumn, SaSelect, SaFromClause, SaLabel, SaRow,\
|
||||
SaBind, SaLambdaSelect
|
||||
from nominatim_core.db.sqlalchemy_types import Geometry
|
||||
from .connection import SearchConnection
|
||||
from . import results as nres
|
||||
from .logging import log
|
||||
from .types import AnyPoint, DataLayer, ReverseDetails, GeometryFormat, Bbox
|
||||
|
||||
# In SQLAlchemy expression which compare with NULL need to be expressed with
|
||||
# the equal sign.
|
||||
# pylint: disable=singleton-comparison
|
||||
|
||||
RowFunc = Callable[[Optional[SaRow], Type[nres.ReverseResult]], Optional[nres.ReverseResult]]
|
||||
|
||||
WKT_PARAM: SaBind = sa.bindparam('wkt', type_=Geometry)
|
||||
MAX_RANK_PARAM: SaBind = sa.bindparam('max_rank')
|
||||
|
||||
def no_index(expr: SaColumn) -> SaColumn:
|
||||
""" Wrap the given expression, so that the query planner will
|
||||
refrain from using the expression for index lookup.
|
||||
"""
|
||||
return sa.func.coalesce(sa.null(), expr) # pylint: disable=not-callable
|
||||
|
||||
|
||||
def _select_from_placex(t: SaFromClause, use_wkt: bool = True) -> SaSelect:
|
||||
""" Create a select statement with the columns relevant for reverse
|
||||
results.
|
||||
"""
|
||||
if not use_wkt:
|
||||
distance = t.c.distance
|
||||
centroid = t.c.centroid
|
||||
else:
|
||||
distance = t.c.geometry.ST_Distance(WKT_PARAM)
|
||||
centroid = sa.case((t.c.geometry.is_line_like(), t.c.geometry.ST_ClosestPoint(WKT_PARAM)),
|
||||
else_=t.c.centroid).label('centroid')
|
||||
|
||||
|
||||
return sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
|
||||
t.c.class_, t.c.type,
|
||||
t.c.address, t.c.extratags,
|
||||
t.c.housenumber, t.c.postcode, t.c.country_code,
|
||||
t.c.importance, t.c.wikipedia,
|
||||
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
|
||||
centroid,
|
||||
t.c.linked_place_id, t.c.admin_level,
|
||||
distance.label('distance'),
|
||||
t.c.geometry.ST_Expand(0).label('bbox'))
|
||||
|
||||
|
||||
def _interpolated_housenumber(table: SaFromClause) -> SaLabel:
|
||||
return sa.cast(table.c.startnumber
|
||||
+ sa.func.round(((table.c.endnumber - table.c.startnumber) * table.c.position)
|
||||
/ table.c.step) * table.c.step,
|
||||
sa.Integer).label('housenumber')
|
||||
|
||||
|
||||
def _interpolated_position(table: SaFromClause) -> SaLabel:
|
||||
fac = sa.cast(table.c.step, sa.Float) / (table.c.endnumber - table.c.startnumber)
|
||||
rounded_pos = sa.func.round(table.c.position / fac) * fac
|
||||
return sa.case(
|
||||
(table.c.endnumber == table.c.startnumber, table.c.linegeo.ST_Centroid()),
|
||||
else_=table.c.linegeo.ST_LineInterpolatePoint(rounded_pos)).label('centroid')
|
||||
|
||||
|
||||
def _locate_interpolation(table: SaFromClause) -> SaLabel:
|
||||
""" Given a position, locate the closest point on the line.
|
||||
"""
|
||||
return sa.case((table.c.linegeo.is_line_like(),
|
||||
table.c.linegeo.ST_LineLocatePoint(WKT_PARAM)),
|
||||
else_=0).label('position')
|
||||
|
||||
|
||||
def _get_closest(*rows: Optional[SaRow]) -> Optional[SaRow]:
|
||||
return min(rows, key=lambda row: 1000 if row is None else row.distance)
|
||||
|
||||
|
||||
class ReverseGeocoder:
|
||||
""" Class implementing the logic for looking up a place from a
|
||||
coordinate.
|
||||
"""
|
||||
|
||||
def __init__(self, conn: SearchConnection, params: ReverseDetails,
|
||||
restrict_to_country_areas: bool = False) -> None:
|
||||
self.conn = conn
|
||||
self.params = params
|
||||
self.restrict_to_country_areas = restrict_to_country_areas
|
||||
|
||||
self.bind_params: Dict[str, Any] = {'max_rank': params.max_rank}
|
||||
|
||||
|
||||
@property
|
||||
def max_rank(self) -> int:
|
||||
""" Return the maximum configured rank.
|
||||
"""
|
||||
return self.params.max_rank
|
||||
|
||||
|
||||
def has_geometries(self) -> bool:
|
||||
""" Check if any geometries are requested.
|
||||
"""
|
||||
return bool(self.params.geometry_output)
|
||||
|
||||
|
||||
def layer_enabled(self, *layer: DataLayer) -> bool:
|
||||
""" Return true when any of the given layer types are requested.
|
||||
"""
|
||||
return any(self.params.layers & l for l in layer)
|
||||
|
||||
|
||||
def layer_disabled(self, *layer: DataLayer) -> bool:
|
||||
""" Return true when none of the given layer types is requested.
|
||||
"""
|
||||
return not any(self.params.layers & l for l in layer)
|
||||
|
||||
|
||||
def has_feature_layers(self) -> bool:
|
||||
""" Return true if any layer other than ADDRESS or POI is requested.
|
||||
"""
|
||||
return self.layer_enabled(DataLayer.RAILWAY, DataLayer.MANMADE, DataLayer.NATURAL)
|
||||
|
||||
|
||||
def _add_geometry_columns(self, sql: SaLambdaSelect, col: SaColumn) -> SaSelect:
|
||||
out = []
|
||||
|
||||
if self.params.geometry_simplification > 0.0:
|
||||
col = sa.func.ST_SimplifyPreserveTopology(col, self.params.geometry_simplification)
|
||||
|
||||
if self.params.geometry_output & GeometryFormat.GEOJSON:
|
||||
out.append(sa.func.ST_AsGeoJSON(col, 7).label('geometry_geojson'))
|
||||
if self.params.geometry_output & GeometryFormat.TEXT:
|
||||
out.append(sa.func.ST_AsText(col).label('geometry_text'))
|
||||
if self.params.geometry_output & GeometryFormat.KML:
|
||||
out.append(sa.func.ST_AsKML(col, 7).label('geometry_kml'))
|
||||
if self.params.geometry_output & GeometryFormat.SVG:
|
||||
out.append(sa.func.ST_AsSVG(col, 0, 7).label('geometry_svg'))
|
||||
|
||||
return sql.add_columns(*out)
|
||||
|
||||
|
||||
def _filter_by_layer(self, table: SaFromClause) -> SaColumn:
|
||||
if self.layer_enabled(DataLayer.MANMADE):
|
||||
exclude = []
|
||||
if self.layer_disabled(DataLayer.RAILWAY):
|
||||
exclude.append('railway')
|
||||
if self.layer_disabled(DataLayer.NATURAL):
|
||||
exclude.extend(('natural', 'water', 'waterway'))
|
||||
return table.c.class_.not_in(tuple(exclude))
|
||||
|
||||
include = []
|
||||
if self.layer_enabled(DataLayer.RAILWAY):
|
||||
include.append('railway')
|
||||
if self.layer_enabled(DataLayer.NATURAL):
|
||||
include.extend(('natural', 'water', 'waterway'))
|
||||
return table.c.class_.in_(tuple(include))
|
||||
|
||||
|
||||
async def _find_closest_street_or_poi(self, distance: float) -> Optional[SaRow]:
|
||||
""" Look up the closest rank 26+ place in the database, which
|
||||
is closer than the given distance.
|
||||
"""
|
||||
t = self.conn.t.placex
|
||||
|
||||
# PostgreSQL must not get the distance as a parameter because
|
||||
# there is a danger it won't be able to properly estimate index use
|
||||
# when used with prepared statements
|
||||
diststr = sa.text(f"{distance}")
|
||||
|
||||
sql: SaLambdaSelect = sa.lambda_stmt(lambda: _select_from_placex(t)
|
||||
.where(t.c.geometry.within_distance(WKT_PARAM, diststr))
|
||||
.where(t.c.indexed_status == 0)
|
||||
.where(t.c.linked_place_id == None)
|
||||
.where(sa.or_(sa.not_(t.c.geometry.is_area()),
|
||||
t.c.centroid.ST_Distance(WKT_PARAM) < diststr))
|
||||
.order_by('distance')
|
||||
.limit(2))
|
||||
|
||||
if self.has_geometries():
|
||||
sql = self._add_geometry_columns(sql, t.c.geometry)
|
||||
|
||||
restrict: List[Union[SaColumn, Callable[[], SaColumn]]] = []
|
||||
|
||||
if self.layer_enabled(DataLayer.ADDRESS):
|
||||
max_rank = min(29, self.max_rank)
|
||||
restrict.append(lambda: no_index(t.c.rank_address).between(26, max_rank))
|
||||
if self.max_rank == 30:
|
||||
restrict.append(lambda: sa.func.IsAddressPoint(t))
|
||||
if self.layer_enabled(DataLayer.POI) and self.max_rank == 30:
|
||||
restrict.append(lambda: sa.and_(no_index(t.c.rank_search) == 30,
|
||||
t.c.class_.not_in(('place', 'building')),
|
||||
sa.not_(t.c.geometry.is_line_like())))
|
||||
if self.has_feature_layers():
|
||||
restrict.append(sa.and_(no_index(t.c.rank_search).between(26, MAX_RANK_PARAM),
|
||||
no_index(t.c.rank_address) == 0,
|
||||
self._filter_by_layer(t)))
|
||||
|
||||
if not restrict:
|
||||
return None
|
||||
|
||||
sql = sql.where(sa.or_(*restrict))
|
||||
|
||||
# If the closest object is inside an area, then check if there is a
|
||||
# POI node nearby and return that.
|
||||
prev_row = None
|
||||
for row in await self.conn.execute(sql, self.bind_params):
|
||||
if prev_row is None:
|
||||
if row.rank_search <= 27 or row.osm_type == 'N' or row.distance > 0:
|
||||
return row
|
||||
prev_row = row
|
||||
else:
|
||||
if row.rank_search > 27 and row.osm_type == 'N'\
|
||||
and row.distance < 0.0001:
|
||||
return row
|
||||
|
||||
return prev_row
|
||||
|
||||
|
||||
async def _find_housenumber_for_street(self, parent_place_id: int) -> Optional[SaRow]:
|
||||
t = self.conn.t.placex
|
||||
|
||||
def _base_query() -> SaSelect:
|
||||
return _select_from_placex(t)\
|
||||
.where(t.c.geometry.within_distance(WKT_PARAM, 0.001))\
|
||||
.where(t.c.parent_place_id == parent_place_id)\
|
||||
.where(sa.func.IsAddressPoint(t))\
|
||||
.where(t.c.indexed_status == 0)\
|
||||
.where(t.c.linked_place_id == None)\
|
||||
.order_by('distance')\
|
||||
.limit(1)
|
||||
|
||||
sql: SaLambdaSelect
|
||||
if self.has_geometries():
|
||||
sql = self._add_geometry_columns(_base_query(), t.c.geometry)
|
||||
else:
|
||||
sql = sa.lambda_stmt(_base_query)
|
||||
|
||||
return (await self.conn.execute(sql, self.bind_params)).one_or_none()
|
||||
|
||||
|
||||
async def _find_interpolation_for_street(self, parent_place_id: Optional[int],
|
||||
distance: float) -> Optional[SaRow]:
|
||||
t = self.conn.t.osmline
|
||||
|
||||
sql = sa.select(t,
|
||||
t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
|
||||
_locate_interpolation(t))\
|
||||
.where(t.c.linegeo.within_distance(WKT_PARAM, distance))\
|
||||
.where(t.c.startnumber != None)\
|
||||
.order_by('distance')\
|
||||
.limit(1)
|
||||
|
||||
if parent_place_id is not None:
|
||||
sql = sql.where(t.c.parent_place_id == parent_place_id)
|
||||
|
||||
inner = sql.subquery('ipol')
|
||||
|
||||
sql = sa.select(inner.c.place_id, inner.c.osm_id,
|
||||
inner.c.parent_place_id, inner.c.address,
|
||||
_interpolated_housenumber(inner),
|
||||
_interpolated_position(inner),
|
||||
inner.c.postcode, inner.c.country_code,
|
||||
inner.c.distance)
|
||||
|
||||
if self.has_geometries():
|
||||
sub = sql.subquery('geom')
|
||||
sql = self._add_geometry_columns(sa.select(sub), sub.c.centroid)
|
||||
|
||||
return (await self.conn.execute(sql, self.bind_params)).one_or_none()
|
||||
|
||||
|
||||
async def _find_tiger_number_for_street(self, parent_place_id: int) -> Optional[SaRow]:
|
||||
t = self.conn.t.tiger
|
||||
|
||||
def _base_query() -> SaSelect:
|
||||
inner = sa.select(t,
|
||||
t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
|
||||
_locate_interpolation(t))\
|
||||
.where(t.c.linegeo.within_distance(WKT_PARAM, 0.001))\
|
||||
.where(t.c.parent_place_id == parent_place_id)\
|
||||
.order_by('distance')\
|
||||
.limit(1)\
|
||||
.subquery('tiger')
|
||||
|
||||
return sa.select(inner.c.place_id,
|
||||
inner.c.parent_place_id,
|
||||
_interpolated_housenumber(inner),
|
||||
_interpolated_position(inner),
|
||||
inner.c.postcode,
|
||||
inner.c.distance)
|
||||
|
||||
sql: SaLambdaSelect
|
||||
if self.has_geometries():
|
||||
sub = _base_query().subquery('geom')
|
||||
sql = self._add_geometry_columns(sa.select(sub), sub.c.centroid)
|
||||
else:
|
||||
sql = sa.lambda_stmt(_base_query)
|
||||
|
||||
return (await self.conn.execute(sql, self.bind_params)).one_or_none()
|
||||
|
||||
|
||||
async def lookup_street_poi(self) -> Tuple[Optional[SaRow], RowFunc]:
|
||||
""" Find a street or POI/address for the given WKT point.
|
||||
"""
|
||||
log().section('Reverse lookup on street/address level')
|
||||
distance = 0.006
|
||||
parent_place_id = None
|
||||
|
||||
row = await self._find_closest_street_or_poi(distance)
|
||||
row_func: RowFunc = nres.create_from_placex_row
|
||||
log().var_dump('Result (street/building)', row)
|
||||
|
||||
# If the closest result was a street, but an address was requested,
|
||||
# check for a housenumber nearby which is part of the street.
|
||||
if row is not None:
|
||||
if self.max_rank > 27 \
|
||||
and self.layer_enabled(DataLayer.ADDRESS) \
|
||||
and row.rank_address <= 27:
|
||||
distance = 0.001
|
||||
parent_place_id = row.place_id
|
||||
log().comment('Find housenumber for street')
|
||||
addr_row = await self._find_housenumber_for_street(parent_place_id)
|
||||
log().var_dump('Result (street housenumber)', addr_row)
|
||||
|
||||
if addr_row is not None:
|
||||
row = addr_row
|
||||
row_func = nres.create_from_placex_row
|
||||
distance = addr_row.distance
|
||||
elif row.country_code == 'us' and parent_place_id is not None:
|
||||
log().comment('Find TIGER housenumber for street')
|
||||
addr_row = await self._find_tiger_number_for_street(parent_place_id)
|
||||
log().var_dump('Result (street Tiger housenumber)', addr_row)
|
||||
|
||||
if addr_row is not None:
|
||||
row_func = cast(RowFunc,
|
||||
functools.partial(nres.create_from_tiger_row,
|
||||
osm_type=row.osm_type,
|
||||
osm_id=row.osm_id))
|
||||
row = addr_row
|
||||
else:
|
||||
distance = row.distance
|
||||
|
||||
# Check for an interpolation that is either closer than our result
|
||||
# or belongs to a close street found.
|
||||
if self.max_rank > 27 and self.layer_enabled(DataLayer.ADDRESS):
|
||||
log().comment('Find interpolation for street')
|
||||
addr_row = await self._find_interpolation_for_street(parent_place_id,
|
||||
distance)
|
||||
log().var_dump('Result (street interpolation)', addr_row)
|
||||
if addr_row is not None:
|
||||
row = addr_row
|
||||
row_func = nres.create_from_osmline_row
|
||||
|
||||
return row, row_func
|
||||
|
||||
|
||||
async def _lookup_area_address(self) -> Optional[SaRow]:
|
||||
""" Lookup large addressable areas for the given WKT point.
|
||||
"""
|
||||
log().comment('Reverse lookup by larger address area features')
|
||||
t = self.conn.t.placex
|
||||
|
||||
def _base_query() -> SaSelect:
|
||||
# The inner SQL brings results in the right order, so that
|
||||
# later only a minimum of results needs to be checked with ST_Contains.
|
||||
inner = sa.select(t, sa.literal(0.0).label('distance'))\
|
||||
.where(t.c.rank_search.between(5, MAX_RANK_PARAM))\
|
||||
.where(t.c.geometry.intersects(WKT_PARAM))\
|
||||
.where(sa.func.PlacexGeometryReverseLookuppolygon())\
|
||||
.order_by(sa.desc(t.c.rank_search))\
|
||||
.limit(50)\
|
||||
.subquery('area')
|
||||
|
||||
return _select_from_placex(inner, False)\
|
||||
.where(inner.c.geometry.ST_Contains(WKT_PARAM))\
|
||||
.order_by(sa.desc(inner.c.rank_search))\
|
||||
.limit(1)
|
||||
|
||||
sql: SaLambdaSelect = sa.lambda_stmt(_base_query)
|
||||
if self.has_geometries():
|
||||
sql = self._add_geometry_columns(sql, sa.literal_column('area.geometry'))
|
||||
|
||||
address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
|
||||
log().var_dump('Result (area)', address_row)
|
||||
|
||||
if address_row is not None and address_row.rank_search < self.max_rank:
|
||||
log().comment('Search for better matching place nodes inside the area')
|
||||
|
||||
address_rank = address_row.rank_search
|
||||
address_id = address_row.place_id
|
||||
|
||||
def _place_inside_area_query() -> SaSelect:
|
||||
inner = \
|
||||
sa.select(t,
|
||||
t.c.geometry.ST_Distance(WKT_PARAM).label('distance'))\
|
||||
.where(t.c.rank_search > address_rank)\
|
||||
.where(t.c.rank_search <= MAX_RANK_PARAM)\
|
||||
.where(t.c.indexed_status == 0)\
|
||||
.where(sa.func.IntersectsReverseDistance(t, WKT_PARAM))\
|
||||
.order_by(sa.desc(t.c.rank_search))\
|
||||
.limit(50)\
|
||||
.subquery('places')
|
||||
|
||||
touter = t.alias('outer')
|
||||
return _select_from_placex(inner, False)\
|
||||
.join(touter, touter.c.geometry.ST_Contains(inner.c.geometry))\
|
||||
.where(touter.c.place_id == address_id)\
|
||||
.where(sa.func.IsBelowReverseDistance(inner.c.distance, inner.c.rank_search))\
|
||||
.order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
|
||||
.limit(1)
|
||||
|
||||
if self.has_geometries():
|
||||
sql = self._add_geometry_columns(_place_inside_area_query(),
|
||||
sa.literal_column('places.geometry'))
|
||||
else:
|
||||
sql = sa.lambda_stmt(_place_inside_area_query)
|
||||
|
||||
place_address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
|
||||
log().var_dump('Result (place node)', place_address_row)
|
||||
|
||||
if place_address_row is not None:
|
||||
return place_address_row
|
||||
|
||||
return address_row
|
||||
|
||||
|
||||
async def _lookup_area_others(self) -> Optional[SaRow]:
|
||||
t = self.conn.t.placex
|
||||
|
||||
inner = sa.select(t, t.c.geometry.ST_Distance(WKT_PARAM).label('distance'))\
|
||||
.where(t.c.rank_address == 0)\
|
||||
.where(t.c.rank_search.between(5, MAX_RANK_PARAM))\
|
||||
.where(t.c.name != None)\
|
||||
.where(t.c.indexed_status == 0)\
|
||||
.where(t.c.linked_place_id == None)\
|
||||
.where(self._filter_by_layer(t))\
|
||||
.where(t.c.geometry.intersects(sa.func.ST_Expand(WKT_PARAM, 0.007)))\
|
||||
.order_by(sa.desc(t.c.rank_search))\
|
||||
.order_by('distance')\
|
||||
.limit(50)\
|
||||
.subquery()
|
||||
|
||||
sql = _select_from_placex(inner, False)\
|
||||
.where(sa.or_(sa.not_(inner.c.geometry.is_area()),
|
||||
inner.c.geometry.ST_Contains(WKT_PARAM)))\
|
||||
.order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
|
||||
.limit(1)
|
||||
|
||||
if self.has_geometries():
|
||||
sql = self._add_geometry_columns(sql, inner.c.geometry)
|
||||
|
||||
row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
|
||||
log().var_dump('Result (non-address feature)', row)
|
||||
|
||||
return row
|
||||
|
||||
|
||||
async def lookup_area(self) -> Optional[SaRow]:
|
||||
""" Lookup large areas for the current search.
|
||||
"""
|
||||
log().section('Reverse lookup by larger area features')
|
||||
|
||||
if self.layer_enabled(DataLayer.ADDRESS):
|
||||
address_row = await self._lookup_area_address()
|
||||
else:
|
||||
address_row = None
|
||||
|
||||
if self.has_feature_layers():
|
||||
other_row = await self._lookup_area_others()
|
||||
else:
|
||||
other_row = None
|
||||
|
||||
return _get_closest(address_row, other_row)
|
||||
|
||||
|
||||
async def lookup_country_codes(self) -> List[str]:
|
||||
""" Lookup the country for the current search.
|
||||
"""
|
||||
log().section('Reverse lookup by country code')
|
||||
t = self.conn.t.country_grid
|
||||
sql = sa.select(t.c.country_code).distinct()\
|
||||
.where(t.c.geometry.ST_Contains(WKT_PARAM))
|
||||
|
||||
ccodes = [cast(str, r[0]) for r in await self.conn.execute(sql, self.bind_params)]
|
||||
log().var_dump('Country codes', ccodes)
|
||||
return ccodes
|
||||
|
||||
|
||||
async def lookup_country(self, ccodes: List[str]) -> Optional[SaRow]:
|
||||
""" Lookup the country for the current search.
|
||||
"""
|
||||
if not ccodes:
|
||||
ccodes = await self.lookup_country_codes()
|
||||
|
||||
if not ccodes:
|
||||
return None
|
||||
|
||||
t = self.conn.t.placex
|
||||
if self.max_rank > 4:
|
||||
log().comment('Search for place nodes in country')
|
||||
|
||||
def _base_query() -> SaSelect:
|
||||
inner = \
|
||||
sa.select(t,
|
||||
t.c.geometry.ST_Distance(WKT_PARAM).label('distance'))\
|
||||
.where(t.c.rank_search > 4)\
|
||||
.where(t.c.rank_search <= MAX_RANK_PARAM)\
|
||||
.where(t.c.indexed_status == 0)\
|
||||
.where(t.c.country_code.in_(ccodes))\
|
||||
.where(sa.func.IntersectsReverseDistance(t, WKT_PARAM))\
|
||||
.order_by(sa.desc(t.c.rank_search))\
|
||||
.limit(50)\
|
||||
.subquery('area')
|
||||
|
||||
return _select_from_placex(inner, False)\
|
||||
.where(sa.func.IsBelowReverseDistance(inner.c.distance, inner.c.rank_search))\
|
||||
.order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
|
||||
.limit(1)
|
||||
|
||||
sql: SaLambdaSelect
|
||||
if self.has_geometries():
|
||||
sql = self._add_geometry_columns(_base_query(),
|
||||
sa.literal_column('area.geometry'))
|
||||
else:
|
||||
sql = sa.lambda_stmt(_base_query)
|
||||
|
||||
address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
|
||||
log().var_dump('Result (addressable place node)', address_row)
|
||||
else:
|
||||
address_row = None
|
||||
|
||||
if address_row is None:
|
||||
# Still nothing, then return a country with the appropriate country code.
|
||||
def _country_base_query() -> SaSelect:
|
||||
return _select_from_placex(t)\
|
||||
.where(t.c.country_code.in_(ccodes))\
|
||||
.where(t.c.rank_address == 4)\
|
||||
.where(t.c.rank_search == 4)\
|
||||
.where(t.c.linked_place_id == None)\
|
||||
.order_by('distance')\
|
||||
.limit(1)
|
||||
|
||||
if self.has_geometries():
|
||||
sql = self._add_geometry_columns(_country_base_query(), t.c.geometry)
|
||||
else:
|
||||
sql = sa.lambda_stmt(_country_base_query)
|
||||
|
||||
address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
|
||||
|
||||
return address_row
|
||||
|
||||
|
||||
async def lookup(self, coord: AnyPoint) -> Optional[nres.ReverseResult]:
|
||||
""" Look up a single coordinate. Returns the place information,
|
||||
if a place was found near the coordinates or None otherwise.
|
||||
"""
|
||||
log().function('reverse_lookup', coord=coord, params=self.params)
|
||||
|
||||
|
||||
self.bind_params['wkt'] = f'POINT({coord[0]} {coord[1]})'
|
||||
|
||||
row: Optional[SaRow] = None
|
||||
row_func: RowFunc = nres.create_from_placex_row
|
||||
|
||||
if self.max_rank >= 26:
|
||||
row, tmp_row_func = await self.lookup_street_poi()
|
||||
if row is not None:
|
||||
row_func = tmp_row_func
|
||||
|
||||
if row is None:
|
||||
if self.restrict_to_country_areas:
|
||||
ccodes = await self.lookup_country_codes()
|
||||
if not ccodes:
|
||||
return None
|
||||
else:
|
||||
ccodes = []
|
||||
|
||||
if self.max_rank > 4:
|
||||
row = await self.lookup_area()
|
||||
if row is None and self.layer_enabled(DataLayer.ADDRESS):
|
||||
row = await self.lookup_country(ccodes)
|
||||
|
||||
result = row_func(row, nres.ReverseResult)
|
||||
if result is not None:
|
||||
assert row is not None
|
||||
result.distance = row.distance
|
||||
if hasattr(row, 'bbox'):
|
||||
result.bbox = Bbox.from_wkb(row.bbox)
|
||||
await nres.add_result_details(self.conn, [result], self.params)
|
||||
|
||||
return result
|
||||
15
src/nominatim_api/search/__init__.py
Normal file
15
src/nominatim_api/search/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Module for forward search.
|
||||
"""
|
||||
# pylint: disable=useless-import-alias
|
||||
|
||||
from .geocoder import (ForwardGeocoder as ForwardGeocoder)
|
||||
from .query import (Phrase as Phrase,
|
||||
PhraseType as PhraseType)
|
||||
from .query_analyzer_factory import (make_query_analyzer as make_query_analyzer)
|
||||
459
src/nominatim_api/search/db_search_builder.py
Normal file
459
src/nominatim_api/search/db_search_builder.py
Normal file
@@ -0,0 +1,459 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Conversion from token assignment to an abstract DB search.
|
||||
"""
|
||||
from typing import Optional, List, Tuple, Iterator, Dict
|
||||
import heapq
|
||||
|
||||
from ..types import SearchDetails, DataLayer
|
||||
from .query import QueryStruct, Token, TokenType, TokenRange, BreakType
|
||||
from .token_assignment import TokenAssignment
|
||||
from . import db_search_fields as dbf
|
||||
from . import db_searches as dbs
|
||||
from . import db_search_lookups as lookups
|
||||
|
||||
|
||||
def wrap_near_search(categories: List[Tuple[str, str]],
|
||||
search: dbs.AbstractSearch) -> dbs.NearSearch:
|
||||
""" Create a new search that wraps the given search in a search
|
||||
for near places of the given category.
|
||||
"""
|
||||
return dbs.NearSearch(penalty=search.penalty,
|
||||
categories=dbf.WeightedCategories(categories,
|
||||
[0.0] * len(categories)),
|
||||
search=search)
|
||||
|
||||
|
||||
def build_poi_search(category: List[Tuple[str, str]],
|
||||
countries: Optional[List[str]]) -> dbs.PoiSearch:
|
||||
""" Create a new search for places by the given category, possibly
|
||||
constraint to the given countries.
|
||||
"""
|
||||
if countries:
|
||||
ccs = dbf.WeightedStrings(countries, [0.0] * len(countries))
|
||||
else:
|
||||
ccs = dbf.WeightedStrings([], [])
|
||||
|
||||
class _PoiData(dbf.SearchData):
|
||||
penalty = 0.0
|
||||
qualifiers = dbf.WeightedCategories(category, [0.0] * len(category))
|
||||
countries=ccs
|
||||
|
||||
return dbs.PoiSearch(_PoiData())
|
||||
|
||||
|
||||
class SearchBuilder:
|
||||
""" Build the abstract search queries from token assignments.
|
||||
"""
|
||||
|
||||
def __init__(self, query: QueryStruct, details: SearchDetails) -> None:
|
||||
self.query = query
|
||||
self.details = details
|
||||
|
||||
|
||||
@property
|
||||
def configured_for_country(self) -> bool:
|
||||
""" Return true if the search details are configured to
|
||||
allow countries in the result.
|
||||
"""
|
||||
return self.details.min_rank <= 4 and self.details.max_rank >= 4 \
|
||||
and self.details.layer_enabled(DataLayer.ADDRESS)
|
||||
|
||||
|
||||
@property
|
||||
def configured_for_postcode(self) -> bool:
|
||||
""" Return true if the search details are configured to
|
||||
allow postcodes in the result.
|
||||
"""
|
||||
return self.details.min_rank <= 5 and self.details.max_rank >= 11\
|
||||
and self.details.layer_enabled(DataLayer.ADDRESS)
|
||||
|
||||
|
||||
@property
|
||||
def configured_for_housenumbers(self) -> bool:
|
||||
""" Return true if the search details are configured to
|
||||
allow addresses in the result.
|
||||
"""
|
||||
return self.details.max_rank >= 30 \
|
||||
and self.details.layer_enabled(DataLayer.ADDRESS)
|
||||
|
||||
|
||||
def build(self, assignment: TokenAssignment) -> Iterator[dbs.AbstractSearch]:
|
||||
""" Yield all possible abstract searches for the given token assignment.
|
||||
"""
|
||||
sdata = self.get_search_data(assignment)
|
||||
if sdata is None:
|
||||
return
|
||||
|
||||
near_items = self.get_near_items(assignment)
|
||||
if near_items is not None and not near_items:
|
||||
return # impossible compbination of near items and category parameter
|
||||
|
||||
if assignment.name is None:
|
||||
if near_items and not sdata.postcodes:
|
||||
sdata.qualifiers = near_items
|
||||
near_items = None
|
||||
builder = self.build_poi_search(sdata)
|
||||
elif assignment.housenumber:
|
||||
hnr_tokens = self.query.get_tokens(assignment.housenumber,
|
||||
TokenType.HOUSENUMBER)
|
||||
builder = self.build_housenumber_search(sdata, hnr_tokens, assignment.address)
|
||||
else:
|
||||
builder = self.build_special_search(sdata, assignment.address,
|
||||
bool(near_items))
|
||||
else:
|
||||
builder = self.build_name_search(sdata, assignment.name, assignment.address,
|
||||
bool(near_items))
|
||||
|
||||
if near_items:
|
||||
penalty = min(near_items.penalties)
|
||||
near_items.penalties = [p - penalty for p in near_items.penalties]
|
||||
for search in builder:
|
||||
search_penalty = search.penalty
|
||||
search.penalty = 0.0
|
||||
yield dbs.NearSearch(penalty + assignment.penalty + search_penalty,
|
||||
near_items, search)
|
||||
else:
|
||||
for search in builder:
|
||||
search.penalty += assignment.penalty
|
||||
yield search
|
||||
|
||||
|
||||
def build_poi_search(self, sdata: dbf.SearchData) -> Iterator[dbs.AbstractSearch]:
|
||||
""" Build abstract search query for a simple category search.
|
||||
This kind of search requires an additional geographic constraint.
|
||||
"""
|
||||
if not sdata.housenumbers \
|
||||
and ((self.details.viewbox and self.details.bounded_viewbox) or self.details.near):
|
||||
yield dbs.PoiSearch(sdata)
|
||||
|
||||
|
||||
def build_special_search(self, sdata: dbf.SearchData,
|
||||
address: List[TokenRange],
|
||||
is_category: bool) -> Iterator[dbs.AbstractSearch]:
|
||||
""" Build abstract search queries for searches that do not involve
|
||||
a named place.
|
||||
"""
|
||||
if sdata.qualifiers:
|
||||
# No special searches over qualifiers supported.
|
||||
return
|
||||
|
||||
if sdata.countries and not address and not sdata.postcodes \
|
||||
and self.configured_for_country:
|
||||
yield dbs.CountrySearch(sdata)
|
||||
|
||||
if sdata.postcodes and (is_category or self.configured_for_postcode):
|
||||
penalty = 0.0 if sdata.countries else 0.1
|
||||
if address:
|
||||
sdata.lookups = [dbf.FieldLookup('nameaddress_vector',
|
||||
[t.token for r in address
|
||||
for t in self.query.get_partials_list(r)],
|
||||
lookups.Restrict)]
|
||||
penalty += 0.2
|
||||
yield dbs.PostcodeSearch(penalty, sdata)
|
||||
|
||||
|
||||
def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
|
||||
address: List[TokenRange]) -> Iterator[dbs.AbstractSearch]:
|
||||
""" Build a simple address search for special entries where the
|
||||
housenumber is the main name token.
|
||||
"""
|
||||
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
|
||||
expected_count = sum(t.count for t in hnrs)
|
||||
|
||||
partials = {t.token: t.addr_count for trange in address
|
||||
for t in self.query.get_partials_list(trange)}
|
||||
|
||||
if expected_count < 8000:
|
||||
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
|
||||
list(partials), lookups.Restrict))
|
||||
elif len(partials) != 1 or list(partials.values())[0] < 10000:
|
||||
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
|
||||
list(partials), lookups.LookupAll))
|
||||
else:
|
||||
addr_fulls = [t.token for t
|
||||
in self.query.get_tokens(address[0], TokenType.WORD)]
|
||||
if len(addr_fulls) > 5:
|
||||
return
|
||||
sdata.lookups.append(
|
||||
dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny))
|
||||
|
||||
sdata.housenumbers = dbf.WeightedStrings([], [])
|
||||
yield dbs.PlaceSearch(0.05, sdata, expected_count)
|
||||
|
||||
|
||||
def build_name_search(self, sdata: dbf.SearchData,
|
||||
name: TokenRange, address: List[TokenRange],
|
||||
is_category: bool) -> Iterator[dbs.AbstractSearch]:
|
||||
""" Build abstract search queries for simple name or address searches.
|
||||
"""
|
||||
if is_category or not sdata.housenumbers or self.configured_for_housenumbers:
|
||||
ranking = self.get_name_ranking(name)
|
||||
name_penalty = ranking.normalize_penalty()
|
||||
if ranking.rankings:
|
||||
sdata.rankings.append(ranking)
|
||||
for penalty, count, lookup in self.yield_lookups(name, address):
|
||||
sdata.lookups = lookup
|
||||
yield dbs.PlaceSearch(penalty + name_penalty, sdata, count)
|
||||
|
||||
|
||||
def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
|
||||
-> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
|
||||
""" Yield all variants how the given name and address should best
|
||||
be searched for. This takes into account how frequent the terms
|
||||
are and tries to find a lookup that optimizes index use.
|
||||
"""
|
||||
penalty = 0.0 # extra penalty
|
||||
name_partials = {t.token: t for t in self.query.get_partials_list(name)}
|
||||
|
||||
addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
|
||||
addr_tokens = list({t.token for t in addr_partials})
|
||||
|
||||
partials_indexed = all(t.is_indexed for t in name_partials.values()) \
|
||||
and all(t.is_indexed for t in addr_partials)
|
||||
exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
|
||||
|
||||
if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
|
||||
yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
|
||||
return
|
||||
|
||||
addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
|
||||
# Partial term to frequent. Try looking up by rare full names first.
|
||||
name_fulls = self.query.get_tokens(name, TokenType.WORD)
|
||||
if name_fulls:
|
||||
fulls_count = sum(t.count for t in name_fulls)
|
||||
if partials_indexed:
|
||||
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
|
||||
|
||||
if fulls_count < 50000 or addr_count < 30000:
|
||||
yield penalty,fulls_count / (2**len(addr_tokens)), \
|
||||
self.get_full_name_ranking(name_fulls, addr_partials,
|
||||
fulls_count > 30000 / max(1, len(addr_tokens)))
|
||||
|
||||
# To catch remaining results, lookup by name and address
|
||||
# We only do this if there is a reasonable number of results expected.
|
||||
exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
|
||||
if exp_count < 10000 and addr_count < 20000\
|
||||
and all(t.is_indexed for t in name_partials.values()):
|
||||
penalty += 0.35 * max(1 if name_fulls else 0.1,
|
||||
5 - len(name_partials) - len(addr_tokens))
|
||||
yield penalty, exp_count,\
|
||||
self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
|
||||
|
||||
|
||||
def get_name_address_ranking(self, name_tokens: List[int],
|
||||
addr_partials: List[Token]) -> List[dbf.FieldLookup]:
|
||||
""" Create a ranking expression looking up by name and address.
|
||||
"""
|
||||
lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
|
||||
|
||||
addr_restrict_tokens = []
|
||||
addr_lookup_tokens = []
|
||||
for t in addr_partials:
|
||||
if t.is_indexed:
|
||||
if t.addr_count > 20000:
|
||||
addr_restrict_tokens.append(t.token)
|
||||
else:
|
||||
addr_lookup_tokens.append(t.token)
|
||||
|
||||
if addr_restrict_tokens:
|
||||
lookup.append(dbf.FieldLookup('nameaddress_vector',
|
||||
addr_restrict_tokens, lookups.Restrict))
|
||||
if addr_lookup_tokens:
|
||||
lookup.append(dbf.FieldLookup('nameaddress_vector',
|
||||
addr_lookup_tokens, lookups.LookupAll))
|
||||
|
||||
return lookup
|
||||
|
||||
|
||||
def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Token],
|
||||
use_lookup: bool) -> List[dbf.FieldLookup]:
|
||||
""" Create a ranking expression with full name terms and
|
||||
additional address lookup. When 'use_lookup' is true, then
|
||||
address lookups will use the index, when the occurences are not
|
||||
too many.
|
||||
"""
|
||||
# At this point drop unindexed partials from the address.
|
||||
# This might yield wrong results, nothing we can do about that.
|
||||
if use_lookup:
|
||||
addr_restrict_tokens = []
|
||||
addr_lookup_tokens = []
|
||||
for t in addr_partials:
|
||||
if t.is_indexed:
|
||||
if t.addr_count > 20000:
|
||||
addr_restrict_tokens.append(t.token)
|
||||
else:
|
||||
addr_lookup_tokens.append(t.token)
|
||||
else:
|
||||
addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed]
|
||||
addr_lookup_tokens = []
|
||||
|
||||
return dbf.lookup_by_any_name([t.token for t in name_fulls],
|
||||
addr_restrict_tokens, addr_lookup_tokens)
|
||||
|
||||
|
||||
def get_name_ranking(self, trange: TokenRange,
|
||||
db_field: str = 'name_vector') -> dbf.FieldRanking:
|
||||
""" Create a ranking expression for a name term in the given range.
|
||||
"""
|
||||
name_fulls = self.query.get_tokens(trange, TokenType.WORD)
|
||||
ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
|
||||
ranks.sort(key=lambda r: r.penalty)
|
||||
# Fallback, sum of penalty for partials
|
||||
name_partials = self.query.get_partials_list(trange)
|
||||
default = sum(t.penalty for t in name_partials) + 0.2
|
||||
return dbf.FieldRanking(db_field, default, ranks)
|
||||
|
||||
|
||||
def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
|
||||
""" Create a list of ranking expressions for an address term
|
||||
for the given ranges.
|
||||
"""
|
||||
todo: List[Tuple[int, int, dbf.RankedTokens]] = []
|
||||
heapq.heappush(todo, (0, trange.start, dbf.RankedTokens(0.0, [])))
|
||||
ranks: List[dbf.RankedTokens] = []
|
||||
|
||||
while todo: # pylint: disable=too-many-nested-blocks
|
||||
neglen, pos, rank = heapq.heappop(todo)
|
||||
for tlist in self.query.nodes[pos].starting:
|
||||
if tlist.ttype in (TokenType.PARTIAL, TokenType.WORD):
|
||||
if tlist.end < trange.end:
|
||||
chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
|
||||
if tlist.ttype == TokenType.PARTIAL:
|
||||
penalty = rank.penalty + chgpenalty \
|
||||
+ max(t.penalty for t in tlist.tokens)
|
||||
heapq.heappush(todo, (neglen - 1, tlist.end,
|
||||
dbf.RankedTokens(penalty, rank.tokens)))
|
||||
else:
|
||||
for t in tlist.tokens:
|
||||
heapq.heappush(todo, (neglen - 1, tlist.end,
|
||||
rank.with_token(t, chgpenalty)))
|
||||
elif tlist.end == trange.end:
|
||||
if tlist.ttype == TokenType.PARTIAL:
|
||||
ranks.append(dbf.RankedTokens(rank.penalty
|
||||
+ max(t.penalty for t in tlist.tokens),
|
||||
rank.tokens))
|
||||
else:
|
||||
ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
|
||||
if len(ranks) >= 10:
|
||||
# Too many variants, bail out and only add
|
||||
# Worst-case Fallback: sum of penalty of partials
|
||||
name_partials = self.query.get_partials_list(trange)
|
||||
default = sum(t.penalty for t in name_partials) + 0.2
|
||||
ranks.append(dbf.RankedTokens(rank.penalty + default, []))
|
||||
# Bail out of outer loop
|
||||
todo.clear()
|
||||
break
|
||||
|
||||
ranks.sort(key=lambda r: len(r.tokens))
|
||||
default = ranks[0].penalty + 0.3
|
||||
del ranks[0]
|
||||
ranks.sort(key=lambda r: r.penalty)
|
||||
|
||||
return dbf.FieldRanking('nameaddress_vector', default, ranks)
|
||||
|
||||
|
||||
def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchData]:
|
||||
""" Collect the tokens for the non-name search fields in the
|
||||
assignment.
|
||||
"""
|
||||
sdata = dbf.SearchData()
|
||||
sdata.penalty = assignment.penalty
|
||||
if assignment.country:
|
||||
tokens = self.get_country_tokens(assignment.country)
|
||||
if not tokens:
|
||||
return None
|
||||
sdata.set_strings('countries', tokens)
|
||||
elif self.details.countries:
|
||||
sdata.countries = dbf.WeightedStrings(self.details.countries,
|
||||
[0.0] * len(self.details.countries))
|
||||
if assignment.housenumber:
|
||||
sdata.set_strings('housenumbers',
|
||||
self.query.get_tokens(assignment.housenumber,
|
||||
TokenType.HOUSENUMBER))
|
||||
if assignment.postcode:
|
||||
sdata.set_strings('postcodes',
|
||||
self.query.get_tokens(assignment.postcode,
|
||||
TokenType.POSTCODE))
|
||||
if assignment.qualifier:
|
||||
tokens = self.get_qualifier_tokens(assignment.qualifier)
|
||||
if not tokens:
|
||||
return None
|
||||
sdata.set_qualifiers(tokens)
|
||||
elif self.details.categories:
|
||||
sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
|
||||
[0.0] * len(self.details.categories))
|
||||
|
||||
if assignment.address:
|
||||
if not assignment.name and assignment.housenumber:
|
||||
# housenumber search: the first item needs to be handled like
|
||||
# a name in ranking or penalties are not comparable with
|
||||
# normal searches.
|
||||
sdata.set_ranking([self.get_name_ranking(assignment.address[0],
|
||||
db_field='nameaddress_vector')]
|
||||
+ [self.get_addr_ranking(r) for r in assignment.address[1:]])
|
||||
else:
|
||||
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
|
||||
else:
|
||||
sdata.rankings = []
|
||||
|
||||
return sdata
|
||||
|
||||
|
||||
def get_country_tokens(self, trange: TokenRange) -> List[Token]:
|
||||
""" Return the list of country tokens for the given range,
|
||||
optionally filtered by the country list from the details
|
||||
parameters.
|
||||
"""
|
||||
tokens = self.query.get_tokens(trange, TokenType.COUNTRY)
|
||||
if self.details.countries:
|
||||
tokens = [t for t in tokens if t.lookup_word in self.details.countries]
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def get_qualifier_tokens(self, trange: TokenRange) -> List[Token]:
|
||||
""" Return the list of qualifier tokens for the given range,
|
||||
optionally filtered by the qualifier list from the details
|
||||
parameters.
|
||||
"""
|
||||
tokens = self.query.get_tokens(trange, TokenType.QUALIFIER)
|
||||
if self.details.categories:
|
||||
tokens = [t for t in tokens if t.get_category() in self.details.categories]
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
|
||||
""" Collect tokens for near items search or use the categories
|
||||
requested per parameter.
|
||||
Returns None if no category search is requested.
|
||||
"""
|
||||
if assignment.near_item:
|
||||
tokens: Dict[Tuple[str, str], float] = {}
|
||||
for t in self.query.get_tokens(assignment.near_item, TokenType.NEAR_ITEM):
|
||||
cat = t.get_category()
|
||||
# The category of a near search will be that of near_item.
|
||||
# Thus, if search is restricted to a category parameter,
|
||||
# the two sets must intersect.
|
||||
if (not self.details.categories or cat in self.details.categories)\
|
||||
and t.penalty < tokens.get(cat, 1000.0):
|
||||
tokens[cat] = t.penalty
|
||||
return dbf.WeightedCategories(list(tokens.keys()), list(tokens.values()))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
PENALTY_WORDCHANGE = {
|
||||
BreakType.START: 0.0,
|
||||
BreakType.END: 0.0,
|
||||
BreakType.PHRASE: 0.0,
|
||||
BreakType.WORD: 0.1,
|
||||
BreakType.PART: 0.2,
|
||||
BreakType.TOKEN: 0.4
|
||||
}
|
||||
254
src/nominatim_api/search/db_search_fields.py
Normal file
254
src/nominatim_api/search/db_search_fields.py
Normal file
@@ -0,0 +1,254 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Data structures for more complex fields in abstract search descriptions.
|
||||
"""
|
||||
from typing import List, Tuple, Iterator, Dict, Type
|
||||
import dataclasses
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from nominatim_core.typing import SaFromClause, SaColumn, SaExpression
|
||||
from .query import Token
|
||||
from . import db_search_lookups as lookups
|
||||
from nominatim_core.utils.json_writer import JsonWriter
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class WeightedStrings:
|
||||
""" A list of strings together with a penalty.
|
||||
"""
|
||||
values: List[str]
|
||||
penalties: List[float]
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return bool(self.values)
|
||||
|
||||
|
||||
def __iter__(self) -> Iterator[Tuple[str, float]]:
|
||||
return iter(zip(self.values, self.penalties))
|
||||
|
||||
|
||||
def get_penalty(self, value: str, default: float = 1000.0) -> float:
|
||||
""" Get the penalty for the given value. Returns the given default
|
||||
if the value does not exist.
|
||||
"""
|
||||
try:
|
||||
return self.penalties[self.values.index(value)]
|
||||
except ValueError:
|
||||
pass
|
||||
return default
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class WeightedCategories:
|
||||
""" A list of class/type tuples together with a penalty.
|
||||
"""
|
||||
values: List[Tuple[str, str]]
|
||||
penalties: List[float]
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return bool(self.values)
|
||||
|
||||
|
||||
def __iter__(self) -> Iterator[Tuple[Tuple[str, str], float]]:
|
||||
return iter(zip(self.values, self.penalties))
|
||||
|
||||
|
||||
def get_penalty(self, value: Tuple[str, str], default: float = 1000.0) -> float:
|
||||
""" Get the penalty for the given value. Returns the given default
|
||||
if the value does not exist.
|
||||
"""
|
||||
try:
|
||||
return self.penalties[self.values.index(value)]
|
||||
except ValueError:
|
||||
pass
|
||||
return default
|
||||
|
||||
|
||||
def sql_restrict(self, table: SaFromClause) -> SaExpression:
|
||||
""" Return an SQLAlcheny expression that restricts the
|
||||
class and type columns of the given table to the values
|
||||
in the list.
|
||||
Must not be used with an empty list.
|
||||
"""
|
||||
assert self.values
|
||||
if len(self.values) == 1:
|
||||
return sa.and_(table.c.class_ == self.values[0][0],
|
||||
table.c.type == self.values[0][1])
|
||||
|
||||
return sa.or_(*(sa.and_(table.c.class_ == c, table.c.type == t)
|
||||
for c, t in self.values))
|
||||
|
||||
|
||||
@dataclasses.dataclass(order=True)
|
||||
class RankedTokens:
|
||||
""" List of tokens together with the penalty of using it.
|
||||
"""
|
||||
penalty: float
|
||||
tokens: List[int]
|
||||
|
||||
def with_token(self, t: Token, transition_penalty: float) -> 'RankedTokens':
|
||||
""" Create a new RankedTokens list with the given token appended.
|
||||
The tokens penalty as well as the given transition penalty
|
||||
are added to the overall penalty.
|
||||
"""
|
||||
return RankedTokens(self.penalty + t.penalty + transition_penalty,
|
||||
self.tokens + [t.token])
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class FieldRanking:
|
||||
""" A list of rankings to be applied sequentially until one matches.
|
||||
The matched ranking determines the penalty. If none matches a
|
||||
default penalty is applied.
|
||||
"""
|
||||
column: str
|
||||
default: float
|
||||
rankings: List[RankedTokens]
|
||||
|
||||
def normalize_penalty(self) -> float:
|
||||
""" Reduce the default and ranking penalties, such that the minimum
|
||||
penalty is 0. Return the penalty that was subtracted.
|
||||
"""
|
||||
if self.rankings:
|
||||
min_penalty = min(self.default, min(r.penalty for r in self.rankings))
|
||||
else:
|
||||
min_penalty = self.default
|
||||
if min_penalty > 0.0:
|
||||
self.default -= min_penalty
|
||||
for ranking in self.rankings:
|
||||
ranking.penalty -= min_penalty
|
||||
return min_penalty
|
||||
|
||||
|
||||
def sql_penalty(self, table: SaFromClause) -> SaColumn:
|
||||
""" Create an SQL expression for the rankings.
|
||||
"""
|
||||
assert self.rankings
|
||||
|
||||
rout = JsonWriter().start_array()
|
||||
for rank in self.rankings:
|
||||
rout.start_array().value(rank.penalty).next()
|
||||
rout.start_array()
|
||||
for token in rank.tokens:
|
||||
rout.value(token).next()
|
||||
rout.end_array()
|
||||
rout.end_array().next()
|
||||
rout.end_array()
|
||||
|
||||
return sa.func.weigh_search(table.c[self.column], rout(), self.default)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class FieldLookup:
|
||||
""" A list of tokens to be searched for. The column names the database
|
||||
column to search in and the lookup_type the operator that is applied.
|
||||
'lookup_all' requires all tokens to match. 'lookup_any' requires
|
||||
one of the tokens to match. 'restrict' requires to match all tokens
|
||||
but avoids the use of indexes.
|
||||
"""
|
||||
column: str
|
||||
tokens: List[int]
|
||||
lookup_type: Type[lookups.LookupType]
|
||||
|
||||
def sql_condition(self, table: SaFromClause) -> SaColumn:
|
||||
""" Create an SQL expression for the given match condition.
|
||||
"""
|
||||
return self.lookup_type(table, self.column, self.tokens)
|
||||
|
||||
|
||||
class SearchData:
|
||||
""" Search fields derived from query and token assignment
|
||||
to be used with the SQL queries.
|
||||
"""
|
||||
penalty: float
|
||||
|
||||
lookups: List[FieldLookup] = []
|
||||
rankings: List[FieldRanking]
|
||||
|
||||
housenumbers: WeightedStrings = WeightedStrings([], [])
|
||||
postcodes: WeightedStrings = WeightedStrings([], [])
|
||||
countries: WeightedStrings = WeightedStrings([], [])
|
||||
|
||||
qualifiers: WeightedCategories = WeightedCategories([], [])
|
||||
|
||||
|
||||
def set_strings(self, field: str, tokens: List[Token]) -> None:
|
||||
""" Set on of the WeightedStrings properties from the given
|
||||
token list. Adapt the global penalty, so that the
|
||||
minimum penalty is 0.
|
||||
"""
|
||||
if tokens:
|
||||
min_penalty = min(t.penalty for t in tokens)
|
||||
self.penalty += min_penalty
|
||||
wstrs = WeightedStrings([t.lookup_word for t in tokens],
|
||||
[t.penalty - min_penalty for t in tokens])
|
||||
|
||||
setattr(self, field, wstrs)
|
||||
|
||||
|
||||
def set_qualifiers(self, tokens: List[Token]) -> None:
|
||||
""" Set the qulaifier field from the given tokens.
|
||||
"""
|
||||
if tokens:
|
||||
categories: Dict[Tuple[str, str], float] = {}
|
||||
min_penalty = 1000.0
|
||||
for t in tokens:
|
||||
min_penalty = min(min_penalty, t.penalty)
|
||||
cat = t.get_category()
|
||||
if t.penalty < categories.get(cat, 1000.0):
|
||||
categories[cat] = t.penalty
|
||||
self.penalty += min_penalty
|
||||
self.qualifiers = WeightedCategories(list(categories.keys()),
|
||||
list(categories.values()))
|
||||
|
||||
|
||||
def set_ranking(self, rankings: List[FieldRanking]) -> None:
|
||||
""" Set the list of rankings and normalize the ranking.
|
||||
"""
|
||||
self.rankings = []
|
||||
for ranking in rankings:
|
||||
if ranking.rankings:
|
||||
self.penalty += ranking.normalize_penalty()
|
||||
self.rankings.append(ranking)
|
||||
else:
|
||||
self.penalty += ranking.default
|
||||
|
||||
|
||||
def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
|
||||
""" Create a lookup list where name tokens are looked up via index
|
||||
and potential address tokens are used to restrict the search further.
|
||||
"""
|
||||
lookup = [FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
|
||||
if addr_tokens:
|
||||
lookup.append(FieldLookup('nameaddress_vector', addr_tokens, lookups.Restrict))
|
||||
|
||||
return lookup
|
||||
|
||||
|
||||
def lookup_by_any_name(name_tokens: List[int], addr_restrict_tokens: List[int],
|
||||
addr_lookup_tokens: List[int]) -> List[FieldLookup]:
|
||||
""" Create a lookup list where name tokens are looked up via index
|
||||
and only one of the name tokens must be present.
|
||||
Potential address tokens are used to restrict the search further.
|
||||
"""
|
||||
lookup = [FieldLookup('name_vector', name_tokens, lookups.LookupAny)]
|
||||
if addr_restrict_tokens:
|
||||
lookup.append(FieldLookup('nameaddress_vector', addr_restrict_tokens, lookups.Restrict))
|
||||
if addr_lookup_tokens:
|
||||
lookup.append(FieldLookup('nameaddress_vector', addr_lookup_tokens, lookups.LookupAll))
|
||||
|
||||
return lookup
|
||||
|
||||
|
||||
def lookup_by_addr(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
|
||||
""" Create a lookup list where address tokens are looked up via index
|
||||
and the name tokens are only used to restrict the search further.
|
||||
"""
|
||||
return [FieldLookup('name_vector', name_tokens, lookups.Restrict),
|
||||
FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll)]
|
||||
114
src/nominatim_api/search/db_search_lookups.py
Normal file
114
src/nominatim_api/search/db_search_lookups.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of lookup functions for the search_name table.
|
||||
"""
|
||||
from typing import List, Any
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.ext.compiler import compiles
|
||||
|
||||
from nominatim_core.typing import SaFromClause
|
||||
from nominatim_core.db.sqlalchemy_types import IntArray
|
||||
|
||||
# pylint: disable=consider-using-f-string
|
||||
|
||||
LookupType = sa.sql.expression.FunctionElement[Any]
|
||||
|
||||
class LookupAll(LookupType):
|
||||
""" Find all entries in search_name table that contain all of
|
||||
a given list of tokens using an index for the search.
|
||||
"""
|
||||
inherit_cache = True
|
||||
|
||||
def __init__(self, table: SaFromClause, column: str, tokens: List[int]) -> None:
|
||||
super().__init__(table.c.place_id, getattr(table.c, column), column,
|
||||
sa.type_coerce(tokens, IntArray))
|
||||
|
||||
|
||||
@compiles(LookupAll) # type: ignore[no-untyped-call, misc]
|
||||
def _default_lookup_all(element: LookupAll,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
_, col, _, tokens = list(element.clauses)
|
||||
return "(%s @> %s)" % (compiler.process(col, **kw),
|
||||
compiler.process(tokens, **kw))
|
||||
|
||||
|
||||
@compiles(LookupAll, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def _sqlite_lookup_all(element: LookupAll,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
place, col, colname, tokens = list(element.clauses)
|
||||
return "(%s IN (SELECT CAST(value as bigint) FROM"\
|
||||
" (SELECT array_intersect_fuzzy(places) as p FROM"\
|
||||
" (SELECT places FROM reverse_search_name"\
|
||||
" WHERE word IN (SELECT value FROM json_each('[' || %s || ']'))"\
|
||||
" AND column = %s"\
|
||||
" ORDER BY length(places)) as x) as u,"\
|
||||
" json_each('[' || u.p || ']'))"\
|
||||
" AND array_contains(%s, %s))"\
|
||||
% (compiler.process(place, **kw),
|
||||
compiler.process(tokens, **kw),
|
||||
compiler.process(colname, **kw),
|
||||
compiler.process(col, **kw),
|
||||
compiler.process(tokens, **kw)
|
||||
)
|
||||
|
||||
|
||||
|
||||
class LookupAny(LookupType):
|
||||
""" Find all entries that contain at least one of the given tokens.
|
||||
Use an index for the search.
|
||||
"""
|
||||
inherit_cache = True
|
||||
|
||||
def __init__(self, table: SaFromClause, column: str, tokens: List[int]) -> None:
|
||||
super().__init__(table.c.place_id, getattr(table.c, column), column,
|
||||
sa.type_coerce(tokens, IntArray))
|
||||
|
||||
@compiles(LookupAny) # type: ignore[no-untyped-call, misc]
|
||||
def _default_lookup_any(element: LookupAny,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
_, col, _, tokens = list(element.clauses)
|
||||
return "(%s && %s)" % (compiler.process(col, **kw),
|
||||
compiler.process(tokens, **kw))
|
||||
|
||||
@compiles(LookupAny, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def _sqlite_lookup_any(element: LookupAny,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
place, _, colname, tokens = list(element.clauses)
|
||||
return "%s IN (SELECT CAST(value as bigint) FROM"\
|
||||
" (SELECT array_union(places) as p FROM reverse_search_name"\
|
||||
" WHERE word IN (SELECT value FROM json_each('[' || %s || ']'))"\
|
||||
" AND column = %s) as u,"\
|
||||
" json_each('[' || u.p || ']'))" % (compiler.process(place, **kw),
|
||||
compiler.process(tokens, **kw),
|
||||
compiler.process(colname, **kw))
|
||||
|
||||
|
||||
|
||||
class Restrict(LookupType):
|
||||
""" Find all entries that contain all of the given tokens.
|
||||
Do not use an index for the search.
|
||||
"""
|
||||
inherit_cache = True
|
||||
|
||||
def __init__(self, table: SaFromClause, column: str, tokens: List[int]) -> None:
|
||||
super().__init__(getattr(table.c, column),
|
||||
sa.type_coerce(tokens, IntArray))
|
||||
|
||||
|
||||
@compiles(Restrict) # type: ignore[no-untyped-call, misc]
|
||||
def _default_restrict(element: Restrict,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "(coalesce(null, %s) @> %s)" % (compiler.process(arg1, **kw),
|
||||
compiler.process(arg2, **kw))
|
||||
|
||||
@compiles(Restrict, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def _sqlite_restrict(element: Restrict,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "array_contains(%s)" % compiler.process(element.clauses, **kw)
|
||||
874
src/nominatim_api/search/db_searches.py
Normal file
874
src/nominatim_api/search/db_searches.py
Normal file
@@ -0,0 +1,874 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of the actual database accesses for forward search.
|
||||
"""
|
||||
from typing import List, Tuple, AsyncIterator, Dict, Any, Callable, cast
|
||||
import abc
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from nominatim_core.typing import SaFromClause, SaScalarSelect, SaColumn, \
|
||||
SaExpression, SaSelect, SaLambdaSelect, SaRow, SaBind
|
||||
from nominatim_core.db.sqlalchemy_types import Geometry, IntArray
|
||||
from ..connection import SearchConnection
|
||||
from ..types import SearchDetails, DataLayer, GeometryFormat, Bbox
|
||||
from .. import results as nres
|
||||
from .db_search_fields import SearchData, WeightedCategories
|
||||
|
||||
#pylint: disable=singleton-comparison,not-callable
|
||||
#pylint: disable=too-many-branches,too-many-arguments,too-many-locals,too-many-statements
|
||||
|
||||
def no_index(expr: SaColumn) -> SaColumn:
|
||||
""" Wrap the given expression, so that the query planner will
|
||||
refrain from using the expression for index lookup.
|
||||
"""
|
||||
return sa.func.coalesce(sa.null(), expr) # pylint: disable=not-callable
|
||||
|
||||
|
||||
def _details_to_bind_params(details: SearchDetails) -> Dict[str, Any]:
|
||||
""" Create a dictionary from search parameters that can be used
|
||||
as bind parameter for SQL execute.
|
||||
"""
|
||||
return {'limit': details.max_results,
|
||||
'min_rank': details.min_rank,
|
||||
'max_rank': details.max_rank,
|
||||
'viewbox': details.viewbox,
|
||||
'viewbox2': details.viewbox_x2,
|
||||
'near': details.near,
|
||||
'near_radius': details.near_radius,
|
||||
'excluded': details.excluded,
|
||||
'countries': details.countries}
|
||||
|
||||
|
||||
LIMIT_PARAM: SaBind = sa.bindparam('limit')
|
||||
MIN_RANK_PARAM: SaBind = sa.bindparam('min_rank')
|
||||
MAX_RANK_PARAM: SaBind = sa.bindparam('max_rank')
|
||||
VIEWBOX_PARAM: SaBind = sa.bindparam('viewbox', type_=Geometry)
|
||||
VIEWBOX2_PARAM: SaBind = sa.bindparam('viewbox2', type_=Geometry)
|
||||
NEAR_PARAM: SaBind = sa.bindparam('near', type_=Geometry)
|
||||
NEAR_RADIUS_PARAM: SaBind = sa.bindparam('near_radius')
|
||||
COUNTRIES_PARAM: SaBind = sa.bindparam('countries')
|
||||
|
||||
|
||||
def filter_by_area(sql: SaSelect, t: SaFromClause,
|
||||
details: SearchDetails, avoid_index: bool = False) -> SaSelect:
|
||||
""" Apply SQL statements for filtering by viewbox and near point,
|
||||
if applicable.
|
||||
"""
|
||||
if details.near is not None and details.near_radius is not None:
|
||||
if details.near_radius < 0.1 and not avoid_index:
|
||||
sql = sql.where(t.c.geometry.within_distance(NEAR_PARAM, NEAR_RADIUS_PARAM))
|
||||
else:
|
||||
sql = sql.where(t.c.geometry.ST_Distance(NEAR_PARAM) <= NEAR_RADIUS_PARAM)
|
||||
if details.viewbox is not None and details.bounded_viewbox:
|
||||
sql = sql.where(t.c.geometry.intersects(VIEWBOX_PARAM,
|
||||
use_index=not avoid_index and
|
||||
details.viewbox.area < 0.2))
|
||||
|
||||
return sql
|
||||
|
||||
|
||||
def _exclude_places(t: SaFromClause) -> Callable[[], SaExpression]:
|
||||
return lambda: t.c.place_id.not_in(sa.bindparam('excluded'))
|
||||
|
||||
|
||||
def _select_placex(t: SaFromClause) -> SaSelect:
|
||||
return sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
|
||||
t.c.class_, t.c.type,
|
||||
t.c.address, t.c.extratags,
|
||||
t.c.housenumber, t.c.postcode, t.c.country_code,
|
||||
t.c.wikipedia,
|
||||
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
|
||||
t.c.linked_place_id, t.c.admin_level,
|
||||
t.c.centroid,
|
||||
t.c.geometry.ST_Expand(0).label('bbox'))
|
||||
|
||||
|
||||
def _add_geometry_columns(sql: SaLambdaSelect, col: SaColumn, details: SearchDetails) -> SaSelect:
|
||||
out = []
|
||||
|
||||
if details.geometry_simplification > 0.0:
|
||||
col = sa.func.ST_SimplifyPreserveTopology(col, details.geometry_simplification)
|
||||
|
||||
if details.geometry_output & GeometryFormat.GEOJSON:
|
||||
out.append(sa.func.ST_AsGeoJSON(col, 7).label('geometry_geojson'))
|
||||
if details.geometry_output & GeometryFormat.TEXT:
|
||||
out.append(sa.func.ST_AsText(col).label('geometry_text'))
|
||||
if details.geometry_output & GeometryFormat.KML:
|
||||
out.append(sa.func.ST_AsKML(col, 7).label('geometry_kml'))
|
||||
if details.geometry_output & GeometryFormat.SVG:
|
||||
out.append(sa.func.ST_AsSVG(col, 0, 7).label('geometry_svg'))
|
||||
|
||||
return sql.add_columns(*out)
|
||||
|
||||
|
||||
def _make_interpolation_subquery(table: SaFromClause, inner: SaFromClause,
|
||||
numerals: List[int], details: SearchDetails) -> SaScalarSelect:
|
||||
all_ids = sa.func.ArrayAgg(table.c.place_id)
|
||||
sql = sa.select(all_ids).where(table.c.parent_place_id == inner.c.place_id)
|
||||
|
||||
if len(numerals) == 1:
|
||||
sql = sql.where(sa.between(numerals[0], table.c.startnumber, table.c.endnumber))\
|
||||
.where((numerals[0] - table.c.startnumber) % table.c.step == 0)
|
||||
else:
|
||||
sql = sql.where(sa.or_(
|
||||
*(sa.and_(sa.between(n, table.c.startnumber, table.c.endnumber),
|
||||
(n - table.c.startnumber) % table.c.step == 0)
|
||||
for n in numerals)))
|
||||
|
||||
if details.excluded:
|
||||
sql = sql.where(_exclude_places(table))
|
||||
|
||||
return sql.scalar_subquery()
|
||||
|
||||
|
||||
def _filter_by_layer(table: SaFromClause, layers: DataLayer) -> SaColumn:
|
||||
orexpr: List[SaExpression] = []
|
||||
if layers & DataLayer.ADDRESS and layers & DataLayer.POI:
|
||||
orexpr.append(no_index(table.c.rank_address).between(1, 30))
|
||||
elif layers & DataLayer.ADDRESS:
|
||||
orexpr.append(no_index(table.c.rank_address).between(1, 29))
|
||||
orexpr.append(sa.func.IsAddressPoint(table))
|
||||
elif layers & DataLayer.POI:
|
||||
orexpr.append(sa.and_(no_index(table.c.rank_address) == 30,
|
||||
table.c.class_.not_in(('place', 'building'))))
|
||||
|
||||
if layers & DataLayer.MANMADE:
|
||||
exclude = []
|
||||
if not layers & DataLayer.RAILWAY:
|
||||
exclude.append('railway')
|
||||
if not layers & DataLayer.NATURAL:
|
||||
exclude.extend(('natural', 'water', 'waterway'))
|
||||
orexpr.append(sa.and_(table.c.class_.not_in(tuple(exclude)),
|
||||
no_index(table.c.rank_address) == 0))
|
||||
else:
|
||||
include = []
|
||||
if layers & DataLayer.RAILWAY:
|
||||
include.append('railway')
|
||||
if layers & DataLayer.NATURAL:
|
||||
include.extend(('natural', 'water', 'waterway'))
|
||||
orexpr.append(sa.and_(table.c.class_.in_(tuple(include)),
|
||||
no_index(table.c.rank_address) == 0))
|
||||
|
||||
if len(orexpr) == 1:
|
||||
return orexpr[0]
|
||||
|
||||
return sa.or_(*orexpr)
|
||||
|
||||
|
||||
def _interpolated_position(table: SaFromClause, nr: SaColumn) -> SaColumn:
|
||||
pos = sa.cast(nr - table.c.startnumber, sa.Float) / (table.c.endnumber - table.c.startnumber)
|
||||
return sa.case(
|
||||
(table.c.endnumber == table.c.startnumber, table.c.linegeo.ST_Centroid()),
|
||||
else_=table.c.linegeo.ST_LineInterpolatePoint(pos)).label('centroid')
|
||||
|
||||
|
||||
async def _get_placex_housenumbers(conn: SearchConnection,
|
||||
place_ids: List[int],
|
||||
details: SearchDetails) -> AsyncIterator[nres.SearchResult]:
|
||||
t = conn.t.placex
|
||||
sql = _select_placex(t).add_columns(t.c.importance)\
|
||||
.where(t.c.place_id.in_(place_ids))
|
||||
|
||||
if details.geometry_output:
|
||||
sql = _add_geometry_columns(sql, t.c.geometry, details)
|
||||
|
||||
for row in await conn.execute(sql):
|
||||
result = nres.create_from_placex_row(row, nres.SearchResult)
|
||||
assert result
|
||||
result.bbox = Bbox.from_wkb(row.bbox)
|
||||
yield result
|
||||
|
||||
|
||||
def _int_list_to_subquery(inp: List[int]) -> 'sa.Subquery':
|
||||
""" Create a subselect that returns the given list of integers
|
||||
as rows in the column 'nr'.
|
||||
"""
|
||||
vtab = sa.func.JsonArrayEach(sa.type_coerce(inp, sa.JSON))\
|
||||
.table_valued(sa.column('value', type_=sa.JSON))
|
||||
return sa.select(sa.cast(sa.cast(vtab.c.value, sa.Text), sa.Integer).label('nr')).subquery()
|
||||
|
||||
|
||||
async def _get_osmline(conn: SearchConnection, place_ids: List[int],
|
||||
numerals: List[int],
|
||||
details: SearchDetails) -> AsyncIterator[nres.SearchResult]:
|
||||
t = conn.t.osmline
|
||||
|
||||
values = _int_list_to_subquery(numerals)
|
||||
sql = sa.select(t.c.place_id, t.c.osm_id,
|
||||
t.c.parent_place_id, t.c.address,
|
||||
values.c.nr.label('housenumber'),
|
||||
_interpolated_position(t, values.c.nr),
|
||||
t.c.postcode, t.c.country_code)\
|
||||
.where(t.c.place_id.in_(place_ids))\
|
||||
.join(values, values.c.nr.between(t.c.startnumber, t.c.endnumber))
|
||||
|
||||
if details.geometry_output:
|
||||
sub = sql.subquery()
|
||||
sql = _add_geometry_columns(sa.select(sub), sub.c.centroid, details)
|
||||
|
||||
for row in await conn.execute(sql):
|
||||
result = nres.create_from_osmline_row(row, nres.SearchResult)
|
||||
assert result
|
||||
yield result
|
||||
|
||||
|
||||
async def _get_tiger(conn: SearchConnection, place_ids: List[int],
|
||||
numerals: List[int], osm_id: int,
|
||||
details: SearchDetails) -> AsyncIterator[nres.SearchResult]:
|
||||
t = conn.t.tiger
|
||||
values = _int_list_to_subquery(numerals)
|
||||
sql = sa.select(t.c.place_id, t.c.parent_place_id,
|
||||
sa.literal('W').label('osm_type'),
|
||||
sa.literal(osm_id).label('osm_id'),
|
||||
values.c.nr.label('housenumber'),
|
||||
_interpolated_position(t, values.c.nr),
|
||||
t.c.postcode)\
|
||||
.where(t.c.place_id.in_(place_ids))\
|
||||
.join(values, values.c.nr.between(t.c.startnumber, t.c.endnumber))
|
||||
|
||||
if details.geometry_output:
|
||||
sub = sql.subquery()
|
||||
sql = _add_geometry_columns(sa.select(sub), sub.c.centroid, details)
|
||||
|
||||
for row in await conn.execute(sql):
|
||||
result = nres.create_from_tiger_row(row, nres.SearchResult)
|
||||
assert result
|
||||
yield result
|
||||
|
||||
|
||||
class AbstractSearch(abc.ABC):
|
||||
""" Encapuslation of a single lookup in the database.
|
||||
"""
|
||||
SEARCH_PRIO: int = 2
|
||||
|
||||
def __init__(self, penalty: float) -> None:
|
||||
self.penalty = penalty
|
||||
|
||||
@abc.abstractmethod
|
||||
async def lookup(self, conn: SearchConnection,
|
||||
details: SearchDetails) -> nres.SearchResults:
|
||||
""" Find results for the search in the database.
|
||||
"""
|
||||
|
||||
|
||||
class NearSearch(AbstractSearch):
|
||||
""" Category search of a place type near the result of another search.
|
||||
"""
|
||||
def __init__(self, penalty: float, categories: WeightedCategories,
|
||||
search: AbstractSearch) -> None:
|
||||
super().__init__(penalty)
|
||||
self.search = search
|
||||
self.categories = categories
|
||||
|
||||
|
||||
async def lookup(self, conn: SearchConnection,
|
||||
details: SearchDetails) -> nres.SearchResults:
|
||||
""" Find results for the search in the database.
|
||||
"""
|
||||
results = nres.SearchResults()
|
||||
base = await self.search.lookup(conn, details)
|
||||
|
||||
if not base:
|
||||
return results
|
||||
|
||||
base.sort(key=lambda r: (r.accuracy, r.rank_search))
|
||||
max_accuracy = base[0].accuracy + 0.5
|
||||
if base[0].rank_address == 0:
|
||||
min_rank = 0
|
||||
max_rank = 0
|
||||
elif base[0].rank_address < 26:
|
||||
min_rank = 1
|
||||
max_rank = min(25, base[0].rank_address + 4)
|
||||
else:
|
||||
min_rank = 26
|
||||
max_rank = 30
|
||||
base = nres.SearchResults(r for r in base if r.source_table == nres.SourceTable.PLACEX
|
||||
and r.accuracy <= max_accuracy
|
||||
and r.bbox and r.bbox.area < 20
|
||||
and r.rank_address >= min_rank
|
||||
and r.rank_address <= max_rank)
|
||||
|
||||
if base:
|
||||
baseids = [b.place_id for b in base[:5] if b.place_id]
|
||||
|
||||
for category, penalty in self.categories:
|
||||
await self.lookup_category(results, conn, baseids, category, penalty, details)
|
||||
if len(results) >= details.max_results:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def lookup_category(self, results: nres.SearchResults,
|
||||
conn: SearchConnection, ids: List[int],
|
||||
category: Tuple[str, str], penalty: float,
|
||||
details: SearchDetails) -> None:
|
||||
""" Find places of the given category near the list of
|
||||
place ids and add the results to 'results'.
|
||||
"""
|
||||
table = await conn.get_class_table(*category)
|
||||
|
||||
tgeom = conn.t.placex.alias('pgeom')
|
||||
|
||||
if table is None:
|
||||
# No classtype table available, do a simplified lookup in placex.
|
||||
table = conn.t.placex
|
||||
sql = sa.select(table.c.place_id,
|
||||
sa.func.min(tgeom.c.centroid.ST_Distance(table.c.centroid))
|
||||
.label('dist'))\
|
||||
.join(tgeom, table.c.geometry.intersects(tgeom.c.centroid.ST_Expand(0.01)))\
|
||||
.where(table.c.class_ == category[0])\
|
||||
.where(table.c.type == category[1])
|
||||
else:
|
||||
# Use classtype table. We can afford to use a larger
|
||||
# radius for the lookup.
|
||||
sql = sa.select(table.c.place_id,
|
||||
sa.func.min(tgeom.c.centroid.ST_Distance(table.c.centroid))
|
||||
.label('dist'))\
|
||||
.join(tgeom,
|
||||
table.c.centroid.ST_CoveredBy(
|
||||
sa.case((sa.and_(tgeom.c.rank_address > 9,
|
||||
tgeom.c.geometry.is_area()),
|
||||
tgeom.c.geometry),
|
||||
else_ = tgeom.c.centroid.ST_Expand(0.05))))
|
||||
|
||||
inner = sql.where(tgeom.c.place_id.in_(ids))\
|
||||
.group_by(table.c.place_id).subquery()
|
||||
|
||||
t = conn.t.placex
|
||||
sql = _select_placex(t).add_columns((-inner.c.dist).label('importance'))\
|
||||
.join(inner, inner.c.place_id == t.c.place_id)\
|
||||
.order_by(inner.c.dist)
|
||||
|
||||
sql = sql.where(no_index(t.c.rank_address).between(MIN_RANK_PARAM, MAX_RANK_PARAM))
|
||||
if details.countries:
|
||||
sql = sql.where(t.c.country_code.in_(COUNTRIES_PARAM))
|
||||
if details.excluded:
|
||||
sql = sql.where(_exclude_places(t))
|
||||
if details.layers is not None:
|
||||
sql = sql.where(_filter_by_layer(t, details.layers))
|
||||
|
||||
sql = sql.limit(LIMIT_PARAM)
|
||||
for row in await conn.execute(sql, _details_to_bind_params(details)):
|
||||
result = nres.create_from_placex_row(row, nres.SearchResult)
|
||||
assert result
|
||||
result.accuracy = self.penalty + penalty
|
||||
result.bbox = Bbox.from_wkb(row.bbox)
|
||||
results.append(result)
|
||||
|
||||
|
||||
|
||||
class PoiSearch(AbstractSearch):
|
||||
""" Category search in a geographic area.
|
||||
"""
|
||||
def __init__(self, sdata: SearchData) -> None:
|
||||
super().__init__(sdata.penalty)
|
||||
self.qualifiers = sdata.qualifiers
|
||||
self.countries = sdata.countries
|
||||
|
||||
|
||||
async def lookup(self, conn: SearchConnection,
|
||||
details: SearchDetails) -> nres.SearchResults:
|
||||
""" Find results for the search in the database.
|
||||
"""
|
||||
bind_params = _details_to_bind_params(details)
|
||||
t = conn.t.placex
|
||||
|
||||
rows: List[SaRow] = []
|
||||
|
||||
if details.near and details.near_radius is not None and details.near_radius < 0.2:
|
||||
# simply search in placex table
|
||||
def _base_query() -> SaSelect:
|
||||
return _select_placex(t) \
|
||||
.add_columns((-t.c.centroid.ST_Distance(NEAR_PARAM))
|
||||
.label('importance'))\
|
||||
.where(t.c.linked_place_id == None) \
|
||||
.where(t.c.geometry.within_distance(NEAR_PARAM, NEAR_RADIUS_PARAM)) \
|
||||
.order_by(t.c.centroid.ST_Distance(NEAR_PARAM)) \
|
||||
.limit(LIMIT_PARAM)
|
||||
|
||||
classtype = self.qualifiers.values
|
||||
if len(classtype) == 1:
|
||||
cclass, ctype = classtype[0]
|
||||
sql: SaLambdaSelect = sa.lambda_stmt(lambda: _base_query()
|
||||
.where(t.c.class_ == cclass)
|
||||
.where(t.c.type == ctype))
|
||||
else:
|
||||
sql = _base_query().where(sa.or_(*(sa.and_(t.c.class_ == cls, t.c.type == typ)
|
||||
for cls, typ in classtype)))
|
||||
|
||||
if self.countries:
|
||||
sql = sql.where(t.c.country_code.in_(self.countries.values))
|
||||
|
||||
if details.viewbox is not None and details.bounded_viewbox:
|
||||
sql = sql.where(t.c.geometry.intersects(VIEWBOX_PARAM))
|
||||
|
||||
rows.extend(await conn.execute(sql, bind_params))
|
||||
else:
|
||||
# use the class type tables
|
||||
for category in self.qualifiers.values:
|
||||
table = await conn.get_class_table(*category)
|
||||
if table is not None:
|
||||
sql = _select_placex(t)\
|
||||
.add_columns(t.c.importance)\
|
||||
.join(table, t.c.place_id == table.c.place_id)\
|
||||
.where(t.c.class_ == category[0])\
|
||||
.where(t.c.type == category[1])
|
||||
|
||||
if details.viewbox is not None and details.bounded_viewbox:
|
||||
sql = sql.where(table.c.centroid.intersects(VIEWBOX_PARAM))
|
||||
|
||||
if details.near and details.near_radius is not None:
|
||||
sql = sql.order_by(table.c.centroid.ST_Distance(NEAR_PARAM))\
|
||||
.where(table.c.centroid.within_distance(NEAR_PARAM,
|
||||
NEAR_RADIUS_PARAM))
|
||||
|
||||
if self.countries:
|
||||
sql = sql.where(t.c.country_code.in_(self.countries.values))
|
||||
|
||||
sql = sql.limit(LIMIT_PARAM)
|
||||
rows.extend(await conn.execute(sql, bind_params))
|
||||
|
||||
results = nres.SearchResults()
|
||||
for row in rows:
|
||||
result = nres.create_from_placex_row(row, nres.SearchResult)
|
||||
assert result
|
||||
result.accuracy = self.penalty + self.qualifiers.get_penalty((row.class_, row.type))
|
||||
result.bbox = Bbox.from_wkb(row.bbox)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class CountrySearch(AbstractSearch):
|
||||
""" Search for a country name or country code.
|
||||
"""
|
||||
SEARCH_PRIO = 0
|
||||
|
||||
def __init__(self, sdata: SearchData) -> None:
|
||||
super().__init__(sdata.penalty)
|
||||
self.countries = sdata.countries
|
||||
|
||||
|
||||
async def lookup(self, conn: SearchConnection,
|
||||
details: SearchDetails) -> nres.SearchResults:
|
||||
""" Find results for the search in the database.
|
||||
"""
|
||||
t = conn.t.placex
|
||||
|
||||
ccodes = self.countries.values
|
||||
sql = _select_placex(t)\
|
||||
.add_columns(t.c.importance)\
|
||||
.where(t.c.country_code.in_(ccodes))\
|
||||
.where(t.c.rank_address == 4)
|
||||
|
||||
if details.geometry_output:
|
||||
sql = _add_geometry_columns(sql, t.c.geometry, details)
|
||||
|
||||
if details.excluded:
|
||||
sql = sql.where(_exclude_places(t))
|
||||
|
||||
sql = filter_by_area(sql, t, details)
|
||||
|
||||
results = nres.SearchResults()
|
||||
for row in await conn.execute(sql, _details_to_bind_params(details)):
|
||||
result = nres.create_from_placex_row(row, nres.SearchResult)
|
||||
assert result
|
||||
result.accuracy = self.penalty + self.countries.get_penalty(row.country_code, 5.0)
|
||||
result.bbox = Bbox.from_wkb(row.bbox)
|
||||
results.append(result)
|
||||
|
||||
if not results:
|
||||
results = await self.lookup_in_country_table(conn, details)
|
||||
|
||||
if results:
|
||||
details.min_rank = min(5, details.max_rank)
|
||||
details.max_rank = min(25, details.max_rank)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def lookup_in_country_table(self, conn: SearchConnection,
|
||||
details: SearchDetails) -> nres.SearchResults:
|
||||
""" Look up the country in the fallback country tables.
|
||||
"""
|
||||
# Avoid the fallback search when this is a more search. Country results
|
||||
# usually are in the first batch of results and it is not possible
|
||||
# to exclude these fallbacks.
|
||||
if details.excluded:
|
||||
return nres.SearchResults()
|
||||
|
||||
t = conn.t.country_name
|
||||
tgrid = conn.t.country_grid
|
||||
|
||||
sql = sa.select(tgrid.c.country_code,
|
||||
tgrid.c.geometry.ST_Centroid().ST_Collect().ST_Centroid()
|
||||
.label('centroid'),
|
||||
tgrid.c.geometry.ST_Collect().ST_Expand(0).label('bbox'))\
|
||||
.where(tgrid.c.country_code.in_(self.countries.values))\
|
||||
.group_by(tgrid.c.country_code)
|
||||
|
||||
sql = filter_by_area(sql, tgrid, details, avoid_index=True)
|
||||
|
||||
sub = sql.subquery('grid')
|
||||
|
||||
sql = sa.select(t.c.country_code,
|
||||
t.c.name.merge(t.c.derived_name).label('name'),
|
||||
sub.c.centroid, sub.c.bbox)\
|
||||
.join(sub, t.c.country_code == sub.c.country_code)
|
||||
|
||||
if details.geometry_output:
|
||||
sql = _add_geometry_columns(sql, sub.c.centroid, details)
|
||||
|
||||
results = nres.SearchResults()
|
||||
for row in await conn.execute(sql, _details_to_bind_params(details)):
|
||||
result = nres.create_from_country_row(row, nres.SearchResult)
|
||||
assert result
|
||||
result.bbox = Bbox.from_wkb(row.bbox)
|
||||
result.accuracy = self.penalty + self.countries.get_penalty(row.country_code, 5.0)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
|
||||
class PostcodeSearch(AbstractSearch):
|
||||
""" Search for a postcode.
|
||||
"""
|
||||
def __init__(self, extra_penalty: float, sdata: SearchData) -> None:
|
||||
super().__init__(sdata.penalty + extra_penalty)
|
||||
self.countries = sdata.countries
|
||||
self.postcodes = sdata.postcodes
|
||||
self.lookups = sdata.lookups
|
||||
self.rankings = sdata.rankings
|
||||
|
||||
|
||||
async def lookup(self, conn: SearchConnection,
|
||||
details: SearchDetails) -> nres.SearchResults:
|
||||
""" Find results for the search in the database.
|
||||
"""
|
||||
t = conn.t.postcode
|
||||
pcs = self.postcodes.values
|
||||
|
||||
sql = sa.select(t.c.place_id, t.c.parent_place_id,
|
||||
t.c.rank_search, t.c.rank_address,
|
||||
t.c.postcode, t.c.country_code,
|
||||
t.c.geometry.label('centroid'))\
|
||||
.where(t.c.postcode.in_(pcs))
|
||||
|
||||
if details.geometry_output:
|
||||
sql = _add_geometry_columns(sql, t.c.geometry, details)
|
||||
|
||||
penalty: SaExpression = sa.literal(self.penalty)
|
||||
|
||||
if details.viewbox is not None and not details.bounded_viewbox:
|
||||
penalty += sa.case((t.c.geometry.intersects(VIEWBOX_PARAM), 0.0),
|
||||
(t.c.geometry.intersects(VIEWBOX2_PARAM), 0.5),
|
||||
else_=1.0)
|
||||
|
||||
if details.near is not None:
|
||||
sql = sql.order_by(t.c.geometry.ST_Distance(NEAR_PARAM))
|
||||
|
||||
sql = filter_by_area(sql, t, details)
|
||||
|
||||
if self.countries:
|
||||
sql = sql.where(t.c.country_code.in_(self.countries.values))
|
||||
|
||||
if details.excluded:
|
||||
sql = sql.where(_exclude_places(t))
|
||||
|
||||
if self.lookups:
|
||||
assert len(self.lookups) == 1
|
||||
tsearch = conn.t.search_name
|
||||
sql = sql.where(tsearch.c.place_id == t.c.parent_place_id)\
|
||||
.where((tsearch.c.name_vector + tsearch.c.nameaddress_vector)
|
||||
.contains(sa.type_coerce(self.lookups[0].tokens,
|
||||
IntArray)))
|
||||
|
||||
for ranking in self.rankings:
|
||||
penalty += ranking.sql_penalty(conn.t.search_name)
|
||||
penalty += sa.case(*((t.c.postcode == v, p) for v, p in self.postcodes),
|
||||
else_=1.0)
|
||||
|
||||
|
||||
sql = sql.add_columns(penalty.label('accuracy'))
|
||||
sql = sql.order_by('accuracy').limit(LIMIT_PARAM)
|
||||
|
||||
results = nres.SearchResults()
|
||||
for row in await conn.execute(sql, _details_to_bind_params(details)):
|
||||
p = conn.t.placex
|
||||
placex_sql = _select_placex(p).add_columns(p.c.importance)\
|
||||
.where(sa.text("""class = 'boundary'
|
||||
AND type = 'postal_code'
|
||||
AND osm_type = 'R'"""))\
|
||||
.where(p.c.country_code == row.country_code)\
|
||||
.where(p.c.postcode == row.postcode)\
|
||||
.limit(1)
|
||||
|
||||
if details.geometry_output:
|
||||
placex_sql = _add_geometry_columns(placex_sql, p.c.geometry, details)
|
||||
|
||||
for prow in await conn.execute(placex_sql, _details_to_bind_params(details)):
|
||||
result = nres.create_from_placex_row(prow, nres.SearchResult)
|
||||
break
|
||||
else:
|
||||
result = nres.create_from_postcode_row(row, nres.SearchResult)
|
||||
|
||||
assert result
|
||||
if result.place_id not in details.excluded:
|
||||
result.accuracy = row.accuracy
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
|
||||
class PlaceSearch(AbstractSearch):
|
||||
""" Generic search for an address or named place.
|
||||
"""
|
||||
SEARCH_PRIO = 1
|
||||
|
||||
def __init__(self, extra_penalty: float, sdata: SearchData, expected_count: int) -> None:
|
||||
super().__init__(sdata.penalty + extra_penalty)
|
||||
self.countries = sdata.countries
|
||||
self.postcodes = sdata.postcodes
|
||||
self.housenumbers = sdata.housenumbers
|
||||
self.qualifiers = sdata.qualifiers
|
||||
self.lookups = sdata.lookups
|
||||
self.rankings = sdata.rankings
|
||||
self.expected_count = expected_count
|
||||
|
||||
|
||||
def _inner_search_name_cte(self, conn: SearchConnection,
|
||||
details: SearchDetails) -> 'sa.CTE':
|
||||
""" Create a subquery that preselects the rows in the search_name
|
||||
table.
|
||||
"""
|
||||
t = conn.t.search_name
|
||||
|
||||
penalty: SaExpression = sa.literal(self.penalty)
|
||||
for ranking in self.rankings:
|
||||
penalty += ranking.sql_penalty(t)
|
||||
|
||||
sql = sa.select(t.c.place_id, t.c.search_rank, t.c.address_rank,
|
||||
t.c.country_code, t.c.centroid,
|
||||
t.c.name_vector, t.c.nameaddress_vector,
|
||||
sa.case((t.c.importance > 0, t.c.importance),
|
||||
else_=0.40001-(sa.cast(t.c.search_rank, sa.Float())/75))
|
||||
.label('importance'),
|
||||
penalty.label('penalty'))
|
||||
|
||||
for lookup in self.lookups:
|
||||
sql = sql.where(lookup.sql_condition(t))
|
||||
|
||||
if self.countries:
|
||||
sql = sql.where(t.c.country_code.in_(self.countries.values))
|
||||
|
||||
if self.postcodes:
|
||||
# if a postcode is given, don't search for state or country level objects
|
||||
sql = sql.where(t.c.address_rank > 9)
|
||||
if self.expected_count > 10000:
|
||||
# Many results expected. Restrict by postcode.
|
||||
tpc = conn.t.postcode
|
||||
sql = sql.where(sa.select(tpc.c.postcode)
|
||||
.where(tpc.c.postcode.in_(self.postcodes.values))
|
||||
.where(t.c.centroid.within_distance(tpc.c.geometry, 0.4))
|
||||
.exists())
|
||||
|
||||
if details.viewbox is not None:
|
||||
if details.bounded_viewbox:
|
||||
sql = sql.where(t.c.centroid
|
||||
.intersects(VIEWBOX_PARAM,
|
||||
use_index=details.viewbox.area < 0.2))
|
||||
elif not self.postcodes and not self.housenumbers and self.expected_count >= 10000:
|
||||
sql = sql.where(t.c.centroid
|
||||
.intersects(VIEWBOX2_PARAM,
|
||||
use_index=details.viewbox.area < 0.5))
|
||||
|
||||
if details.near is not None and details.near_radius is not None:
|
||||
if details.near_radius < 0.1:
|
||||
sql = sql.where(t.c.centroid.within_distance(NEAR_PARAM,
|
||||
NEAR_RADIUS_PARAM))
|
||||
else:
|
||||
sql = sql.where(t.c.centroid
|
||||
.ST_Distance(NEAR_PARAM) < NEAR_RADIUS_PARAM)
|
||||
|
||||
if self.housenumbers:
|
||||
sql = sql.where(t.c.address_rank.between(16, 30))
|
||||
else:
|
||||
if details.excluded:
|
||||
sql = sql.where(_exclude_places(t))
|
||||
if details.min_rank > 0:
|
||||
sql = sql.where(sa.or_(t.c.address_rank >= MIN_RANK_PARAM,
|
||||
t.c.search_rank >= MIN_RANK_PARAM))
|
||||
if details.max_rank < 30:
|
||||
sql = sql.where(sa.or_(t.c.address_rank <= MAX_RANK_PARAM,
|
||||
t.c.search_rank <= MAX_RANK_PARAM))
|
||||
|
||||
inner = sql.limit(10000).order_by(sa.desc(sa.text('importance'))).subquery()
|
||||
|
||||
sql = sa.select(inner.c.place_id, inner.c.search_rank, inner.c.address_rank,
|
||||
inner.c.country_code, inner.c.centroid, inner.c.importance,
|
||||
inner.c.penalty)
|
||||
|
||||
# If the query is not an address search or has a geographic preference,
|
||||
# preselect most important items to restrict the number of places
|
||||
# that need to be looked up in placex.
|
||||
if not self.housenumbers\
|
||||
and (details.viewbox is None or details.bounded_viewbox)\
|
||||
and (details.near is None or details.near_radius is not None)\
|
||||
and not self.qualifiers:
|
||||
sql = sql.add_columns(sa.func.first_value(inner.c.penalty - inner.c.importance)
|
||||
.over(order_by=inner.c.penalty - inner.c.importance)
|
||||
.label('min_penalty'))
|
||||
|
||||
inner = sql.subquery()
|
||||
|
||||
sql = sa.select(inner.c.place_id, inner.c.search_rank, inner.c.address_rank,
|
||||
inner.c.country_code, inner.c.centroid, inner.c.importance,
|
||||
inner.c.penalty)\
|
||||
.where(inner.c.penalty - inner.c.importance < inner.c.min_penalty + 0.5)
|
||||
|
||||
return sql.cte('searches')
|
||||
|
||||
|
||||
async def lookup(self, conn: SearchConnection,
|
||||
details: SearchDetails) -> nres.SearchResults:
|
||||
""" Find results for the search in the database.
|
||||
"""
|
||||
t = conn.t.placex
|
||||
tsearch = self._inner_search_name_cte(conn, details)
|
||||
|
||||
sql = _select_placex(t).join(tsearch, t.c.place_id == tsearch.c.place_id)
|
||||
|
||||
if details.geometry_output:
|
||||
sql = _add_geometry_columns(sql, t.c.geometry, details)
|
||||
|
||||
penalty: SaExpression = tsearch.c.penalty
|
||||
|
||||
if self.postcodes:
|
||||
tpc = conn.t.postcode
|
||||
pcs = self.postcodes.values
|
||||
|
||||
pc_near = sa.select(sa.func.min(tpc.c.geometry.ST_Distance(t.c.centroid)))\
|
||||
.where(tpc.c.postcode.in_(pcs))\
|
||||
.scalar_subquery()
|
||||
penalty += sa.case((t.c.postcode.in_(pcs), 0.0),
|
||||
else_=sa.func.coalesce(pc_near, cast(SaColumn, 2.0)))
|
||||
|
||||
if details.viewbox is not None and not details.bounded_viewbox:
|
||||
penalty += sa.case((t.c.geometry.intersects(VIEWBOX_PARAM, use_index=False), 0.0),
|
||||
(t.c.geometry.intersects(VIEWBOX2_PARAM, use_index=False), 0.5),
|
||||
else_=1.0)
|
||||
|
||||
if details.near is not None:
|
||||
sql = sql.add_columns((-tsearch.c.centroid.ST_Distance(NEAR_PARAM))
|
||||
.label('importance'))
|
||||
sql = sql.order_by(sa.desc(sa.text('importance')))
|
||||
else:
|
||||
sql = sql.order_by(penalty - tsearch.c.importance)
|
||||
sql = sql.add_columns(tsearch.c.importance)
|
||||
|
||||
|
||||
sql = sql.add_columns(penalty.label('accuracy'))\
|
||||
.order_by(sa.text('accuracy'))
|
||||
|
||||
if self.housenumbers:
|
||||
hnr_list = '|'.join(self.housenumbers.values)
|
||||
inner = sql.where(sa.or_(tsearch.c.address_rank < 30,
|
||||
sa.func.RegexpWord(hnr_list, t.c.housenumber)))\
|
||||
.subquery()
|
||||
|
||||
# Housenumbers from placex
|
||||
thnr = conn.t.placex.alias('hnr')
|
||||
pid_list = sa.func.ArrayAgg(thnr.c.place_id)
|
||||
place_sql = sa.select(pid_list)\
|
||||
.where(thnr.c.parent_place_id == inner.c.place_id)\
|
||||
.where(sa.func.RegexpWord(hnr_list, thnr.c.housenumber))\
|
||||
.where(thnr.c.linked_place_id == None)\
|
||||
.where(thnr.c.indexed_status == 0)
|
||||
|
||||
if details.excluded:
|
||||
place_sql = place_sql.where(thnr.c.place_id.not_in(sa.bindparam('excluded')))
|
||||
if self.qualifiers:
|
||||
place_sql = place_sql.where(self.qualifiers.sql_restrict(thnr))
|
||||
|
||||
numerals = [int(n) for n in self.housenumbers.values
|
||||
if n.isdigit() and len(n) < 8]
|
||||
interpol_sql: SaColumn
|
||||
tiger_sql: SaColumn
|
||||
if numerals and \
|
||||
(not self.qualifiers or ('place', 'house') in self.qualifiers.values):
|
||||
# Housenumbers from interpolations
|
||||
interpol_sql = _make_interpolation_subquery(conn.t.osmline, inner,
|
||||
numerals, details)
|
||||
# Housenumbers from Tiger
|
||||
tiger_sql = sa.case((inner.c.country_code == 'us',
|
||||
_make_interpolation_subquery(conn.t.tiger, inner,
|
||||
numerals, details)
|
||||
), else_=None)
|
||||
else:
|
||||
interpol_sql = sa.null()
|
||||
tiger_sql = sa.null()
|
||||
|
||||
unsort = sa.select(inner, place_sql.scalar_subquery().label('placex_hnr'),
|
||||
interpol_sql.label('interpol_hnr'),
|
||||
tiger_sql.label('tiger_hnr')).subquery('unsort')
|
||||
sql = sa.select(unsort)\
|
||||
.order_by(sa.case((unsort.c.placex_hnr != None, 1),
|
||||
(unsort.c.interpol_hnr != None, 2),
|
||||
(unsort.c.tiger_hnr != None, 3),
|
||||
else_=4),
|
||||
unsort.c.accuracy)
|
||||
else:
|
||||
sql = sql.where(t.c.linked_place_id == None)\
|
||||
.where(t.c.indexed_status == 0)
|
||||
if self.qualifiers:
|
||||
sql = sql.where(self.qualifiers.sql_restrict(t))
|
||||
if details.layers is not None:
|
||||
sql = sql.where(_filter_by_layer(t, details.layers))
|
||||
|
||||
sql = sql.limit(LIMIT_PARAM)
|
||||
|
||||
results = nres.SearchResults()
|
||||
for row in await conn.execute(sql, _details_to_bind_params(details)):
|
||||
result = nres.create_from_placex_row(row, nres.SearchResult)
|
||||
assert result
|
||||
result.bbox = Bbox.from_wkb(row.bbox)
|
||||
result.accuracy = row.accuracy
|
||||
if self.housenumbers and row.rank_address < 30:
|
||||
if row.placex_hnr:
|
||||
subs = _get_placex_housenumbers(conn, row.placex_hnr, details)
|
||||
elif row.interpol_hnr:
|
||||
subs = _get_osmline(conn, row.interpol_hnr, numerals, details)
|
||||
elif row.tiger_hnr:
|
||||
subs = _get_tiger(conn, row.tiger_hnr, numerals, row.osm_id, details)
|
||||
else:
|
||||
subs = None
|
||||
|
||||
if subs is not None:
|
||||
async for sub in subs:
|
||||
assert sub.housenumber
|
||||
sub.accuracy = result.accuracy
|
||||
if not any(nr in self.housenumbers.values
|
||||
for nr in sub.housenumber.split(';')):
|
||||
sub.accuracy += 0.6
|
||||
results.append(sub)
|
||||
|
||||
# Only add the street as a result, if it meets all other
|
||||
# filter conditions.
|
||||
if (not details.excluded or result.place_id not in details.excluded)\
|
||||
and (not self.qualifiers or result.category in self.qualifiers.values)\
|
||||
and result.rank_address >= details.min_rank:
|
||||
result.accuracy += 1.0 # penalty for missing housenumber
|
||||
results.append(result)
|
||||
else:
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
274
src/nominatim_api/search/geocoder.py
Normal file
274
src/nominatim_api/search/geocoder.py
Normal file
@@ -0,0 +1,274 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Public interface to the search code.
|
||||
"""
|
||||
from typing import List, Any, Optional, Iterator, Tuple, Dict
|
||||
import itertools
|
||||
import re
|
||||
import datetime as dt
|
||||
import difflib
|
||||
|
||||
from ..connection import SearchConnection
|
||||
from ..types import SearchDetails
|
||||
from ..results import SearchResult, SearchResults, add_result_details
|
||||
from ..logging import log
|
||||
from .token_assignment import yield_token_assignments
|
||||
from .db_search_builder import SearchBuilder, build_poi_search, wrap_near_search
|
||||
from .db_searches import AbstractSearch
|
||||
from .query_analyzer_factory import make_query_analyzer, AbstractQueryAnalyzer
|
||||
from .query import Phrase, QueryStruct
|
||||
|
||||
class ForwardGeocoder:
|
||||
""" Main class responsible for place search.
|
||||
"""
|
||||
|
||||
def __init__(self, conn: SearchConnection,
|
||||
params: SearchDetails, timeout: Optional[int]) -> None:
|
||||
self.conn = conn
|
||||
self.params = params
|
||||
self.timeout = dt.timedelta(seconds=timeout or 1000000)
|
||||
self.query_analyzer: Optional[AbstractQueryAnalyzer] = None
|
||||
|
||||
|
||||
@property
|
||||
def limit(self) -> int:
|
||||
""" Return the configured maximum number of search results.
|
||||
"""
|
||||
return self.params.max_results
|
||||
|
||||
|
||||
async def build_searches(self,
|
||||
phrases: List[Phrase]) -> Tuple[QueryStruct, List[AbstractSearch]]:
|
||||
""" Analyse the query and return the tokenized query and list of
|
||||
possible searches over it.
|
||||
"""
|
||||
if self.query_analyzer is None:
|
||||
self.query_analyzer = await make_query_analyzer(self.conn)
|
||||
|
||||
query = await self.query_analyzer.analyze_query(phrases)
|
||||
|
||||
searches: List[AbstractSearch] = []
|
||||
if query.num_token_slots() > 0:
|
||||
# 2. Compute all possible search interpretations
|
||||
log().section('Compute abstract searches')
|
||||
search_builder = SearchBuilder(query, self.params)
|
||||
num_searches = 0
|
||||
for assignment in yield_token_assignments(query):
|
||||
searches.extend(search_builder.build(assignment))
|
||||
if num_searches < len(searches):
|
||||
log().table_dump('Searches for assignment',
|
||||
_dump_searches(searches, query, num_searches))
|
||||
num_searches = len(searches)
|
||||
searches.sort(key=lambda s: (s.penalty, s.SEARCH_PRIO))
|
||||
|
||||
return query, searches
|
||||
|
||||
|
||||
async def execute_searches(self, query: QueryStruct,
|
||||
searches: List[AbstractSearch]) -> SearchResults:
|
||||
""" Run the abstract searches against the database until a result
|
||||
is found.
|
||||
"""
|
||||
log().section('Execute database searches')
|
||||
results: Dict[Any, SearchResult] = {}
|
||||
|
||||
end_time = dt.datetime.now() + self.timeout
|
||||
|
||||
min_ranking = searches[0].penalty + 2.0
|
||||
prev_penalty = 0.0
|
||||
for i, search in enumerate(searches):
|
||||
if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
|
||||
break
|
||||
log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
|
||||
log().var_dump('Params', self.params)
|
||||
lookup_results = await search.lookup(self.conn, self.params)
|
||||
for result in lookup_results:
|
||||
rhash = (result.source_table, result.place_id,
|
||||
result.housenumber, result.country_code)
|
||||
prevresult = results.get(rhash)
|
||||
if prevresult:
|
||||
prevresult.accuracy = min(prevresult.accuracy, result.accuracy)
|
||||
else:
|
||||
results[rhash] = result
|
||||
min_ranking = min(min_ranking, result.accuracy * 1.2, 2.0)
|
||||
log().result_dump('Results', ((r.accuracy, r) for r in lookup_results))
|
||||
prev_penalty = search.penalty
|
||||
if dt.datetime.now() >= end_time:
|
||||
break
|
||||
|
||||
return SearchResults(results.values())
|
||||
|
||||
|
||||
def pre_filter_results(self, results: SearchResults) -> SearchResults:
|
||||
""" Remove results that are significantly worse than the
|
||||
best match.
|
||||
"""
|
||||
if results:
|
||||
max_ranking = min(r.ranking for r in results) + 0.5
|
||||
results = SearchResults(r for r in results if r.ranking < max_ranking)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
|
||||
""" Remove badly matching results, sort by ranking and
|
||||
limit to the configured number of results.
|
||||
"""
|
||||
if results:
|
||||
results.sort(key=lambda r: r.ranking)
|
||||
min_rank = results[0].rank_search
|
||||
min_ranking = results[0].ranking
|
||||
results = SearchResults(r for r in results
|
||||
if r.ranking + 0.03 * (r.rank_search - min_rank)
|
||||
< min_ranking + 0.5)
|
||||
|
||||
results = SearchResults(results[:self.limit])
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
|
||||
""" Adjust the accuracy of the localized result according to how well
|
||||
they match the original query.
|
||||
"""
|
||||
assert self.query_analyzer is not None
|
||||
qwords = [word for phrase in query.source
|
||||
for word in re.split('[, ]+', phrase.text) if word]
|
||||
if not qwords:
|
||||
return
|
||||
|
||||
for result in results:
|
||||
# Negative importance indicates ordering by distance, which is
|
||||
# more important than word matching.
|
||||
if not result.display_name\
|
||||
or (result.importance is not None and result.importance < 0):
|
||||
continue
|
||||
distance = 0.0
|
||||
norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
|
||||
result.country_code or '')))
|
||||
words = set((w for w in norm.split(' ') if w))
|
||||
if not words:
|
||||
continue
|
||||
for qword in qwords:
|
||||
wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
|
||||
if wdist < 0.5:
|
||||
distance += len(qword)
|
||||
else:
|
||||
distance += (1.0 - wdist) * len(qword)
|
||||
# Compensate for the fact that country names do not get a
|
||||
# match penalty yet by the tokenizer.
|
||||
# Temporary hack that needs to be removed!
|
||||
if result.rank_address == 4:
|
||||
distance *= 2
|
||||
result.accuracy += distance * 0.4 / sum(len(w) for w in qwords)
|
||||
|
||||
|
||||
async def lookup_pois(self, categories: List[Tuple[str, str]],
|
||||
phrases: List[Phrase]) -> SearchResults:
|
||||
""" Look up places by category. If phrase is given, a place search
|
||||
over the phrase will be executed first and places close to the
|
||||
results returned.
|
||||
"""
|
||||
log().function('forward_lookup_pois', categories=categories, params=self.params)
|
||||
|
||||
if phrases:
|
||||
query, searches = await self.build_searches(phrases)
|
||||
|
||||
if query:
|
||||
searches = [wrap_near_search(categories, s) for s in searches[:50]]
|
||||
results = await self.execute_searches(query, searches)
|
||||
results = self.pre_filter_results(results)
|
||||
await add_result_details(self.conn, results, self.params)
|
||||
log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
|
||||
results = self.sort_and_cut_results(results)
|
||||
else:
|
||||
results = SearchResults()
|
||||
else:
|
||||
search = build_poi_search(categories, self.params.countries)
|
||||
results = await search.lookup(self.conn, self.params)
|
||||
await add_result_details(self.conn, results, self.params)
|
||||
|
||||
log().result_dump('Final Results', ((r.accuracy, r) for r in results))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def lookup(self, phrases: List[Phrase]) -> SearchResults:
|
||||
""" Look up a single free-text query.
|
||||
"""
|
||||
log().function('forward_lookup', phrases=phrases, params=self.params)
|
||||
results = SearchResults()
|
||||
|
||||
if self.params.is_impossible():
|
||||
return results
|
||||
|
||||
query, searches = await self.build_searches(phrases)
|
||||
|
||||
if searches:
|
||||
# Execute SQL until an appropriate result is found.
|
||||
results = await self.execute_searches(query, searches[:50])
|
||||
results = self.pre_filter_results(results)
|
||||
await add_result_details(self.conn, results, self.params)
|
||||
log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
|
||||
self.rerank_by_query(query, results)
|
||||
log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
|
||||
results = self.sort_and_cut_results(results)
|
||||
log().result_dump('Final Results', ((r.accuracy, r) for r in results))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# pylint: disable=invalid-name,too-many-locals
|
||||
def _dump_searches(searches: List[AbstractSearch], query: QueryStruct,
|
||||
start: int = 0) -> Iterator[Optional[List[Any]]]:
|
||||
yield ['Penalty', 'Lookups', 'Housenr', 'Postcode', 'Countries',
|
||||
'Qualifier', 'Catgeory', 'Rankings']
|
||||
|
||||
def tk(tl: List[int]) -> str:
|
||||
tstr = [f"{query.find_lookup_word_by_id(t)}({t})" for t in tl]
|
||||
|
||||
return f"[{','.join(tstr)}]"
|
||||
|
||||
def fmt_ranking(f: Any) -> str:
|
||||
if not f:
|
||||
return ''
|
||||
ranks = ','.join((f"{tk(r.tokens)}^{r.penalty:.3g}" for r in f.rankings))
|
||||
if len(ranks) > 100:
|
||||
ranks = ranks[:100] + '...'
|
||||
return f"{f.column}({ranks},def={f.default:.3g})"
|
||||
|
||||
def fmt_lookup(l: Any) -> str:
|
||||
if not l:
|
||||
return ''
|
||||
|
||||
return f"{l.lookup_type}({l.column}{tk(l.tokens)})"
|
||||
|
||||
|
||||
def fmt_cstr(c: Any) -> str:
|
||||
if not c:
|
||||
return ''
|
||||
|
||||
return f'{c[0]}^{c[1]}'
|
||||
|
||||
for search in searches[start:]:
|
||||
fields = ('lookups', 'rankings', 'countries', 'housenumbers',
|
||||
'postcodes', 'qualifiers')
|
||||
if hasattr(search, 'search'):
|
||||
iters = itertools.zip_longest([f"{search.penalty:.3g}"],
|
||||
*(getattr(search.search, attr, []) for attr in fields),
|
||||
getattr(search, 'categories', []),
|
||||
fillvalue='')
|
||||
else:
|
||||
iters = itertools.zip_longest([f"{search.penalty:.3g}"],
|
||||
*(getattr(search, attr, []) for attr in fields),
|
||||
[],
|
||||
fillvalue='')
|
||||
for penalty, lookup, rank, cc, hnr, pc, qual, cat in iters:
|
||||
yield [penalty, fmt_lookup(lookup), fmt_cstr(hnr),
|
||||
fmt_cstr(pc), fmt_cstr(cc), fmt_cstr(qual), fmt_cstr(cat), fmt_ranking(rank)]
|
||||
yield None
|
||||
314
src/nominatim_api/search/icu_tokenizer.py
Normal file
314
src/nominatim_api/search/icu_tokenizer.py
Normal file
@@ -0,0 +1,314 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of query analysis for the ICU tokenizer.
|
||||
"""
|
||||
from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
|
||||
from collections import defaultdict
|
||||
import dataclasses
|
||||
import difflib
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from nominatim_core.typing import SaRow
|
||||
from nominatim_core.db.sqlalchemy_types import Json
|
||||
from ..connection import SearchConnection
|
||||
from ..logging import log
|
||||
from ..search import query as qmod
|
||||
from ..search.query_analyzer_factory import AbstractQueryAnalyzer
|
||||
|
||||
|
||||
DB_TO_TOKEN_TYPE = {
|
||||
'W': qmod.TokenType.WORD,
|
||||
'w': qmod.TokenType.PARTIAL,
|
||||
'H': qmod.TokenType.HOUSENUMBER,
|
||||
'P': qmod.TokenType.POSTCODE,
|
||||
'C': qmod.TokenType.COUNTRY
|
||||
}
|
||||
|
||||
|
||||
class QueryPart(NamedTuple):
|
||||
""" Normalized and transliterated form of a single term in the query.
|
||||
When the term came out of a split during the transliteration,
|
||||
the normalized string is the full word before transliteration.
|
||||
The word number keeps track of the word before transliteration
|
||||
and can be used to identify partial transliterated terms.
|
||||
"""
|
||||
token: str
|
||||
normalized: str
|
||||
word_number: int
|
||||
|
||||
|
||||
QueryParts = List[QueryPart]
|
||||
WordDict = Dict[str, List[qmod.TokenRange]]
|
||||
|
||||
def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
|
||||
""" Return all combinations of words in the terms list after the
|
||||
given position.
|
||||
"""
|
||||
total = len(terms)
|
||||
for first in range(start, total):
|
||||
word = terms[first].token
|
||||
yield word, qmod.TokenRange(first, first + 1)
|
||||
for last in range(first + 1, min(first + 20, total)):
|
||||
word = ' '.join((word, terms[last].token))
|
||||
yield word, qmod.TokenRange(first, last + 1)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ICUToken(qmod.Token):
|
||||
""" Specialised token for ICU tokenizer.
|
||||
"""
|
||||
word_token: str
|
||||
info: Optional[Dict[str, Any]]
|
||||
|
||||
def get_category(self) -> Tuple[str, str]:
|
||||
assert self.info
|
||||
return self.info.get('class', ''), self.info.get('type', '')
|
||||
|
||||
|
||||
def rematch(self, norm: str) -> None:
|
||||
""" Check how well the token matches the given normalized string
|
||||
and add a penalty, if necessary.
|
||||
"""
|
||||
if not self.lookup_word:
|
||||
return
|
||||
|
||||
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
|
||||
distance = 0
|
||||
for tag, afrom, ato, bfrom, bto in seq.get_opcodes():
|
||||
if tag in ('delete', 'insert') and (afrom == 0 or ato == len(self.lookup_word)):
|
||||
distance += 1
|
||||
elif tag == 'replace':
|
||||
distance += max((ato-afrom), (bto-bfrom))
|
||||
elif tag != 'equal':
|
||||
distance += abs((ato-afrom) - (bto-bfrom))
|
||||
self.penalty += (distance/len(self.lookup_word))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def from_db_row(row: SaRow) -> 'ICUToken':
|
||||
""" Create a ICUToken from the row of the word table.
|
||||
"""
|
||||
count = 1 if row.info is None else row.info.get('count', 1)
|
||||
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
|
||||
|
||||
penalty = 0.0
|
||||
if row.type == 'w':
|
||||
penalty = 0.3
|
||||
elif row.type == 'W':
|
||||
if len(row.word_token) == 1 and row.word_token == row.word:
|
||||
penalty = 0.2 if row.word.isdigit() else 0.3
|
||||
elif row.type == 'H':
|
||||
penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
|
||||
if all(not c.isdigit() for c in row.word_token):
|
||||
penalty += 0.2 * (len(row.word_token) - 1)
|
||||
elif row.type == 'C':
|
||||
if len(row.word_token) == 1:
|
||||
penalty = 0.3
|
||||
|
||||
if row.info is None:
|
||||
lookup_word = row.word
|
||||
else:
|
||||
lookup_word = row.info.get('lookup', row.word)
|
||||
if lookup_word:
|
||||
lookup_word = lookup_word.split('@', 1)[0]
|
||||
else:
|
||||
lookup_word = row.word_token
|
||||
|
||||
return ICUToken(penalty=penalty, token=row.word_id, count=max(1, count),
|
||||
lookup_word=lookup_word, is_indexed=True,
|
||||
word_token=row.word_token, info=row.info,
|
||||
addr_count=max(1, addr_count))
|
||||
|
||||
|
||||
|
||||
class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
||||
""" Converter for query strings into a tokenized query
|
||||
using the tokens created by a ICU tokenizer.
|
||||
"""
|
||||
|
||||
def __init__(self, conn: SearchConnection) -> None:
|
||||
self.conn = conn
|
||||
|
||||
|
||||
async def setup(self) -> None:
|
||||
""" Set up static data structures needed for the analysis.
|
||||
"""
|
||||
async def _make_normalizer() -> Any:
|
||||
rules = await self.conn.get_property('tokenizer_import_normalisation')
|
||||
return Transliterator.createFromRules("normalization", rules)
|
||||
|
||||
self.normalizer = await self.conn.get_cached_value('ICUTOK', 'normalizer',
|
||||
_make_normalizer)
|
||||
|
||||
async def _make_transliterator() -> Any:
|
||||
rules = await self.conn.get_property('tokenizer_import_transliteration')
|
||||
return Transliterator.createFromRules("transliteration", rules)
|
||||
|
||||
self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator',
|
||||
_make_transliterator)
|
||||
|
||||
if 'word' not in self.conn.t.meta.tables:
|
||||
sa.Table('word', self.conn.t.meta,
|
||||
sa.Column('word_id', sa.Integer),
|
||||
sa.Column('word_token', sa.Text, nullable=False),
|
||||
sa.Column('type', sa.Text, nullable=False),
|
||||
sa.Column('word', sa.Text),
|
||||
sa.Column('info', Json))
|
||||
|
||||
|
||||
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
|
||||
""" Analyze the given list of phrases and return the
|
||||
tokenized query.
|
||||
"""
|
||||
log().section('Analyze query (using ICU tokenizer)')
|
||||
normalized = list(filter(lambda p: p.text,
|
||||
(qmod.Phrase(p.ptype, self.normalize_text(p.text))
|
||||
for p in phrases)))
|
||||
query = qmod.QueryStruct(normalized)
|
||||
log().var_dump('Normalized query', query.source)
|
||||
if not query.source:
|
||||
return query
|
||||
|
||||
parts, words = self.split_query(query)
|
||||
log().var_dump('Transliterated query', lambda: _dump_transliterated(query, parts))
|
||||
|
||||
for row in await self.lookup_in_db(list(words.keys())):
|
||||
for trange in words[row.word_token]:
|
||||
token = ICUToken.from_db_row(row)
|
||||
if row.type == 'S':
|
||||
if row.info['op'] in ('in', 'near'):
|
||||
if trange.start == 0:
|
||||
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
|
||||
else:
|
||||
if trange.start == 0 and trange.end == query.num_token_slots():
|
||||
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
|
||||
else:
|
||||
query.add_token(trange, qmod.TokenType.QUALIFIER, token)
|
||||
else:
|
||||
query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
|
||||
|
||||
self.add_extra_tokens(query, parts)
|
||||
self.rerank_tokens(query, parts)
|
||||
|
||||
log().table_dump('Word tokens', _dump_word_tokens(query))
|
||||
|
||||
return query
|
||||
|
||||
|
||||
def normalize_text(self, text: str) -> str:
|
||||
""" Bring the given text into a normalized form. That is the
|
||||
standardized form search will work with. All information removed
|
||||
at this stage is inevitably lost.
|
||||
"""
|
||||
return cast(str, self.normalizer.transliterate(text))
|
||||
|
||||
|
||||
def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
|
||||
""" Transliterate the phrases and split them into tokens.
|
||||
|
||||
Returns the list of transliterated tokens together with their
|
||||
normalized form and a dictionary of words for lookup together
|
||||
with their position.
|
||||
"""
|
||||
parts: QueryParts = []
|
||||
phrase_start = 0
|
||||
words = defaultdict(list)
|
||||
wordnr = 0
|
||||
for phrase in query.source:
|
||||
query.nodes[-1].ptype = phrase.ptype
|
||||
for word in phrase.text.split(' '):
|
||||
trans = self.transliterator.transliterate(word)
|
||||
if trans:
|
||||
for term in trans.split(' '):
|
||||
if term:
|
||||
parts.append(QueryPart(term, word, wordnr))
|
||||
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
|
||||
query.nodes[-1].btype = qmod.BreakType.WORD
|
||||
wordnr += 1
|
||||
query.nodes[-1].btype = qmod.BreakType.PHRASE
|
||||
|
||||
for word, wrange in yield_words(parts, phrase_start):
|
||||
words[word].append(wrange)
|
||||
|
||||
phrase_start = len(parts)
|
||||
query.nodes[-1].btype = qmod.BreakType.END
|
||||
|
||||
return parts, words
|
||||
|
||||
|
||||
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
|
||||
""" Return the token information from the database for the
|
||||
given word tokens.
|
||||
"""
|
||||
t = self.conn.t.meta.tables['word']
|
||||
return await self.conn.execute(t.select().where(t.c.word_token.in_(words)))
|
||||
|
||||
|
||||
def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
|
||||
""" Add tokens to query that are not saved in the database.
|
||||
"""
|
||||
for part, node, i in zip(parts, query.nodes, range(1000)):
|
||||
if len(part.token) <= 4 and part[0].isdigit()\
|
||||
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
|
||||
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
|
||||
ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
|
||||
|
||||
|
||||
def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
|
||||
""" Add penalties to tokens that depend on presence of other token.
|
||||
"""
|
||||
for i, node, tlist in query.iter_token_lists():
|
||||
if tlist.ttype == qmod.TokenType.POSTCODE:
|
||||
for repl in node.starting:
|
||||
if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \
|
||||
and (repl.ttype != qmod.TokenType.HOUSENUMBER
|
||||
or len(tlist.tokens[0].lookup_word) > 4):
|
||||
repl.add_penalty(0.39)
|
||||
elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
|
||||
and len(tlist.tokens[0].lookup_word) <= 3:
|
||||
if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
|
||||
for repl in node.starting:
|
||||
if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
|
||||
repl.add_penalty(0.5 - tlist.tokens[0].penalty)
|
||||
elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
|
||||
norm = parts[i].normalized
|
||||
for j in range(i + 1, tlist.end):
|
||||
if parts[j - 1].word_number != parts[j].word_number:
|
||||
norm += ' ' + parts[j].normalized
|
||||
for token in tlist.tokens:
|
||||
cast(ICUToken, token).rematch(norm)
|
||||
|
||||
|
||||
def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str:
|
||||
out = query.nodes[0].btype.value
|
||||
for node, part in zip(query.nodes[1:], parts):
|
||||
out += part.token + node.btype.value
|
||||
return out
|
||||
|
||||
|
||||
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
|
||||
yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
|
||||
for node in query.nodes:
|
||||
for tlist in node.starting:
|
||||
for token in tlist.tokens:
|
||||
t = cast(ICUToken, token)
|
||||
yield [tlist.ttype.name, t.token, t.word_token or '',
|
||||
t.lookup_word or '', t.penalty, t.count, t.info]
|
||||
|
||||
|
||||
async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
|
||||
""" Create and set up a new query analyzer for a database based
|
||||
on the ICU tokenizer.
|
||||
"""
|
||||
out = ICUQueryAnalyzer(conn)
|
||||
await out.setup()
|
||||
|
||||
return out
|
||||
272
src/nominatim_api/search/legacy_tokenizer.py
Normal file
272
src/nominatim_api/search/legacy_tokenizer.py
Normal file
@@ -0,0 +1,272 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of query analysis for the legacy tokenizer.
|
||||
"""
|
||||
from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
|
||||
from copy import copy
|
||||
from collections import defaultdict
|
||||
import dataclasses
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from nominatim_core.typing import SaRow
|
||||
from ..connection import SearchConnection
|
||||
from ..logging import log
|
||||
from . import query as qmod
|
||||
from .query_analyzer_factory import AbstractQueryAnalyzer
|
||||
|
||||
def yield_words(terms: List[str], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
|
||||
""" Return all combinations of words in the terms list after the
|
||||
given position.
|
||||
"""
|
||||
total = len(terms)
|
||||
for first in range(start, total):
|
||||
word = terms[first]
|
||||
yield word, qmod.TokenRange(first, first + 1)
|
||||
for last in range(first + 1, min(first + 20, total)):
|
||||
word = ' '.join((word, terms[last]))
|
||||
yield word, qmod.TokenRange(first, last + 1)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class LegacyToken(qmod.Token):
|
||||
""" Specialised token for legacy tokenizer.
|
||||
"""
|
||||
word_token: str
|
||||
category: Optional[Tuple[str, str]]
|
||||
country: Optional[str]
|
||||
operator: Optional[str]
|
||||
|
||||
@property
|
||||
def info(self) -> Dict[str, Any]:
|
||||
""" Dictionary of additional properties of the token.
|
||||
Should only be used for debugging purposes.
|
||||
"""
|
||||
return {'category': self.category,
|
||||
'country': self.country,
|
||||
'operator': self.operator}
|
||||
|
||||
|
||||
def get_category(self) -> Tuple[str, str]:
|
||||
assert self.category
|
||||
return self.category
|
||||
|
||||
|
||||
class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
|
||||
""" Converter for query strings into a tokenized query
|
||||
using the tokens created by a legacy tokenizer.
|
||||
"""
|
||||
|
||||
def __init__(self, conn: SearchConnection) -> None:
|
||||
self.conn = conn
|
||||
|
||||
async def setup(self) -> None:
|
||||
""" Set up static data structures needed for the analysis.
|
||||
"""
|
||||
self.max_word_freq = int(await self.conn.get_property('tokenizer_maxwordfreq'))
|
||||
if 'word' not in self.conn.t.meta.tables:
|
||||
sa.Table('word', self.conn.t.meta,
|
||||
sa.Column('word_id', sa.Integer),
|
||||
sa.Column('word_token', sa.Text, nullable=False),
|
||||
sa.Column('word', sa.Text),
|
||||
sa.Column('class', sa.Text),
|
||||
sa.Column('type', sa.Text),
|
||||
sa.Column('country_code', sa.Text),
|
||||
sa.Column('search_name_count', sa.Integer),
|
||||
sa.Column('operator', sa.Text))
|
||||
|
||||
|
||||
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
|
||||
""" Analyze the given list of phrases and return the
|
||||
tokenized query.
|
||||
"""
|
||||
log().section('Analyze query (using Legacy tokenizer)')
|
||||
|
||||
normalized = []
|
||||
if phrases:
|
||||
for row in await self.conn.execute(sa.select(*(sa.func.make_standard_name(p.text)
|
||||
for p in phrases))):
|
||||
normalized = [qmod.Phrase(p.ptype, r) for r, p in zip(row, phrases) if r]
|
||||
break
|
||||
|
||||
query = qmod.QueryStruct(normalized)
|
||||
log().var_dump('Normalized query', query.source)
|
||||
if not query.source:
|
||||
return query
|
||||
|
||||
parts, words = self.split_query(query)
|
||||
lookup_words = list(words.keys())
|
||||
log().var_dump('Split query', parts)
|
||||
log().var_dump('Extracted words', lookup_words)
|
||||
|
||||
for row in await self.lookup_in_db(lookup_words):
|
||||
for trange in words[row.word_token.strip()]:
|
||||
token, ttype = self.make_token(row)
|
||||
if ttype == qmod.TokenType.NEAR_ITEM:
|
||||
if trange.start == 0:
|
||||
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
|
||||
elif ttype == qmod.TokenType.QUALIFIER:
|
||||
query.add_token(trange, qmod.TokenType.QUALIFIER, token)
|
||||
if trange.start == 0 or trange.end == query.num_token_slots():
|
||||
token = copy(token)
|
||||
token.penalty += 0.1 * (query.num_token_slots())
|
||||
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
|
||||
elif ttype != qmod.TokenType.PARTIAL or trange.start + 1 == trange.end:
|
||||
query.add_token(trange, ttype, token)
|
||||
|
||||
self.add_extra_tokens(query, parts)
|
||||
self.rerank_tokens(query)
|
||||
|
||||
log().table_dump('Word tokens', _dump_word_tokens(query))
|
||||
|
||||
return query
|
||||
|
||||
|
||||
def normalize_text(self, text: str) -> str:
|
||||
""" Bring the given text into a normalized form.
|
||||
|
||||
This only removes case, so some difference with the normalization
|
||||
in the phrase remains.
|
||||
"""
|
||||
return text.lower()
|
||||
|
||||
|
||||
def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
|
||||
Dict[str, List[qmod.TokenRange]]]:
|
||||
""" Transliterate the phrases and split them into tokens.
|
||||
|
||||
Returns a list of transliterated tokens and a dictionary
|
||||
of words for lookup together with their position.
|
||||
"""
|
||||
parts: List[str] = []
|
||||
phrase_start = 0
|
||||
words = defaultdict(list)
|
||||
for phrase in query.source:
|
||||
query.nodes[-1].ptype = phrase.ptype
|
||||
for trans in phrase.text.split(' '):
|
||||
if trans:
|
||||
for term in trans.split(' '):
|
||||
if term:
|
||||
parts.append(trans)
|
||||
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
|
||||
query.nodes[-1].btype = qmod.BreakType.WORD
|
||||
query.nodes[-1].btype = qmod.BreakType.PHRASE
|
||||
for word, wrange in yield_words(parts, phrase_start):
|
||||
words[word].append(wrange)
|
||||
phrase_start = len(parts)
|
||||
query.nodes[-1].btype = qmod.BreakType.END
|
||||
|
||||
return parts, words
|
||||
|
||||
|
||||
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
|
||||
""" Return the token information from the database for the
|
||||
given word tokens.
|
||||
"""
|
||||
t = self.conn.t.meta.tables['word']
|
||||
|
||||
sql = t.select().where(t.c.word_token.in_(words + [' ' + w for w in words]))
|
||||
|
||||
return await self.conn.execute(sql)
|
||||
|
||||
|
||||
def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]:
|
||||
""" Create a LegacyToken from the row of the word table.
|
||||
Also determines the type of token.
|
||||
"""
|
||||
penalty = 0.0
|
||||
is_indexed = True
|
||||
|
||||
rowclass = getattr(row, 'class')
|
||||
|
||||
if row.country_code is not None:
|
||||
ttype = qmod.TokenType.COUNTRY
|
||||
lookup_word = row.country_code
|
||||
elif rowclass is not None:
|
||||
if rowclass == 'place' and row.type == 'house':
|
||||
ttype = qmod.TokenType.HOUSENUMBER
|
||||
lookup_word = row.word_token[1:]
|
||||
elif rowclass == 'place' and row.type == 'postcode':
|
||||
ttype = qmod.TokenType.POSTCODE
|
||||
lookup_word = row.word_token[1:]
|
||||
else:
|
||||
ttype = qmod.TokenType.NEAR_ITEM if row.operator in ('in', 'near')\
|
||||
else qmod.TokenType.QUALIFIER
|
||||
lookup_word = row.word
|
||||
elif row.word_token.startswith(' '):
|
||||
ttype = qmod.TokenType.WORD
|
||||
lookup_word = row.word or row.word_token[1:]
|
||||
else:
|
||||
ttype = qmod.TokenType.PARTIAL
|
||||
lookup_word = row.word_token
|
||||
penalty = 0.21
|
||||
if row.search_name_count > self.max_word_freq:
|
||||
is_indexed = False
|
||||
|
||||
return LegacyToken(penalty=penalty, token=row.word_id,
|
||||
count=max(1, row.search_name_count or 1),
|
||||
addr_count=1, # not supported
|
||||
lookup_word=lookup_word,
|
||||
word_token=row.word_token.strip(),
|
||||
category=(rowclass, row.type) if rowclass is not None else None,
|
||||
country=row.country_code,
|
||||
operator=row.operator,
|
||||
is_indexed=is_indexed),\
|
||||
ttype
|
||||
|
||||
|
||||
def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None:
|
||||
""" Add tokens to query that are not saved in the database.
|
||||
"""
|
||||
for part, node, i in zip(parts, query.nodes, range(1000)):
|
||||
if len(part) <= 4 and part.isdigit()\
|
||||
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
|
||||
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
|
||||
LegacyToken(penalty=0.5, token=0, count=1, addr_count=1,
|
||||
lookup_word=part, word_token=part,
|
||||
category=None, country=None,
|
||||
operator=None, is_indexed=True))
|
||||
|
||||
|
||||
def rerank_tokens(self, query: qmod.QueryStruct) -> None:
|
||||
""" Add penalties to tokens that depend on presence of other token.
|
||||
"""
|
||||
for _, node, tlist in query.iter_token_lists():
|
||||
if tlist.ttype == qmod.TokenType.POSTCODE:
|
||||
for repl in node.starting:
|
||||
if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \
|
||||
and (repl.ttype != qmod.TokenType.HOUSENUMBER
|
||||
or len(tlist.tokens[0].lookup_word) > 4):
|
||||
repl.add_penalty(0.39)
|
||||
elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
|
||||
and len(tlist.tokens[0].lookup_word) <= 3:
|
||||
if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
|
||||
for repl in node.starting:
|
||||
if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
|
||||
repl.add_penalty(0.5 - tlist.tokens[0].penalty)
|
||||
|
||||
|
||||
|
||||
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
|
||||
yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
|
||||
for node in query.nodes:
|
||||
for tlist in node.starting:
|
||||
for token in tlist.tokens:
|
||||
t = cast(LegacyToken, token)
|
||||
yield [tlist.ttype.name, t.token, t.word_token or '',
|
||||
t.lookup_word or '', t.penalty, t.count, t.info]
|
||||
|
||||
|
||||
async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
|
||||
""" Create and set up a new query analyzer for a database based
|
||||
on the ICU tokenizer.
|
||||
"""
|
||||
out = LegacyQueryAnalyzer(conn)
|
||||
await out.setup()
|
||||
|
||||
return out
|
||||
297
src/nominatim_api/search/query.py
Normal file
297
src/nominatim_api/search/query.py
Normal file
@@ -0,0 +1,297 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Datastructures for a tokenized query.
|
||||
"""
|
||||
from typing import List, Tuple, Optional, Iterator
|
||||
from abc import ABC, abstractmethod
|
||||
import dataclasses
|
||||
import enum
|
||||
|
||||
class BreakType(enum.Enum):
|
||||
""" Type of break between tokens.
|
||||
"""
|
||||
START = '<'
|
||||
""" Begin of the query. """
|
||||
END = '>'
|
||||
""" End of the query. """
|
||||
PHRASE = ','
|
||||
""" Break between two phrases. """
|
||||
WORD = ' '
|
||||
""" Break between words. """
|
||||
PART = '-'
|
||||
""" Break inside a word, for example a hyphen or apostrophe. """
|
||||
TOKEN = '`'
|
||||
""" Break created as a result of tokenization.
|
||||
This may happen in languages without spaces between words.
|
||||
"""
|
||||
|
||||
|
||||
class TokenType(enum.Enum):
|
||||
""" Type of token.
|
||||
"""
|
||||
WORD = enum.auto()
|
||||
""" Full name of a place. """
|
||||
PARTIAL = enum.auto()
|
||||
""" Word term without breaks, does not necessarily represent a full name. """
|
||||
HOUSENUMBER = enum.auto()
|
||||
""" Housenumber term. """
|
||||
POSTCODE = enum.auto()
|
||||
""" Postal code term. """
|
||||
COUNTRY = enum.auto()
|
||||
""" Country name or reference. """
|
||||
QUALIFIER = enum.auto()
|
||||
""" Special term used together with name (e.g. _Hotel_ Bellevue). """
|
||||
NEAR_ITEM = enum.auto()
|
||||
""" Special term used as searchable object(e.g. supermarket in ...). """
|
||||
|
||||
|
||||
class PhraseType(enum.Enum):
|
||||
""" Designation of a phrase.
|
||||
"""
|
||||
NONE = 0
|
||||
""" No specific designation (i.e. source is free-form query). """
|
||||
AMENITY = enum.auto()
|
||||
""" Contains name or type of a POI. """
|
||||
STREET = enum.auto()
|
||||
""" Contains a street name optionally with a housenumber. """
|
||||
CITY = enum.auto()
|
||||
""" Contains the postal city. """
|
||||
COUNTY = enum.auto()
|
||||
""" Contains the equivalent of a county. """
|
||||
STATE = enum.auto()
|
||||
""" Contains a state or province. """
|
||||
POSTCODE = enum.auto()
|
||||
""" Contains a postal code. """
|
||||
COUNTRY = enum.auto()
|
||||
""" Contains the country name or code. """
|
||||
|
||||
def compatible_with(self, ttype: TokenType,
|
||||
is_full_phrase: bool) -> bool:
|
||||
""" Check if the given token type can be used with the phrase type.
|
||||
"""
|
||||
if self == PhraseType.NONE:
|
||||
return not is_full_phrase or ttype != TokenType.QUALIFIER
|
||||
if self == PhraseType.AMENITY:
|
||||
return ttype in (TokenType.WORD, TokenType.PARTIAL)\
|
||||
or (is_full_phrase and ttype == TokenType.NEAR_ITEM)\
|
||||
or (not is_full_phrase and ttype == TokenType.QUALIFIER)
|
||||
if self == PhraseType.STREET:
|
||||
return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER)
|
||||
if self == PhraseType.POSTCODE:
|
||||
return ttype == TokenType.POSTCODE
|
||||
if self == PhraseType.COUNTRY:
|
||||
return ttype == TokenType.COUNTRY
|
||||
|
||||
return ttype in (TokenType.WORD, TokenType.PARTIAL)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Token(ABC):
|
||||
""" Base type for tokens.
|
||||
Specific query analyzers must implement the concrete token class.
|
||||
"""
|
||||
|
||||
penalty: float
|
||||
token: int
|
||||
count: int
|
||||
addr_count: int
|
||||
lookup_word: str
|
||||
is_indexed: bool
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def get_category(self) -> Tuple[str, str]:
|
||||
""" Return the category restriction for qualifier terms and
|
||||
category objects.
|
||||
"""
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TokenRange:
|
||||
""" Indexes of query nodes over which a token spans.
|
||||
"""
|
||||
start: int
|
||||
end: int
|
||||
|
||||
def __lt__(self, other: 'TokenRange') -> bool:
|
||||
return self.end <= other.start
|
||||
|
||||
|
||||
def __le__(self, other: 'TokenRange') -> bool:
|
||||
return NotImplemented
|
||||
|
||||
|
||||
def __gt__(self, other: 'TokenRange') -> bool:
|
||||
return self.start >= other.end
|
||||
|
||||
|
||||
def __ge__(self, other: 'TokenRange') -> bool:
|
||||
return NotImplemented
|
||||
|
||||
|
||||
def replace_start(self, new_start: int) -> 'TokenRange':
|
||||
""" Return a new token range with the new start.
|
||||
"""
|
||||
return TokenRange(new_start, self.end)
|
||||
|
||||
|
||||
def replace_end(self, new_end: int) -> 'TokenRange':
|
||||
""" Return a new token range with the new end.
|
||||
"""
|
||||
return TokenRange(self.start, new_end)
|
||||
|
||||
|
||||
def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']:
|
||||
""" Split the span into two spans at the given index.
|
||||
The index must be within the span.
|
||||
"""
|
||||
return self.replace_end(index), self.replace_start(index)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TokenList:
|
||||
""" List of all tokens of a given type going from one breakpoint to another.
|
||||
"""
|
||||
end: int
|
||||
ttype: TokenType
|
||||
tokens: List[Token]
|
||||
|
||||
|
||||
def add_penalty(self, penalty: float) -> None:
|
||||
""" Add the given penalty to all tokens in the list.
|
||||
"""
|
||||
for token in self.tokens:
|
||||
token.penalty += penalty
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class QueryNode:
|
||||
""" A node of the query representing a break between terms.
|
||||
"""
|
||||
btype: BreakType
|
||||
ptype: PhraseType
|
||||
starting: List[TokenList] = dataclasses.field(default_factory=list)
|
||||
|
||||
def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
|
||||
""" Check if there are tokens of the given types ending at the
|
||||
given node.
|
||||
"""
|
||||
return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
|
||||
|
||||
|
||||
def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
|
||||
""" Get the list of tokens of the given type starting at this node
|
||||
and ending at the node 'end'. Returns 'None' if no such
|
||||
tokens exist.
|
||||
"""
|
||||
for tlist in self.starting:
|
||||
if tlist.end == end and tlist.ttype == ttype:
|
||||
return tlist.tokens
|
||||
return None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Phrase:
|
||||
""" A normalized query part. Phrases may be typed which means that
|
||||
they then represent a specific part of the address.
|
||||
"""
|
||||
ptype: PhraseType
|
||||
text: str
|
||||
|
||||
|
||||
class QueryStruct:
|
||||
""" A tokenized search query together with the normalized source
|
||||
from which the tokens have been parsed.
|
||||
|
||||
The query contains a list of nodes that represent the breaks
|
||||
between words. Tokens span between nodes, which don't necessarily
|
||||
need to be direct neighbours. Thus the query is represented as a
|
||||
directed acyclic graph.
|
||||
|
||||
When created, a query contains a single node: the start of the
|
||||
query. Further nodes can be added by appending to 'nodes'.
|
||||
"""
|
||||
|
||||
def __init__(self, source: List[Phrase]) -> None:
|
||||
self.source = source
|
||||
self.nodes: List[QueryNode] = \
|
||||
[QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
|
||||
|
||||
|
||||
def num_token_slots(self) -> int:
|
||||
""" Return the length of the query in vertice steps.
|
||||
"""
|
||||
return len(self.nodes) - 1
|
||||
|
||||
|
||||
def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
|
||||
""" Append a new break node with the given break type.
|
||||
The phrase type denotes the type for any tokens starting
|
||||
at the node.
|
||||
"""
|
||||
self.nodes.append(QueryNode(btype, ptype))
|
||||
|
||||
|
||||
def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
|
||||
""" Add a token to the query. 'start' and 'end' are the indexes of the
|
||||
nodes from which to which the token spans. The indexes must exist
|
||||
and are expected to be in the same phrase.
|
||||
'ttype' denotes the type of the token and 'token' the token to
|
||||
be inserted.
|
||||
|
||||
If the token type is not compatible with the phrase it should
|
||||
be added to, then the token is silently dropped.
|
||||
"""
|
||||
snode = self.nodes[trange.start]
|
||||
full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\
|
||||
and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
|
||||
if snode.ptype.compatible_with(ttype, full_phrase):
|
||||
tlist = snode.get_tokens(trange.end, ttype)
|
||||
if tlist is None:
|
||||
snode.starting.append(TokenList(trange.end, ttype, [token]))
|
||||
else:
|
||||
tlist.append(token)
|
||||
|
||||
|
||||
def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
|
||||
""" Get the list of tokens of a given type, spanning the given
|
||||
nodes. The nodes must exist. If no tokens exist, an
|
||||
empty list is returned.
|
||||
"""
|
||||
return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
|
||||
|
||||
|
||||
def get_partials_list(self, trange: TokenRange) -> List[Token]:
|
||||
""" Create a list of partial tokens between the given nodes.
|
||||
The list is composed of the first token of type PARTIAL
|
||||
going to the subsequent node. Such PARTIAL tokens are
|
||||
assumed to exist.
|
||||
"""
|
||||
return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL)))
|
||||
for i in range(trange.start, trange.end)]
|
||||
|
||||
|
||||
def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
|
||||
""" Iterator over all token lists in the query.
|
||||
"""
|
||||
for i, node in enumerate(self.nodes):
|
||||
for tlist in node.starting:
|
||||
yield i, node, tlist
|
||||
|
||||
|
||||
def find_lookup_word_by_id(self, token: int) -> str:
|
||||
""" Find the first token with the given token ID and return
|
||||
its lookup word. Returns 'None' if no such token exists.
|
||||
The function is very slow and must only be used for
|
||||
debugging.
|
||||
"""
|
||||
for node in self.nodes:
|
||||
for tlist in node.starting:
|
||||
for t in tlist.tokens:
|
||||
if t.token == token:
|
||||
return f"[{tlist.ttype.name[0]}]{t.lookup_word}"
|
||||
return 'None'
|
||||
54
src/nominatim_api/search/query_analyzer_factory.py
Normal file
54
src/nominatim_api/search/query_analyzer_factory.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Factory for creating a query analyzer for the configured tokenizer.
|
||||
"""
|
||||
from typing import List, cast, TYPE_CHECKING
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
import importlib
|
||||
|
||||
from ..logging import log
|
||||
from ..connection import SearchConnection
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .query import Phrase, QueryStruct
|
||||
|
||||
class AbstractQueryAnalyzer(ABC):
|
||||
""" Class for analysing incoming queries.
|
||||
|
||||
Query analyzers are tied to the tokenizer used on import.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def analyze_query(self, phrases: List['Phrase']) -> 'QueryStruct':
|
||||
""" Analyze the given phrases and return the tokenized query.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def normalize_text(self, text: str) -> str:
|
||||
""" Bring the given text into a normalized form. That is the
|
||||
standardized form search will work with. All information removed
|
||||
at this stage is inevitably lost.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
async def make_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
|
||||
""" Create a query analyzer for the tokenizer used by the database.
|
||||
"""
|
||||
name = await conn.get_property('tokenizer')
|
||||
|
||||
src_file = Path(__file__).parent / f'{name}_tokenizer.py'
|
||||
if not src_file.is_file():
|
||||
log().comment(f"No tokenizer named '{name}' available. Database not set up properly.")
|
||||
raise RuntimeError('Tokenizer not found')
|
||||
|
||||
module = importlib.import_module(f'nominatim.api.search.{name}_tokenizer')
|
||||
|
||||
return cast(AbstractQueryAnalyzer, await module.create_query_analyzer(conn))
|
||||
422
src/nominatim_api/search/token_assignment.py
Normal file
422
src/nominatim_api/search/token_assignment.py
Normal file
@@ -0,0 +1,422 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Create query interpretations where each vertice in the query is assigned
|
||||
a specific function (expressed as a token type).
|
||||
"""
|
||||
from typing import Optional, List, Iterator
|
||||
import dataclasses
|
||||
|
||||
from ..logging import log
|
||||
from . import query as qmod
|
||||
|
||||
# pylint: disable=too-many-return-statements,too-many-branches
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TypedRange:
|
||||
""" A token range for a specific type of tokens.
|
||||
"""
|
||||
ttype: qmod.TokenType
|
||||
trange: qmod.TokenRange
|
||||
|
||||
|
||||
PENALTY_TOKENCHANGE = {
|
||||
qmod.BreakType.START: 0.0,
|
||||
qmod.BreakType.END: 0.0,
|
||||
qmod.BreakType.PHRASE: 0.0,
|
||||
qmod.BreakType.WORD: 0.1,
|
||||
qmod.BreakType.PART: 0.2,
|
||||
qmod.BreakType.TOKEN: 0.4
|
||||
}
|
||||
|
||||
TypedRangeSeq = List[TypedRange]
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TokenAssignment: # pylint: disable=too-many-instance-attributes
|
||||
""" Representation of a possible assignment of token types
|
||||
to the tokens in a tokenized query.
|
||||
"""
|
||||
penalty: float = 0.0
|
||||
name: Optional[qmod.TokenRange] = None
|
||||
address: List[qmod.TokenRange] = dataclasses.field(default_factory=list)
|
||||
housenumber: Optional[qmod.TokenRange] = None
|
||||
postcode: Optional[qmod.TokenRange] = None
|
||||
country: Optional[qmod.TokenRange] = None
|
||||
near_item: Optional[qmod.TokenRange] = None
|
||||
qualifier: Optional[qmod.TokenRange] = None
|
||||
|
||||
|
||||
@staticmethod
|
||||
def from_ranges(ranges: TypedRangeSeq) -> 'TokenAssignment':
|
||||
""" Create a new token assignment from a sequence of typed spans.
|
||||
"""
|
||||
out = TokenAssignment()
|
||||
for token in ranges:
|
||||
if token.ttype == qmod.TokenType.PARTIAL:
|
||||
out.address.append(token.trange)
|
||||
elif token.ttype == qmod.TokenType.HOUSENUMBER:
|
||||
out.housenumber = token.trange
|
||||
elif token.ttype == qmod.TokenType.POSTCODE:
|
||||
out.postcode = token.trange
|
||||
elif token.ttype == qmod.TokenType.COUNTRY:
|
||||
out.country = token.trange
|
||||
elif token.ttype == qmod.TokenType.NEAR_ITEM:
|
||||
out.near_item = token.trange
|
||||
elif token.ttype == qmod.TokenType.QUALIFIER:
|
||||
out.qualifier = token.trange
|
||||
return out
|
||||
|
||||
|
||||
class _TokenSequence:
|
||||
""" Working state used to put together the token assignments.
|
||||
|
||||
Represents an intermediate state while traversing the tokenized
|
||||
query.
|
||||
"""
|
||||
def __init__(self, seq: TypedRangeSeq,
|
||||
direction: int = 0, penalty: float = 0.0) -> None:
|
||||
self.seq = seq
|
||||
self.direction = direction
|
||||
self.penalty = penalty
|
||||
|
||||
|
||||
def __str__(self) -> str:
|
||||
seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype.name}]' for r in self.seq)
|
||||
return f'{seq} (dir: {self.direction}, penalty: {self.penalty})'
|
||||
|
||||
|
||||
@property
|
||||
def end_pos(self) -> int:
|
||||
""" Return the index of the global end of the current sequence.
|
||||
"""
|
||||
return self.seq[-1].trange.end if self.seq else 0
|
||||
|
||||
|
||||
def has_types(self, *ttypes: qmod.TokenType) -> bool:
|
||||
""" Check if the current sequence contains any typed ranges of
|
||||
the given types.
|
||||
"""
|
||||
return any(s.ttype in ttypes for s in self.seq)
|
||||
|
||||
|
||||
def is_final(self) -> bool:
|
||||
""" Return true when the sequence cannot be extended by any
|
||||
form of token anymore.
|
||||
"""
|
||||
# Country and category must be the final term for left-to-right
|
||||
return len(self.seq) > 1 and \
|
||||
self.seq[-1].ttype in (qmod.TokenType.COUNTRY, qmod.TokenType.NEAR_ITEM)
|
||||
|
||||
|
||||
def appendable(self, ttype: qmod.TokenType) -> Optional[int]:
|
||||
""" Check if the give token type is appendable to the existing sequence.
|
||||
|
||||
Returns None if the token type is not appendable, otherwise the
|
||||
new direction of the sequence after adding such a type. The
|
||||
token is not added.
|
||||
"""
|
||||
if ttype == qmod.TokenType.WORD:
|
||||
return None
|
||||
|
||||
if not self.seq:
|
||||
# Append unconditionally to the empty list
|
||||
if ttype == qmod.TokenType.COUNTRY:
|
||||
return -1
|
||||
if ttype in (qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
|
||||
return 1
|
||||
return self.direction
|
||||
|
||||
# Name tokens are always acceptable and don't change direction
|
||||
if ttype == qmod.TokenType.PARTIAL:
|
||||
# qualifiers cannot appear in the middle of the query. They need
|
||||
# to be near the next phrase.
|
||||
if self.direction == -1 \
|
||||
and any(t.ttype == qmod.TokenType.QUALIFIER for t in self.seq[:-1]):
|
||||
return None
|
||||
return self.direction
|
||||
|
||||
# Other tokens may only appear once
|
||||
if self.has_types(ttype):
|
||||
return None
|
||||
|
||||
if ttype == qmod.TokenType.HOUSENUMBER:
|
||||
if self.direction == 1:
|
||||
if len(self.seq) == 1 and self.seq[0].ttype == qmod.TokenType.QUALIFIER:
|
||||
return None
|
||||
if len(self.seq) > 2 \
|
||||
or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
|
||||
return None # direction left-to-right: housenumber must come before anything
|
||||
elif self.direction == -1 \
|
||||
or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
|
||||
return -1 # force direction right-to-left if after other terms
|
||||
|
||||
return self.direction
|
||||
|
||||
if ttype == qmod.TokenType.POSTCODE:
|
||||
if self.direction == -1:
|
||||
if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
|
||||
return None
|
||||
return -1
|
||||
if self.direction == 1:
|
||||
return None if self.has_types(qmod.TokenType.COUNTRY) else 1
|
||||
if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
|
||||
return 1
|
||||
return self.direction
|
||||
|
||||
if ttype == qmod.TokenType.COUNTRY:
|
||||
return None if self.direction == -1 else 1
|
||||
|
||||
if ttype == qmod.TokenType.NEAR_ITEM:
|
||||
return self.direction
|
||||
|
||||
if ttype == qmod.TokenType.QUALIFIER:
|
||||
if self.direction == 1:
|
||||
if (len(self.seq) == 1
|
||||
and self.seq[0].ttype in (qmod.TokenType.PARTIAL, qmod.TokenType.NEAR_ITEM)) \
|
||||
or (len(self.seq) == 2
|
||||
and self.seq[0].ttype == qmod.TokenType.NEAR_ITEM
|
||||
and self.seq[1].ttype == qmod.TokenType.PARTIAL):
|
||||
return 1
|
||||
return None
|
||||
if self.direction == -1:
|
||||
return -1
|
||||
|
||||
tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TokenType.NEAR_ITEM else self.seq
|
||||
if len(tempseq) == 0:
|
||||
return 1
|
||||
if len(tempseq) == 1 and self.seq[0].ttype == qmod.TokenType.HOUSENUMBER:
|
||||
return None
|
||||
if len(tempseq) > 1 or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
|
||||
return -1
|
||||
return 0
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def advance(self, ttype: qmod.TokenType, end_pos: int,
|
||||
btype: qmod.BreakType) -> Optional['_TokenSequence']:
|
||||
""" Return a new token sequence state with the given token type
|
||||
extended.
|
||||
"""
|
||||
newdir = self.appendable(ttype)
|
||||
if newdir is None:
|
||||
return None
|
||||
|
||||
if not self.seq:
|
||||
newseq = [TypedRange(ttype, qmod.TokenRange(0, end_pos))]
|
||||
new_penalty = 0.0
|
||||
else:
|
||||
last = self.seq[-1]
|
||||
if btype != qmod.BreakType.PHRASE and last.ttype == ttype:
|
||||
# extend the existing range
|
||||
newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
|
||||
new_penalty = 0.0
|
||||
else:
|
||||
# start a new range
|
||||
newseq = list(self.seq) + [TypedRange(ttype,
|
||||
qmod.TokenRange(last.trange.end, end_pos))]
|
||||
new_penalty = PENALTY_TOKENCHANGE[btype]
|
||||
|
||||
return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
|
||||
|
||||
|
||||
def _adapt_penalty_from_priors(self, priors: int, new_dir: int) -> bool:
|
||||
if priors >= 2:
|
||||
if self.direction == 0:
|
||||
self.direction = new_dir
|
||||
else:
|
||||
if priors == 2:
|
||||
self.penalty += 0.8
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def recheck_sequence(self) -> bool:
|
||||
""" Check that the sequence is a fully valid token assignment
|
||||
and adapt direction and penalties further if necessary.
|
||||
|
||||
This function catches some impossible assignments that need
|
||||
forward context and can therefore not be excluded when building
|
||||
the assignment.
|
||||
"""
|
||||
# housenumbers may not be further than 2 words from the beginning.
|
||||
# If there are two words in front, give it a penalty.
|
||||
hnrpos = next((i for i, tr in enumerate(self.seq)
|
||||
if tr.ttype == qmod.TokenType.HOUSENUMBER),
|
||||
None)
|
||||
if hnrpos is not None:
|
||||
if self.direction != -1:
|
||||
priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TokenType.PARTIAL)
|
||||
if not self._adapt_penalty_from_priors(priors, -1):
|
||||
return False
|
||||
if self.direction != 1:
|
||||
priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TokenType.PARTIAL)
|
||||
if not self._adapt_penalty_from_priors(priors, 1):
|
||||
return False
|
||||
if any(t.ttype == qmod.TokenType.NEAR_ITEM for t in self.seq):
|
||||
self.penalty += 1.0
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _get_assignments_postcode(self, base: TokenAssignment,
|
||||
query_len: int) -> Iterator[TokenAssignment]:
|
||||
""" Yield possible assignments of Postcode searches with an
|
||||
address component.
|
||||
"""
|
||||
assert base.postcode is not None
|
||||
|
||||
if (base.postcode.start == 0 and self.direction != -1)\
|
||||
or (base.postcode.end == query_len and self.direction != 1):
|
||||
log().comment('postcode search')
|
||||
# <address>,<postcode> should give preference to address search
|
||||
if base.postcode.start == 0:
|
||||
penalty = self.penalty
|
||||
self.direction = -1 # name searches are only possible backwards
|
||||
else:
|
||||
penalty = self.penalty + 0.1
|
||||
self.direction = 1 # name searches are only possible forwards
|
||||
yield dataclasses.replace(base, penalty=penalty)
|
||||
|
||||
|
||||
def _get_assignments_address_forward(self, base: TokenAssignment,
|
||||
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
|
||||
""" Yield possible assignments of address searches with
|
||||
left-to-right reading.
|
||||
"""
|
||||
first = base.address[0]
|
||||
|
||||
log().comment('first word = name')
|
||||
yield dataclasses.replace(base, penalty=self.penalty,
|
||||
name=first, address=base.address[1:])
|
||||
|
||||
# To paraphrase:
|
||||
# * if another name term comes after the first one and before the
|
||||
# housenumber
|
||||
# * a qualifier comes after the name
|
||||
# * the containing phrase is strictly typed
|
||||
if (base.housenumber and first.end < base.housenumber.start)\
|
||||
or (base.qualifier and base.qualifier > first)\
|
||||
or (query.nodes[first.start].ptype != qmod.PhraseType.NONE):
|
||||
return
|
||||
|
||||
penalty = self.penalty
|
||||
|
||||
# Penalty for:
|
||||
# * <name>, <street>, <housenumber> , ...
|
||||
# * queries that are comma-separated
|
||||
if (base.housenumber and base.housenumber > first) or len(query.source) > 1:
|
||||
penalty += 0.25
|
||||
|
||||
for i in range(first.start + 1, first.end):
|
||||
name, addr = first.split(i)
|
||||
log().comment(f'split first word = name ({i - first.start})')
|
||||
yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
|
||||
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
|
||||
|
||||
|
||||
def _get_assignments_address_backward(self, base: TokenAssignment,
|
||||
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
|
||||
""" Yield possible assignments of address searches with
|
||||
right-to-left reading.
|
||||
"""
|
||||
last = base.address[-1]
|
||||
|
||||
if self.direction == -1 or len(base.address) > 1:
|
||||
log().comment('last word = name')
|
||||
yield dataclasses.replace(base, penalty=self.penalty,
|
||||
name=last, address=base.address[:-1])
|
||||
|
||||
# To paraphrase:
|
||||
# * if another name term comes before the last one and after the
|
||||
# housenumber
|
||||
# * a qualifier comes before the name
|
||||
# * the containing phrase is strictly typed
|
||||
if (base.housenumber and last.start > base.housenumber.end)\
|
||||
or (base.qualifier and base.qualifier < last)\
|
||||
or (query.nodes[last.start].ptype != qmod.PhraseType.NONE):
|
||||
return
|
||||
|
||||
penalty = self.penalty
|
||||
if base.housenumber and base.housenumber < last:
|
||||
penalty += 0.4
|
||||
if len(query.source) > 1:
|
||||
penalty += 0.25
|
||||
|
||||
for i in range(last.start + 1, last.end):
|
||||
addr, name = last.split(i)
|
||||
log().comment(f'split last word = name ({i - last.start})')
|
||||
yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
|
||||
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
|
||||
|
||||
|
||||
def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
|
||||
""" Yield possible assignments for the current sequence.
|
||||
|
||||
This function splits up general name assignments into name
|
||||
and address and yields all possible variants of that.
|
||||
"""
|
||||
base = TokenAssignment.from_ranges(self.seq)
|
||||
|
||||
num_addr_tokens = sum(t.end - t.start for t in base.address)
|
||||
if num_addr_tokens > 50:
|
||||
return
|
||||
|
||||
# Postcode search (postcode-only search is covered in next case)
|
||||
if base.postcode is not None and base.address:
|
||||
yield from self._get_assignments_postcode(base, query.num_token_slots())
|
||||
|
||||
# Postcode or country-only search
|
||||
if not base.address:
|
||||
if not base.housenumber and (base.postcode or base.country or base.near_item):
|
||||
log().comment('postcode/country search')
|
||||
yield dataclasses.replace(base, penalty=self.penalty)
|
||||
else:
|
||||
# <postcode>,<address> should give preference to postcode search
|
||||
if base.postcode and base.postcode.start == 0:
|
||||
self.penalty += 0.1
|
||||
|
||||
# Right-to-left reading of the address
|
||||
if self.direction != -1:
|
||||
yield from self._get_assignments_address_forward(base, query)
|
||||
|
||||
# Left-to-right reading of the address
|
||||
if self.direction != 1:
|
||||
yield from self._get_assignments_address_backward(base, query)
|
||||
|
||||
# variant for special housenumber searches
|
||||
if base.housenumber and not base.qualifier:
|
||||
yield dataclasses.replace(base, penalty=self.penalty)
|
||||
|
||||
|
||||
def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
|
||||
""" Return possible word type assignments to word positions.
|
||||
|
||||
The assignments are computed from the concrete tokens listed
|
||||
in the tokenized query.
|
||||
|
||||
The result includes the penalty for transitions from one word type to
|
||||
another. It does not include penalties for transitions within a
|
||||
type.
|
||||
"""
|
||||
todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PhraseType.NONE else 1)]
|
||||
|
||||
while todo:
|
||||
state = todo.pop()
|
||||
node = query.nodes[state.end_pos]
|
||||
|
||||
for tlist in node.starting:
|
||||
newstate = state.advance(tlist.ttype, tlist.end, node.btype)
|
||||
if newstate is not None:
|
||||
if newstate.end_pos == query.num_token_slots():
|
||||
if newstate.recheck_sequence():
|
||||
log().var_dump('Assignment', newstate)
|
||||
yield from newstate.get_assignments(query)
|
||||
elif not newstate.is_final():
|
||||
todo.append(newstate)
|
||||
0
src/nominatim_api/server/__init__.py
Normal file
0
src/nominatim_api/server/__init__.py
Normal file
0
src/nominatim_api/server/falcon/__init__.py
Normal file
0
src/nominatim_api/server/falcon/__init__.py
Normal file
194
src/nominatim_api/server/falcon/server.py
Normal file
194
src/nominatim_api/server/falcon/server.py
Normal file
@@ -0,0 +1,194 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Server implementation using the falcon webserver framework.
|
||||
"""
|
||||
from typing import Optional, Mapping, cast, Any, List
|
||||
from pathlib import Path
|
||||
import datetime as dt
|
||||
import asyncio
|
||||
|
||||
from falcon.asgi import App, Request, Response
|
||||
|
||||
from nominatim_core.config import Configuration
|
||||
from ...core import NominatimAPIAsync
|
||||
from ... import v1 as api_impl
|
||||
from ... import logging as loglib
|
||||
|
||||
class HTTPNominatimError(Exception):
|
||||
""" A special exception class for errors raised during processing.
|
||||
"""
|
||||
def __init__(self, msg: str, status: int, content_type: str) -> None:
|
||||
self.msg = msg
|
||||
self.status = status
|
||||
self.content_type = content_type
|
||||
|
||||
|
||||
async def nominatim_error_handler(req: Request, resp: Response, #pylint: disable=unused-argument
|
||||
exception: HTTPNominatimError,
|
||||
_: Any) -> None:
|
||||
""" Special error handler that passes message and content type as
|
||||
per exception info.
|
||||
"""
|
||||
resp.status = exception.status
|
||||
resp.text = exception.msg
|
||||
resp.content_type = exception.content_type
|
||||
|
||||
|
||||
async def timeout_error_handler(req: Request, resp: Response, #pylint: disable=unused-argument
|
||||
exception: TimeoutError, #pylint: disable=unused-argument
|
||||
_: Any) -> None:
|
||||
""" Special error handler that passes message and content type as
|
||||
per exception info.
|
||||
"""
|
||||
resp.status = 503
|
||||
|
||||
loglib.log().comment('Aborted: Query took too long to process.')
|
||||
logdata = loglib.get_and_disable()
|
||||
if logdata:
|
||||
resp.text = logdata
|
||||
resp.content_type = 'text/html; charset=utf-8'
|
||||
else:
|
||||
resp.text = "Query took too long to process."
|
||||
resp.content_type = 'text/plain; charset=utf-8'
|
||||
|
||||
|
||||
class ParamWrapper(api_impl.ASGIAdaptor):
|
||||
""" Adaptor class for server glue to Falcon framework.
|
||||
"""
|
||||
|
||||
def __init__(self, req: Request, resp: Response,
|
||||
config: Configuration) -> None:
|
||||
self.request = req
|
||||
self.response = resp
|
||||
self._config = config
|
||||
|
||||
|
||||
def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
|
||||
return cast(Optional[str], self.request.get_param(name, default=default))
|
||||
|
||||
|
||||
def get_header(self, name: str, default: Optional[str] = None) -> Optional[str]:
|
||||
return cast(Optional[str], self.request.get_header(name, default=default))
|
||||
|
||||
|
||||
def error(self, msg: str, status: int = 400) -> HTTPNominatimError:
|
||||
return HTTPNominatimError(msg, status, self.content_type)
|
||||
|
||||
|
||||
def create_response(self, status: int, output: str, num_results: int) -> None:
|
||||
self.response.context.num_results = num_results
|
||||
self.response.status = status
|
||||
self.response.text = output
|
||||
self.response.content_type = self.content_type
|
||||
|
||||
|
||||
def base_uri(self) -> str:
|
||||
return cast (str, self.request.forwarded_prefix)
|
||||
|
||||
def config(self) -> Configuration:
|
||||
return self._config
|
||||
|
||||
|
||||
class EndpointWrapper:
|
||||
""" Converter for server glue endpoint functions to Falcon request handlers.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, func: api_impl.EndpointFunc, api: NominatimAPIAsync) -> None:
|
||||
self.name = name
|
||||
self.func = func
|
||||
self.api = api
|
||||
|
||||
|
||||
async def on_get(self, req: Request, resp: Response) -> None:
|
||||
""" Implementation of the endpoint.
|
||||
"""
|
||||
await self.func(self.api, ParamWrapper(req, resp, self.api.config))
|
||||
|
||||
|
||||
class FileLoggingMiddleware:
|
||||
""" Middleware to log selected requests into a file.
|
||||
"""
|
||||
|
||||
def __init__(self, file_name: str):
|
||||
self.fd = open(file_name, 'a', buffering=1, encoding='utf8') # pylint: disable=R1732
|
||||
|
||||
|
||||
async def process_request(self, req: Request, _: Response) -> None:
|
||||
""" Callback before the request starts timing.
|
||||
"""
|
||||
req.context.start = dt.datetime.now(tz=dt.timezone.utc)
|
||||
|
||||
|
||||
async def process_response(self, req: Request, resp: Response,
|
||||
resource: Optional[EndpointWrapper],
|
||||
req_succeeded: bool) -> None:
|
||||
""" Callback after requests writes to the logfile. It only
|
||||
writes logs for successful requests for search, reverse and lookup.
|
||||
"""
|
||||
if not req_succeeded or resource is None or resp.status != 200\
|
||||
or resource.name not in ('reverse', 'search', 'lookup', 'details'):
|
||||
return
|
||||
|
||||
finish = dt.datetime.now(tz=dt.timezone.utc)
|
||||
duration = (finish - req.context.start).total_seconds()
|
||||
params = req.scope['query_string'].decode('utf8')
|
||||
start = req.context.start.replace(tzinfo=None)\
|
||||
.isoformat(sep=' ', timespec='milliseconds')
|
||||
|
||||
self.fd.write(f"[{start}] "
|
||||
f"{duration:.4f} {getattr(resp.context, 'num_results', 0)} "
|
||||
f'{resource.name} "{params}"\n')
|
||||
|
||||
|
||||
class APIShutdown:
|
||||
""" Middleware that closes any open database connections.
|
||||
"""
|
||||
|
||||
def __init__(self, api: NominatimAPIAsync) -> None:
|
||||
self.api = api
|
||||
|
||||
async def process_shutdown(self, *_: Any) -> None:
|
||||
"""Process the ASGI lifespan shutdown event.
|
||||
"""
|
||||
await self.api.close()
|
||||
|
||||
|
||||
def get_application(project_dir: Path,
|
||||
environ: Optional[Mapping[str, str]] = None) -> App:
|
||||
""" Create a Nominatim Falcon ASGI application.
|
||||
"""
|
||||
api = NominatimAPIAsync(project_dir, environ)
|
||||
|
||||
middleware: List[object] = [APIShutdown(api)]
|
||||
log_file = api.config.LOG_FILE
|
||||
if log_file:
|
||||
middleware.append(FileLoggingMiddleware(log_file))
|
||||
|
||||
app = App(cors_enable=api.config.get_bool('CORS_NOACCESSCONTROL'),
|
||||
middleware=middleware)
|
||||
app.add_error_handler(HTTPNominatimError, nominatim_error_handler)
|
||||
app.add_error_handler(TimeoutError, timeout_error_handler)
|
||||
# different from TimeoutError in Python <= 3.10
|
||||
app.add_error_handler(asyncio.TimeoutError, timeout_error_handler)
|
||||
|
||||
legacy_urls = api.config.get_bool('SERVE_LEGACY_URLS')
|
||||
for name, func in api_impl.ROUTES:
|
||||
endpoint = EndpointWrapper(name, func, api)
|
||||
app.add_route(f"/{name}", endpoint)
|
||||
if legacy_urls:
|
||||
app.add_route(f"/{name}.php", endpoint)
|
||||
|
||||
return app
|
||||
|
||||
|
||||
def run_wsgi() -> App:
|
||||
""" Entry point for uvicorn.
|
||||
|
||||
Make sure uvicorn is run from the project directory.
|
||||
"""
|
||||
return get_application(Path('.'))
|
||||
0
src/nominatim_api/server/starlette/__init__.py
Normal file
0
src/nominatim_api/server/starlette/__init__.py
Normal file
174
src/nominatim_api/server/starlette/server.py
Normal file
174
src/nominatim_api/server/starlette/server.py
Normal file
@@ -0,0 +1,174 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Server implementation using the starlette webserver framework.
|
||||
"""
|
||||
from typing import Any, Optional, Mapping, Callable, cast, Coroutine, Dict, Awaitable
|
||||
from pathlib import Path
|
||||
import datetime as dt
|
||||
import asyncio
|
||||
|
||||
from starlette.applications import Starlette
|
||||
from starlette.routing import Route
|
||||
from starlette.exceptions import HTTPException
|
||||
from starlette.responses import Response, PlainTextResponse, HTMLResponse
|
||||
from starlette.requests import Request
|
||||
from starlette.middleware import Middleware
|
||||
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
|
||||
from starlette.middleware.cors import CORSMiddleware
|
||||
|
||||
from nominatim_core.config import Configuration
|
||||
from ...core import NominatimAPIAsync
|
||||
from ... import v1 as api_impl
|
||||
from ... import logging as loglib
|
||||
|
||||
class ParamWrapper(api_impl.ASGIAdaptor):
|
||||
""" Adaptor class for server glue to Starlette framework.
|
||||
"""
|
||||
|
||||
def __init__(self, request: Request) -> None:
|
||||
self.request = request
|
||||
|
||||
|
||||
def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
|
||||
return self.request.query_params.get(name, default=default)
|
||||
|
||||
|
||||
def get_header(self, name: str, default: Optional[str] = None) -> Optional[str]:
|
||||
return self.request.headers.get(name, default)
|
||||
|
||||
|
||||
def error(self, msg: str, status: int = 400) -> HTTPException:
|
||||
return HTTPException(status, detail=msg,
|
||||
headers={'content-type': self.content_type})
|
||||
|
||||
|
||||
def create_response(self, status: int, output: str, num_results: int) -> Response:
|
||||
self.request.state.num_results = num_results
|
||||
return Response(output, status_code=status, media_type=self.content_type)
|
||||
|
||||
|
||||
def base_uri(self) -> str:
|
||||
scheme = self.request.url.scheme
|
||||
host = self.request.url.hostname
|
||||
port = self.request.url.port
|
||||
root = self.request.scope['root_path']
|
||||
if (scheme == 'http' and port == 80) or (scheme == 'https' and port == 443):
|
||||
port = None
|
||||
if port is not None:
|
||||
return f"{scheme}://{host}:{port}{root}"
|
||||
|
||||
return f"{scheme}://{host}{root}"
|
||||
|
||||
|
||||
def config(self) -> Configuration:
|
||||
return cast(Configuration, self.request.app.state.API.config)
|
||||
|
||||
|
||||
def _wrap_endpoint(func: api_impl.EndpointFunc)\
|
||||
-> Callable[[Request], Coroutine[Any, Any, Response]]:
|
||||
async def _callback(request: Request) -> Response:
|
||||
return cast(Response, await func(request.app.state.API, ParamWrapper(request)))
|
||||
|
||||
return _callback
|
||||
|
||||
|
||||
class FileLoggingMiddleware(BaseHTTPMiddleware):
|
||||
""" Middleware to log selected requests into a file.
|
||||
"""
|
||||
|
||||
def __init__(self, app: Starlette, file_name: str = ''):
|
||||
super().__init__(app)
|
||||
self.fd = open(file_name, 'a', buffering=1, encoding='utf8') # pylint: disable=R1732
|
||||
|
||||
async def dispatch(self, request: Request,
|
||||
call_next: RequestResponseEndpoint) -> Response:
|
||||
start = dt.datetime.now(tz=dt.timezone.utc)
|
||||
response = await call_next(request)
|
||||
|
||||
if response.status_code != 200:
|
||||
return response
|
||||
|
||||
finish = dt.datetime.now(tz=dt.timezone.utc)
|
||||
|
||||
for endpoint in ('reverse', 'search', 'lookup', 'details'):
|
||||
if request.url.path.startswith('/' + endpoint):
|
||||
qtype = endpoint
|
||||
break
|
||||
else:
|
||||
return response
|
||||
|
||||
duration = (finish - start).total_seconds()
|
||||
params = request.scope['query_string'].decode('utf8')
|
||||
|
||||
self.fd.write(f"[{start.replace(tzinfo=None).isoformat(sep=' ', timespec='milliseconds')}] "
|
||||
f"{duration:.4f} {getattr(request.state, 'num_results', 0)} "
|
||||
f'{qtype} "{params}"\n')
|
||||
|
||||
return response
|
||||
|
||||
|
||||
async def timeout_error(request: Request, #pylint: disable=unused-argument
|
||||
_: Exception) -> Response:
|
||||
""" Error handler for query timeouts.
|
||||
"""
|
||||
loglib.log().comment('Aborted: Query took too long to process.')
|
||||
logdata = loglib.get_and_disable()
|
||||
|
||||
if logdata:
|
||||
return HTMLResponse(logdata)
|
||||
|
||||
return PlainTextResponse("Query took too long to process.", status_code=503)
|
||||
|
||||
|
||||
def get_application(project_dir: Path,
|
||||
environ: Optional[Mapping[str, str]] = None,
|
||||
debug: bool = True) -> Starlette:
|
||||
""" Create a Nominatim falcon ASGI application.
|
||||
"""
|
||||
config = Configuration(project_dir, environ)
|
||||
|
||||
routes = []
|
||||
legacy_urls = config.get_bool('SERVE_LEGACY_URLS')
|
||||
for name, func in api_impl.ROUTES:
|
||||
endpoint = _wrap_endpoint(func)
|
||||
routes.append(Route(f"/{name}", endpoint=endpoint))
|
||||
if legacy_urls:
|
||||
routes.append(Route(f"/{name}.php", endpoint=endpoint))
|
||||
|
||||
middleware = []
|
||||
if config.get_bool('CORS_NOACCESSCONTROL'):
|
||||
middleware.append(Middleware(CORSMiddleware,
|
||||
allow_origins=['*'],
|
||||
allow_methods=['GET', 'OPTIONS'],
|
||||
max_age=86400))
|
||||
|
||||
log_file = config.LOG_FILE
|
||||
if log_file:
|
||||
middleware.append(Middleware(FileLoggingMiddleware, file_name=log_file))
|
||||
|
||||
exceptions: Dict[Any, Callable[[Request, Exception], Awaitable[Response]]] = {
|
||||
TimeoutError: timeout_error,
|
||||
asyncio.TimeoutError: timeout_error
|
||||
}
|
||||
|
||||
async def _shutdown() -> None:
|
||||
await app.state.API.close()
|
||||
|
||||
app = Starlette(debug=debug, routes=routes, middleware=middleware,
|
||||
exception_handlers=exceptions,
|
||||
on_shutdown=[_shutdown])
|
||||
|
||||
app.state.API = NominatimAPIAsync(project_dir, environ)
|
||||
|
||||
return app
|
||||
|
||||
|
||||
def run_wsgi() -> Starlette:
|
||||
""" Entry point for uvicorn.
|
||||
"""
|
||||
return get_application(Path('.'), debug=False)
|
||||
221
src/nominatim_api/sql/sqlalchemy_functions.py
Normal file
221
src/nominatim_api/sql/sqlalchemy_functions.py
Normal file
@@ -0,0 +1,221 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Custom functions and expressions for SQLAlchemy.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import Any
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.ext.compiler import compiles
|
||||
|
||||
from nominatim_core.typing import SaColumn
|
||||
|
||||
# pylint: disable=all
|
||||
|
||||
class PlacexGeometryReverseLookuppolygon(sa.sql.functions.GenericFunction[Any]):
|
||||
""" Check for conditions that allow partial index use on
|
||||
'idx_placex_geometry_reverse_lookupPolygon'.
|
||||
|
||||
Needs to be constant, so that the query planner picks them up correctly
|
||||
in prepared statements.
|
||||
"""
|
||||
name = 'PlacexGeometryReverseLookuppolygon'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(PlacexGeometryReverseLookuppolygon) # type: ignore[no-untyped-call, misc]
|
||||
def _default_intersects(element: PlacexGeometryReverseLookuppolygon,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return ("(ST_GeometryType(placex.geometry) in ('ST_Polygon', 'ST_MultiPolygon')"
|
||||
" AND placex.rank_address between 4 and 25"
|
||||
" AND placex.type != 'postcode'"
|
||||
" AND placex.name is not null"
|
||||
" AND placex.indexed_status = 0"
|
||||
" AND placex.linked_place_id is null)")
|
||||
|
||||
|
||||
@compiles(PlacexGeometryReverseLookuppolygon, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def _sqlite_intersects(element: PlacexGeometryReverseLookuppolygon,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return ("(ST_GeometryType(placex.geometry) in ('POLYGON', 'MULTIPOLYGON')"
|
||||
" AND placex.rank_address between 4 and 25"
|
||||
" AND placex.type != 'postcode'"
|
||||
" AND placex.name is not null"
|
||||
" AND placex.indexed_status = 0"
|
||||
" AND placex.linked_place_id is null)")
|
||||
|
||||
|
||||
class IntersectsReverseDistance(sa.sql.functions.GenericFunction[Any]):
|
||||
name = 'IntersectsReverseDistance'
|
||||
inherit_cache = True
|
||||
|
||||
def __init__(self, table: sa.Table, geom: SaColumn) -> None:
|
||||
super().__init__(table.c.geometry,
|
||||
table.c.rank_search, geom)
|
||||
self.tablename = table.name
|
||||
|
||||
|
||||
@compiles(IntersectsReverseDistance) # type: ignore[no-untyped-call, misc]
|
||||
def default_reverse_place_diameter(element: IntersectsReverseDistance,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
table = element.tablename
|
||||
return f"({table}.rank_address between 4 and 25"\
|
||||
f" AND {table}.type != 'postcode'"\
|
||||
f" AND {table}.name is not null"\
|
||||
f" AND {table}.linked_place_id is null"\
|
||||
f" AND {table}.osm_type = 'N'" + \
|
||||
" AND ST_Buffer(%s, reverse_place_diameter(%s)) && %s)" % \
|
||||
tuple(map(lambda c: compiler.process(c, **kw), element.clauses))
|
||||
|
||||
|
||||
@compiles(IntersectsReverseDistance, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def sqlite_reverse_place_diameter(element: IntersectsReverseDistance,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
geom1, rank, geom2 = list(element.clauses)
|
||||
table = element.tablename
|
||||
|
||||
return (f"({table}.rank_address between 4 and 25"\
|
||||
f" AND {table}.type != 'postcode'"\
|
||||
f" AND {table}.name is not null"\
|
||||
f" AND {table}.linked_place_id is null"\
|
||||
f" AND {table}.osm_type = 'N'"\
|
||||
" AND MbrIntersects(%s, ST_Expand(%s, 14.0 * exp(-0.2 * %s) - 0.03))"\
|
||||
f" AND {table}.place_id IN"\
|
||||
" (SELECT place_id FROM placex_place_node_areas"\
|
||||
" WHERE ROWID IN (SELECT ROWID FROM SpatialIndex"\
|
||||
" WHERE f_table_name = 'placex_place_node_areas'"\
|
||||
" AND search_frame = %s)))") % (
|
||||
compiler.process(geom1, **kw),
|
||||
compiler.process(geom2, **kw),
|
||||
compiler.process(rank, **kw),
|
||||
compiler.process(geom2, **kw))
|
||||
|
||||
|
||||
class IsBelowReverseDistance(sa.sql.functions.GenericFunction[Any]):
|
||||
name = 'IsBelowReverseDistance'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(IsBelowReverseDistance) # type: ignore[no-untyped-call, misc]
|
||||
def default_is_below_reverse_distance(element: IsBelowReverseDistance,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
dist, rank = list(element.clauses)
|
||||
return "%s < reverse_place_diameter(%s)" % (compiler.process(dist, **kw),
|
||||
compiler.process(rank, **kw))
|
||||
|
||||
|
||||
@compiles(IsBelowReverseDistance, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def sqlite_is_below_reverse_distance(element: IsBelowReverseDistance,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
dist, rank = list(element.clauses)
|
||||
return "%s < 14.0 * exp(-0.2 * %s) - 0.03" % (compiler.process(dist, **kw),
|
||||
compiler.process(rank, **kw))
|
||||
|
||||
|
||||
class IsAddressPoint(sa.sql.functions.GenericFunction[Any]):
|
||||
name = 'IsAddressPoint'
|
||||
inherit_cache = True
|
||||
|
||||
def __init__(self, table: sa.Table) -> None:
|
||||
super().__init__(table.c.rank_address,
|
||||
table.c.housenumber, table.c.name)
|
||||
|
||||
|
||||
@compiles(IsAddressPoint) # type: ignore[no-untyped-call, misc]
|
||||
def default_is_address_point(element: IsAddressPoint,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
rank, hnr, name = list(element.clauses)
|
||||
return "(%s = 30 AND (%s IS NOT NULL OR %s ? 'addr:housename'))" % (
|
||||
compiler.process(rank, **kw),
|
||||
compiler.process(hnr, **kw),
|
||||
compiler.process(name, **kw))
|
||||
|
||||
|
||||
@compiles(IsAddressPoint, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def sqlite_is_address_point(element: IsAddressPoint,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
rank, hnr, name = list(element.clauses)
|
||||
return "(%s = 30 AND coalesce(%s, json_extract(%s, '$.addr:housename')) IS NOT NULL)" % (
|
||||
compiler.process(rank, **kw),
|
||||
compiler.process(hnr, **kw),
|
||||
compiler.process(name, **kw))
|
||||
|
||||
|
||||
class CrosscheckNames(sa.sql.functions.GenericFunction[Any]):
|
||||
""" Check if in the given list of names in parameters 1 any of the names
|
||||
from the JSON array in parameter 2 are contained.
|
||||
"""
|
||||
name = 'CrosscheckNames'
|
||||
inherit_cache = True
|
||||
|
||||
@compiles(CrosscheckNames) # type: ignore[no-untyped-call, misc]
|
||||
def compile_crosscheck_names(element: CrosscheckNames,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "coalesce(avals(%s) && ARRAY(SELECT * FROM json_array_elements_text(%s)), false)" % (
|
||||
compiler.process(arg1, **kw), compiler.process(arg2, **kw))
|
||||
|
||||
|
||||
@compiles(CrosscheckNames, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def compile_sqlite_crosscheck_names(element: CrosscheckNames,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "EXISTS(SELECT *"\
|
||||
" FROM json_each(%s) as name, json_each(%s) as match_name"\
|
||||
" WHERE name.value = match_name.value)"\
|
||||
% (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
|
||||
|
||||
|
||||
class JsonArrayEach(sa.sql.functions.GenericFunction[Any]):
|
||||
""" Return elements of a json array as a set.
|
||||
"""
|
||||
name = 'JsonArrayEach'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(JsonArrayEach) # type: ignore[no-untyped-call, misc]
|
||||
def default_json_array_each(element: JsonArrayEach, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "json_array_elements(%s)" % compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
@compiles(JsonArrayEach, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def sqlite_json_array_each(element: JsonArrayEach, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "json_each(%s)" % compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
|
||||
class Greatest(sa.sql.functions.GenericFunction[Any]):
|
||||
""" Function to compute maximum of all its input parameters.
|
||||
"""
|
||||
name = 'greatest'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(Greatest, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def sqlite_greatest(element: Greatest, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "max(%s)" % compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
|
||||
class RegexpWord(sa.sql.functions.GenericFunction[Any]):
|
||||
""" Check if a full word is in a given string.
|
||||
"""
|
||||
name = 'RegexpWord'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(RegexpWord, 'postgresql') # type: ignore[no-untyped-call, misc]
|
||||
def postgres_regexp_nocase(element: RegexpWord, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "%s ~* ('\\m(' || %s || ')\\M')::text" % (compiler.process(arg2, **kw), compiler.process(arg1, **kw))
|
||||
|
||||
|
||||
@compiles(RegexpWord, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def sqlite_regexp_nocase(element: RegexpWord, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "regexp('\\b(' || %s || ')\\b', %s)" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
|
||||
122
src/nominatim_api/sql/sqlite_functions.py
Normal file
122
src/nominatim_api/sql/sqlite_functions.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Custom functions for SQLite.
|
||||
"""
|
||||
from typing import cast, Optional, Set, Any
|
||||
import json
|
||||
|
||||
# pylint: disable=protected-access
|
||||
|
||||
def weigh_search(search_vector: Optional[str], rankings: str, default: float) -> float:
|
||||
""" Custom weight function for search results.
|
||||
"""
|
||||
if search_vector is not None:
|
||||
svec = [int(x) for x in search_vector.split(',')]
|
||||
for rank in json.loads(rankings):
|
||||
if all(r in svec for r in rank[1]):
|
||||
return cast(float, rank[0])
|
||||
|
||||
return default
|
||||
|
||||
|
||||
class ArrayIntersectFuzzy:
|
||||
""" Compute the array of common elements of all input integer arrays.
|
||||
Very large input parameters may be ignored to speed up
|
||||
computation. Therefore, the result is a superset of common elements.
|
||||
|
||||
Input and output arrays are given as comma-separated lists.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
self.first = ''
|
||||
self.values: Optional[Set[int]] = None
|
||||
|
||||
def step(self, value: Optional[str]) -> None:
|
||||
""" Add the next array to the intersection.
|
||||
"""
|
||||
if value is not None:
|
||||
if not self.first:
|
||||
self.first = value
|
||||
elif len(value) < 10000000:
|
||||
if self.values is None:
|
||||
self.values = {int(x) for x in self.first.split(',')}
|
||||
self.values.intersection_update((int(x) for x in value.split(',')))
|
||||
|
||||
def finalize(self) -> str:
|
||||
""" Return the final result.
|
||||
"""
|
||||
if self.values is not None:
|
||||
return ','.join(map(str, self.values))
|
||||
|
||||
return self.first
|
||||
|
||||
|
||||
class ArrayUnion:
|
||||
""" Compute the set of all elements of the input integer arrays.
|
||||
|
||||
Input and output arrays are given as strings of comma-separated lists.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
self.values: Optional[Set[str]] = None
|
||||
|
||||
def step(self, value: Optional[str]) -> None:
|
||||
""" Add the next array to the union.
|
||||
"""
|
||||
if value is not None:
|
||||
if self.values is None:
|
||||
self.values = set(value.split(','))
|
||||
else:
|
||||
self.values.update(value.split(','))
|
||||
|
||||
def finalize(self) -> str:
|
||||
""" Return the final result.
|
||||
"""
|
||||
return '' if self.values is None else ','.join(self.values)
|
||||
|
||||
|
||||
def array_contains(container: Optional[str], containee: Optional[str]) -> Optional[bool]:
|
||||
""" Is the array 'containee' completely contained in array 'container'.
|
||||
"""
|
||||
if container is None or containee is None:
|
||||
return None
|
||||
|
||||
vset = container.split(',')
|
||||
return all(v in vset for v in containee.split(','))
|
||||
|
||||
|
||||
def array_pair_contains(container1: Optional[str], container2: Optional[str],
|
||||
containee: Optional[str]) -> Optional[bool]:
|
||||
""" Is the array 'containee' completely contained in the union of
|
||||
array 'container1' and array 'container2'.
|
||||
"""
|
||||
if container1 is None or container2 is None or containee is None:
|
||||
return None
|
||||
|
||||
vset = container1.split(',') + container2.split(',')
|
||||
return all(v in vset for v in containee.split(','))
|
||||
|
||||
|
||||
def install_custom_functions(conn: Any) -> None:
|
||||
""" Install helper functions for Nominatim into the given SQLite
|
||||
database connection.
|
||||
"""
|
||||
conn.create_function('weigh_search', 3, weigh_search, deterministic=True)
|
||||
conn.create_function('array_contains', 2, array_contains, deterministic=True)
|
||||
conn.create_function('array_pair_contains', 3, array_pair_contains, deterministic=True)
|
||||
_create_aggregate(conn, 'array_intersect_fuzzy', 1, ArrayIntersectFuzzy)
|
||||
_create_aggregate(conn, 'array_union', 1, ArrayUnion)
|
||||
|
||||
|
||||
async def _make_aggregate(aioconn: Any, *args: Any) -> None:
|
||||
await aioconn._execute(aioconn._conn.create_aggregate, *args)
|
||||
|
||||
|
||||
def _create_aggregate(conn: Any, name: str, nargs: int, aggregate: Any) -> None:
|
||||
try:
|
||||
conn.await_(_make_aggregate(conn._connection, name, nargs, aggregate))
|
||||
except Exception as error: # pylint: disable=broad-exception-caught
|
||||
conn._handle_exception(error)
|
||||
51
src/nominatim_api/status.py
Normal file
51
src/nominatim_api/status.py
Normal file
@@ -0,0 +1,51 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Classes and function related to status call.
|
||||
"""
|
||||
from typing import Optional
|
||||
import datetime as dt
|
||||
import dataclasses
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from .connection import SearchConnection
|
||||
from .version import NOMINATIM_API_VERSION
|
||||
|
||||
@dataclasses.dataclass
|
||||
class StatusResult:
|
||||
""" Result of a call to the status API.
|
||||
"""
|
||||
status: int
|
||||
message: str
|
||||
software_version = NOMINATIM_API_VERSION
|
||||
data_updated: Optional[dt.datetime] = None
|
||||
database_version: Optional[str] = None
|
||||
|
||||
|
||||
async def get_status(conn: SearchConnection) -> StatusResult:
|
||||
""" Execute a status API call.
|
||||
"""
|
||||
status = StatusResult(0, 'OK')
|
||||
|
||||
# Last update date
|
||||
sql = sa.select(conn.t.import_status.c.lastimportdate).limit(1)
|
||||
status.data_updated = await conn.scalar(sql)
|
||||
|
||||
if status.data_updated is not None:
|
||||
if status.data_updated.tzinfo is None:
|
||||
status.data_updated = status.data_updated.replace(tzinfo=dt.timezone.utc)
|
||||
else:
|
||||
status.data_updated = status.data_updated.astimezone(dt.timezone.utc)
|
||||
|
||||
# Database version
|
||||
try:
|
||||
status.database_version = await conn.get_property('database_version')
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return status
|
||||
550
src/nominatim_api/types.py
Normal file
550
src/nominatim_api/types.py
Normal file
@@ -0,0 +1,550 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Complex datatypes used by the Nominatim API.
|
||||
"""
|
||||
from typing import Optional, Union, Tuple, NamedTuple, TypeVar, Type, Dict, \
|
||||
Any, List, Sequence
|
||||
from collections import abc
|
||||
import dataclasses
|
||||
import enum
|
||||
import math
|
||||
from struct import unpack
|
||||
from binascii import unhexlify
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from .localization import Locales
|
||||
|
||||
# pylint: disable=no-member,too-many-boolean-expressions,too-many-instance-attributes
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PlaceID:
|
||||
""" Reference a place by Nominatim's internal ID.
|
||||
|
||||
A PlaceID may reference place from the main table placex, from
|
||||
the interpolation tables or the postcode tables. Place IDs are not
|
||||
stable between installations. You may use this type theefore only
|
||||
with place IDs obtained from the same database.
|
||||
"""
|
||||
place_id: int
|
||||
"""
|
||||
The internal ID of the place to reference.
|
||||
"""
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class OsmID:
|
||||
""" Reference a place by its OSM ID and potentially the basic category.
|
||||
|
||||
The OSM ID may refer to places in the main table placex and OSM
|
||||
interpolation lines.
|
||||
"""
|
||||
osm_type: str
|
||||
""" OSM type of the object. Must be one of `N`(node), `W`(way) or
|
||||
`R`(relation).
|
||||
"""
|
||||
osm_id: int
|
||||
""" The OSM ID of the object.
|
||||
"""
|
||||
osm_class: Optional[str] = None
|
||||
""" The same OSM object may appear multiple times in the database under
|
||||
different categories. The optional class parameter allows to distinguish
|
||||
the different categories and corresponds to the key part of the category.
|
||||
If there are multiple objects in the database and `osm_class` is
|
||||
left out, then one of the objects is returned at random.
|
||||
"""
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.osm_type not in ('N', 'W', 'R'):
|
||||
raise ValueError(f"Illegal OSM type '{self.osm_type}'. Must be one of N, W, R.")
|
||||
|
||||
|
||||
PlaceRef = Union[PlaceID, OsmID]
|
||||
|
||||
|
||||
class Point(NamedTuple):
|
||||
""" A geographic point in WGS84 projection.
|
||||
"""
|
||||
x: float
|
||||
y: float
|
||||
|
||||
|
||||
@property
|
||||
def lat(self) -> float:
|
||||
""" Return the latitude of the point.
|
||||
"""
|
||||
return self.y
|
||||
|
||||
|
||||
@property
|
||||
def lon(self) -> float:
|
||||
""" Return the longitude of the point.
|
||||
"""
|
||||
return self.x
|
||||
|
||||
|
||||
def to_geojson(self) -> str:
|
||||
""" Return the point in GeoJSON format.
|
||||
"""
|
||||
return f'{{"type": "Point","coordinates": [{self.x}, {self.y}]}}'
|
||||
|
||||
|
||||
@staticmethod
|
||||
def from_wkb(wkb: Union[str, bytes]) -> 'Point':
|
||||
""" Create a point from EWKB as returned from the database.
|
||||
"""
|
||||
if isinstance(wkb, str):
|
||||
wkb = unhexlify(wkb)
|
||||
if len(wkb) != 25:
|
||||
raise ValueError(f"Point wkb has unexpected length {len(wkb)}")
|
||||
if wkb[0] == 0:
|
||||
gtype, srid, x, y = unpack('>iidd', wkb[1:])
|
||||
elif wkb[0] == 1:
|
||||
gtype, srid, x, y = unpack('<iidd', wkb[1:])
|
||||
else:
|
||||
raise ValueError("WKB has unknown endian value.")
|
||||
|
||||
if gtype != 0x20000001:
|
||||
raise ValueError("WKB must be a point geometry.")
|
||||
if srid != 4326:
|
||||
raise ValueError("Only WGS84 WKB supported.")
|
||||
|
||||
return Point(x, y)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def from_param(inp: Any) -> 'Point':
|
||||
""" Create a point from an input parameter. The parameter
|
||||
may be given as a point, a string or a sequence of
|
||||
strings or floats. Raises a UsageError if the format is
|
||||
not correct.
|
||||
"""
|
||||
if isinstance(inp, Point):
|
||||
return inp
|
||||
|
||||
seq: Sequence[str]
|
||||
if isinstance(inp, str):
|
||||
seq = inp.split(',')
|
||||
elif isinstance(inp, abc.Sequence):
|
||||
seq = inp
|
||||
|
||||
if len(seq) != 2:
|
||||
raise UsageError('Point parameter needs 2 coordinates.')
|
||||
try:
|
||||
x, y = filter(math.isfinite, map(float, seq))
|
||||
except ValueError as exc:
|
||||
raise UsageError('Point parameter needs to be numbers.') from exc
|
||||
|
||||
if x < -180.0 or x > 180.0 or y < -90.0 or y > 90.0:
|
||||
raise UsageError('Point coordinates invalid.')
|
||||
|
||||
return Point(x, y)
|
||||
|
||||
|
||||
def to_wkt(self) -> str:
|
||||
""" Return the WKT representation of the point.
|
||||
"""
|
||||
return f'POINT({self.x} {self.y})'
|
||||
|
||||
|
||||
|
||||
AnyPoint = Union[Point, Tuple[float, float]]
|
||||
|
||||
WKB_BBOX_HEADER_LE = b'\x01\x03\x00\x00\x20\xE6\x10\x00\x00\x01\x00\x00\x00\x05\x00\x00\x00'
|
||||
WKB_BBOX_HEADER_BE = b'\x00\x20\x00\x00\x03\x00\x00\x10\xe6\x00\x00\x00\x01\x00\x00\x00\x05'
|
||||
|
||||
class Bbox:
|
||||
""" A bounding box in WGS84 projection.
|
||||
|
||||
The coordinates are available as an array in the 'coord'
|
||||
property in the order (minx, miny, maxx, maxy).
|
||||
"""
|
||||
def __init__(self, minx: float, miny: float, maxx: float, maxy: float) -> None:
|
||||
""" Create a new bounding box with the given coordinates in WGS84
|
||||
projection.
|
||||
"""
|
||||
self.coords = (minx, miny, maxx, maxy)
|
||||
|
||||
|
||||
@property
|
||||
def minlat(self) -> float:
|
||||
""" Southern-most latitude, corresponding to the minimum y coordinate.
|
||||
"""
|
||||
return self.coords[1]
|
||||
|
||||
|
||||
@property
|
||||
def maxlat(self) -> float:
|
||||
""" Northern-most latitude, corresponding to the maximum y coordinate.
|
||||
"""
|
||||
return self.coords[3]
|
||||
|
||||
|
||||
@property
|
||||
def minlon(self) -> float:
|
||||
""" Western-most longitude, corresponding to the minimum x coordinate.
|
||||
"""
|
||||
return self.coords[0]
|
||||
|
||||
|
||||
@property
|
||||
def maxlon(self) -> float:
|
||||
""" Eastern-most longitude, corresponding to the maximum x coordinate.
|
||||
"""
|
||||
return self.coords[2]
|
||||
|
||||
|
||||
@property
|
||||
def area(self) -> float:
|
||||
""" Return the area of the box in WGS84.
|
||||
"""
|
||||
return (self.coords[2] - self.coords[0]) * (self.coords[3] - self.coords[1])
|
||||
|
||||
|
||||
def contains(self, pt: Point) -> bool:
|
||||
""" Check if the point is inside or on the boundary of the box.
|
||||
"""
|
||||
return self.coords[0] <= pt[0] and self.coords[1] <= pt[1]\
|
||||
and self.coords[2] >= pt[0] and self.coords[3] >= pt[1]
|
||||
|
||||
|
||||
def to_wkt(self) -> str:
|
||||
""" Return the WKT representation of the Bbox. This
|
||||
is a simple polygon with four points.
|
||||
"""
|
||||
return 'POLYGON(({0} {1},{0} {3},{2} {3},{2} {1},{0} {1}))'\
|
||||
.format(*self.coords) # pylint: disable=consider-using-f-string
|
||||
|
||||
|
||||
@staticmethod
|
||||
def from_wkb(wkb: Union[None, str, bytes]) -> 'Optional[Bbox]':
|
||||
""" Create a Bbox from a bounding box polygon as returned by
|
||||
the database. Returns `None` if the input value is None.
|
||||
"""
|
||||
if wkb is None:
|
||||
return None
|
||||
|
||||
if isinstance(wkb, str):
|
||||
wkb = unhexlify(wkb)
|
||||
|
||||
if len(wkb) != 97:
|
||||
raise ValueError("WKB must be a bounding box polygon")
|
||||
if wkb.startswith(WKB_BBOX_HEADER_LE):
|
||||
x1, y1, _, _, x2, y2 = unpack('<dddddd', wkb[17:65])
|
||||
elif wkb.startswith(WKB_BBOX_HEADER_BE):
|
||||
x1, y1, _, _, x2, y2 = unpack('>dddddd', wkb[17:65])
|
||||
else:
|
||||
raise ValueError("WKB has wrong header")
|
||||
|
||||
return Bbox(min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def from_point(pt: Point, buffer: float) -> 'Bbox':
|
||||
""" Return a Bbox around the point with the buffer added to all sides.
|
||||
"""
|
||||
return Bbox(pt[0] - buffer, pt[1] - buffer,
|
||||
pt[0] + buffer, pt[1] + buffer)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def from_param(inp: Any) -> 'Bbox':
|
||||
""" Return a Bbox from an input parameter. The box may be
|
||||
given as a Bbox, a string or a list or strings or integer.
|
||||
Raises a UsageError if the format is incorrect.
|
||||
"""
|
||||
if isinstance(inp, Bbox):
|
||||
return inp
|
||||
|
||||
seq: Sequence[str]
|
||||
if isinstance(inp, str):
|
||||
seq = inp.split(',')
|
||||
elif isinstance(inp, abc.Sequence):
|
||||
seq = inp
|
||||
|
||||
if len(seq) != 4:
|
||||
raise UsageError('Bounding box parameter needs 4 coordinates.')
|
||||
try:
|
||||
x1, y1, x2, y2 = filter(math.isfinite, map(float, seq))
|
||||
except ValueError as exc:
|
||||
raise UsageError('Bounding box parameter needs to be numbers.') from exc
|
||||
|
||||
x1 = min(180, max(-180, x1))
|
||||
x2 = min(180, max(-180, x2))
|
||||
y1 = min(90, max(-90, y1))
|
||||
y2 = min(90, max(-90, y2))
|
||||
|
||||
if x1 == x2 or y1 == y2:
|
||||
raise UsageError('Bounding box with invalid parameters.')
|
||||
|
||||
return Bbox(min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2))
|
||||
|
||||
|
||||
class GeometryFormat(enum.Flag):
|
||||
""" All search functions support returning the full geometry of a place in
|
||||
various formats. The internal geometry is converted by PostGIS to
|
||||
the desired format and then returned as a string. It is possible to
|
||||
request multiple formats at the same time.
|
||||
"""
|
||||
NONE = 0
|
||||
""" No geometry requested. Alias for a empty flag.
|
||||
"""
|
||||
GEOJSON = enum.auto()
|
||||
"""
|
||||
[GeoJSON](https://geojson.org/) format
|
||||
"""
|
||||
KML = enum.auto()
|
||||
"""
|
||||
[KML](https://en.wikipedia.org/wiki/Keyhole_Markup_Language) format
|
||||
"""
|
||||
SVG = enum.auto()
|
||||
"""
|
||||
[SVG](http://www.w3.org/TR/SVG/paths.html) format
|
||||
"""
|
||||
TEXT = enum.auto()
|
||||
"""
|
||||
[WKT](https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry) format
|
||||
"""
|
||||
|
||||
|
||||
class DataLayer(enum.Flag):
|
||||
""" The `DataLayer` flag type defines the layers that can be selected
|
||||
for reverse and forward search.
|
||||
"""
|
||||
ADDRESS = enum.auto()
|
||||
""" The address layer contains all places relevant for addresses:
|
||||
fully qualified addresses with a house number (or a house name equivalent,
|
||||
for some addresses) and places that can be part of an address like
|
||||
roads, cities, states.
|
||||
"""
|
||||
POI = enum.auto()
|
||||
""" Layer for points of interest like shops, restaurants but also
|
||||
recycling bins or postboxes.
|
||||
"""
|
||||
RAILWAY = enum.auto()
|
||||
""" Layer with railway features including tracks and other infrastructure.
|
||||
Note that in Nominatim's standard configuration, only very few railway
|
||||
features are imported into the database. Thus a custom configuration
|
||||
is required to make full use of this layer.
|
||||
"""
|
||||
NATURAL = enum.auto()
|
||||
""" Layer with natural features like rivers, lakes and mountains.
|
||||
"""
|
||||
MANMADE = enum.auto()
|
||||
""" Layer with other human-made features and boundaries. This layer is
|
||||
the catch-all and includes all features not covered by the other
|
||||
layers. A typical example for this layer are national park boundaries.
|
||||
"""
|
||||
|
||||
|
||||
def format_country(cc: Any) -> List[str]:
|
||||
""" Extract a list of country codes from the input which may be either
|
||||
a string or list of strings. Filters out all values that are not
|
||||
a two-letter string.
|
||||
"""
|
||||
clist: Sequence[str]
|
||||
if isinstance(cc, str):
|
||||
clist = cc.split(',')
|
||||
elif isinstance(cc, abc.Sequence):
|
||||
clist = cc
|
||||
else:
|
||||
raise UsageError("Parameter 'country' needs to be a comma-separated list "
|
||||
"or a Python list of strings.")
|
||||
|
||||
return [cc.lower() for cc in clist if isinstance(cc, str) and len(cc) == 2]
|
||||
|
||||
|
||||
def format_excluded(ids: Any) -> List[int]:
|
||||
""" Extract a list of place ids from the input which may be either
|
||||
a string or a list of strings or ints. Ignores empty value but
|
||||
throws a UserError on anything that cannot be converted to int.
|
||||
"""
|
||||
plist: Sequence[str]
|
||||
if isinstance(ids, str):
|
||||
plist = [s.strip() for s in ids.split(',')]
|
||||
elif isinstance(ids, abc.Sequence):
|
||||
plist = ids
|
||||
else:
|
||||
raise UsageError("Parameter 'excluded' needs to be a comma-separated list "
|
||||
"or a Python list of numbers.")
|
||||
if not all(isinstance(i, int) or
|
||||
(isinstance(i, str) and (not i or i.isdigit())) for i in plist):
|
||||
raise UsageError("Parameter 'excluded' only takes place IDs.")
|
||||
|
||||
return [int(id) for id in plist if id] or [0]
|
||||
|
||||
|
||||
def format_categories(categories: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
||||
""" Extract a list of categories. Currently a noop.
|
||||
"""
|
||||
return categories
|
||||
|
||||
TParam = TypeVar('TParam', bound='LookupDetails') # pylint: disable=invalid-name
|
||||
|
||||
@dataclasses.dataclass
|
||||
class LookupDetails:
|
||||
""" Collection of parameters that define which kind of details are
|
||||
returned with a lookup or details result.
|
||||
"""
|
||||
geometry_output: GeometryFormat = GeometryFormat.NONE
|
||||
""" Add the full geometry of the place to the result. Multiple
|
||||
formats may be selected. Note that geometries can become quite large.
|
||||
"""
|
||||
address_details: bool = False
|
||||
""" Get detailed information on the places that make up the address
|
||||
for the result.
|
||||
"""
|
||||
linked_places: bool = False
|
||||
""" Get detailed information on the places that link to the result.
|
||||
"""
|
||||
parented_places: bool = False
|
||||
""" Get detailed information on all places that this place is a parent
|
||||
for, i.e. all places for which it provides the address details.
|
||||
Only POI places can have parents.
|
||||
"""
|
||||
keywords: bool = False
|
||||
""" Add information about the search terms used for this place.
|
||||
"""
|
||||
geometry_simplification: float = 0.0
|
||||
""" Simplification factor for a geometry in degrees WGS. A factor of
|
||||
0.0 means the original geometry is kept. The higher the value, the
|
||||
more the geometry gets simplified.
|
||||
"""
|
||||
locales: Locales = Locales()
|
||||
""" Preferred languages for localization of results.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_kwargs(cls: Type[TParam], kwargs: Dict[str, Any]) -> TParam:
|
||||
""" Load the data fields of the class from a dictionary.
|
||||
Unknown entries in the dictionary are ignored, missing ones
|
||||
get the default setting.
|
||||
|
||||
The function supports type checking and throws a UsageError
|
||||
when the value does not fit.
|
||||
"""
|
||||
def _check_field(v: Any, field: 'dataclasses.Field[Any]') -> Any:
|
||||
if v is None:
|
||||
return field.default_factory() \
|
||||
if field.default_factory != dataclasses.MISSING \
|
||||
else field.default
|
||||
if field.metadata and 'transform' in field.metadata:
|
||||
return field.metadata['transform'](v)
|
||||
if not isinstance(v, field.type):
|
||||
raise UsageError(f"Parameter '{field.name}' needs to be of {field.type!s}.")
|
||||
return v
|
||||
|
||||
return cls(**{f.name: _check_field(kwargs[f.name], f)
|
||||
for f in dataclasses.fields(cls) if f.name in kwargs})
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ReverseDetails(LookupDetails):
|
||||
""" Collection of parameters for the reverse call.
|
||||
"""
|
||||
max_rank: int = dataclasses.field(default=30,
|
||||
metadata={'transform': lambda v: max(0, min(v, 30))}
|
||||
)
|
||||
""" Highest address rank to return.
|
||||
"""
|
||||
layers: DataLayer = DataLayer.ADDRESS | DataLayer.POI
|
||||
""" Filter which kind of data to include.
|
||||
"""
|
||||
|
||||
@dataclasses.dataclass
|
||||
class SearchDetails(LookupDetails):
|
||||
""" Collection of parameters for the search call.
|
||||
"""
|
||||
max_results: int = 10
|
||||
""" Maximum number of results to be returned. The actual number of results
|
||||
may be less.
|
||||
"""
|
||||
min_rank: int = dataclasses.field(default=0,
|
||||
metadata={'transform': lambda v: max(0, min(v, 30))}
|
||||
)
|
||||
""" Lowest address rank to return.
|
||||
"""
|
||||
max_rank: int = dataclasses.field(default=30,
|
||||
metadata={'transform': lambda v: max(0, min(v, 30))}
|
||||
)
|
||||
""" Highest address rank to return.
|
||||
"""
|
||||
layers: Optional[DataLayer] = dataclasses.field(default=None,
|
||||
metadata={'transform': lambda r : r})
|
||||
""" Filter which kind of data to include. When 'None' (the default) then
|
||||
filtering by layers is disabled.
|
||||
"""
|
||||
countries: List[str] = dataclasses.field(default_factory=list,
|
||||
metadata={'transform': format_country})
|
||||
""" Restrict search results to the given countries. An empty list (the
|
||||
default) will disable this filter.
|
||||
"""
|
||||
excluded: List[int] = dataclasses.field(default_factory=list,
|
||||
metadata={'transform': format_excluded})
|
||||
""" List of OSM objects to exclude from the results. Currently only
|
||||
works when the internal place ID is given.
|
||||
An empty list (the default) will disable this filter.
|
||||
"""
|
||||
viewbox: Optional[Bbox] = dataclasses.field(default=None,
|
||||
metadata={'transform': Bbox.from_param})
|
||||
""" Focus the search on a given map area.
|
||||
"""
|
||||
bounded_viewbox: bool = False
|
||||
""" Use 'viewbox' as a filter and restrict results to places within the
|
||||
given area.
|
||||
"""
|
||||
near: Optional[Point] = dataclasses.field(default=None,
|
||||
metadata={'transform': Point.from_param})
|
||||
""" Order results by distance to the given point.
|
||||
"""
|
||||
near_radius: Optional[float] = dataclasses.field(default=None,
|
||||
metadata={'transform': lambda r : r})
|
||||
""" Use near point as a filter and drop results outside the given
|
||||
radius. Radius is given in degrees WSG84.
|
||||
"""
|
||||
categories: List[Tuple[str, str]] = dataclasses.field(default_factory=list,
|
||||
metadata={'transform': format_categories})
|
||||
""" Restrict search to places with one of the given class/type categories.
|
||||
An empty list (the default) will disable this filter.
|
||||
"""
|
||||
viewbox_x2: Optional[Bbox] = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.viewbox is not None:
|
||||
xext = (self.viewbox.maxlon - self.viewbox.minlon)/2
|
||||
yext = (self.viewbox.maxlat - self.viewbox.minlat)/2
|
||||
self.viewbox_x2 = Bbox(self.viewbox.minlon - xext, self.viewbox.minlat - yext,
|
||||
self.viewbox.maxlon + xext, self.viewbox.maxlat + yext)
|
||||
|
||||
|
||||
def restrict_min_max_rank(self, new_min: int, new_max: int) -> None:
|
||||
""" Change the min_rank and max_rank fields to respect the
|
||||
given boundaries.
|
||||
"""
|
||||
assert new_min <= new_max
|
||||
self.min_rank = max(self.min_rank, new_min)
|
||||
self.max_rank = min(self.max_rank, new_max)
|
||||
|
||||
|
||||
def is_impossible(self) -> bool:
|
||||
""" Check if the parameter configuration is contradictionary and
|
||||
cannot yield any results.
|
||||
"""
|
||||
return (self.min_rank > self.max_rank
|
||||
or (self.bounded_viewbox
|
||||
and self.viewbox is not None and self.near is not None
|
||||
and self.viewbox.contains(self.near))
|
||||
or (self.layers is not None and not self.layers)
|
||||
or (self.max_rank <= 4 and
|
||||
self.layers is not None and not self.layers & DataLayer.ADDRESS))
|
||||
|
||||
|
||||
def layer_enabled(self, layer: DataLayer) -> bool:
|
||||
""" Check if the given layer has been chosen. Also returns
|
||||
true when layer restriction has been disabled completely.
|
||||
"""
|
||||
return self.layers is None or bool(self.layers & layer)
|
||||
21
src/nominatim_api/v1/__init__.py
Normal file
21
src/nominatim_api/v1/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of API version v1 (aka the legacy version).
|
||||
"""
|
||||
|
||||
#pylint: disable=useless-import-alias
|
||||
|
||||
from .server_glue import (ASGIAdaptor as ASGIAdaptor,
|
||||
EndpointFunc as EndpointFunc,
|
||||
ROUTES as ROUTES)
|
||||
|
||||
from . import format as _format
|
||||
|
||||
list_formats = _format.dispatch.list_formats
|
||||
supports_format = _format.dispatch.supports_format
|
||||
format_result = _format.dispatch.format_result
|
||||
201
src/nominatim_api/v1/classtypes.py
Normal file
201
src/nominatim_api/v1/classtypes.py
Normal file
@@ -0,0 +1,201 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Hard-coded information about tag categories.
|
||||
|
||||
These tables have been copied verbatim from the old PHP code. For future
|
||||
version a more flexible formatting is required.
|
||||
"""
|
||||
from typing import Tuple, Optional, Mapping, Union
|
||||
|
||||
from ..results import ReverseResult, SearchResult
|
||||
from ..types import Bbox
|
||||
|
||||
def get_label_tag(category: Tuple[str, str], extratags: Optional[Mapping[str, str]],
|
||||
rank: int, country: Optional[str]) -> str:
|
||||
""" Create a label tag for the given place that can be used as an XML name.
|
||||
"""
|
||||
if rank < 26 and extratags and 'place' in extratags:
|
||||
label = extratags['place']
|
||||
elif rank < 26 and extratags and 'linked_place' in extratags:
|
||||
label = extratags['linked_place']
|
||||
elif category == ('boundary', 'administrative'):
|
||||
label = ADMIN_LABELS.get((country or '', int(rank/2)))\
|
||||
or ADMIN_LABELS.get(('', int(rank/2)))\
|
||||
or 'Administrative'
|
||||
elif category[1] == 'postal_code':
|
||||
label = 'postcode'
|
||||
elif rank < 26:
|
||||
label = category[1] if category[1] != 'yes' else category[0]
|
||||
elif rank < 28:
|
||||
label = 'road'
|
||||
elif category[0] == 'place'\
|
||||
and category[1] in ('house_number', 'house_name', 'country_code'):
|
||||
label = category[1]
|
||||
else:
|
||||
label = category[0]
|
||||
|
||||
return label.lower().replace(' ', '_')
|
||||
|
||||
|
||||
def bbox_from_result(result: Union[ReverseResult, SearchResult]) -> Bbox:
|
||||
""" Compute a bounding box for the result. For ways and relations
|
||||
a given boundingbox is used. For all other object, a box is computed
|
||||
around the centroid according to dimensions derived from the
|
||||
search rank.
|
||||
"""
|
||||
if (result.osm_object and result.osm_object[0] == 'N') or result.bbox is None:
|
||||
extent = NODE_EXTENT.get(result.category, 0.00005)
|
||||
return Bbox.from_point(result.centroid, extent)
|
||||
|
||||
return result.bbox
|
||||
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
OSM_ATTRIBUTION = 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright'
|
||||
|
||||
|
||||
OSM_TYPE_NAME = {
|
||||
'N': 'node',
|
||||
'W': 'way',
|
||||
'R': 'relation'
|
||||
}
|
||||
|
||||
|
||||
ADMIN_LABELS = {
|
||||
('', 1): 'Continent',
|
||||
('', 2): 'Country',
|
||||
('', 3): 'Region',
|
||||
('', 4): 'State',
|
||||
('', 5): 'State District',
|
||||
('', 6): 'County',
|
||||
('', 7): 'Municipality',
|
||||
('', 8): 'City',
|
||||
('', 9): 'City District',
|
||||
('', 10): 'Suburb',
|
||||
('', 11): 'Neighbourhood',
|
||||
('', 12): 'City Block',
|
||||
('no', 3): 'State',
|
||||
('no', 4): 'County',
|
||||
('se', 3): 'State',
|
||||
('se', 4): 'County'
|
||||
}
|
||||
|
||||
|
||||
ICONS = {
|
||||
('boundary', 'administrative'): 'poi_boundary_administrative',
|
||||
('place', 'city'): 'poi_place_city',
|
||||
('place', 'town'): 'poi_place_town',
|
||||
('place', 'village'): 'poi_place_village',
|
||||
('place', 'hamlet'): 'poi_place_village',
|
||||
('place', 'suburb'): 'poi_place_village',
|
||||
('place', 'locality'): 'poi_place_village',
|
||||
('place', 'airport'): 'transport_airport2',
|
||||
('aeroway', 'aerodrome'): 'transport_airport2',
|
||||
('railway', 'station'): 'transport_train_station2',
|
||||
('amenity', 'place_of_worship'): 'place_of_worship_unknown3',
|
||||
('amenity', 'pub'): 'food_pub',
|
||||
('amenity', 'bar'): 'food_bar',
|
||||
('amenity', 'university'): 'education_university',
|
||||
('tourism', 'museum'): 'tourist_museum',
|
||||
('amenity', 'arts_centre'): 'tourist_art_gallery2',
|
||||
('tourism', 'zoo'): 'tourist_zoo',
|
||||
('tourism', 'theme_park'): 'poi_point_of_interest',
|
||||
('tourism', 'attraction'): 'poi_point_of_interest',
|
||||
('leisure', 'golf_course'): 'sport_golf',
|
||||
('historic', 'castle'): 'tourist_castle',
|
||||
('amenity', 'hospital'): 'health_hospital',
|
||||
('amenity', 'school'): 'education_school',
|
||||
('amenity', 'theatre'): 'tourist_theatre',
|
||||
('amenity', 'library'): 'amenity_library',
|
||||
('amenity', 'fire_station'): 'amenity_firestation3',
|
||||
('amenity', 'police'): 'amenity_police2',
|
||||
('amenity', 'bank'): 'money_bank2',
|
||||
('amenity', 'post_office'): 'amenity_post_office',
|
||||
('tourism', 'hotel'): 'accommodation_hotel2',
|
||||
('amenity', 'cinema'): 'tourist_cinema',
|
||||
('tourism', 'artwork'): 'tourist_art_gallery2',
|
||||
('historic', 'archaeological_site'): 'tourist_archaeological2',
|
||||
('amenity', 'doctors'): 'health_doctors',
|
||||
('leisure', 'sports_centre'): 'sport_leisure_centre',
|
||||
('leisure', 'swimming_pool'): 'sport_swimming_outdoor',
|
||||
('shop', 'supermarket'): 'shopping_supermarket',
|
||||
('shop', 'convenience'): 'shopping_convenience',
|
||||
('amenity', 'restaurant'): 'food_restaurant',
|
||||
('amenity', 'fast_food'): 'food_fastfood',
|
||||
('amenity', 'cafe'): 'food_cafe',
|
||||
('tourism', 'guest_house'): 'accommodation_bed_and_breakfast',
|
||||
('amenity', 'pharmacy'): 'health_pharmacy_dispensing',
|
||||
('amenity', 'fuel'): 'transport_fuel',
|
||||
('natural', 'peak'): 'poi_peak',
|
||||
('natural', 'wood'): 'landuse_coniferous_and_deciduous',
|
||||
('shop', 'bicycle'): 'shopping_bicycle',
|
||||
('shop', 'clothes'): 'shopping_clothes',
|
||||
('shop', 'hairdresser'): 'shopping_hairdresser',
|
||||
('shop', 'doityourself'): 'shopping_diy',
|
||||
('shop', 'estate_agent'): 'shopping_estateagent2',
|
||||
('shop', 'car'): 'shopping_car',
|
||||
('shop', 'garden_centre'): 'shopping_garden_centre',
|
||||
('shop', 'car_repair'): 'shopping_car_repair',
|
||||
('shop', 'bakery'): 'shopping_bakery',
|
||||
('shop', 'butcher'): 'shopping_butcher',
|
||||
('shop', 'apparel'): 'shopping_clothes',
|
||||
('shop', 'laundry'): 'shopping_laundrette',
|
||||
('shop', 'beverages'): 'shopping_alcohol',
|
||||
('shop', 'alcohol'): 'shopping_alcohol',
|
||||
('shop', 'optician'): 'health_opticians',
|
||||
('shop', 'chemist'): 'health_pharmacy',
|
||||
('shop', 'gallery'): 'tourist_art_gallery2',
|
||||
('shop', 'jewelry'): 'shopping_jewelry',
|
||||
('tourism', 'information'): 'amenity_information',
|
||||
('historic', 'ruins'): 'tourist_ruin',
|
||||
('amenity', 'college'): 'education_school',
|
||||
('historic', 'monument'): 'tourist_monument',
|
||||
('historic', 'memorial'): 'tourist_monument',
|
||||
('historic', 'mine'): 'poi_mine',
|
||||
('tourism', 'caravan_site'): 'accommodation_caravan_park',
|
||||
('amenity', 'bus_station'): 'transport_bus_station',
|
||||
('amenity', 'atm'): 'money_atm2',
|
||||
('tourism', 'viewpoint'): 'tourist_view_point',
|
||||
('tourism', 'guesthouse'): 'accommodation_bed_and_breakfast',
|
||||
('railway', 'tram'): 'transport_tram_stop',
|
||||
('amenity', 'courthouse'): 'amenity_court',
|
||||
('amenity', 'recycling'): 'amenity_recycling',
|
||||
('amenity', 'dentist'): 'health_dentist',
|
||||
('natural', 'beach'): 'tourist_beach',
|
||||
('railway', 'tram_stop'): 'transport_tram_stop',
|
||||
('amenity', 'prison'): 'amenity_prison',
|
||||
('highway', 'bus_stop'): 'transport_bus_stop2'
|
||||
}
|
||||
|
||||
NODE_EXTENT = {
|
||||
('place', 'continent'): 25,
|
||||
('place', 'country'): 7,
|
||||
('place', 'state'): 2.6,
|
||||
('place', 'province'): 2.6,
|
||||
('place', 'region'): 1.0,
|
||||
('place', 'county'): 0.7,
|
||||
('place', 'city'): 0.16,
|
||||
('place', 'municipality'): 0.16,
|
||||
('place', 'island'): 0.32,
|
||||
('place', 'postcode'): 0.16,
|
||||
('place', 'town'): 0.04,
|
||||
('place', 'village'): 0.02,
|
||||
('place', 'hamlet'): 0.02,
|
||||
('place', 'district'): 0.02,
|
||||
('place', 'borough'): 0.02,
|
||||
('place', 'suburb'): 0.02,
|
||||
('place', 'locality'): 0.01,
|
||||
('place', 'neighbourhood'): 0.01,
|
||||
('place', 'quarter'): 0.01,
|
||||
('place', 'city_block'): 0.01,
|
||||
('landuse', 'farm'): 0.01,
|
||||
('place', 'farm'): 0.01,
|
||||
('place', 'airport'): 0.015,
|
||||
('aeroway', 'aerodrome'): 0.015,
|
||||
('railway', 'station'): 0.005
|
||||
}
|
||||
259
src/nominatim_api/v1/format.py
Normal file
259
src/nominatim_api/v1/format.py
Normal file
@@ -0,0 +1,259 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Output formatters for API version v1.
|
||||
"""
|
||||
from typing import List, Dict, Mapping, Any
|
||||
import collections
|
||||
import datetime as dt
|
||||
|
||||
from nominatim_core.utils.json_writer import JsonWriter
|
||||
from ..status import StatusResult
|
||||
from ..results import DetailedResult, ReverseResults, SearchResults, \
|
||||
AddressLines, AddressLine
|
||||
from ..localization import Locales
|
||||
from ..result_formatting import FormatDispatcher
|
||||
from .classtypes import ICONS
|
||||
from . import format_json, format_xml
|
||||
|
||||
class RawDataList(List[Dict[str, Any]]):
|
||||
""" Data type for formatting raw data lists 'as is' in json.
|
||||
"""
|
||||
|
||||
dispatch = FormatDispatcher()
|
||||
|
||||
@dispatch.format_func(StatusResult, 'text')
|
||||
def _format_status_text(result: StatusResult, _: Mapping[str, Any]) -> str:
|
||||
if result.status:
|
||||
return f"ERROR: {result.message}"
|
||||
|
||||
return 'OK'
|
||||
|
||||
|
||||
@dispatch.format_func(StatusResult, 'json')
|
||||
def _format_status_json(result: StatusResult, _: Mapping[str, Any]) -> str:
|
||||
out = JsonWriter()
|
||||
|
||||
out.start_object()\
|
||||
.keyval('status', result.status)\
|
||||
.keyval('message', result.message)\
|
||||
.keyval_not_none('data_updated', result.data_updated,
|
||||
lambda v: v.isoformat())\
|
||||
.keyval('software_version', str(result.software_version))\
|
||||
.keyval_not_none('database_version', result.database_version, str)\
|
||||
.end_object()
|
||||
|
||||
return out()
|
||||
|
||||
|
||||
def _add_address_row(writer: JsonWriter, row: AddressLine,
|
||||
locales: Locales) -> None:
|
||||
writer.start_object()\
|
||||
.keyval('localname', locales.display_name(row.names))\
|
||||
.keyval_not_none('place_id', row.place_id)
|
||||
|
||||
if row.osm_object is not None:
|
||||
writer.keyval('osm_id', row.osm_object[1])\
|
||||
.keyval('osm_type', row.osm_object[0])
|
||||
|
||||
if row.extratags:
|
||||
writer.keyval_not_none('place_type', row.extratags.get('place_type'))
|
||||
|
||||
writer.keyval('class', row.category[0])\
|
||||
.keyval('type', row.category[1])\
|
||||
.keyval_not_none('admin_level', row.admin_level)\
|
||||
.keyval('rank_address', row.rank_address)\
|
||||
.keyval('distance', row.distance)\
|
||||
.keyval('isaddress', row.isaddress)\
|
||||
.end_object()
|
||||
|
||||
|
||||
def _add_address_rows(writer: JsonWriter, section: str, rows: AddressLines,
|
||||
locales: Locales) -> None:
|
||||
writer.key(section).start_array()
|
||||
for row in rows:
|
||||
_add_address_row(writer, row, locales)
|
||||
writer.next()
|
||||
writer.end_array().next()
|
||||
|
||||
|
||||
def _add_parent_rows_grouped(writer: JsonWriter, rows: AddressLines,
|
||||
locales: Locales) -> None:
|
||||
# group by category type
|
||||
data = collections.defaultdict(list)
|
||||
for row in rows:
|
||||
sub = JsonWriter()
|
||||
_add_address_row(sub, row, locales)
|
||||
data[row.category[1]].append(sub())
|
||||
|
||||
writer.key('hierarchy').start_object()
|
||||
for group, grouped in data.items():
|
||||
writer.key(group).start_array()
|
||||
grouped.sort() # sorts alphabetically by local name
|
||||
for line in grouped:
|
||||
writer.raw(line).next()
|
||||
writer.end_array().next()
|
||||
|
||||
writer.end_object().next()
|
||||
|
||||
|
||||
@dispatch.format_func(DetailedResult, 'json')
|
||||
def _format_details_json(result: DetailedResult, options: Mapping[str, Any]) -> str:
|
||||
locales = options.get('locales', Locales())
|
||||
geom = result.geometry.get('geojson')
|
||||
centroid = result.centroid.to_geojson()
|
||||
|
||||
out = JsonWriter()
|
||||
out.start_object()\
|
||||
.keyval_not_none('place_id', result.place_id)\
|
||||
.keyval_not_none('parent_place_id', result.parent_place_id)
|
||||
|
||||
if result.osm_object is not None:
|
||||
out.keyval('osm_type', result.osm_object[0])\
|
||||
.keyval('osm_id', result.osm_object[1])
|
||||
|
||||
out.keyval('category', result.category[0])\
|
||||
.keyval('type', result.category[1])\
|
||||
.keyval('admin_level', result.admin_level)\
|
||||
.keyval('localname', result.locale_name or '')\
|
||||
.keyval('names', result.names or {})\
|
||||
.keyval('addresstags', result.address or {})\
|
||||
.keyval_not_none('housenumber', result.housenumber)\
|
||||
.keyval_not_none('calculated_postcode', result.postcode)\
|
||||
.keyval_not_none('country_code', result.country_code)\
|
||||
.keyval_not_none('indexed_date', result.indexed_date, lambda v: v.isoformat())\
|
||||
.keyval_not_none('importance', result.importance)\
|
||||
.keyval('calculated_importance', result.calculated_importance())\
|
||||
.keyval('extratags', result.extratags or {})\
|
||||
.keyval_not_none('calculated_wikipedia', result.wikipedia)\
|
||||
.keyval('rank_address', result.rank_address)\
|
||||
.keyval('rank_search', result.rank_search)\
|
||||
.keyval('isarea', 'Polygon' in (geom or result.geometry.get('type') or ''))\
|
||||
.key('centroid').raw(centroid).next()\
|
||||
.key('geometry').raw(geom or centroid).next()
|
||||
|
||||
if options.get('icon_base_url', None):
|
||||
icon = ICONS.get(result.category)
|
||||
if icon:
|
||||
out.keyval('icon', f"{options['icon_base_url']}/{icon}.p.20.png")
|
||||
|
||||
if result.address_rows is not None:
|
||||
_add_address_rows(out, 'address', result.address_rows, locales)
|
||||
|
||||
if result.linked_rows:
|
||||
_add_address_rows(out, 'linked_places', result.linked_rows, locales)
|
||||
|
||||
if result.name_keywords is not None or result.address_keywords is not None:
|
||||
out.key('keywords').start_object()
|
||||
|
||||
for sec, klist in (('name', result.name_keywords), ('address', result.address_keywords)):
|
||||
out.key(sec).start_array()
|
||||
for word in (klist or []):
|
||||
out.start_object()\
|
||||
.keyval('id', word.word_id)\
|
||||
.keyval('token', word.word_token)\
|
||||
.end_object().next()
|
||||
out.end_array().next()
|
||||
|
||||
out.end_object().next()
|
||||
|
||||
if result.parented_rows is not None:
|
||||
if options.get('group_hierarchy', False):
|
||||
_add_parent_rows_grouped(out, result.parented_rows, locales)
|
||||
else:
|
||||
_add_address_rows(out, 'hierarchy', result.parented_rows, locales)
|
||||
|
||||
out.end_object()
|
||||
|
||||
return out()
|
||||
|
||||
|
||||
@dispatch.format_func(ReverseResults, 'xml')
|
||||
def _format_reverse_xml(results: ReverseResults, options: Mapping[str, Any]) -> str:
|
||||
return format_xml.format_base_xml(results,
|
||||
options, True, 'reversegeocode',
|
||||
{'querystring': options.get('query', '')})
|
||||
|
||||
|
||||
@dispatch.format_func(ReverseResults, 'geojson')
|
||||
def _format_reverse_geojson(results: ReverseResults,
|
||||
options: Mapping[str, Any]) -> str:
|
||||
return format_json.format_base_geojson(results, options, True)
|
||||
|
||||
|
||||
@dispatch.format_func(ReverseResults, 'geocodejson')
|
||||
def _format_reverse_geocodejson(results: ReverseResults,
|
||||
options: Mapping[str, Any]) -> str:
|
||||
return format_json.format_base_geocodejson(results, options, True)
|
||||
|
||||
|
||||
@dispatch.format_func(ReverseResults, 'json')
|
||||
def _format_reverse_json(results: ReverseResults,
|
||||
options: Mapping[str, Any]) -> str:
|
||||
return format_json.format_base_json(results, options, True,
|
||||
class_label='class')
|
||||
|
||||
|
||||
@dispatch.format_func(ReverseResults, 'jsonv2')
|
||||
def _format_reverse_jsonv2(results: ReverseResults,
|
||||
options: Mapping[str, Any]) -> str:
|
||||
return format_json.format_base_json(results, options, True,
|
||||
class_label='category')
|
||||
|
||||
|
||||
@dispatch.format_func(SearchResults, 'xml')
|
||||
def _format_search_xml(results: SearchResults, options: Mapping[str, Any]) -> str:
|
||||
extra = {'querystring': options.get('query', '')}
|
||||
for attr in ('more_url', 'exclude_place_ids', 'viewbox'):
|
||||
if options.get(attr):
|
||||
extra[attr] = options[attr]
|
||||
return format_xml.format_base_xml(results, options, False, 'searchresults',
|
||||
extra)
|
||||
|
||||
|
||||
|
||||
@dispatch.format_func(SearchResults, 'geojson')
|
||||
def _format_search_geojson(results: SearchResults,
|
||||
options: Mapping[str, Any]) -> str:
|
||||
return format_json.format_base_geojson(results, options, False)
|
||||
|
||||
|
||||
@dispatch.format_func(SearchResults, 'geocodejson')
|
||||
def _format_search_geocodejson(results: SearchResults,
|
||||
options: Mapping[str, Any]) -> str:
|
||||
return format_json.format_base_geocodejson(results, options, False)
|
||||
|
||||
|
||||
@dispatch.format_func(SearchResults, 'json')
|
||||
def _format_search_json(results: SearchResults,
|
||||
options: Mapping[str, Any]) -> str:
|
||||
return format_json.format_base_json(results, options, False,
|
||||
class_label='class')
|
||||
|
||||
|
||||
@dispatch.format_func(SearchResults, 'jsonv2')
|
||||
def _format_search_jsonv2(results: SearchResults,
|
||||
options: Mapping[str, Any]) -> str:
|
||||
return format_json.format_base_json(results, options, False,
|
||||
class_label='category')
|
||||
|
||||
@dispatch.format_func(RawDataList, 'json')
|
||||
def _format_raw_data_json(results: RawDataList, _: Mapping[str, Any]) -> str:
|
||||
out = JsonWriter()
|
||||
out.start_array()
|
||||
for res in results:
|
||||
out.start_object()
|
||||
for k, v in res.items():
|
||||
if isinstance(v, dt.datetime):
|
||||
out.keyval(k, v.isoformat(sep= ' ', timespec='seconds'))
|
||||
else:
|
||||
out.keyval(k, v)
|
||||
out.end_object().next()
|
||||
|
||||
out.end_array()
|
||||
|
||||
return out()
|
||||
275
src/nominatim_api/v1/format_json.py
Normal file
275
src/nominatim_api/v1/format_json.py
Normal file
@@ -0,0 +1,275 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Helper functions for output of results in json formats.
|
||||
"""
|
||||
from typing import Mapping, Any, Optional, Tuple, Union
|
||||
|
||||
from nominatim_core.utils.json_writer import JsonWriter
|
||||
from ..results import AddressLines, ReverseResults, SearchResults
|
||||
from . import classtypes as cl
|
||||
|
||||
#pylint: disable=too-many-branches
|
||||
|
||||
def _write_osm_id(out: JsonWriter, osm_object: Optional[Tuple[str, int]]) -> None:
|
||||
if osm_object is not None:
|
||||
out.keyval_not_none('osm_type', cl.OSM_TYPE_NAME.get(osm_object[0], None))\
|
||||
.keyval('osm_id', osm_object[1])
|
||||
|
||||
|
||||
def _write_typed_address(out: JsonWriter, address: Optional[AddressLines],
|
||||
country_code: Optional[str]) -> None:
|
||||
parts = {}
|
||||
for line in (address or []):
|
||||
if line.isaddress:
|
||||
if line.local_name:
|
||||
label = cl.get_label_tag(line.category, line.extratags,
|
||||
line.rank_address, country_code)
|
||||
if label not in parts:
|
||||
parts[label] = line.local_name
|
||||
if line.names and 'ISO3166-2' in line.names and line.admin_level:
|
||||
parts[f"ISO3166-2-lvl{line.admin_level}"] = line.names['ISO3166-2']
|
||||
|
||||
for k, v in parts.items():
|
||||
out.keyval(k, v)
|
||||
|
||||
if country_code:
|
||||
out.keyval('country_code', country_code)
|
||||
|
||||
|
||||
def _write_geocodejson_address(out: JsonWriter,
|
||||
address: Optional[AddressLines],
|
||||
obj_place_id: Optional[int],
|
||||
country_code: Optional[str]) -> None:
|
||||
extra = {}
|
||||
for line in (address or []):
|
||||
if line.isaddress and line.local_name:
|
||||
if line.category[1] in ('postcode', 'postal_code'):
|
||||
out.keyval('postcode', line.local_name)
|
||||
elif line.category[1] == 'house_number':
|
||||
out.keyval('housenumber', line.local_name)
|
||||
elif (obj_place_id is None or obj_place_id != line.place_id) \
|
||||
and line.rank_address >= 4 and line.rank_address < 28:
|
||||
rank_name = GEOCODEJSON_RANKS[line.rank_address]
|
||||
if rank_name not in extra:
|
||||
extra[rank_name] = line.local_name
|
||||
|
||||
|
||||
for k, v in extra.items():
|
||||
out.keyval(k, v)
|
||||
|
||||
if country_code:
|
||||
out.keyval('country_code', country_code)
|
||||
|
||||
|
||||
def format_base_json(results: Union[ReverseResults, SearchResults],
|
||||
options: Mapping[str, Any], simple: bool,
|
||||
class_label: str) -> str:
|
||||
""" Return the result list as a simple json string in custom Nominatim format.
|
||||
"""
|
||||
out = JsonWriter()
|
||||
|
||||
if simple:
|
||||
if not results:
|
||||
return '{"error":"Unable to geocode"}'
|
||||
else:
|
||||
out.start_array()
|
||||
|
||||
for result in results:
|
||||
out.start_object()\
|
||||
.keyval_not_none('place_id', result.place_id)\
|
||||
.keyval('licence', cl.OSM_ATTRIBUTION)\
|
||||
|
||||
_write_osm_id(out, result.osm_object)
|
||||
|
||||
out.keyval('lat', f"{result.centroid.lat}")\
|
||||
.keyval('lon', f"{result.centroid.lon}")\
|
||||
.keyval(class_label, result.category[0])\
|
||||
.keyval('type', result.category[1])\
|
||||
.keyval('place_rank', result.rank_search)\
|
||||
.keyval('importance', result.calculated_importance())\
|
||||
.keyval('addresstype', cl.get_label_tag(result.category, result.extratags,
|
||||
result.rank_address,
|
||||
result.country_code))\
|
||||
.keyval('name', result.locale_name or '')\
|
||||
.keyval('display_name', result.display_name or '')
|
||||
|
||||
|
||||
if options.get('icon_base_url', None):
|
||||
icon = cl.ICONS.get(result.category)
|
||||
if icon:
|
||||
out.keyval('icon', f"{options['icon_base_url']}/{icon}.p.20.png")
|
||||
|
||||
if options.get('addressdetails', False):
|
||||
out.key('address').start_object()
|
||||
_write_typed_address(out, result.address_rows, result.country_code)
|
||||
out.end_object().next()
|
||||
|
||||
if options.get('extratags', False):
|
||||
out.keyval('extratags', result.extratags)
|
||||
|
||||
if options.get('namedetails', False):
|
||||
out.keyval('namedetails', result.names)
|
||||
|
||||
bbox = cl.bbox_from_result(result)
|
||||
out.key('boundingbox').start_array()\
|
||||
.value(f"{bbox.minlat:0.7f}").next()\
|
||||
.value(f"{bbox.maxlat:0.7f}").next()\
|
||||
.value(f"{bbox.minlon:0.7f}").next()\
|
||||
.value(f"{bbox.maxlon:0.7f}").next()\
|
||||
.end_array().next()
|
||||
|
||||
if result.geometry:
|
||||
for key in ('text', 'kml'):
|
||||
out.keyval_not_none('geo' + key, result.geometry.get(key))
|
||||
if 'geojson' in result.geometry:
|
||||
out.key('geojson').raw(result.geometry['geojson']).next()
|
||||
out.keyval_not_none('svg', result.geometry.get('svg'))
|
||||
|
||||
out.end_object()
|
||||
|
||||
if simple:
|
||||
return out()
|
||||
|
||||
out.next()
|
||||
|
||||
out.end_array()
|
||||
|
||||
return out()
|
||||
|
||||
|
||||
def format_base_geojson(results: Union[ReverseResults, SearchResults],
|
||||
options: Mapping[str, Any],
|
||||
simple: bool) -> str:
|
||||
""" Return the result list as a geojson string.
|
||||
"""
|
||||
if not results and simple:
|
||||
return '{"error":"Unable to geocode"}'
|
||||
|
||||
out = JsonWriter()
|
||||
|
||||
out.start_object()\
|
||||
.keyval('type', 'FeatureCollection')\
|
||||
.keyval('licence', cl.OSM_ATTRIBUTION)\
|
||||
.key('features').start_array()
|
||||
|
||||
for result in results:
|
||||
out.start_object()\
|
||||
.keyval('type', 'Feature')\
|
||||
.key('properties').start_object()
|
||||
|
||||
out.keyval_not_none('place_id', result.place_id)
|
||||
|
||||
_write_osm_id(out, result.osm_object)
|
||||
|
||||
out.keyval('place_rank', result.rank_search)\
|
||||
.keyval('category', result.category[0])\
|
||||
.keyval('type', result.category[1])\
|
||||
.keyval('importance', result.calculated_importance())\
|
||||
.keyval('addresstype', cl.get_label_tag(result.category, result.extratags,
|
||||
result.rank_address,
|
||||
result.country_code))\
|
||||
.keyval('name', result.locale_name or '')\
|
||||
.keyval('display_name', result.display_name or '')
|
||||
|
||||
if options.get('addressdetails', False):
|
||||
out.key('address').start_object()
|
||||
_write_typed_address(out, result.address_rows, result.country_code)
|
||||
out.end_object().next()
|
||||
|
||||
if options.get('extratags', False):
|
||||
out.keyval('extratags', result.extratags)
|
||||
|
||||
if options.get('namedetails', False):
|
||||
out.keyval('namedetails', result.names)
|
||||
|
||||
out.end_object().next() # properties
|
||||
|
||||
out.key('bbox').start_array()
|
||||
for coord in cl.bbox_from_result(result).coords:
|
||||
out.float(coord, 7).next()
|
||||
out.end_array().next()
|
||||
|
||||
out.key('geometry').raw(result.geometry.get('geojson')
|
||||
or result.centroid.to_geojson()).next()
|
||||
|
||||
out.end_object().next()
|
||||
|
||||
out.end_array().next().end_object()
|
||||
|
||||
return out()
|
||||
|
||||
|
||||
def format_base_geocodejson(results: Union[ReverseResults, SearchResults],
|
||||
options: Mapping[str, Any], simple: bool) -> str:
|
||||
""" Return the result list as a geocodejson string.
|
||||
"""
|
||||
if not results and simple:
|
||||
return '{"error":"Unable to geocode"}'
|
||||
|
||||
out = JsonWriter()
|
||||
|
||||
out.start_object()\
|
||||
.keyval('type', 'FeatureCollection')\
|
||||
.key('geocoding').start_object()\
|
||||
.keyval('version', '0.1.0')\
|
||||
.keyval('attribution', cl.OSM_ATTRIBUTION)\
|
||||
.keyval('licence', 'ODbL')\
|
||||
.keyval_not_none('query', options.get('query'))\
|
||||
.end_object().next()\
|
||||
.key('features').start_array()
|
||||
|
||||
for result in results:
|
||||
out.start_object()\
|
||||
.keyval('type', 'Feature')\
|
||||
.key('properties').start_object()\
|
||||
.key('geocoding').start_object()
|
||||
|
||||
out.keyval_not_none('place_id', result.place_id)
|
||||
|
||||
_write_osm_id(out, result.osm_object)
|
||||
|
||||
out.keyval('osm_key', result.category[0])\
|
||||
.keyval('osm_value', result.category[1])\
|
||||
.keyval('type', GEOCODEJSON_RANKS[max(3, min(28, result.rank_address))])\
|
||||
.keyval_not_none('accuracy', getattr(result, 'distance', None), transform=int)\
|
||||
.keyval('label', result.display_name or '')\
|
||||
.keyval_not_none('name', result.locale_name or None)\
|
||||
|
||||
if options.get('addressdetails', False):
|
||||
_write_geocodejson_address(out, result.address_rows, result.place_id,
|
||||
result.country_code)
|
||||
|
||||
out.key('admin').start_object()
|
||||
if result.address_rows:
|
||||
for line in result.address_rows:
|
||||
if line.isaddress and (line.admin_level or 15) < 15 and line.local_name \
|
||||
and line.category[0] == 'boundary' and line.category[1] == 'administrative':
|
||||
out.keyval(f"level{line.admin_level}", line.local_name)
|
||||
out.end_object().next()
|
||||
|
||||
out.end_object().next().end_object().next()
|
||||
|
||||
out.key('geometry').raw(result.geometry.get('geojson')
|
||||
or result.centroid.to_geojson()).next()
|
||||
|
||||
out.end_object().next()
|
||||
|
||||
out.end_array().next().end_object()
|
||||
|
||||
return out()
|
||||
|
||||
|
||||
GEOCODEJSON_RANKS = {
|
||||
3: 'locality',
|
||||
4: 'country',
|
||||
5: 'state', 6: 'state', 7: 'state', 8: 'state', 9: 'state',
|
||||
10: 'county', 11: 'county', 12: 'county',
|
||||
13: 'city', 14: 'city', 15: 'city', 16: 'city',
|
||||
17: 'district', 18: 'district', 19: 'district', 20: 'district', 21: 'district',
|
||||
22: 'locality', 23: 'locality', 24: 'locality',
|
||||
25: 'street', 26: 'street', 27: 'street', 28: 'house'}
|
||||
126
src/nominatim_api/v1/format_xml.py
Normal file
126
src/nominatim_api/v1/format_xml.py
Normal file
@@ -0,0 +1,126 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Helper functions for output of results in XML format.
|
||||
"""
|
||||
from typing import Mapping, Any, Optional, Union
|
||||
import datetime as dt
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from ..results import AddressLines, ReverseResult, ReverseResults, \
|
||||
SearchResult, SearchResults
|
||||
from . import classtypes as cl
|
||||
|
||||
#pylint: disable=too-many-branches
|
||||
|
||||
def _write_xml_address(root: ET.Element, address: AddressLines,
|
||||
country_code: Optional[str]) -> None:
|
||||
parts = {}
|
||||
for line in address:
|
||||
if line.isaddress:
|
||||
if line.local_name:
|
||||
label = cl.get_label_tag(line.category, line.extratags,
|
||||
line.rank_address, country_code)
|
||||
if label not in parts:
|
||||
parts[label] = line.local_name
|
||||
if line.names and 'ISO3166-2' in line.names and line.admin_level:
|
||||
parts[f"ISO3166-2-lvl{line.admin_level}"] = line.names['ISO3166-2']
|
||||
|
||||
for k,v in parts.items():
|
||||
ET.SubElement(root, k).text = v
|
||||
|
||||
if country_code:
|
||||
ET.SubElement(root, 'country_code').text = country_code
|
||||
|
||||
|
||||
def _create_base_entry(result: Union[ReverseResult, SearchResult],
|
||||
root: ET.Element, simple: bool) -> ET.Element:
|
||||
place = ET.SubElement(root, 'result' if simple else 'place')
|
||||
if result.place_id is not None:
|
||||
place.set('place_id', str(result.place_id))
|
||||
if result.osm_object:
|
||||
osm_type = cl.OSM_TYPE_NAME.get(result.osm_object[0], None)
|
||||
if osm_type is not None:
|
||||
place.set('osm_type', osm_type)
|
||||
place.set('osm_id', str(result.osm_object[1]))
|
||||
if result.names and 'ref' in result.names:
|
||||
place.set('ref', result.names['ref'])
|
||||
elif result.locale_name:
|
||||
# bug reproduced from PHP
|
||||
place.set('ref', result.locale_name)
|
||||
place.set('lat', f"{result.centroid.lat:.7f}")
|
||||
place.set('lon', f"{result.centroid.lon:.7f}")
|
||||
|
||||
bbox = cl.bbox_from_result(result)
|
||||
place.set('boundingbox',
|
||||
f"{bbox.minlat:.7f},{bbox.maxlat:.7f},{bbox.minlon:.7f},{bbox.maxlon:.7f}")
|
||||
|
||||
place.set('place_rank', str(result.rank_search))
|
||||
place.set('address_rank', str(result.rank_address))
|
||||
|
||||
if result.geometry:
|
||||
for key in ('text', 'svg'):
|
||||
if key in result.geometry:
|
||||
place.set('geo' + key, result.geometry[key])
|
||||
if 'kml' in result.geometry:
|
||||
ET.SubElement(root if simple else place, 'geokml')\
|
||||
.append(ET.fromstring(result.geometry['kml']))
|
||||
if 'geojson' in result.geometry:
|
||||
place.set('geojson', result.geometry['geojson'])
|
||||
|
||||
if simple:
|
||||
place.text = result.display_name or ''
|
||||
else:
|
||||
place.set('display_name', result.display_name or '')
|
||||
place.set('class', result.category[0])
|
||||
place.set('type', result.category[1])
|
||||
place.set('importance', str(result.calculated_importance()))
|
||||
|
||||
return place
|
||||
|
||||
|
||||
def format_base_xml(results: Union[ReverseResults, SearchResults],
|
||||
options: Mapping[str, Any],
|
||||
simple: bool, xml_root_tag: str,
|
||||
xml_extra_info: Mapping[str, str]) -> str:
|
||||
""" Format the result into an XML response. With 'simple' exactly one
|
||||
result will be output, otherwise a list.
|
||||
"""
|
||||
root = ET.Element(xml_root_tag)
|
||||
root.set('timestamp', dt.datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +00:00'))
|
||||
root.set('attribution', cl.OSM_ATTRIBUTION)
|
||||
for k, v in xml_extra_info.items():
|
||||
root.set(k, v)
|
||||
|
||||
if simple and not results:
|
||||
ET.SubElement(root, 'error').text = 'Unable to geocode'
|
||||
|
||||
for result in results:
|
||||
place = _create_base_entry(result, root, simple)
|
||||
|
||||
if not simple and options.get('icon_base_url', None):
|
||||
icon = cl.ICONS.get(result.category)
|
||||
if icon:
|
||||
place.set('icon', icon)
|
||||
|
||||
if options.get('addressdetails', False) and result.address_rows:
|
||||
_write_xml_address(ET.SubElement(root, 'addressparts') if simple else place,
|
||||
result.address_rows, result.country_code)
|
||||
|
||||
if options.get('extratags', False):
|
||||
eroot = ET.SubElement(root if simple else place, 'extratags')
|
||||
if result.extratags:
|
||||
for k, v in result.extratags.items():
|
||||
ET.SubElement(eroot, 'tag', attrib={'key': k, 'value': v})
|
||||
|
||||
if options.get('namedetails', False):
|
||||
eroot = ET.SubElement(root if simple else place, 'namedetails')
|
||||
if result.names:
|
||||
for k,v in result.names.items():
|
||||
ET.SubElement(eroot, 'name', attrib={'desc': k}).text = v
|
||||
|
||||
return '<?xml version="1.0" encoding="UTF-8" ?>\n' + ET.tostring(root, encoding='unicode')
|
||||
201
src/nominatim_api/v1/helpers.py
Normal file
201
src/nominatim_api/v1/helpers.py
Normal file
@@ -0,0 +1,201 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Helper function for parsing parameters and and outputting data
|
||||
specifically for the v1 version of the API.
|
||||
"""
|
||||
from typing import Tuple, Optional, Any, Dict, Iterable
|
||||
from itertools import chain
|
||||
import re
|
||||
|
||||
from ..results import SearchResult, SearchResults, SourceTable
|
||||
from ..types import SearchDetails, GeometryFormat
|
||||
|
||||
REVERSE_MAX_RANKS = [2, 2, 2, # 0-2 Continent/Sea
|
||||
4, 4, # 3-4 Country
|
||||
8, # 5 State
|
||||
10, 10, # 6-7 Region
|
||||
12, 12, # 8-9 County
|
||||
16, 17, # 10-11 City
|
||||
18, # 12 Town
|
||||
19, # 13 Village/Suburb
|
||||
22, # 14 Hamlet/Neighbourhood
|
||||
25, # 15 Localities
|
||||
26, # 16 Major Streets
|
||||
27, # 17 Minor Streets
|
||||
30 # 18 Building
|
||||
]
|
||||
|
||||
|
||||
def zoom_to_rank(zoom: int) -> int:
|
||||
""" Convert a zoom parameter into a rank according to the v1 API spec.
|
||||
"""
|
||||
return REVERSE_MAX_RANKS[max(0, min(18, zoom))]
|
||||
|
||||
|
||||
FEATURE_TYPE_TO_RANK: Dict[Optional[str], Tuple[int, int]] = {
|
||||
'country': (4, 4),
|
||||
'state': (8, 8),
|
||||
'city': (14, 16),
|
||||
'settlement': (8, 20)
|
||||
}
|
||||
|
||||
|
||||
def feature_type_to_rank(feature_type: Optional[str]) -> Tuple[int, int]:
|
||||
""" Convert a feature type parameter to a tuple of
|
||||
feature type name, minimum rank and maximum rank.
|
||||
"""
|
||||
return FEATURE_TYPE_TO_RANK.get(feature_type, (0, 30))
|
||||
|
||||
|
||||
#pylint: disable=too-many-arguments,too-many-branches
|
||||
def extend_query_parts(queryparts: Dict[str, Any], details: Dict[str, Any],
|
||||
feature_type: Optional[str],
|
||||
namedetails: bool, extratags: bool,
|
||||
excluded: Iterable[str]) -> None:
|
||||
""" Add parameters from details dictionary to the query parts
|
||||
dictionary which is suitable as URL parameter dictionary.
|
||||
"""
|
||||
parsed = SearchDetails.from_kwargs(details)
|
||||
if parsed.geometry_output != GeometryFormat.NONE:
|
||||
if GeometryFormat.GEOJSON in parsed.geometry_output:
|
||||
queryparts['polygon_geojson'] = '1'
|
||||
if GeometryFormat.KML in parsed.geometry_output:
|
||||
queryparts['polygon_kml'] = '1'
|
||||
if GeometryFormat.SVG in parsed.geometry_output:
|
||||
queryparts['polygon_svg'] = '1'
|
||||
if GeometryFormat.TEXT in parsed.geometry_output:
|
||||
queryparts['polygon_text'] = '1'
|
||||
if parsed.address_details:
|
||||
queryparts['addressdetails'] = '1'
|
||||
if namedetails:
|
||||
queryparts['namedetails'] = '1'
|
||||
if extratags:
|
||||
queryparts['extratags'] = '1'
|
||||
if parsed.geometry_simplification > 0.0:
|
||||
queryparts['polygon_threshold'] = f"{parsed.geometry_simplification:.6g}"
|
||||
if parsed.max_results != 10:
|
||||
queryparts['limit'] = str(parsed.max_results)
|
||||
if parsed.countries:
|
||||
queryparts['countrycodes'] = ','.join(parsed.countries)
|
||||
queryparts['exclude_place_ids'] = \
|
||||
','.join(chain(excluded, map(str, (e for e in parsed.excluded if e > 0))))
|
||||
if parsed.viewbox:
|
||||
queryparts['viewbox'] = ','.join(f"{c:.7g}" for c in parsed.viewbox.coords)
|
||||
if parsed.bounded_viewbox:
|
||||
queryparts['bounded'] = '1'
|
||||
if not details['dedupe']:
|
||||
queryparts['dedupe'] = '0'
|
||||
if feature_type in FEATURE_TYPE_TO_RANK:
|
||||
queryparts['featureType'] = feature_type
|
||||
|
||||
|
||||
def deduplicate_results(results: SearchResults, max_results: int) -> SearchResults:
|
||||
""" Remove results that look like duplicates.
|
||||
|
||||
Two results are considered the same if they have the same OSM ID
|
||||
or if they have the same category, display name and rank.
|
||||
"""
|
||||
osm_ids_done = set()
|
||||
classification_done = set()
|
||||
deduped = SearchResults()
|
||||
for result in results:
|
||||
if result.source_table == SourceTable.POSTCODE:
|
||||
assert result.names and 'ref' in result.names
|
||||
if any(_is_postcode_relation_for(r, result.names['ref']) for r in results):
|
||||
continue
|
||||
if result.source_table == SourceTable.PLACEX:
|
||||
classification = (result.osm_object[0] if result.osm_object else None,
|
||||
result.category,
|
||||
result.display_name,
|
||||
result.rank_address)
|
||||
if result.osm_object not in osm_ids_done \
|
||||
and classification not in classification_done:
|
||||
deduped.append(result)
|
||||
osm_ids_done.add(result.osm_object)
|
||||
classification_done.add(classification)
|
||||
else:
|
||||
deduped.append(result)
|
||||
if len(deduped) >= max_results:
|
||||
break
|
||||
|
||||
return deduped
|
||||
|
||||
|
||||
def _is_postcode_relation_for(result: SearchResult, postcode: str) -> bool:
|
||||
return result.source_table == SourceTable.PLACEX \
|
||||
and result.osm_object is not None \
|
||||
and result.osm_object[0] == 'R' \
|
||||
and result.category == ('boundary', 'postal_code') \
|
||||
and result.names is not None \
|
||||
and result.names.get('ref') == postcode
|
||||
|
||||
|
||||
def _deg(axis:str) -> str:
|
||||
return f"(?P<{axis}_deg>\\d+\\.\\d+)°?"
|
||||
|
||||
def _deg_min(axis: str) -> str:
|
||||
return f"(?P<{axis}_deg>\\d+)[°\\s]+(?P<{axis}_min>[\\d.]+)[′']*"
|
||||
|
||||
def _deg_min_sec(axis: str) -> str:
|
||||
return f"(?P<{axis}_deg>\\d+)[°\\s]+(?P<{axis}_min>\\d+)[′'\\s]+(?P<{axis}_sec>[\\d.]+)[\"″]*"
|
||||
|
||||
COORD_REGEX = [re.compile(r'(?:(?P<pre>.*?)\s+)??' + r + r'(?:\s+(?P<post>.*))?') for r in (
|
||||
r"(?P<ns>[NS])\s*" + _deg('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg('lon'),
|
||||
_deg('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg('lon') + r"\s*(?P<ew>[EW])",
|
||||
r"(?P<ns>[NS])\s*" + _deg_min('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg_min('lon'),
|
||||
_deg_min('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg_min('lon') + r"\s*(?P<ew>[EW])",
|
||||
r"(?P<ns>[NS])\s*" + _deg_min_sec('lat') + r"[\s,]+" + r"(?P<ew>[EW])\s*" + _deg_min_sec('lon'),
|
||||
_deg_min_sec('lat') + r"\s*(?P<ns>[NS])[\s,]+" + _deg_min_sec('lon') + r"\s*(?P<ew>[EW])",
|
||||
r"\[?(?P<lat_deg>[+-]?\d+\.\d+)[\s,]+(?P<lon_deg>[+-]?\d+\.\d+)\]?"
|
||||
)]
|
||||
|
||||
def extract_coords_from_query(query: str) -> Tuple[str, Optional[float], Optional[float]]:
|
||||
""" Look for something that is formatted like a coordinate at the
|
||||
beginning or end of the query. If found, extract the coordinate and
|
||||
return the remaining query (or the empty string if the query
|
||||
consisted of nothing but a coordinate).
|
||||
|
||||
Only the first match will be returned.
|
||||
"""
|
||||
for regex in COORD_REGEX:
|
||||
match = regex.fullmatch(query)
|
||||
if match is None:
|
||||
continue
|
||||
groups = match.groupdict()
|
||||
if not groups['pre'] or not groups['post']:
|
||||
x = float(groups['lon_deg']) \
|
||||
+ float(groups.get('lon_min', 0.0)) / 60.0 \
|
||||
+ float(groups.get('lon_sec', 0.0)) / 3600.0
|
||||
if groups.get('ew') == 'W':
|
||||
x = -x
|
||||
y = float(groups['lat_deg']) \
|
||||
+ float(groups.get('lat_min', 0.0)) / 60.0 \
|
||||
+ float(groups.get('lat_sec', 0.0)) / 3600.0
|
||||
if groups.get('ns') == 'S':
|
||||
y = -y
|
||||
return groups['pre'] or groups['post'] or '', x, y
|
||||
|
||||
return query, None, None
|
||||
|
||||
|
||||
CATEGORY_REGEX = re.compile(r'(?P<pre>.*?)\[(?P<cls>[a-zA-Z_]+)=(?P<typ>[a-zA-Z_]+)\](?P<post>.*)')
|
||||
|
||||
def extract_category_from_query(query: str) -> Tuple[str, Optional[str], Optional[str]]:
|
||||
""" Extract a hidden category specification of the form '[key=value]' from
|
||||
the query. If found, extract key and value and
|
||||
return the remaining query (or the empty string if the query
|
||||
consisted of nothing but a category).
|
||||
|
||||
Only the first match will be returned.
|
||||
"""
|
||||
match = CATEGORY_REGEX.search(query)
|
||||
if match is not None:
|
||||
return (match.group('pre').strip() + ' ' + match.group('post').strip()).strip(), \
|
||||
match.group('cls'), match.group('typ')
|
||||
|
||||
return query, None, None
|
||||
577
src/nominatim_api/v1/server_glue.py
Normal file
577
src/nominatim_api/v1/server_glue.py
Normal file
@@ -0,0 +1,577 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Generic part of the server implementation of the v1 API.
|
||||
Combine with the scaffolding provided for the various Python ASGI frameworks.
|
||||
"""
|
||||
from typing import Optional, Any, Type, Callable, NoReturn, Dict, cast
|
||||
from functools import reduce
|
||||
import abc
|
||||
import dataclasses
|
||||
import math
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.config import Configuration
|
||||
from .. import logging as loglib
|
||||
from ..core import NominatimAPIAsync
|
||||
from .format import dispatch as formatting
|
||||
from .format import RawDataList
|
||||
from ..types import DataLayer, GeometryFormat, PlaceRef, PlaceID, OsmID, Point
|
||||
from ..status import StatusResult
|
||||
from ..results import DetailedResult, ReverseResults, SearchResult, SearchResults
|
||||
from ..localization import Locales
|
||||
from . import helpers
|
||||
|
||||
CONTENT_TEXT = 'text/plain; charset=utf-8'
|
||||
CONTENT_XML = 'text/xml; charset=utf-8'
|
||||
CONTENT_HTML = 'text/html; charset=utf-8'
|
||||
CONTENT_JSON = 'application/json; charset=utf-8'
|
||||
|
||||
CONTENT_TYPE = {'text': CONTENT_TEXT, 'xml': CONTENT_XML, 'debug': CONTENT_HTML}
|
||||
|
||||
class ASGIAdaptor(abc.ABC):
|
||||
""" Adapter class for the different ASGI frameworks.
|
||||
Wraps functionality over concrete requests and responses.
|
||||
"""
|
||||
content_type: str = CONTENT_TEXT
|
||||
|
||||
@abc.abstractmethod
|
||||
def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
|
||||
""" Return an input parameter as a string. If the parameter was
|
||||
not provided, return the 'default' value.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_header(self, name: str, default: Optional[str] = None) -> Optional[str]:
|
||||
""" Return a HTTP header parameter as a string. If the parameter was
|
||||
not provided, return the 'default' value.
|
||||
"""
|
||||
|
||||
|
||||
@abc.abstractmethod
|
||||
def error(self, msg: str, status: int = 400) -> Exception:
|
||||
""" Construct an appropriate exception from the given error message.
|
||||
The exception must result in a HTTP error with the given status.
|
||||
"""
|
||||
|
||||
|
||||
@abc.abstractmethod
|
||||
def create_response(self, status: int, output: str, num_results: int) -> Any:
|
||||
""" Create a response from the given parameters. The result will
|
||||
be returned by the endpoint functions. The adaptor may also
|
||||
return None when the response is created internally with some
|
||||
different means.
|
||||
|
||||
The response must return the HTTP given status code 'status', set
|
||||
the HTTP content-type headers to the string provided and the
|
||||
body of the response to 'output'.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def base_uri(self) -> str:
|
||||
""" Return the URI of the original request.
|
||||
"""
|
||||
|
||||
|
||||
@abc.abstractmethod
|
||||
def config(self) -> Configuration:
|
||||
""" Return the current configuration object.
|
||||
"""
|
||||
|
||||
|
||||
def build_response(self, output: str, status: int = 200, num_results: int = 0) -> Any:
|
||||
""" Create a response from the given output. Wraps a JSONP function
|
||||
around the response, if necessary.
|
||||
"""
|
||||
if self.content_type == CONTENT_JSON and status == 200:
|
||||
jsonp = self.get('json_callback')
|
||||
if jsonp is not None:
|
||||
if any(not part.isidentifier() for part in jsonp.split('.')):
|
||||
self.raise_error('Invalid json_callback value')
|
||||
output = f"{jsonp}({output})"
|
||||
self.content_type = 'application/javascript; charset=utf-8'
|
||||
|
||||
return self.create_response(status, output, num_results)
|
||||
|
||||
|
||||
def raise_error(self, msg: str, status: int = 400) -> NoReturn:
|
||||
""" Raise an exception resulting in the given HTTP status and
|
||||
message. The message will be formatted according to the
|
||||
output format chosen by the request.
|
||||
"""
|
||||
if self.content_type == CONTENT_XML:
|
||||
msg = f"""<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<error>
|
||||
<code>{status}</code>
|
||||
<message>{msg}</message>
|
||||
</error>
|
||||
"""
|
||||
elif self.content_type == CONTENT_JSON:
|
||||
msg = f"""{{"error":{{"code":{status},"message":"{msg}"}}}}"""
|
||||
elif self.content_type == CONTENT_HTML:
|
||||
loglib.log().section('Execution error')
|
||||
loglib.log().var_dump('Status', status)
|
||||
loglib.log().var_dump('Message', msg)
|
||||
msg = loglib.get_and_disable()
|
||||
|
||||
raise self.error(msg, status)
|
||||
|
||||
|
||||
def get_int(self, name: str, default: Optional[int] = None) -> int:
|
||||
""" Return an input parameter as an int. Raises an exception if
|
||||
the parameter is given but not in an integer format.
|
||||
|
||||
If 'default' is given, then it will be returned when the parameter
|
||||
is missing completely. When 'default' is None, an error will be
|
||||
raised on a missing parameter.
|
||||
"""
|
||||
value = self.get(name)
|
||||
|
||||
if value is None:
|
||||
if default is not None:
|
||||
return default
|
||||
|
||||
self.raise_error(f"Parameter '{name}' missing.")
|
||||
|
||||
try:
|
||||
intval = int(value)
|
||||
except ValueError:
|
||||
self.raise_error(f"Parameter '{name}' must be a number.")
|
||||
|
||||
return intval
|
||||
|
||||
|
||||
def get_float(self, name: str, default: Optional[float] = None) -> float:
|
||||
""" Return an input parameter as a flaoting-point number. Raises an
|
||||
exception if the parameter is given but not in an float format.
|
||||
|
||||
If 'default' is given, then it will be returned when the parameter
|
||||
is missing completely. When 'default' is None, an error will be
|
||||
raised on a missing parameter.
|
||||
"""
|
||||
value = self.get(name)
|
||||
|
||||
if value is None:
|
||||
if default is not None:
|
||||
return default
|
||||
|
||||
self.raise_error(f"Parameter '{name}' missing.")
|
||||
|
||||
try:
|
||||
fval = float(value)
|
||||
except ValueError:
|
||||
self.raise_error(f"Parameter '{name}' must be a number.")
|
||||
|
||||
if math.isnan(fval) or math.isinf(fval):
|
||||
self.raise_error(f"Parameter '{name}' must be a number.")
|
||||
|
||||
return fval
|
||||
|
||||
|
||||
def get_bool(self, name: str, default: Optional[bool] = None) -> bool:
|
||||
""" Return an input parameter as bool. Only '0' is accepted as
|
||||
an input for 'false' all other inputs will be interpreted as 'true'.
|
||||
|
||||
If 'default' is given, then it will be returned when the parameter
|
||||
is missing completely. When 'default' is None, an error will be
|
||||
raised on a missing parameter.
|
||||
"""
|
||||
value = self.get(name)
|
||||
|
||||
if value is None:
|
||||
if default is not None:
|
||||
return default
|
||||
|
||||
self.raise_error(f"Parameter '{name}' missing.")
|
||||
|
||||
return value != '0'
|
||||
|
||||
|
||||
def get_accepted_languages(self) -> str:
|
||||
""" Return the accepted languages.
|
||||
"""
|
||||
return self.get('accept-language')\
|
||||
or self.get_header('accept-language')\
|
||||
or self.config().DEFAULT_LANGUAGE
|
||||
|
||||
|
||||
def setup_debugging(self) -> bool:
|
||||
""" Set up collection of debug information if requested.
|
||||
|
||||
Return True when debugging was requested.
|
||||
"""
|
||||
if self.get_bool('debug', False):
|
||||
loglib.set_log_output('html')
|
||||
self.content_type = CONTENT_HTML
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_layers(self) -> Optional[DataLayer]:
|
||||
""" Return a parsed version of the layer parameter.
|
||||
"""
|
||||
param = self.get('layer', None)
|
||||
if param is None:
|
||||
return None
|
||||
|
||||
return cast(DataLayer,
|
||||
reduce(DataLayer.__or__,
|
||||
(getattr(DataLayer, s.upper()) for s in param.split(','))))
|
||||
|
||||
|
||||
def parse_format(self, result_type: Type[Any], default: str) -> str:
|
||||
""" Get and check the 'format' parameter and prepare the formatter.
|
||||
`result_type` is the type of result to be returned by the function
|
||||
and `default` the format value to assume when no parameter is present.
|
||||
"""
|
||||
fmt = self.get('format', default=default)
|
||||
assert fmt is not None
|
||||
|
||||
if not formatting.supports_format(result_type, fmt):
|
||||
self.raise_error("Parameter 'format' must be one of: " +
|
||||
', '.join(formatting.list_formats(result_type)))
|
||||
|
||||
self.content_type = CONTENT_TYPE.get(fmt, CONTENT_JSON)
|
||||
return fmt
|
||||
|
||||
|
||||
def parse_geometry_details(self, fmt: str) -> Dict[str, Any]:
|
||||
""" Create details structure from the supplied geometry parameters.
|
||||
"""
|
||||
numgeoms = 0
|
||||
output = GeometryFormat.NONE
|
||||
if self.get_bool('polygon_geojson', False):
|
||||
output |= GeometryFormat.GEOJSON
|
||||
numgeoms += 1
|
||||
if fmt not in ('geojson', 'geocodejson'):
|
||||
if self.get_bool('polygon_text', False):
|
||||
output |= GeometryFormat.TEXT
|
||||
numgeoms += 1
|
||||
if self.get_bool('polygon_kml', False):
|
||||
output |= GeometryFormat.KML
|
||||
numgeoms += 1
|
||||
if self.get_bool('polygon_svg', False):
|
||||
output |= GeometryFormat.SVG
|
||||
numgeoms += 1
|
||||
|
||||
if numgeoms > self.config().get_int('POLYGON_OUTPUT_MAX_TYPES'):
|
||||
self.raise_error('Too many polygon output options selected.')
|
||||
|
||||
return {'address_details': True,
|
||||
'geometry_simplification': self.get_float('polygon_threshold', 0.0),
|
||||
'geometry_output': output
|
||||
}
|
||||
|
||||
|
||||
async def status_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
|
||||
""" Server glue for /status endpoint. See API docs for details.
|
||||
"""
|
||||
result = await api.status()
|
||||
|
||||
fmt = params.parse_format(StatusResult, 'text')
|
||||
|
||||
if fmt == 'text' and result.status:
|
||||
status_code = 500
|
||||
else:
|
||||
status_code = 200
|
||||
|
||||
return params.build_response(formatting.format_result(result, fmt, {}),
|
||||
status=status_code)
|
||||
|
||||
|
||||
async def details_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
|
||||
""" Server glue for /details endpoint. See API docs for details.
|
||||
"""
|
||||
fmt = params.parse_format(DetailedResult, 'json')
|
||||
place_id = params.get_int('place_id', 0)
|
||||
place: PlaceRef
|
||||
if place_id:
|
||||
place = PlaceID(place_id)
|
||||
else:
|
||||
osmtype = params.get('osmtype')
|
||||
if osmtype is None:
|
||||
params.raise_error("Missing ID parameter 'place_id' or 'osmtype'.")
|
||||
place = OsmID(osmtype, params.get_int('osmid'), params.get('class'))
|
||||
|
||||
debug = params.setup_debugging()
|
||||
|
||||
locales = Locales.from_accept_languages(params.get_accepted_languages())
|
||||
|
||||
result = await api.details(place,
|
||||
address_details=params.get_bool('addressdetails', False),
|
||||
linked_places=params.get_bool('linkedplaces', True),
|
||||
parented_places=params.get_bool('hierarchy', False),
|
||||
keywords=params.get_bool('keywords', False),
|
||||
geometry_output = GeometryFormat.GEOJSON
|
||||
if params.get_bool('polygon_geojson', False)
|
||||
else GeometryFormat.NONE,
|
||||
locales=locales
|
||||
)
|
||||
|
||||
if debug:
|
||||
return params.build_response(loglib.get_and_disable())
|
||||
|
||||
if result is None:
|
||||
params.raise_error('No place with that OSM ID found.', status=404)
|
||||
|
||||
output = formatting.format_result(result, fmt,
|
||||
{'locales': locales,
|
||||
'group_hierarchy': params.get_bool('group_hierarchy', False),
|
||||
'icon_base_url': params.config().MAPICON_URL})
|
||||
|
||||
return params.build_response(output, num_results=1)
|
||||
|
||||
|
||||
async def reverse_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
|
||||
""" Server glue for /reverse endpoint. See API docs for details.
|
||||
"""
|
||||
fmt = params.parse_format(ReverseResults, 'xml')
|
||||
debug = params.setup_debugging()
|
||||
coord = Point(params.get_float('lon'), params.get_float('lat'))
|
||||
|
||||
details = params.parse_geometry_details(fmt)
|
||||
details['max_rank'] = helpers.zoom_to_rank(params.get_int('zoom', 18))
|
||||
details['layers'] = params.get_layers()
|
||||
details['locales'] = Locales.from_accept_languages(params.get_accepted_languages())
|
||||
|
||||
result = await api.reverse(coord, **details)
|
||||
|
||||
if debug:
|
||||
return params.build_response(loglib.get_and_disable(), num_results=1 if result else 0)
|
||||
|
||||
if fmt == 'xml':
|
||||
queryparts = {'lat': str(coord.lat), 'lon': str(coord.lon), 'format': 'xml'}
|
||||
zoom = params.get('zoom', None)
|
||||
if zoom:
|
||||
queryparts['zoom'] = zoom
|
||||
query = urlencode(queryparts)
|
||||
else:
|
||||
query = ''
|
||||
|
||||
fmt_options = {'query': query,
|
||||
'extratags': params.get_bool('extratags', False),
|
||||
'namedetails': params.get_bool('namedetails', False),
|
||||
'addressdetails': params.get_bool('addressdetails', True)}
|
||||
|
||||
output = formatting.format_result(ReverseResults([result] if result else []),
|
||||
fmt, fmt_options)
|
||||
|
||||
return params.build_response(output, num_results=1 if result else 0)
|
||||
|
||||
|
||||
async def lookup_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
|
||||
""" Server glue for /lookup endpoint. See API docs for details.
|
||||
"""
|
||||
fmt = params.parse_format(SearchResults, 'xml')
|
||||
debug = params.setup_debugging()
|
||||
details = params.parse_geometry_details(fmt)
|
||||
details['locales'] = Locales.from_accept_languages(params.get_accepted_languages())
|
||||
|
||||
places = []
|
||||
for oid in (params.get('osm_ids') or '').split(','):
|
||||
oid = oid.strip()
|
||||
if len(oid) > 1 and oid[0] in 'RNWrnw' and oid[1:].isdigit():
|
||||
places.append(OsmID(oid[0].upper(), int(oid[1:])))
|
||||
|
||||
if len(places) > params.config().get_int('LOOKUP_MAX_COUNT'):
|
||||
params.raise_error('Too many object IDs.')
|
||||
|
||||
if places:
|
||||
results = await api.lookup(places, **details)
|
||||
else:
|
||||
results = SearchResults()
|
||||
|
||||
if debug:
|
||||
return params.build_response(loglib.get_and_disable(), num_results=len(results))
|
||||
|
||||
fmt_options = {'extratags': params.get_bool('extratags', False),
|
||||
'namedetails': params.get_bool('namedetails', False),
|
||||
'addressdetails': params.get_bool('addressdetails', True)}
|
||||
|
||||
output = formatting.format_result(results, fmt, fmt_options)
|
||||
|
||||
return params.build_response(output, num_results=len(results))
|
||||
|
||||
|
||||
async def _unstructured_search(query: str, api: NominatimAPIAsync,
|
||||
details: Dict[str, Any]) -> SearchResults:
|
||||
if not query:
|
||||
return SearchResults()
|
||||
|
||||
# Extract special format for coordinates from query.
|
||||
query, x, y = helpers.extract_coords_from_query(query)
|
||||
if x is not None:
|
||||
assert y is not None
|
||||
details['near'] = Point(x, y)
|
||||
details['near_radius'] = 0.1
|
||||
|
||||
# If no query is left, revert to reverse search.
|
||||
if x is not None and not query:
|
||||
result = await api.reverse(details['near'], **details)
|
||||
if not result:
|
||||
return SearchResults()
|
||||
|
||||
return SearchResults(
|
||||
[SearchResult(**{f.name: getattr(result, f.name)
|
||||
for f in dataclasses.fields(SearchResult)
|
||||
if hasattr(result, f.name)})])
|
||||
|
||||
query, cls, typ = helpers.extract_category_from_query(query)
|
||||
if cls is not None:
|
||||
assert typ is not None
|
||||
return await api.search_category([(cls, typ)], near_query=query, **details)
|
||||
|
||||
return await api.search(query, **details)
|
||||
|
||||
|
||||
async def search_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
|
||||
""" Server glue for /search endpoint. See API docs for details.
|
||||
"""
|
||||
fmt = params.parse_format(SearchResults, 'jsonv2')
|
||||
debug = params.setup_debugging()
|
||||
details = params.parse_geometry_details(fmt)
|
||||
|
||||
details['countries'] = params.get('countrycodes', None)
|
||||
details['excluded'] = params.get('exclude_place_ids', None)
|
||||
details['viewbox'] = params.get('viewbox', None) or params.get('viewboxlbrt', None)
|
||||
details['bounded_viewbox'] = params.get_bool('bounded', False)
|
||||
details['dedupe'] = params.get_bool('dedupe', True)
|
||||
|
||||
max_results = max(1, min(50, params.get_int('limit', 10)))
|
||||
details['max_results'] = max_results + min(10, max_results) \
|
||||
if details['dedupe'] else max_results
|
||||
|
||||
details['min_rank'], details['max_rank'] = \
|
||||
helpers.feature_type_to_rank(params.get('featureType', ''))
|
||||
if params.get('featureType', None) is not None:
|
||||
details['layers'] = DataLayer.ADDRESS
|
||||
else:
|
||||
details['layers'] = params.get_layers()
|
||||
|
||||
details['locales'] = Locales.from_accept_languages(params.get_accepted_languages())
|
||||
|
||||
# unstructured query parameters
|
||||
query = params.get('q', None)
|
||||
# structured query parameters
|
||||
queryparts = {}
|
||||
for key in ('amenity', 'street', 'city', 'county', 'state', 'postalcode', 'country'):
|
||||
details[key] = params.get(key, None)
|
||||
if details[key]:
|
||||
queryparts[key] = details[key]
|
||||
|
||||
try:
|
||||
if query is not None:
|
||||
if queryparts:
|
||||
params.raise_error("Structured query parameters"
|
||||
"(amenity, street, city, county, state, postalcode, country)"
|
||||
" cannot be used together with 'q' parameter.")
|
||||
queryparts['q'] = query
|
||||
results = await _unstructured_search(query, api, details)
|
||||
else:
|
||||
query = ', '.join(queryparts.values())
|
||||
|
||||
results = await api.search_address(**details)
|
||||
except UsageError as err:
|
||||
params.raise_error(str(err))
|
||||
|
||||
if details['dedupe'] and len(results) > 1:
|
||||
results = helpers.deduplicate_results(results, max_results)
|
||||
|
||||
if debug:
|
||||
return params.build_response(loglib.get_and_disable(), num_results=len(results))
|
||||
|
||||
if fmt == 'xml':
|
||||
helpers.extend_query_parts(queryparts, details,
|
||||
params.get('featureType', ''),
|
||||
params.get_bool('namedetails', False),
|
||||
params.get_bool('extratags', False),
|
||||
(str(r.place_id) for r in results if r.place_id))
|
||||
queryparts['format'] = fmt
|
||||
|
||||
moreurl = params.base_uri() + '/search?' + urlencode(queryparts)
|
||||
else:
|
||||
moreurl = ''
|
||||
|
||||
fmt_options = {'query': query, 'more_url': moreurl,
|
||||
'exclude_place_ids': queryparts.get('exclude_place_ids'),
|
||||
'viewbox': queryparts.get('viewbox'),
|
||||
'extratags': params.get_bool('extratags', False),
|
||||
'namedetails': params.get_bool('namedetails', False),
|
||||
'addressdetails': params.get_bool('addressdetails', False)}
|
||||
|
||||
output = formatting.format_result(results, fmt, fmt_options)
|
||||
|
||||
return params.build_response(output, num_results=len(results))
|
||||
|
||||
|
||||
async def deletable_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
|
||||
""" Server glue for /deletable endpoint.
|
||||
This is a special endpoint that shows polygons that have been
|
||||
deleted or are broken in the OSM data but are kept in the
|
||||
Nominatim database to minimize disruption.
|
||||
"""
|
||||
fmt = params.parse_format(RawDataList, 'json')
|
||||
|
||||
async with api.begin() as conn:
|
||||
sql = sa.text(""" SELECT p.place_id, country_code,
|
||||
name->'name' as name, i.*
|
||||
FROM placex p, import_polygon_delete i
|
||||
WHERE p.osm_id = i.osm_id AND p.osm_type = i.osm_type
|
||||
AND p.class = i.class AND p.type = i.type
|
||||
""")
|
||||
results = RawDataList(r._asdict() for r in await conn.execute(sql))
|
||||
|
||||
return params.build_response(formatting.format_result(results, fmt, {}))
|
||||
|
||||
|
||||
async def polygons_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any:
|
||||
""" Server glue for /polygons endpoint.
|
||||
This is a special endpoint that shows polygons that have changed
|
||||
their size but are kept in the Nominatim database with their
|
||||
old area to minimize disruption.
|
||||
"""
|
||||
fmt = params.parse_format(RawDataList, 'json')
|
||||
sql_params: Dict[str, Any] = {
|
||||
'days': params.get_int('days', -1),
|
||||
'cls': params.get('class')
|
||||
}
|
||||
reduced = params.get_bool('reduced', False)
|
||||
|
||||
async with api.begin() as conn:
|
||||
sql = sa.select(sa.text("""osm_type, osm_id, class, type,
|
||||
name->'name' as name,
|
||||
country_code, errormessage, updated"""))\
|
||||
.select_from(sa.text('import_polygon_error'))
|
||||
if sql_params['days'] > 0:
|
||||
sql = sql.where(sa.text("updated > 'now'::timestamp - make_interval(days => :days)"))
|
||||
if reduced:
|
||||
sql = sql.where(sa.text("errormessage like 'Area reduced%'"))
|
||||
if sql_params['cls'] is not None:
|
||||
sql = sql.where(sa.text("class = :cls"))
|
||||
|
||||
sql = sql.order_by(sa.literal_column('updated').desc()).limit(1000)
|
||||
|
||||
results = RawDataList(r._asdict() for r in await conn.execute(sql, sql_params))
|
||||
|
||||
return params.build_response(formatting.format_result(results, fmt, {}))
|
||||
|
||||
|
||||
EndpointFunc = Callable[[NominatimAPIAsync, ASGIAdaptor], Any]
|
||||
|
||||
ROUTES = [
|
||||
('status', status_endpoint),
|
||||
('details', details_endpoint),
|
||||
('reverse', reverse_endpoint),
|
||||
('lookup', lookup_endpoint),
|
||||
('search', search_endpoint),
|
||||
('deletable', deletable_endpoint),
|
||||
('polygons', polygons_endpoint),
|
||||
]
|
||||
11
src/nominatim_api/version.py
Normal file
11
src/nominatim_api/version.py
Normal file
@@ -0,0 +1,11 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Version information for the Nominatim API.
|
||||
"""
|
||||
|
||||
NOMINATIM_API_VERSION = '4.4.99'
|
||||
0
src/nominatim_core/__init__.py
Normal file
0
src/nominatim_core/__init__.py
Normal file
374
src/nominatim_core/config.py
Normal file
374
src/nominatim_core/config.py
Normal file
@@ -0,0 +1,374 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Nominatim configuration accessor.
|
||||
"""
|
||||
from typing import Dict, Any, List, Mapping, Optional
|
||||
import importlib.util
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import json
|
||||
import yaml
|
||||
|
||||
from dotenv import dotenv_values
|
||||
from psycopg2.extensions import parse_dsn
|
||||
|
||||
from .typing import StrPath
|
||||
from .errors import UsageError
|
||||
from . import paths
|
||||
|
||||
LOG = logging.getLogger()
|
||||
CONFIG_CACHE : Dict[str, Any] = {}
|
||||
|
||||
def flatten_config_list(content: Any, section: str = '') -> List[Any]:
|
||||
""" Flatten YAML configuration lists that contain include sections
|
||||
which are lists themselves.
|
||||
"""
|
||||
if not content:
|
||||
return []
|
||||
|
||||
if not isinstance(content, list):
|
||||
raise UsageError(f"List expected in section '{section}'.")
|
||||
|
||||
output = []
|
||||
for ele in content:
|
||||
if isinstance(ele, list):
|
||||
output.extend(flatten_config_list(ele, section))
|
||||
else:
|
||||
output.append(ele)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class Configuration:
|
||||
""" This class wraps access to the configuration settings
|
||||
for the Nominatim instance in use.
|
||||
|
||||
All Nominatim configuration options are prefixed with 'NOMINATIM_' to
|
||||
avoid conflicts with other environment variables. All settings can
|
||||
be accessed as properties of the class under the same name as the
|
||||
setting but with the `NOMINATIM_` prefix removed. In addition, there
|
||||
are accessor functions that convert the setting values to types
|
||||
other than string.
|
||||
"""
|
||||
|
||||
def __init__(self, project_dir: Optional[Path],
|
||||
environ: Optional[Mapping[str, str]] = None) -> None:
|
||||
self.environ = environ or os.environ
|
||||
self.project_dir = project_dir
|
||||
self.config_dir = paths.CONFIG_DIR
|
||||
self._config = dotenv_values(str(self.config_dir / 'env.defaults'))
|
||||
if self.project_dir is not None and (self.project_dir / '.env').is_file():
|
||||
self.project_dir = self.project_dir.resolve()
|
||||
self._config.update(dotenv_values(str(self.project_dir / '.env')))
|
||||
|
||||
class _LibDirs:
|
||||
module: Path
|
||||
osm2pgsql: Path
|
||||
php = paths.PHPLIB_DIR
|
||||
sql = paths.SQLLIB_DIR
|
||||
data = paths.DATA_DIR
|
||||
|
||||
self.lib_dir = _LibDirs()
|
||||
self._private_plugins: Dict[str, object] = {}
|
||||
|
||||
|
||||
def set_libdirs(self, **kwargs: StrPath) -> None:
|
||||
""" Set paths to library functions and data.
|
||||
"""
|
||||
for key, value in kwargs.items():
|
||||
setattr(self.lib_dir, key, None if value is None else Path(value))
|
||||
|
||||
|
||||
def __getattr__(self, name: str) -> str:
|
||||
name = 'NOMINATIM_' + name
|
||||
|
||||
if name in self.environ:
|
||||
return self.environ[name]
|
||||
|
||||
return self._config[name] or ''
|
||||
|
||||
|
||||
def get_bool(self, name: str) -> bool:
|
||||
""" Return the given configuration parameter as a boolean.
|
||||
|
||||
Parameters:
|
||||
name: Name of the configuration parameter with the NOMINATIM_
|
||||
prefix removed.
|
||||
|
||||
Returns:
|
||||
`True` for values of '1', 'yes' and 'true', `False` otherwise.
|
||||
"""
|
||||
return getattr(self, name).lower() in ('1', 'yes', 'true')
|
||||
|
||||
|
||||
def get_int(self, name: str) -> int:
|
||||
""" Return the given configuration parameter as an int.
|
||||
|
||||
Parameters:
|
||||
name: Name of the configuration parameter with the NOMINATIM_
|
||||
prefix removed.
|
||||
|
||||
Returns:
|
||||
The configuration value converted to int.
|
||||
|
||||
Raises:
|
||||
ValueError: when the value is not a number.
|
||||
"""
|
||||
try:
|
||||
return int(getattr(self, name))
|
||||
except ValueError as exp:
|
||||
LOG.fatal("Invalid setting NOMINATIM_%s. Needs to be a number.", name)
|
||||
raise UsageError("Configuration error.") from exp
|
||||
|
||||
|
||||
def get_str_list(self, name: str) -> Optional[List[str]]:
|
||||
""" Return the given configuration parameter as a list of strings.
|
||||
The values are assumed to be given as a comma-sparated list and
|
||||
will be stripped before returning them.
|
||||
|
||||
Parameters:
|
||||
name: Name of the configuration parameter with the NOMINATIM_
|
||||
prefix removed.
|
||||
|
||||
Returns:
|
||||
(List[str]): The comma-split parameter as a list. The
|
||||
elements are stripped of leading and final spaces before
|
||||
being returned.
|
||||
(None): The configuration parameter was unset or empty.
|
||||
"""
|
||||
raw = getattr(self, name)
|
||||
|
||||
return [v.strip() for v in raw.split(',')] if raw else None
|
||||
|
||||
|
||||
def get_path(self, name: str) -> Optional[Path]:
|
||||
""" Return the given configuration parameter as a Path.
|
||||
|
||||
Parameters:
|
||||
name: Name of the configuration parameter with the NOMINATIM_
|
||||
prefix removed.
|
||||
|
||||
Returns:
|
||||
(Path): A Path object of the parameter value.
|
||||
If a relative path is configured, then the function converts this
|
||||
into an absolute path with the project directory as root path.
|
||||
(None): The configuration parameter was unset or empty.
|
||||
"""
|
||||
value = getattr(self, name)
|
||||
if not value:
|
||||
return None
|
||||
|
||||
cfgpath = Path(value)
|
||||
|
||||
if not cfgpath.is_absolute():
|
||||
assert self.project_dir is not None
|
||||
cfgpath = self.project_dir / cfgpath
|
||||
|
||||
return cfgpath.resolve()
|
||||
|
||||
|
||||
def get_libpq_dsn(self) -> str:
|
||||
""" Get configured database DSN converted into the key/value format
|
||||
understood by libpq and psycopg.
|
||||
"""
|
||||
dsn = self.DATABASE_DSN
|
||||
|
||||
def quote_param(param: str) -> str:
|
||||
key, val = param.split('=')
|
||||
val = val.replace('\\', '\\\\').replace("'", "\\'")
|
||||
if ' ' in val:
|
||||
val = "'" + val + "'"
|
||||
return key + '=' + val
|
||||
|
||||
if dsn.startswith('pgsql:'):
|
||||
# Old PHP DSN format. Convert before returning.
|
||||
return ' '.join([quote_param(p) for p in dsn[6:].split(';')])
|
||||
|
||||
return dsn
|
||||
|
||||
|
||||
def get_database_params(self) -> Mapping[str, str]:
|
||||
""" Get the configured parameters for the database connection
|
||||
as a mapping.
|
||||
"""
|
||||
dsn = self.DATABASE_DSN
|
||||
|
||||
if dsn.startswith('pgsql:'):
|
||||
return dict((p.split('=', 1) for p in dsn[6:].split(';')))
|
||||
|
||||
return parse_dsn(dsn)
|
||||
|
||||
|
||||
def get_import_style_file(self) -> Path:
|
||||
""" Return the import style file as a path object. Translates the
|
||||
name of the standard styles automatically into a file in the
|
||||
config style.
|
||||
"""
|
||||
style = getattr(self, 'IMPORT_STYLE')
|
||||
|
||||
if style in ('admin', 'street', 'address', 'full', 'extratags'):
|
||||
return self.config_dir / f'import-{style}.lua'
|
||||
|
||||
return self.find_config_file('', 'IMPORT_STYLE')
|
||||
|
||||
|
||||
def get_os_env(self) -> Dict[str, str]:
|
||||
""" Return a copy of the OS environment with the Nominatim configuration
|
||||
merged in.
|
||||
"""
|
||||
env = {k: v for k, v in self._config.items() if v is not None}
|
||||
env.update(self.environ)
|
||||
|
||||
return env
|
||||
|
||||
|
||||
def load_sub_configuration(self, filename: StrPath,
|
||||
config: Optional[str] = None) -> Any:
|
||||
""" Load additional configuration from a file. `filename` is the name
|
||||
of the configuration file. The file is first searched in the
|
||||
project directory and then in the global settings directory.
|
||||
|
||||
If `config` is set, then the name of the configuration file can
|
||||
be additionally given through a .env configuration option. When
|
||||
the option is set, then the file will be exclusively loaded as set:
|
||||
if the name is an absolute path, the file name is taken as is,
|
||||
if the name is relative, it is taken to be relative to the
|
||||
project directory.
|
||||
|
||||
The format of the file is determined from the filename suffix.
|
||||
Currently only files with extension '.yaml' are supported.
|
||||
|
||||
YAML files support a special '!include' construct. When the
|
||||
directive is given, the value is taken to be a filename, the file
|
||||
is loaded using this function and added at the position in the
|
||||
configuration tree.
|
||||
"""
|
||||
configfile = self.find_config_file(filename, config)
|
||||
|
||||
if str(configfile) in CONFIG_CACHE:
|
||||
return CONFIG_CACHE[str(configfile)]
|
||||
|
||||
if configfile.suffix in ('.yaml', '.yml'):
|
||||
result = self._load_from_yaml(configfile)
|
||||
elif configfile.suffix == '.json':
|
||||
with configfile.open('r', encoding='utf-8') as cfg:
|
||||
result = json.load(cfg)
|
||||
else:
|
||||
raise UsageError(f"Config file '{configfile}' has unknown format.")
|
||||
|
||||
CONFIG_CACHE[str(configfile)] = result
|
||||
return result
|
||||
|
||||
|
||||
def load_plugin_module(self, module_name: str, internal_path: str) -> Any:
|
||||
""" Load a Python module as a plugin.
|
||||
|
||||
The module_name may have three variants:
|
||||
|
||||
* A name without any '.' is assumed to be an internal module
|
||||
and will be searched relative to `internal_path`.
|
||||
* If the name ends in `.py`, module_name is assumed to be a
|
||||
file name relative to the project directory.
|
||||
* Any other name is assumed to be an absolute module name.
|
||||
|
||||
In either of the variants the module name must start with a letter.
|
||||
"""
|
||||
if not module_name or not module_name[0].isidentifier():
|
||||
raise UsageError(f'Invalid module name {module_name}')
|
||||
|
||||
if '.' not in module_name:
|
||||
module_name = module_name.replace('-', '_')
|
||||
full_module = f'{internal_path}.{module_name}'
|
||||
return sys.modules.get(full_module) or importlib.import_module(full_module)
|
||||
|
||||
if module_name.endswith('.py'):
|
||||
if self.project_dir is None or not (self.project_dir / module_name).exists():
|
||||
raise UsageError(f"Cannot find module '{module_name}' in project directory.")
|
||||
|
||||
if module_name in self._private_plugins:
|
||||
return self._private_plugins[module_name]
|
||||
|
||||
file_path = str(self.project_dir / module_name)
|
||||
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
||||
if spec:
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
# Do not add to global modules because there is no standard
|
||||
# module name that Python can resolve.
|
||||
self._private_plugins[module_name] = module
|
||||
assert spec.loader is not None
|
||||
spec.loader.exec_module(module)
|
||||
|
||||
return module
|
||||
|
||||
return sys.modules.get(module_name) or importlib.import_module(module_name)
|
||||
|
||||
|
||||
def find_config_file(self, filename: StrPath,
|
||||
config: Optional[str] = None) -> Path:
|
||||
""" Resolve the location of a configuration file given a filename and
|
||||
an optional configuration option with the file name.
|
||||
Raises a UsageError when the file cannot be found or is not
|
||||
a regular file.
|
||||
"""
|
||||
if config is not None:
|
||||
cfg_value = getattr(self, config)
|
||||
if cfg_value:
|
||||
cfg_filename = Path(cfg_value)
|
||||
|
||||
if cfg_filename.is_absolute():
|
||||
cfg_filename = cfg_filename.resolve()
|
||||
|
||||
if not cfg_filename.is_file():
|
||||
LOG.fatal("Cannot find config file '%s'.", cfg_filename)
|
||||
raise UsageError("Config file not found.")
|
||||
|
||||
return cfg_filename
|
||||
|
||||
filename = cfg_filename
|
||||
|
||||
|
||||
search_paths = [self.project_dir, self.config_dir]
|
||||
for path in search_paths:
|
||||
if path is not None and (path / filename).is_file():
|
||||
return path / filename
|
||||
|
||||
LOG.fatal("Configuration file '%s' not found.\nDirectories searched: %s",
|
||||
filename, search_paths)
|
||||
raise UsageError("Config file not found.")
|
||||
|
||||
|
||||
def _load_from_yaml(self, cfgfile: Path) -> Any:
|
||||
""" Load a YAML configuration file. This installs a special handler that
|
||||
allows to include other YAML files using the '!include' operator.
|
||||
"""
|
||||
yaml.add_constructor('!include', self._yaml_include_representer,
|
||||
Loader=yaml.SafeLoader)
|
||||
return yaml.safe_load(cfgfile.read_text(encoding='utf-8'))
|
||||
|
||||
|
||||
def _yaml_include_representer(self, loader: Any, node: yaml.Node) -> Any:
|
||||
""" Handler for the '!include' operator in YAML files.
|
||||
|
||||
When the filename is relative, then the file is first searched in the
|
||||
project directory and then in the global settings directory.
|
||||
"""
|
||||
fname = loader.construct_scalar(node)
|
||||
|
||||
if Path(fname).is_absolute():
|
||||
configfile = Path(fname)
|
||||
else:
|
||||
configfile = self.find_config_file(loader.construct_scalar(node))
|
||||
|
||||
if configfile.suffix != '.yaml':
|
||||
LOG.fatal("Format error while reading '%s': only YAML format supported.",
|
||||
configfile)
|
||||
raise UsageError("Cannot handle config file format.")
|
||||
|
||||
return yaml.safe_load(configfile.read_text(encoding='utf-8'))
|
||||
0
src/nominatim_core/db/__init__.py
Normal file
0
src/nominatim_core/db/__init__.py
Normal file
236
src/nominatim_core/db/async_connection.py
Normal file
236
src/nominatim_core/db/async_connection.py
Normal file
@@ -0,0 +1,236 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
""" Non-blocking database connections.
|
||||
"""
|
||||
from typing import Callable, Any, Optional, Iterator, Sequence
|
||||
import logging
|
||||
import select
|
||||
import time
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import wait_select
|
||||
|
||||
# psycopg2 emits different exceptions pre and post 2.8. Detect if the new error
|
||||
# module is available and adapt the error handling accordingly.
|
||||
try:
|
||||
import psycopg2.errors # pylint: disable=no-name-in-module,import-error
|
||||
__has_psycopg2_errors__ = True
|
||||
except ImportError:
|
||||
__has_psycopg2_errors__ = False
|
||||
|
||||
from ..typing import T_cursor, Query
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
class DeadlockHandler:
|
||||
""" Context manager that catches deadlock exceptions and calls
|
||||
the given handler function. All other exceptions are passed on
|
||||
normally.
|
||||
"""
|
||||
|
||||
def __init__(self, handler: Callable[[], None], ignore_sql_errors: bool = False) -> None:
|
||||
self.handler = handler
|
||||
self.ignore_sql_errors = ignore_sql_errors
|
||||
|
||||
def __enter__(self) -> 'DeadlockHandler':
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> bool:
|
||||
if __has_psycopg2_errors__:
|
||||
if exc_type == psycopg2.errors.DeadlockDetected: # pylint: disable=E1101
|
||||
self.handler()
|
||||
return True
|
||||
elif exc_type == psycopg2.extensions.TransactionRollbackError \
|
||||
and exc_value.pgcode == '40P01':
|
||||
self.handler()
|
||||
return True
|
||||
|
||||
if self.ignore_sql_errors and isinstance(exc_value, psycopg2.Error):
|
||||
LOG.info("SQL error ignored: %s", exc_value)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class DBConnection:
|
||||
""" A single non-blocking database connection.
|
||||
"""
|
||||
|
||||
def __init__(self, dsn: str,
|
||||
cursor_factory: Optional[Callable[..., T_cursor]] = None,
|
||||
ignore_sql_errors: bool = False) -> None:
|
||||
self.dsn = dsn
|
||||
|
||||
self.current_query: Optional[Query] = None
|
||||
self.current_params: Optional[Sequence[Any]] = None
|
||||
self.ignore_sql_errors = ignore_sql_errors
|
||||
|
||||
self.conn: Optional['psycopg2._psycopg.connection'] = None
|
||||
self.cursor: Optional['psycopg2._psycopg.cursor'] = None
|
||||
self.connect(cursor_factory=cursor_factory)
|
||||
|
||||
def close(self) -> None:
|
||||
""" Close all open connections. Does not wait for pending requests.
|
||||
"""
|
||||
if self.conn is not None:
|
||||
if self.cursor is not None:
|
||||
self.cursor.close()
|
||||
self.cursor = None
|
||||
self.conn.close()
|
||||
|
||||
self.conn = None
|
||||
|
||||
def connect(self, cursor_factory: Optional[Callable[..., T_cursor]] = None) -> None:
|
||||
""" (Re)connect to the database. Creates an asynchronous connection
|
||||
with JIT and parallel processing disabled. If a connection was
|
||||
already open, it is closed and a new connection established.
|
||||
The caller must ensure that no query is pending before reconnecting.
|
||||
"""
|
||||
self.close()
|
||||
|
||||
# Use a dict to hand in the parameters because async is a reserved
|
||||
# word in Python3.
|
||||
self.conn = psycopg2.connect(**{'dsn': self.dsn, 'async': True}) # type: ignore
|
||||
assert self.conn
|
||||
self.wait()
|
||||
|
||||
if cursor_factory is not None:
|
||||
self.cursor = self.conn.cursor(cursor_factory=cursor_factory)
|
||||
else:
|
||||
self.cursor = self.conn.cursor()
|
||||
# Disable JIT and parallel workers as they are known to cause problems.
|
||||
# Update pg_settings instead of using SET because it does not yield
|
||||
# errors on older versions of Postgres where the settings are not
|
||||
# implemented.
|
||||
self.perform(
|
||||
""" UPDATE pg_settings SET setting = -1 WHERE name = 'jit_above_cost';
|
||||
UPDATE pg_settings SET setting = 0
|
||||
WHERE name = 'max_parallel_workers_per_gather';""")
|
||||
self.wait()
|
||||
|
||||
def _deadlock_handler(self) -> None:
|
||||
LOG.info("Deadlock detected (params = %s), retry.", str(self.current_params))
|
||||
assert self.cursor is not None
|
||||
assert self.current_query is not None
|
||||
assert self.current_params is not None
|
||||
|
||||
self.cursor.execute(self.current_query, self.current_params)
|
||||
|
||||
def wait(self) -> None:
|
||||
""" Block until any pending operation is done.
|
||||
"""
|
||||
while True:
|
||||
with DeadlockHandler(self._deadlock_handler, self.ignore_sql_errors):
|
||||
wait_select(self.conn)
|
||||
self.current_query = None
|
||||
return
|
||||
|
||||
def perform(self, sql: Query, args: Optional[Sequence[Any]] = None) -> None:
|
||||
""" Send SQL query to the server. Returns immediately without
|
||||
blocking.
|
||||
"""
|
||||
assert self.cursor is not None
|
||||
self.current_query = sql
|
||||
self.current_params = args
|
||||
self.cursor.execute(sql, args)
|
||||
|
||||
def fileno(self) -> int:
|
||||
""" File descriptor to wait for. (Makes this class select()able.)
|
||||
"""
|
||||
assert self.conn is not None
|
||||
return self.conn.fileno()
|
||||
|
||||
def is_done(self) -> bool:
|
||||
""" Check if the connection is available for a new query.
|
||||
|
||||
Also checks if the previous query has run into a deadlock.
|
||||
If so, then the previous query is repeated.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
|
||||
if self.current_query is None:
|
||||
return True
|
||||
|
||||
with DeadlockHandler(self._deadlock_handler, self.ignore_sql_errors):
|
||||
if self.conn.poll() == psycopg2.extensions.POLL_OK:
|
||||
self.current_query = None
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class WorkerPool:
|
||||
""" A pool of asynchronous database connections.
|
||||
|
||||
The pool may be used as a context manager.
|
||||
"""
|
||||
REOPEN_CONNECTIONS_AFTER = 100000
|
||||
|
||||
def __init__(self, dsn: str, pool_size: int, ignore_sql_errors: bool = False) -> None:
|
||||
self.threads = [DBConnection(dsn, ignore_sql_errors=ignore_sql_errors)
|
||||
for _ in range(pool_size)]
|
||||
self.free_workers = self._yield_free_worker()
|
||||
self.wait_time = 0.0
|
||||
|
||||
|
||||
def finish_all(self) -> None:
|
||||
""" Wait for all connection to finish.
|
||||
"""
|
||||
for thread in self.threads:
|
||||
while not thread.is_done():
|
||||
thread.wait()
|
||||
|
||||
self.free_workers = self._yield_free_worker()
|
||||
|
||||
def close(self) -> None:
|
||||
""" Close all connections and clear the pool.
|
||||
"""
|
||||
for thread in self.threads:
|
||||
thread.close()
|
||||
self.threads = []
|
||||
self.free_workers = iter([])
|
||||
|
||||
|
||||
def next_free_worker(self) -> DBConnection:
|
||||
""" Get the next free connection.
|
||||
"""
|
||||
return next(self.free_workers)
|
||||
|
||||
|
||||
def _yield_free_worker(self) -> Iterator[DBConnection]:
|
||||
ready = self.threads
|
||||
command_stat = 0
|
||||
while True:
|
||||
for thread in ready:
|
||||
if thread.is_done():
|
||||
command_stat += 1
|
||||
yield thread
|
||||
|
||||
if command_stat > self.REOPEN_CONNECTIONS_AFTER:
|
||||
self._reconnect_threads()
|
||||
ready = self.threads
|
||||
command_stat = 0
|
||||
else:
|
||||
tstart = time.time()
|
||||
_, ready, _ = select.select([], self.threads, [])
|
||||
self.wait_time += time.time() - tstart
|
||||
|
||||
|
||||
def _reconnect_threads(self) -> None:
|
||||
for thread in self.threads:
|
||||
while not thread.is_done():
|
||||
thread.wait()
|
||||
thread.connect()
|
||||
|
||||
|
||||
def __enter__(self) -> 'WorkerPool':
|
||||
return self
|
||||
|
||||
|
||||
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
|
||||
self.finish_all()
|
||||
self.close()
|
||||
21
src/nominatim_core/db/async_core_library.py
Normal file
21
src/nominatim_core/db/async_core_library.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Import the base library to use with asynchronous SQLAlchemy.
|
||||
"""
|
||||
# pylint: disable=invalid-name
|
||||
|
||||
from typing import Any
|
||||
|
||||
try:
|
||||
import psycopg
|
||||
PGCORE_LIB = 'psycopg'
|
||||
PGCORE_ERROR: Any = psycopg.Error
|
||||
except ModuleNotFoundError:
|
||||
import asyncpg
|
||||
PGCORE_LIB = 'asyncpg'
|
||||
PGCORE_ERROR = asyncpg.PostgresError
|
||||
254
src/nominatim_core/db/connection.py
Normal file
254
src/nominatim_core/db/connection.py
Normal file
@@ -0,0 +1,254 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Specialised connection and cursor functions.
|
||||
"""
|
||||
from typing import Optional, Any, Callable, ContextManager, Dict, cast, overload, Tuple, Iterable
|
||||
import contextlib
|
||||
import logging
|
||||
import os
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extensions
|
||||
import psycopg2.extras
|
||||
from psycopg2 import sql as pysql
|
||||
|
||||
from ..typing import SysEnv, Query, T_cursor
|
||||
from ..errors import UsageError
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
class Cursor(psycopg2.extras.DictCursor):
|
||||
""" A cursor returning dict-like objects and providing specialised
|
||||
execution functions.
|
||||
"""
|
||||
# pylint: disable=arguments-renamed,arguments-differ
|
||||
def execute(self, query: Query, args: Any = None) -> None:
|
||||
""" Query execution that logs the SQL query when debugging is enabled.
|
||||
"""
|
||||
if LOG.isEnabledFor(logging.DEBUG):
|
||||
LOG.debug(self.mogrify(query, args).decode('utf-8'))
|
||||
|
||||
super().execute(query, args)
|
||||
|
||||
|
||||
def execute_values(self, sql: Query, argslist: Iterable[Tuple[Any, ...]],
|
||||
template: Optional[Query] = None) -> None:
|
||||
""" Wrapper for the psycopg2 convenience function to execute
|
||||
SQL for a list of values.
|
||||
"""
|
||||
LOG.debug("SQL execute_values(%s, %s)", sql, argslist)
|
||||
|
||||
psycopg2.extras.execute_values(self, sql, argslist, template=template)
|
||||
|
||||
|
||||
def scalar(self, sql: Query, args: Any = None) -> Any:
|
||||
""" Execute query that returns a single value. The value is returned.
|
||||
If the query yields more than one row, a ValueError is raised.
|
||||
"""
|
||||
self.execute(sql, args)
|
||||
|
||||
if self.rowcount != 1:
|
||||
raise RuntimeError("Query did not return a single row.")
|
||||
|
||||
result = self.fetchone()
|
||||
assert result is not None
|
||||
|
||||
return result[0]
|
||||
|
||||
|
||||
def drop_table(self, name: str, if_exists: bool = True, cascade: bool = False) -> None:
|
||||
""" Drop the table with the given name.
|
||||
Set `if_exists` to False if a non-existent table should raise
|
||||
an exception instead of just being ignored. If 'cascade' is set
|
||||
to True then all dependent tables are deleted as well.
|
||||
"""
|
||||
sql = 'DROP TABLE '
|
||||
if if_exists:
|
||||
sql += 'IF EXISTS '
|
||||
sql += '{}'
|
||||
if cascade:
|
||||
sql += ' CASCADE'
|
||||
|
||||
self.execute(pysql.SQL(sql).format(pysql.Identifier(name)))
|
||||
|
||||
|
||||
class Connection(psycopg2.extensions.connection):
|
||||
""" A connection that provides the specialised cursor by default and
|
||||
adds convenience functions for administrating the database.
|
||||
"""
|
||||
@overload # type: ignore[override]
|
||||
def cursor(self) -> Cursor:
|
||||
...
|
||||
|
||||
@overload
|
||||
def cursor(self, name: str) -> Cursor:
|
||||
...
|
||||
|
||||
@overload
|
||||
def cursor(self, cursor_factory: Callable[..., T_cursor]) -> T_cursor:
|
||||
...
|
||||
|
||||
def cursor(self, cursor_factory = Cursor, **kwargs): # type: ignore
|
||||
""" Return a new cursor. By default the specialised cursor is returned.
|
||||
"""
|
||||
return super().cursor(cursor_factory=cursor_factory, **kwargs)
|
||||
|
||||
|
||||
def table_exists(self, table: str) -> bool:
|
||||
""" Check that a table with the given name exists in the database.
|
||||
"""
|
||||
with self.cursor() as cur:
|
||||
num = cur.scalar("""SELECT count(*) FROM pg_tables
|
||||
WHERE tablename = %s and schemaname = 'public'""", (table, ))
|
||||
return num == 1 if isinstance(num, int) else False
|
||||
|
||||
|
||||
def table_has_column(self, table: str, column: str) -> bool:
|
||||
""" Check if the table 'table' exists and has a column with name 'column'.
|
||||
"""
|
||||
with self.cursor() as cur:
|
||||
has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
|
||||
WHERE table_name = %s
|
||||
and column_name = %s""",
|
||||
(table, column))
|
||||
return has_column > 0 if isinstance(has_column, int) else False
|
||||
|
||||
|
||||
def index_exists(self, index: str, table: Optional[str] = None) -> bool:
|
||||
""" Check that an index with the given name exists in the database.
|
||||
If table is not None then the index must relate to the given
|
||||
table.
|
||||
"""
|
||||
with self.cursor() as cur:
|
||||
cur.execute("""SELECT tablename FROM pg_indexes
|
||||
WHERE indexname = %s and schemaname = 'public'""", (index, ))
|
||||
if cur.rowcount == 0:
|
||||
return False
|
||||
|
||||
if table is not None:
|
||||
row = cur.fetchone()
|
||||
if row is None or not isinstance(row[0], str):
|
||||
return False
|
||||
return row[0] == table
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def drop_table(self, name: str, if_exists: bool = True, cascade: bool = False) -> None:
|
||||
""" Drop the table with the given name.
|
||||
Set `if_exists` to False if a non-existent table should raise
|
||||
an exception instead of just being ignored.
|
||||
"""
|
||||
with self.cursor() as cur:
|
||||
cur.drop_table(name, if_exists, cascade)
|
||||
self.commit()
|
||||
|
||||
|
||||
def server_version_tuple(self) -> Tuple[int, int]:
|
||||
""" Return the server version as a tuple of (major, minor).
|
||||
Converts correctly for pre-10 and post-10 PostgreSQL versions.
|
||||
"""
|
||||
version = self.server_version
|
||||
if version < 100000:
|
||||
return (int(version / 10000), int((version % 10000) / 100))
|
||||
|
||||
return (int(version / 10000), version % 10000)
|
||||
|
||||
|
||||
def postgis_version_tuple(self) -> Tuple[int, int]:
|
||||
""" Return the postgis version installed in the database as a
|
||||
tuple of (major, minor). Assumes that the PostGIS extension
|
||||
has been installed already.
|
||||
"""
|
||||
with self.cursor() as cur:
|
||||
version = cur.scalar('SELECT postgis_lib_version()')
|
||||
|
||||
version_parts = version.split('.')
|
||||
if len(version_parts) < 2:
|
||||
raise UsageError(f"Error fetching Postgis version. Bad format: {version}")
|
||||
|
||||
return (int(version_parts[0]), int(version_parts[1]))
|
||||
|
||||
|
||||
def extension_loaded(self, extension_name: str) -> bool:
|
||||
""" Return True if the hstore extension is loaded in the database.
|
||||
"""
|
||||
with self.cursor() as cur:
|
||||
cur.execute('SELECT extname FROM pg_extension WHERE extname = %s', (extension_name, ))
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
class ConnectionContext(ContextManager[Connection]):
|
||||
""" Context manager of the connection that also provides direct access
|
||||
to the underlying connection.
|
||||
"""
|
||||
connection: Connection
|
||||
|
||||
def connect(dsn: str) -> ConnectionContext:
|
||||
""" Open a connection to the database using the specialised connection
|
||||
factory. The returned object may be used in conjunction with 'with'.
|
||||
When used outside a context manager, use the `connection` attribute
|
||||
to get the connection.
|
||||
"""
|
||||
try:
|
||||
conn = psycopg2.connect(dsn, connection_factory=Connection)
|
||||
ctxmgr = cast(ConnectionContext, contextlib.closing(conn))
|
||||
ctxmgr.connection = conn
|
||||
return ctxmgr
|
||||
except psycopg2.OperationalError as err:
|
||||
raise UsageError(f"Cannot connect to database: {err}") from err
|
||||
|
||||
|
||||
# Translation from PG connection string parameters to PG environment variables.
|
||||
# Derived from https://www.postgresql.org/docs/current/libpq-envars.html.
|
||||
_PG_CONNECTION_STRINGS = {
|
||||
'host': 'PGHOST',
|
||||
'hostaddr': 'PGHOSTADDR',
|
||||
'port': 'PGPORT',
|
||||
'dbname': 'PGDATABASE',
|
||||
'user': 'PGUSER',
|
||||
'password': 'PGPASSWORD',
|
||||
'passfile': 'PGPASSFILE',
|
||||
'channel_binding': 'PGCHANNELBINDING',
|
||||
'service': 'PGSERVICE',
|
||||
'options': 'PGOPTIONS',
|
||||
'application_name': 'PGAPPNAME',
|
||||
'sslmode': 'PGSSLMODE',
|
||||
'requiressl': 'PGREQUIRESSL',
|
||||
'sslcompression': 'PGSSLCOMPRESSION',
|
||||
'sslcert': 'PGSSLCERT',
|
||||
'sslkey': 'PGSSLKEY',
|
||||
'sslrootcert': 'PGSSLROOTCERT',
|
||||
'sslcrl': 'PGSSLCRL',
|
||||
'requirepeer': 'PGREQUIREPEER',
|
||||
'ssl_min_protocol_version': 'PGSSLMINPROTOCOLVERSION',
|
||||
'ssl_max_protocol_version': 'PGSSLMAXPROTOCOLVERSION',
|
||||
'gssencmode': 'PGGSSENCMODE',
|
||||
'krbsrvname': 'PGKRBSRVNAME',
|
||||
'gsslib': 'PGGSSLIB',
|
||||
'connect_timeout': 'PGCONNECT_TIMEOUT',
|
||||
'target_session_attrs': 'PGTARGETSESSIONATTRS',
|
||||
}
|
||||
|
||||
|
||||
def get_pg_env(dsn: str,
|
||||
base_env: Optional[SysEnv] = None) -> Dict[str, str]:
|
||||
""" Return a copy of `base_env` with the environment variables for
|
||||
PostgreSQL set up from the given database connection string.
|
||||
If `base_env` is None, then the OS environment is used as a base
|
||||
environment.
|
||||
"""
|
||||
env = dict(base_env if base_env is not None else os.environ)
|
||||
|
||||
for param, value in psycopg2.extensions.parse_dsn(dsn).items():
|
||||
if param in _PG_CONNECTION_STRINGS:
|
||||
env[_PG_CONNECTION_STRINGS[param]] = value
|
||||
else:
|
||||
LOG.error("Unknown connection parameter '%s' ignored.", param)
|
||||
|
||||
return env
|
||||
47
src/nominatim_core/db/properties.py
Normal file
47
src/nominatim_core/db/properties.py
Normal file
@@ -0,0 +1,47 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Query and access functions for the in-database property table.
|
||||
"""
|
||||
from typing import Optional, cast
|
||||
|
||||
from .connection import Connection
|
||||
|
||||
def set_property(conn: Connection, name: str, value: str) -> None:
|
||||
""" Add or replace the property with the given name.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('SELECT value FROM nominatim_properties WHERE property = %s',
|
||||
(name, ))
|
||||
|
||||
if cur.rowcount == 0:
|
||||
sql = 'INSERT INTO nominatim_properties (value, property) VALUES (%s, %s)'
|
||||
else:
|
||||
sql = 'UPDATE nominatim_properties SET value = %s WHERE property = %s'
|
||||
|
||||
cur.execute(sql, (value, name))
|
||||
conn.commit()
|
||||
|
||||
|
||||
def get_property(conn: Connection, name: str) -> Optional[str]:
|
||||
""" Return the current value of the given property or None if the property
|
||||
is not set.
|
||||
"""
|
||||
if not conn.table_exists('nominatim_properties'):
|
||||
return None
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('SELECT value FROM nominatim_properties WHERE property = %s',
|
||||
(name, ))
|
||||
|
||||
if cur.rowcount == 0:
|
||||
return None
|
||||
|
||||
result = cur.fetchone()
|
||||
assert result is not None
|
||||
|
||||
return cast(Optional[str], result[0])
|
||||
143
src/nominatim_core/db/sql_preprocessor.py
Normal file
143
src/nominatim_core/db/sql_preprocessor.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Preprocessing of SQL files.
|
||||
"""
|
||||
from typing import Set, Dict, Any, cast
|
||||
import jinja2
|
||||
|
||||
from .connection import Connection
|
||||
from .async_connection import WorkerPool
|
||||
from ..config import Configuration
|
||||
|
||||
def _get_partitions(conn: Connection) -> Set[int]:
|
||||
""" Get the set of partitions currently in use.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('SELECT DISTINCT partition FROM country_name')
|
||||
partitions = set([0])
|
||||
for row in cur:
|
||||
partitions.add(row[0])
|
||||
|
||||
return partitions
|
||||
|
||||
|
||||
def _get_tables(conn: Connection) -> Set[str]:
|
||||
""" Return the set of tables currently in use.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT tablename FROM pg_tables WHERE schemaname = 'public'")
|
||||
|
||||
return set((row[0] for row in list(cur)))
|
||||
|
||||
def _get_middle_db_format(conn: Connection, tables: Set[str]) -> str:
|
||||
""" Returns the version of the slim middle tables.
|
||||
"""
|
||||
if 'osm2pgsql_properties' not in tables:
|
||||
return '1'
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT value FROM osm2pgsql_properties WHERE property = 'db_format'")
|
||||
row = cur.fetchone()
|
||||
|
||||
return cast(str, row[0]) if row is not None else '1'
|
||||
|
||||
|
||||
def _setup_tablespace_sql(config: Configuration) -> Dict[str, str]:
|
||||
""" Returns a dict with tablespace expressions for the different tablespace
|
||||
kinds depending on whether a tablespace is configured or not.
|
||||
"""
|
||||
out = {}
|
||||
for subset in ('ADDRESS', 'SEARCH', 'AUX'):
|
||||
for kind in ('DATA', 'INDEX'):
|
||||
tspace = getattr(config, f'TABLESPACE_{subset}_{kind}')
|
||||
if tspace:
|
||||
tspace = f'TABLESPACE "{tspace}"'
|
||||
out[f'{subset.lower()}_{kind.lower()}'] = tspace
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _setup_postgresql_features(conn: Connection) -> Dict[str, Any]:
|
||||
""" Set up a dictionary with various optional Postgresql/Postgis features that
|
||||
depend on the database version.
|
||||
"""
|
||||
pg_version = conn.server_version_tuple()
|
||||
postgis_version = conn.postgis_version_tuple()
|
||||
pg11plus = pg_version >= (11, 0, 0)
|
||||
ps3 = postgis_version >= (3, 0)
|
||||
return {
|
||||
'has_index_non_key_column': pg11plus,
|
||||
'spgist_geom' : 'SPGIST' if pg11plus and ps3 else 'GIST'
|
||||
}
|
||||
|
||||
class SQLPreprocessor:
|
||||
""" A environment for preprocessing SQL files from the
|
||||
lib-sql directory.
|
||||
|
||||
The preprocessor provides a number of default filters and variables.
|
||||
The variables may be overwritten when rendering an SQL file.
|
||||
|
||||
The preprocessing is currently based on the jinja2 templating library
|
||||
and follows its syntax.
|
||||
"""
|
||||
|
||||
def __init__(self, conn: Connection, config: Configuration) -> None:
|
||||
self.env = jinja2.Environment(autoescape=False,
|
||||
loader=jinja2.FileSystemLoader(str(config.lib_dir.sql)))
|
||||
|
||||
db_info: Dict[str, Any] = {}
|
||||
db_info['partitions'] = _get_partitions(conn)
|
||||
db_info['tables'] = _get_tables(conn)
|
||||
db_info['reverse_only'] = 'search_name' not in db_info['tables']
|
||||
db_info['tablespace'] = _setup_tablespace_sql(config)
|
||||
db_info['middle_db_format'] = _get_middle_db_format(conn, db_info['tables'])
|
||||
|
||||
self.env.globals['config'] = config
|
||||
self.env.globals['db'] = db_info
|
||||
self.env.globals['postgres'] = _setup_postgresql_features(conn)
|
||||
|
||||
|
||||
def run_string(self, conn: Connection, template: str, **kwargs: Any) -> None:
|
||||
""" Execute the given SQL template string on the connection.
|
||||
The keyword arguments may supply additional parameters
|
||||
for preprocessing.
|
||||
"""
|
||||
sql = self.env.from_string(template).render(**kwargs)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None:
|
||||
""" Execute the given SQL file on the connection. The keyword arguments
|
||||
may supply additional parameters for preprocessing.
|
||||
"""
|
||||
sql = self.env.get_template(name).render(**kwargs)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def run_parallel_sql_file(self, dsn: str, name: str, num_threads: int = 1,
|
||||
**kwargs: Any) -> None:
|
||||
""" Execute the given SQL files using parallel asynchronous connections.
|
||||
The keyword arguments may supply additional parameters for
|
||||
preprocessing.
|
||||
|
||||
After preprocessing the SQL code is cut at lines containing only
|
||||
'---'. Each chunk is sent to one of the `num_threads` workers.
|
||||
"""
|
||||
sql = self.env.get_template(name).render(**kwargs)
|
||||
|
||||
parts = sql.split('\n---\n')
|
||||
|
||||
with WorkerPool(dsn, num_threads) as pool:
|
||||
for part in parts:
|
||||
pool.next_free_worker().perform(part)
|
||||
119
src/nominatim_core/db/sqlalchemy_schema.py
Normal file
119
src/nominatim_core/db/sqlalchemy_schema.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
SQLAlchemy definitions for all tables used by the frontend.
|
||||
"""
|
||||
import sqlalchemy as sa
|
||||
|
||||
from .sqlalchemy_types import Geometry, KeyValueStore, IntArray
|
||||
|
||||
#pylint: disable=too-many-instance-attributes
|
||||
class SearchTables:
|
||||
""" Data class that holds the tables of the Nominatim database.
|
||||
|
||||
This schema strictly reflects the read-access view of the database.
|
||||
Any data used for updates only will not be visible.
|
||||
"""
|
||||
|
||||
def __init__(self, meta: sa.MetaData) -> None:
|
||||
self.meta = meta
|
||||
|
||||
self.import_status = sa.Table('import_status', meta,
|
||||
sa.Column('lastimportdate', sa.DateTime(True), nullable=False),
|
||||
sa.Column('sequence_id', sa.Integer),
|
||||
sa.Column('indexed', sa.Boolean))
|
||||
|
||||
self.properties = sa.Table('nominatim_properties', meta,
|
||||
sa.Column('property', sa.Text, nullable=False),
|
||||
sa.Column('value', sa.Text))
|
||||
|
||||
self.placex = sa.Table('placex', meta,
|
||||
sa.Column('place_id', sa.BigInteger, nullable=False),
|
||||
sa.Column('parent_place_id', sa.BigInteger),
|
||||
sa.Column('linked_place_id', sa.BigInteger),
|
||||
sa.Column('importance', sa.Float),
|
||||
sa.Column('indexed_date', sa.DateTime),
|
||||
sa.Column('rank_address', sa.SmallInteger),
|
||||
sa.Column('rank_search', sa.SmallInteger),
|
||||
sa.Column('indexed_status', sa.SmallInteger),
|
||||
sa.Column('osm_type', sa.String(1), nullable=False),
|
||||
sa.Column('osm_id', sa.BigInteger, nullable=False),
|
||||
sa.Column('class', sa.Text, nullable=False, key='class_'),
|
||||
sa.Column('type', sa.Text, nullable=False),
|
||||
sa.Column('admin_level', sa.SmallInteger),
|
||||
sa.Column('name', KeyValueStore),
|
||||
sa.Column('address', KeyValueStore),
|
||||
sa.Column('extratags', KeyValueStore),
|
||||
sa.Column('geometry', Geometry, nullable=False),
|
||||
sa.Column('wikipedia', sa.Text),
|
||||
sa.Column('country_code', sa.String(2)),
|
||||
sa.Column('housenumber', sa.Text),
|
||||
sa.Column('postcode', sa.Text),
|
||||
sa.Column('centroid', Geometry))
|
||||
|
||||
self.addressline = sa.Table('place_addressline', meta,
|
||||
sa.Column('place_id', sa.BigInteger),
|
||||
sa.Column('address_place_id', sa.BigInteger),
|
||||
sa.Column('distance', sa.Float),
|
||||
sa.Column('fromarea', sa.Boolean),
|
||||
sa.Column('isaddress', sa.Boolean))
|
||||
|
||||
self.postcode = sa.Table('location_postcode', meta,
|
||||
sa.Column('place_id', sa.BigInteger),
|
||||
sa.Column('parent_place_id', sa.BigInteger),
|
||||
sa.Column('rank_search', sa.SmallInteger),
|
||||
sa.Column('rank_address', sa.SmallInteger),
|
||||
sa.Column('indexed_status', sa.SmallInteger),
|
||||
sa.Column('indexed_date', sa.DateTime),
|
||||
sa.Column('country_code', sa.String(2)),
|
||||
sa.Column('postcode', sa.Text),
|
||||
sa.Column('geometry', Geometry))
|
||||
|
||||
self.osmline = sa.Table('location_property_osmline', meta,
|
||||
sa.Column('place_id', sa.BigInteger, nullable=False),
|
||||
sa.Column('osm_id', sa.BigInteger),
|
||||
sa.Column('parent_place_id', sa.BigInteger),
|
||||
sa.Column('indexed_date', sa.DateTime),
|
||||
sa.Column('startnumber', sa.Integer),
|
||||
sa.Column('endnumber', sa.Integer),
|
||||
sa.Column('step', sa.SmallInteger),
|
||||
sa.Column('indexed_status', sa.SmallInteger),
|
||||
sa.Column('linegeo', Geometry),
|
||||
sa.Column('address', KeyValueStore),
|
||||
sa.Column('postcode', sa.Text),
|
||||
sa.Column('country_code', sa.String(2)))
|
||||
|
||||
self.country_name = sa.Table('country_name', meta,
|
||||
sa.Column('country_code', sa.String(2)),
|
||||
sa.Column('name', KeyValueStore),
|
||||
sa.Column('derived_name', KeyValueStore),
|
||||
sa.Column('partition', sa.Integer))
|
||||
|
||||
self.country_grid = sa.Table('country_osm_grid', meta,
|
||||
sa.Column('country_code', sa.String(2)),
|
||||
sa.Column('area', sa.Float),
|
||||
sa.Column('geometry', Geometry))
|
||||
|
||||
# The following tables are not necessarily present.
|
||||
self.search_name = sa.Table('search_name', meta,
|
||||
sa.Column('place_id', sa.BigInteger),
|
||||
sa.Column('importance', sa.Float),
|
||||
sa.Column('search_rank', sa.SmallInteger),
|
||||
sa.Column('address_rank', sa.SmallInteger),
|
||||
sa.Column('name_vector', IntArray),
|
||||
sa.Column('nameaddress_vector', IntArray),
|
||||
sa.Column('country_code', sa.String(2)),
|
||||
sa.Column('centroid', Geometry))
|
||||
|
||||
self.tiger = sa.Table('location_property_tiger', meta,
|
||||
sa.Column('place_id', sa.BigInteger),
|
||||
sa.Column('parent_place_id', sa.BigInteger),
|
||||
sa.Column('startnumber', sa.Integer),
|
||||
sa.Column('endnumber', sa.Integer),
|
||||
sa.Column('step', sa.SmallInteger),
|
||||
sa.Column('linegeo', Geometry),
|
||||
sa.Column('postcode', sa.Text))
|
||||
17
src/nominatim_core/db/sqlalchemy_types/__init__.py
Normal file
17
src/nominatim_core/db/sqlalchemy_types/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Module with custom types for SQLAlchemy
|
||||
"""
|
||||
|
||||
# See also https://github.com/PyCQA/pylint/issues/6006
|
||||
# pylint: disable=useless-import-alias
|
||||
|
||||
from .geometry import (Geometry as Geometry)
|
||||
from .int_array import (IntArray as IntArray)
|
||||
from .key_value import (KeyValueStore as KeyValueStore)
|
||||
from .json import (Json as Json)
|
||||
308
src/nominatim_core/db/sqlalchemy_types/geometry.py
Normal file
308
src/nominatim_core/db/sqlalchemy_types/geometry.py
Normal file
@@ -0,0 +1,308 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Custom types for SQLAlchemy.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import Callable, Any, cast
|
||||
import sys
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.ext.compiler import compiles
|
||||
from sqlalchemy import types
|
||||
|
||||
from ...typing import SaColumn, SaBind
|
||||
|
||||
#pylint: disable=all
|
||||
|
||||
class Geometry_DistanceSpheroid(sa.sql.expression.FunctionElement[float]):
|
||||
""" Function to compute the spherical distance in meters.
|
||||
"""
|
||||
type = sa.Float()
|
||||
name = 'Geometry_DistanceSpheroid'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(Geometry_DistanceSpheroid) # type: ignore[no-untyped-call, misc]
|
||||
def _default_distance_spheroid(element: Geometry_DistanceSpheroid,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "ST_DistanceSpheroid(%s,"\
|
||||
" 'SPHEROID[\"WGS 84\",6378137,298.257223563, AUTHORITY[\"EPSG\",\"7030\"]]')"\
|
||||
% compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
@compiles(Geometry_DistanceSpheroid, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def _spatialite_distance_spheroid(element: Geometry_DistanceSpheroid,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "COALESCE(Distance(%s, true), 0.0)" % compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
class Geometry_IsLineLike(sa.sql.expression.FunctionElement[Any]):
|
||||
""" Check if the geometry is a line or multiline.
|
||||
"""
|
||||
name = 'Geometry_IsLineLike'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(Geometry_IsLineLike) # type: ignore[no-untyped-call, misc]
|
||||
def _default_is_line_like(element: Geometry_IsLineLike,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "ST_GeometryType(%s) IN ('ST_LineString', 'ST_MultiLineString')" % \
|
||||
compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
@compiles(Geometry_IsLineLike, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def _sqlite_is_line_like(element: Geometry_IsLineLike,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "ST_GeometryType(%s) IN ('LINESTRING', 'MULTILINESTRING')" % \
|
||||
compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
class Geometry_IsAreaLike(sa.sql.expression.FunctionElement[Any]):
|
||||
""" Check if the geometry is a polygon or multipolygon.
|
||||
"""
|
||||
name = 'Geometry_IsLineLike'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(Geometry_IsAreaLike) # type: ignore[no-untyped-call, misc]
|
||||
def _default_is_area_like(element: Geometry_IsAreaLike,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "ST_GeometryType(%s) IN ('ST_Polygon', 'ST_MultiPolygon')" % \
|
||||
compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
@compiles(Geometry_IsAreaLike, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def _sqlite_is_area_like(element: Geometry_IsAreaLike,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "ST_GeometryType(%s) IN ('POLYGON', 'MULTIPOLYGON')" % \
|
||||
compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
class Geometry_IntersectsBbox(sa.sql.expression.FunctionElement[Any]):
|
||||
""" Check if the bounding boxes of the given geometries intersect.
|
||||
"""
|
||||
name = 'Geometry_IntersectsBbox'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(Geometry_IntersectsBbox) # type: ignore[no-untyped-call, misc]
|
||||
def _default_intersects(element: Geometry_IntersectsBbox,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "%s && %s" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
|
||||
|
||||
|
||||
@compiles(Geometry_IntersectsBbox, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def _sqlite_intersects(element: Geometry_IntersectsBbox,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "MbrIntersects(%s) = 1" % compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
class Geometry_ColumnIntersectsBbox(sa.sql.expression.FunctionElement[Any]):
|
||||
""" Check if the bounding box of the geometry intersects with the
|
||||
given table column, using the spatial index for the column.
|
||||
|
||||
The index must exist or the query may return nothing.
|
||||
"""
|
||||
name = 'Geometry_ColumnIntersectsBbox'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(Geometry_ColumnIntersectsBbox) # type: ignore[no-untyped-call, misc]
|
||||
def default_intersects_column(element: Geometry_ColumnIntersectsBbox,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "%s && %s" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
|
||||
|
||||
|
||||
@compiles(Geometry_ColumnIntersectsBbox, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def spatialite_intersects_column(element: Geometry_ColumnIntersectsBbox,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "MbrIntersects(%s, %s) = 1 and "\
|
||||
"%s.ROWID IN (SELECT ROWID FROM SpatialIndex "\
|
||||
"WHERE f_table_name = '%s' AND f_geometry_column = '%s' "\
|
||||
"AND search_frame = %s)" %(
|
||||
compiler.process(arg1, **kw),
|
||||
compiler.process(arg2, **kw),
|
||||
arg1.table.name, arg1.table.name, arg1.name,
|
||||
compiler.process(arg2, **kw))
|
||||
|
||||
|
||||
class Geometry_ColumnDWithin(sa.sql.expression.FunctionElement[Any]):
|
||||
""" Check if the geometry is within the distance of the
|
||||
given table column, using the spatial index for the column.
|
||||
|
||||
The index must exist or the query may return nothing.
|
||||
"""
|
||||
name = 'Geometry_ColumnDWithin'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(Geometry_ColumnDWithin) # type: ignore[no-untyped-call, misc]
|
||||
def default_dwithin_column(element: Geometry_ColumnDWithin,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "ST_DWithin(%s)" % compiler.process(element.clauses, **kw)
|
||||
|
||||
@compiles(Geometry_ColumnDWithin, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def spatialite_dwithin_column(element: Geometry_ColumnDWithin,
|
||||
compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
geom1, geom2, dist = list(element.clauses)
|
||||
return "ST_Distance(%s, %s) < %s and "\
|
||||
"%s.ROWID IN (SELECT ROWID FROM SpatialIndex "\
|
||||
"WHERE f_table_name = '%s' AND f_geometry_column = '%s' "\
|
||||
"AND search_frame = ST_Expand(%s, %s))" %(
|
||||
compiler.process(geom1, **kw),
|
||||
compiler.process(geom2, **kw),
|
||||
compiler.process(dist, **kw),
|
||||
geom1.table.name, geom1.table.name, geom1.name,
|
||||
compiler.process(geom2, **kw),
|
||||
compiler.process(dist, **kw))
|
||||
|
||||
|
||||
class Geometry(types.UserDefinedType): # type: ignore[type-arg]
|
||||
""" Simplified type decorator for PostGIS geometry. This type
|
||||
only supports geometries in 4326 projection.
|
||||
"""
|
||||
cache_ok = True
|
||||
|
||||
def __init__(self, subtype: str = 'Geometry'):
|
||||
self.subtype = subtype
|
||||
|
||||
|
||||
def get_col_spec(self) -> str:
|
||||
return f'GEOMETRY({self.subtype}, 4326)'
|
||||
|
||||
|
||||
def bind_processor(self, dialect: 'sa.Dialect') -> Callable[[Any], str]:
|
||||
def process(value: Any) -> str:
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
|
||||
return cast(str, value.to_wkt())
|
||||
return process
|
||||
|
||||
|
||||
def result_processor(self, dialect: 'sa.Dialect', coltype: object) -> Callable[[Any], str]:
|
||||
def process(value: Any) -> str:
|
||||
assert isinstance(value, str)
|
||||
return value
|
||||
return process
|
||||
|
||||
|
||||
def column_expression(self, col: SaColumn) -> SaColumn:
|
||||
return sa.func.ST_AsEWKB(col)
|
||||
|
||||
|
||||
def bind_expression(self, bindvalue: SaBind) -> SaColumn:
|
||||
return sa.func.ST_GeomFromText(bindvalue, sa.text('4326'), type_=self)
|
||||
|
||||
|
||||
class comparator_factory(types.UserDefinedType.Comparator): # type: ignore[type-arg]
|
||||
|
||||
def intersects(self, other: SaColumn, use_index: bool = True) -> 'sa.Operators':
|
||||
if not use_index:
|
||||
return Geometry_IntersectsBbox(sa.func.coalesce(sa.null(), self.expr), other)
|
||||
|
||||
if isinstance(self.expr, sa.Column):
|
||||
return Geometry_ColumnIntersectsBbox(self.expr, other)
|
||||
|
||||
return Geometry_IntersectsBbox(self.expr, other)
|
||||
|
||||
|
||||
def is_line_like(self) -> SaColumn:
|
||||
return Geometry_IsLineLike(self)
|
||||
|
||||
|
||||
def is_area(self) -> SaColumn:
|
||||
return Geometry_IsAreaLike(self)
|
||||
|
||||
|
||||
def within_distance(self, other: SaColumn, distance: SaColumn) -> SaColumn:
|
||||
if isinstance(self.expr, sa.Column):
|
||||
return Geometry_ColumnDWithin(self.expr, other, distance)
|
||||
|
||||
return self.ST_Distance(other) < distance
|
||||
|
||||
|
||||
def ST_Distance(self, other: SaColumn) -> SaColumn:
|
||||
return sa.func.ST_Distance(self, other, type_=sa.Float)
|
||||
|
||||
|
||||
def ST_Contains(self, other: SaColumn) -> SaColumn:
|
||||
return sa.func.ST_Contains(self, other, type_=sa.Boolean)
|
||||
|
||||
|
||||
def ST_CoveredBy(self, other: SaColumn) -> SaColumn:
|
||||
return sa.func.ST_CoveredBy(self, other, type_=sa.Boolean)
|
||||
|
||||
|
||||
def ST_ClosestPoint(self, other: SaColumn) -> SaColumn:
|
||||
return sa.func.coalesce(sa.func.ST_ClosestPoint(self, other, type_=Geometry),
|
||||
other)
|
||||
|
||||
|
||||
def ST_Buffer(self, other: SaColumn) -> SaColumn:
|
||||
return sa.func.ST_Buffer(self, other, type_=Geometry)
|
||||
|
||||
|
||||
def ST_Expand(self, other: SaColumn) -> SaColumn:
|
||||
return sa.func.ST_Expand(self, other, type_=Geometry)
|
||||
|
||||
|
||||
def ST_Collect(self) -> SaColumn:
|
||||
return sa.func.ST_Collect(self, type_=Geometry)
|
||||
|
||||
|
||||
def ST_Centroid(self) -> SaColumn:
|
||||
return sa.func.ST_Centroid(self, type_=Geometry)
|
||||
|
||||
|
||||
def ST_LineInterpolatePoint(self, other: SaColumn) -> SaColumn:
|
||||
return sa.func.ST_LineInterpolatePoint(self, other, type_=Geometry)
|
||||
|
||||
|
||||
def ST_LineLocatePoint(self, other: SaColumn) -> SaColumn:
|
||||
return sa.func.ST_LineLocatePoint(self, other, type_=sa.Float)
|
||||
|
||||
|
||||
def distance_spheroid(self, other: SaColumn) -> SaColumn:
|
||||
return Geometry_DistanceSpheroid(self, other)
|
||||
|
||||
|
||||
@compiles(Geometry, 'sqlite') # type: ignore[no-untyped-call]
|
||||
def get_col_spec(self, *args, **kwargs): # type: ignore[no-untyped-def]
|
||||
return 'GEOMETRY'
|
||||
|
||||
|
||||
SQLITE_FUNCTION_ALIAS = (
|
||||
('ST_AsEWKB', sa.Text, 'AsEWKB'),
|
||||
('ST_GeomFromEWKT', Geometry, 'GeomFromEWKT'),
|
||||
('ST_AsGeoJSON', sa.Text, 'AsGeoJSON'),
|
||||
('ST_AsKML', sa.Text, 'AsKML'),
|
||||
('ST_AsSVG', sa.Text, 'AsSVG'),
|
||||
('ST_LineLocatePoint', sa.Float, 'ST_Line_Locate_Point'),
|
||||
('ST_LineInterpolatePoint', sa.Float, 'ST_Line_Interpolate_Point'),
|
||||
)
|
||||
|
||||
def _add_function_alias(func: str, ftype: type, alias: str) -> None:
|
||||
_FuncDef = type(func, (sa.sql.functions.GenericFunction, ), {
|
||||
"type": ftype(),
|
||||
"name": func,
|
||||
"identifier": func,
|
||||
"inherit_cache": True})
|
||||
|
||||
func_templ = f"{alias}(%s)"
|
||||
|
||||
def _sqlite_impl(element: Any, compiler: Any, **kw: Any) -> Any:
|
||||
return func_templ % compiler.process(element.clauses, **kw)
|
||||
|
||||
compiles(_FuncDef, 'sqlite')(_sqlite_impl) # type: ignore[no-untyped-call]
|
||||
|
||||
for alias in SQLITE_FUNCTION_ALIAS:
|
||||
_add_function_alias(*alias)
|
||||
123
src/nominatim_core/db/sqlalchemy_types/int_array.py
Normal file
123
src/nominatim_core/db/sqlalchemy_types/int_array.py
Normal file
@@ -0,0 +1,123 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Custom type for an array of integers.
|
||||
"""
|
||||
from typing import Any, List, cast, Optional
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.ext.compiler import compiles
|
||||
from sqlalchemy.dialects.postgresql import ARRAY
|
||||
|
||||
from ...typing import SaDialect, SaColumn
|
||||
|
||||
# pylint: disable=all
|
||||
|
||||
class IntList(sa.types.TypeDecorator[Any]):
|
||||
""" A list of integers saved as a text of comma-separated numbers.
|
||||
"""
|
||||
impl = sa.types.Unicode
|
||||
cache_ok = True
|
||||
|
||||
def process_bind_param(self, value: Optional[Any], dialect: 'sa.Dialect') -> Optional[str]:
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
assert isinstance(value, list)
|
||||
return ','.join(map(str, value))
|
||||
|
||||
def process_result_value(self, value: Optional[Any],
|
||||
dialect: SaDialect) -> Optional[List[int]]:
|
||||
return [int(v) for v in value.split(',')] if value is not None else None
|
||||
|
||||
def copy(self, **kw: Any) -> 'IntList':
|
||||
return IntList(self.impl.length)
|
||||
|
||||
|
||||
class IntArray(sa.types.TypeDecorator[Any]):
|
||||
""" Dialect-independent list of integers.
|
||||
"""
|
||||
impl = IntList
|
||||
cache_ok = True
|
||||
|
||||
def load_dialect_impl(self, dialect: SaDialect) -> sa.types.TypeEngine[Any]:
|
||||
if dialect.name == 'postgresql':
|
||||
return ARRAY(sa.Integer()) #pylint: disable=invalid-name
|
||||
|
||||
return IntList()
|
||||
|
||||
|
||||
class comparator_factory(sa.types.UserDefinedType.Comparator): # type: ignore[type-arg]
|
||||
|
||||
def __add__(self, other: SaColumn) -> 'sa.ColumnOperators':
|
||||
""" Concate the array with the given array. If one of the
|
||||
operants is null, the value of the other will be returned.
|
||||
"""
|
||||
return ArrayCat(self.expr, other)
|
||||
|
||||
|
||||
def contains(self, other: SaColumn, **kwargs: Any) -> 'sa.ColumnOperators':
|
||||
""" Return true if the array contains all the value of the argument
|
||||
array.
|
||||
"""
|
||||
return ArrayContains(self.expr, other)
|
||||
|
||||
|
||||
|
||||
class ArrayAgg(sa.sql.functions.GenericFunction[Any]):
|
||||
""" Aggregate function to collect elements in an array.
|
||||
"""
|
||||
type = IntArray()
|
||||
identifier = 'ArrayAgg'
|
||||
name = 'array_agg'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(ArrayAgg, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def sqlite_array_agg(element: ArrayAgg, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "group_concat(%s, ',')" % compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
|
||||
class ArrayContains(sa.sql.expression.FunctionElement[Any]):
|
||||
""" Function to check if an array is fully contained in another.
|
||||
"""
|
||||
name = 'ArrayContains'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(ArrayContains) # type: ignore[no-untyped-call, misc]
|
||||
def generic_array_contains(element: ArrayContains, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "(%s @> %s)" % (compiler.process(arg1, **kw),
|
||||
compiler.process(arg2, **kw))
|
||||
|
||||
|
||||
@compiles(ArrayContains, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def sqlite_array_contains(element: ArrayContains, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "array_contains(%s)" % compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
|
||||
class ArrayCat(sa.sql.expression.FunctionElement[Any]):
|
||||
""" Function to check if an array is fully contained in another.
|
||||
"""
|
||||
type = IntArray()
|
||||
identifier = 'ArrayCat'
|
||||
inherit_cache = True
|
||||
|
||||
|
||||
@compiles(ArrayCat) # type: ignore[no-untyped-call, misc]
|
||||
def generic_array_cat(element: ArrayCat, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
return "array_cat(%s)" % compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
@compiles(ArrayCat, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def sqlite_array_cat(element: ArrayCat, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "(%s || ',' || %s)" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
|
||||
|
||||
30
src/nominatim_core/db/sqlalchemy_types/json.py
Normal file
30
src/nominatim_core/db/sqlalchemy_types/json.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Common json type for different dialects.
|
||||
"""
|
||||
from typing import Any
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects.postgresql import JSONB
|
||||
from sqlalchemy.dialects.sqlite import JSON as sqlite_json
|
||||
|
||||
from ...typing import SaDialect
|
||||
|
||||
# pylint: disable=all
|
||||
|
||||
class Json(sa.types.TypeDecorator[Any]):
|
||||
""" Dialect-independent type for JSON.
|
||||
"""
|
||||
impl = sa.types.JSON
|
||||
cache_ok = True
|
||||
|
||||
def load_dialect_impl(self, dialect: SaDialect) -> sa.types.TypeEngine[Any]:
|
||||
if dialect.name == 'postgresql':
|
||||
return JSONB(none_as_null=True) # type: ignore[no-untyped-call]
|
||||
|
||||
return sqlite_json(none_as_null=True)
|
||||
62
src/nominatim_core/db/sqlalchemy_types/key_value.py
Normal file
62
src/nominatim_core/db/sqlalchemy_types/key_value.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
A custom type that implements a simple key-value store of strings.
|
||||
"""
|
||||
from typing import Any
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.ext.compiler import compiles
|
||||
from sqlalchemy.dialects.postgresql import HSTORE
|
||||
from sqlalchemy.dialects.sqlite import JSON as sqlite_json
|
||||
|
||||
from ...typing import SaDialect, SaColumn
|
||||
|
||||
# pylint: disable=all
|
||||
|
||||
class KeyValueStore(sa.types.TypeDecorator[Any]):
|
||||
""" Dialect-independent type of a simple key-value store of strings.
|
||||
"""
|
||||
impl = HSTORE
|
||||
cache_ok = True
|
||||
|
||||
def load_dialect_impl(self, dialect: SaDialect) -> sa.types.TypeEngine[Any]:
|
||||
if dialect.name == 'postgresql':
|
||||
return HSTORE() # type: ignore[no-untyped-call]
|
||||
|
||||
return sqlite_json(none_as_null=True)
|
||||
|
||||
|
||||
class comparator_factory(sa.types.UserDefinedType.Comparator): # type: ignore[type-arg]
|
||||
|
||||
def merge(self, other: SaColumn) -> 'sa.Operators':
|
||||
""" Merge the values from the given KeyValueStore into this
|
||||
one, overwriting values where necessary. When the argument
|
||||
is null, nothing happens.
|
||||
"""
|
||||
return KeyValueConcat(self.expr, other)
|
||||
|
||||
|
||||
class KeyValueConcat(sa.sql.expression.FunctionElement[Any]):
|
||||
""" Return the merged key-value store from the input parameters.
|
||||
"""
|
||||
type = KeyValueStore()
|
||||
name = 'JsonConcat'
|
||||
inherit_cache = True
|
||||
|
||||
@compiles(KeyValueConcat) # type: ignore[no-untyped-call, misc]
|
||||
def default_json_concat(element: KeyValueConcat, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "(%s || coalesce(%s, ''::hstore))" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
|
||||
|
||||
@compiles(KeyValueConcat, 'sqlite') # type: ignore[no-untyped-call, misc]
|
||||
def sqlite_json_concat(element: KeyValueConcat, compiler: 'sa.Compiled', **kw: Any) -> str:
|
||||
arg1, arg2 = list(element.clauses)
|
||||
return "json_patch(%s, coalesce(%s, '{}'))" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
|
||||
|
||||
|
||||
|
||||
127
src/nominatim_core/db/status.py
Normal file
127
src/nominatim_core/db/status.py
Normal file
@@ -0,0 +1,127 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Access and helper functions for the status and status log table.
|
||||
"""
|
||||
from typing import Optional, Tuple, cast
|
||||
import datetime as dt
|
||||
import logging
|
||||
import re
|
||||
|
||||
from .connection import Connection
|
||||
from ..utils.url_utils import get_url
|
||||
from ..errors import UsageError
|
||||
from ..typing import TypedDict
|
||||
|
||||
LOG = logging.getLogger()
|
||||
ISODATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
|
||||
|
||||
|
||||
class StatusRow(TypedDict):
|
||||
""" Dictionary of columns of the import_status table.
|
||||
"""
|
||||
lastimportdate: dt.datetime
|
||||
sequence_id: Optional[int]
|
||||
indexed: Optional[bool]
|
||||
|
||||
|
||||
def compute_database_date(conn: Connection, offline: bool = False) -> dt.datetime:
|
||||
""" Determine the date of the database from the newest object in the
|
||||
data base.
|
||||
"""
|
||||
# If there is a date from osm2pgsql available, use that.
|
||||
if conn.table_exists('osm2pgsql_properties'):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(""" SELECT value FROM osm2pgsql_properties
|
||||
WHERE property = 'current_timestamp' """)
|
||||
row = cur.fetchone()
|
||||
if row is not None:
|
||||
return dt.datetime.strptime(row[0], "%Y-%m-%dT%H:%M:%SZ")\
|
||||
.replace(tzinfo=dt.timezone.utc)
|
||||
|
||||
if offline:
|
||||
raise UsageError("Cannot determine database date from data in offline mode.")
|
||||
|
||||
# Else, find the node with the highest ID in the database
|
||||
with conn.cursor() as cur:
|
||||
if conn.table_exists('place'):
|
||||
osmid = cur.scalar("SELECT max(osm_id) FROM place WHERE osm_type='N'")
|
||||
else:
|
||||
osmid = cur.scalar("SELECT max(osm_id) FROM placex WHERE osm_type='N'")
|
||||
|
||||
if osmid is None:
|
||||
LOG.fatal("No data found in the database.")
|
||||
raise UsageError("No data found in the database.")
|
||||
|
||||
LOG.info("Using node id %d for timestamp lookup", osmid)
|
||||
# Get the node from the API to find the timestamp when it was created.
|
||||
node_url = f'https://www.openstreetmap.org/api/0.6/node/{osmid}/1'
|
||||
data = get_url(node_url)
|
||||
|
||||
match = re.search(r'timestamp="((\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}))Z"', data)
|
||||
|
||||
if match is None:
|
||||
LOG.fatal("The node data downloaded from the API does not contain valid data.\n"
|
||||
"URL used: %s", node_url)
|
||||
raise UsageError("Bad API data.")
|
||||
|
||||
LOG.debug("Found timestamp %s", match.group(1))
|
||||
|
||||
return dt.datetime.strptime(match.group(1), ISODATE_FORMAT).replace(tzinfo=dt.timezone.utc)
|
||||
|
||||
|
||||
def set_status(conn: Connection, date: Optional[dt.datetime],
|
||||
seq: Optional[int] = None, indexed: bool = True) -> None:
|
||||
""" Replace the current status with the given status. If date is `None`
|
||||
then only sequence and indexed will be updated as given. Otherwise
|
||||
the whole status is replaced.
|
||||
The change will be committed to the database.
|
||||
"""
|
||||
assert date is None or date.tzinfo == dt.timezone.utc
|
||||
with conn.cursor() as cur:
|
||||
if date is None:
|
||||
cur.execute("UPDATE import_status set sequence_id = %s, indexed = %s",
|
||||
(seq, indexed))
|
||||
else:
|
||||
cur.execute("TRUNCATE TABLE import_status")
|
||||
cur.execute("""INSERT INTO import_status (lastimportdate, sequence_id, indexed)
|
||||
VALUES (%s, %s, %s)""", (date, seq, indexed))
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
def get_status(conn: Connection) -> Tuple[Optional[dt.datetime], Optional[int], Optional[bool]]:
|
||||
""" Return the current status as a triple of (date, sequence, indexed).
|
||||
If status has not been set up yet, a triple of None is returned.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT * FROM import_status LIMIT 1")
|
||||
if cur.rowcount < 1:
|
||||
return None, None, None
|
||||
|
||||
row = cast(StatusRow, cur.fetchone())
|
||||
return row['lastimportdate'], row['sequence_id'], row['indexed']
|
||||
|
||||
|
||||
def set_indexed(conn: Connection, state: bool) -> None:
|
||||
""" Set the indexed flag in the status table to the given state.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("UPDATE import_status SET indexed = %s", (state, ))
|
||||
conn.commit()
|
||||
|
||||
|
||||
def log_status(conn: Connection, start: dt.datetime,
|
||||
event: str, batchsize: Optional[int] = None) -> None:
|
||||
""" Write a new status line to the `import_osmosis_log` table.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""INSERT INTO import_osmosis_log
|
||||
(batchend, batchseq, batchsize, starttime, endtime, event)
|
||||
SELECT lastimportdate, sequence_id, %s, %s, now(), %s FROM import_status""",
|
||||
(batchsize, start, event))
|
||||
conn.commit()
|
||||
129
src/nominatim_core/db/utils.py
Normal file
129
src/nominatim_core/db/utils.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Helper functions for handling DB accesses.
|
||||
"""
|
||||
from typing import IO, Optional, Union, Any, Iterable
|
||||
import subprocess
|
||||
import logging
|
||||
import gzip
|
||||
import io
|
||||
from pathlib import Path
|
||||
|
||||
from .connection import get_pg_env, Cursor
|
||||
from ..errors import UsageError
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _pipe_to_proc(proc: 'subprocess.Popen[bytes]',
|
||||
fdesc: Union[IO[bytes], gzip.GzipFile]) -> int:
|
||||
assert proc.stdin is not None
|
||||
chunk = fdesc.read(2048)
|
||||
while chunk and proc.poll() is None:
|
||||
try:
|
||||
proc.stdin.write(chunk)
|
||||
except BrokenPipeError as exc:
|
||||
raise UsageError("Failed to execute SQL file.") from exc
|
||||
chunk = fdesc.read(2048)
|
||||
|
||||
return len(chunk)
|
||||
|
||||
def execute_file(dsn: str, fname: Path,
|
||||
ignore_errors: bool = False,
|
||||
pre_code: Optional[str] = None,
|
||||
post_code: Optional[str] = None) -> None:
|
||||
""" Read an SQL file and run its contents against the given database
|
||||
using psql. Use `pre_code` and `post_code` to run extra commands
|
||||
before or after executing the file. The commands are run within the
|
||||
same session, so they may be used to wrap the file execution in a
|
||||
transaction.
|
||||
"""
|
||||
cmd = ['psql']
|
||||
if not ignore_errors:
|
||||
cmd.extend(('-v', 'ON_ERROR_STOP=1'))
|
||||
if not LOG.isEnabledFor(logging.INFO):
|
||||
cmd.append('--quiet')
|
||||
|
||||
with subprocess.Popen(cmd, env=get_pg_env(dsn), stdin=subprocess.PIPE) as proc:
|
||||
assert proc.stdin is not None
|
||||
try:
|
||||
if not LOG.isEnabledFor(logging.INFO):
|
||||
proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8'))
|
||||
|
||||
if pre_code:
|
||||
proc.stdin.write((pre_code + ';').encode('utf-8'))
|
||||
|
||||
if fname.suffix == '.gz':
|
||||
with gzip.open(str(fname), 'rb') as fdesc:
|
||||
remain = _pipe_to_proc(proc, fdesc)
|
||||
else:
|
||||
with fname.open('rb') as fdesc:
|
||||
remain = _pipe_to_proc(proc, fdesc)
|
||||
|
||||
if remain == 0 and post_code:
|
||||
proc.stdin.write((';' + post_code).encode('utf-8'))
|
||||
finally:
|
||||
proc.stdin.close()
|
||||
ret = proc.wait()
|
||||
|
||||
if ret != 0 or remain > 0:
|
||||
raise UsageError("Failed to execute SQL file.")
|
||||
|
||||
|
||||
# List of characters that need to be quoted for the copy command.
|
||||
_SQL_TRANSLATION = {ord('\\'): '\\\\',
|
||||
ord('\t'): '\\t',
|
||||
ord('\n'): '\\n'}
|
||||
|
||||
|
||||
class CopyBuffer:
|
||||
""" Data collector for the copy_from command.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.buffer = io.StringIO()
|
||||
|
||||
|
||||
def __enter__(self) -> 'CopyBuffer':
|
||||
return self
|
||||
|
||||
|
||||
def size(self) -> int:
|
||||
""" Return the number of bytes the buffer currently contains.
|
||||
"""
|
||||
return self.buffer.tell()
|
||||
|
||||
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
|
||||
if self.buffer is not None:
|
||||
self.buffer.close()
|
||||
|
||||
|
||||
def add(self, *data: Any) -> None:
|
||||
""" Add another row of data to the copy buffer.
|
||||
"""
|
||||
first = True
|
||||
for column in data:
|
||||
if first:
|
||||
first = False
|
||||
else:
|
||||
self.buffer.write('\t')
|
||||
if column is None:
|
||||
self.buffer.write('\\N')
|
||||
else:
|
||||
self.buffer.write(str(column).translate(_SQL_TRANSLATION))
|
||||
self.buffer.write('\n')
|
||||
|
||||
|
||||
def copy_out(self, cur: Cursor, table: str, columns: Optional[Iterable[str]] = None) -> None:
|
||||
""" Copy all collected data into the given table.
|
||||
|
||||
The buffer is empty and reusable after this operation.
|
||||
"""
|
||||
if self.buffer.tell() > 0:
|
||||
self.buffer.seek(0)
|
||||
cur.copy_from(self.buffer, table, columns=columns)
|
||||
self.buffer = io.StringIO()
|
||||
14
src/nominatim_core/errors.py
Normal file
14
src/nominatim_core/errors.py
Normal file
@@ -0,0 +1,14 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Custom exception and error classes for Nominatim.
|
||||
"""
|
||||
|
||||
class UsageError(Exception):
|
||||
""" An error raised because of bad user input. This error will usually
|
||||
not cause a stack trace to be printed unless debugging is enabled.
|
||||
"""
|
||||
15
src/nominatim_core/paths.py
Normal file
15
src/nominatim_core/paths.py
Normal file
@@ -0,0 +1,15 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Path settings for extra data used by Nominatim.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
PHPLIB_DIR = (Path(__file__) / '..' / '..' / '..' / 'lib-php').resolve()
|
||||
SQLLIB_DIR = (Path(__file__) / '..' / '..' / '..' / 'lib-sql').resolve()
|
||||
DATA_DIR = (Path(__file__) / '..' / '..' / '..' / 'data').resolve()
|
||||
CONFIG_DIR = (Path(__file__) / '..' / '..' / '..' / 'settings').resolve()
|
||||
0
src/nominatim_core/py.typed
Normal file
0
src/nominatim_core/py.typed
Normal file
75
src/nominatim_core/typing.py
Normal file
75
src/nominatim_core/typing.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Type definitions for typing annotations.
|
||||
|
||||
Complex type definitions are moved here, to keep the source files readable.
|
||||
"""
|
||||
from typing import Any, Union, Mapping, TypeVar, Sequence, TYPE_CHECKING
|
||||
|
||||
# Generics variable names do not confirm to naming styles, ignore globally here.
|
||||
# pylint: disable=invalid-name,abstract-method,multiple-statements
|
||||
# pylint: disable=missing-class-docstring,useless-import-alias
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import psycopg2.sql
|
||||
import psycopg2.extensions
|
||||
import psycopg2.extras
|
||||
import os
|
||||
|
||||
StrPath = Union[str, 'os.PathLike[str]']
|
||||
|
||||
SysEnv = Mapping[str, str]
|
||||
|
||||
# psycopg2-related types
|
||||
|
||||
Query = Union[str, bytes, 'psycopg2.sql.Composable']
|
||||
|
||||
T_ResultKey = TypeVar('T_ResultKey', int, str)
|
||||
|
||||
class DictCursorResult(Mapping[str, Any]):
|
||||
def __getitem__(self, x: Union[int, str]) -> Any: ...
|
||||
|
||||
DictCursorResults = Sequence[DictCursorResult]
|
||||
|
||||
T_cursor = TypeVar('T_cursor', bound='psycopg2.extensions.cursor')
|
||||
|
||||
# The following typing features require typing_extensions to work
|
||||
# on all supported Python versions.
|
||||
# Only require this for type checking but not for normal operations.
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing_extensions import (Protocol as Protocol,
|
||||
Final as Final,
|
||||
TypedDict as TypedDict)
|
||||
else:
|
||||
Protocol = object
|
||||
Final = 'Final'
|
||||
TypedDict = dict
|
||||
|
||||
|
||||
# SQLAlchemy introduced generic types in version 2.0 making typing
|
||||
# incompatible with older versions. Add wrappers here so we don't have
|
||||
# to litter the code with bare-string types.
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import sqlalchemy as sa
|
||||
from typing_extensions import (TypeAlias as TypeAlias)
|
||||
else:
|
||||
TypeAlias = str
|
||||
|
||||
SaLambdaSelect: TypeAlias = 'Union[sa.Select[Any], sa.StatementLambdaElement]'
|
||||
SaSelect: TypeAlias = 'sa.Select[Any]'
|
||||
SaScalarSelect: TypeAlias = 'sa.ScalarSelect[Any]'
|
||||
SaRow: TypeAlias = 'sa.Row[Any]'
|
||||
SaColumn: TypeAlias = 'sa.ColumnElement[Any]'
|
||||
SaExpression: TypeAlias = 'sa.ColumnElement[bool]'
|
||||
SaLabel: TypeAlias = 'sa.Label[Any]'
|
||||
SaFromClause: TypeAlias = 'sa.FromClause'
|
||||
SaSelectable: TypeAlias = 'sa.Selectable'
|
||||
SaBind: TypeAlias = 'sa.BindParameter[Any]'
|
||||
SaDialect: TypeAlias = 'sa.Dialect'
|
||||
0
src/nominatim_core/utils/__init__.py
Normal file
0
src/nominatim_core/utils/__init__.py
Normal file
49
src/nominatim_core/utils/centroid.py
Normal file
49
src/nominatim_core/utils/centroid.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for computation of centroids.
|
||||
"""
|
||||
from typing import Tuple, Any
|
||||
from collections.abc import Collection
|
||||
|
||||
class PointsCentroid:
|
||||
""" Centroid computation from single points using an online algorithm.
|
||||
More points may be added at any time.
|
||||
|
||||
Coordinates are internally treated as a 7-digit fixed-point float
|
||||
(i.e. in OSM style).
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.sum_x = 0
|
||||
self.sum_y = 0
|
||||
self.count = 0
|
||||
|
||||
def centroid(self) -> Tuple[float, float]:
|
||||
""" Return the centroid of all points collected so far.
|
||||
"""
|
||||
if self.count == 0:
|
||||
raise ValueError("No points available for centroid.")
|
||||
|
||||
return (float(self.sum_x/self.count)/10000000,
|
||||
float(self.sum_y/self.count)/10000000)
|
||||
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.count
|
||||
|
||||
|
||||
def __iadd__(self, other: Any) -> 'PointsCentroid':
|
||||
if isinstance(other, Collection) and len(other) == 2:
|
||||
if all(isinstance(p, (float, int)) for p in other):
|
||||
x, y = other
|
||||
self.sum_x += int(x * 10000000)
|
||||
self.sum_y += int(y * 10000000)
|
||||
self.count += 1
|
||||
return self
|
||||
|
||||
raise ValueError("Can only add 2-element tuples to centroid.")
|
||||
149
src/nominatim_core/utils/json_writer.py
Normal file
149
src/nominatim_core/utils/json_writer.py
Normal file
@@ -0,0 +1,149 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Streaming JSON encoder.
|
||||
"""
|
||||
from typing import Any, TypeVar, Optional, Callable
|
||||
import io
|
||||
try:
|
||||
import ujson as json
|
||||
except ModuleNotFoundError:
|
||||
import json # type: ignore[no-redef]
|
||||
|
||||
T = TypeVar('T') # pylint: disable=invalid-name
|
||||
|
||||
class JsonWriter:
|
||||
""" JSON encoder that renders the output directly into an output
|
||||
stream. This is a very simple writer which produces JSON in a
|
||||
compact as possible form.
|
||||
|
||||
The writer does not check for syntactic correctness. It is the
|
||||
responsibility of the caller to call the write functions in an
|
||||
order that produces correct JSON.
|
||||
|
||||
All functions return the writer object itself so that function
|
||||
calls can be chained.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.data = io.StringIO()
|
||||
self.pending = ''
|
||||
|
||||
|
||||
def __call__(self) -> str:
|
||||
""" Return the rendered JSON content as a string.
|
||||
The writer remains usable after calling this function.
|
||||
"""
|
||||
if self.pending:
|
||||
assert self.pending in (']', '}')
|
||||
self.data.write(self.pending)
|
||||
self.pending = ''
|
||||
return self.data.getvalue()
|
||||
|
||||
|
||||
def start_object(self) -> 'JsonWriter':
|
||||
""" Write the open bracket of a JSON object.
|
||||
"""
|
||||
if self.pending:
|
||||
self.data.write(self.pending)
|
||||
self.pending = '{'
|
||||
return self
|
||||
|
||||
|
||||
def end_object(self) -> 'JsonWriter':
|
||||
""" Write the closing bracket of a JSON object.
|
||||
"""
|
||||
assert self.pending in (',', '{', '')
|
||||
if self.pending == '{':
|
||||
self.data.write(self.pending)
|
||||
self.pending = '}'
|
||||
return self
|
||||
|
||||
|
||||
def start_array(self) -> 'JsonWriter':
|
||||
""" Write the opening bracket of a JSON array.
|
||||
"""
|
||||
if self.pending:
|
||||
self.data.write(self.pending)
|
||||
self.pending = '['
|
||||
return self
|
||||
|
||||
|
||||
def end_array(self) -> 'JsonWriter':
|
||||
""" Write the closing bracket of a JSON array.
|
||||
"""
|
||||
assert self.pending in (',', '[', ']', ')', '')
|
||||
if self.pending not in (',', ''):
|
||||
self.data.write(self.pending)
|
||||
self.pending = ']'
|
||||
return self
|
||||
|
||||
|
||||
def key(self, name: str) -> 'JsonWriter':
|
||||
""" Write the key string of a JSON object.
|
||||
"""
|
||||
assert self.pending
|
||||
self.data.write(self.pending)
|
||||
self.data.write(json.dumps(name, ensure_ascii=False))
|
||||
self.pending = ':'
|
||||
return self
|
||||
|
||||
|
||||
def value(self, value: Any) -> 'JsonWriter':
|
||||
""" Write out a value as JSON. The function uses the json.dumps()
|
||||
function for encoding the JSON. Thus any value that can be
|
||||
encoded by that function is permissible here.
|
||||
"""
|
||||
return self.raw(json.dumps(value, ensure_ascii=False))
|
||||
|
||||
|
||||
def float(self, value: float, precision: int) -> 'JsonWriter':
|
||||
""" Write out a float value with the given precision.
|
||||
"""
|
||||
return self.raw(f"{value:0.{precision}f}")
|
||||
|
||||
def next(self) -> 'JsonWriter':
|
||||
""" Write out a delimiter comma between JSON object or array elements.
|
||||
"""
|
||||
if self.pending:
|
||||
self.data.write(self.pending)
|
||||
self.pending = ','
|
||||
return self
|
||||
|
||||
|
||||
def raw(self, raw_json: str) -> 'JsonWriter':
|
||||
""" Write out the given value as is. This function is useful if
|
||||
a value is already available in JSON format.
|
||||
"""
|
||||
if self.pending:
|
||||
self.data.write(self.pending)
|
||||
self.pending = ''
|
||||
self.data.write(raw_json)
|
||||
return self
|
||||
|
||||
|
||||
def keyval(self, key: str, value: Any) -> 'JsonWriter':
|
||||
""" Write out an object element with the given key and value.
|
||||
This is a shortcut for calling 'key()', 'value()' and 'next()'.
|
||||
"""
|
||||
self.key(key)
|
||||
self.value(value)
|
||||
return self.next()
|
||||
|
||||
|
||||
def keyval_not_none(self, key: str, value: Optional[T],
|
||||
transform: Optional[Callable[[T], Any]] = None) -> 'JsonWriter':
|
||||
""" Write out an object element only if the value is not None.
|
||||
If 'transform' is given, it must be a function that takes the
|
||||
value type and returns a JSON encodable type. The transform
|
||||
function will be called before the value is written out.
|
||||
"""
|
||||
if value is not None:
|
||||
self.key(key)
|
||||
self.value(transform(value) if transform else value)
|
||||
self.next()
|
||||
return self
|
||||
31
src/nominatim_core/utils/url_utils.py
Normal file
31
src/nominatim_core/utils/url_utils.py
Normal file
@@ -0,0 +1,31 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Helper functions for accessing URL.
|
||||
"""
|
||||
from typing import IO
|
||||
import logging
|
||||
import urllib.request as urlrequest
|
||||
|
||||
from ..version import NOMINATIM_CORE_VERSION
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def get_url(url: str) -> str:
|
||||
""" Get the contents from the given URL and return it as a UTF-8 string.
|
||||
|
||||
This version makes sure that an appropriate user agent is sent.
|
||||
"""
|
||||
headers = {"User-Agent": f"Nominatim/{NOMINATIM_CORE_VERSION!s}"}
|
||||
|
||||
try:
|
||||
request = urlrequest.Request(url, headers=headers)
|
||||
with urlrequest.urlopen(request) as response: # type: IO[bytes]
|
||||
return response.read().decode('utf-8')
|
||||
except Exception:
|
||||
LOG.fatal('Failed to load URL: %s', url)
|
||||
raise
|
||||
11
src/nominatim_core/version.py
Normal file
11
src/nominatim_core/version.py
Normal file
@@ -0,0 +1,11 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Version information for the Nominatim core package.
|
||||
"""
|
||||
|
||||
NOMINATIM_CORE_VERSION = '4.4.99'
|
||||
0
src/nominatim_db/__init__.py
Normal file
0
src/nominatim_db/__init__.py
Normal file
228
src/nominatim_db/cli.py
Normal file
228
src/nominatim_db/cli.py
Normal file
@@ -0,0 +1,228 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Command-line interface to the Nominatim functions for import, update,
|
||||
database administration and querying.
|
||||
"""
|
||||
from typing import Optional, Any
|
||||
import importlib
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.errors import UsageError
|
||||
from .tools.exec_utils import run_php_server
|
||||
from . import clicmd
|
||||
from . import version
|
||||
from .clicmd.args import NominatimArgs, Subcommand
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
class CommandlineParser:
|
||||
""" Wraps some of the common functions for parsing the command line
|
||||
and setting up subcommands.
|
||||
"""
|
||||
def __init__(self, prog: str, description: Optional[str]):
|
||||
self.parser = argparse.ArgumentParser(
|
||||
prog=prog,
|
||||
description=description,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
|
||||
self.subs = self.parser.add_subparsers(title='available commands',
|
||||
dest='subcommand')
|
||||
|
||||
# Global arguments that only work if no sub-command given
|
||||
self.parser.add_argument('--version', action='store_true',
|
||||
help='Print Nominatim version and exit')
|
||||
|
||||
# Arguments added to every sub-command
|
||||
self.default_args = argparse.ArgumentParser(add_help=False)
|
||||
group = self.default_args.add_argument_group('Default arguments')
|
||||
group.add_argument('-h', '--help', action='help',
|
||||
help='Show this help message and exit')
|
||||
group.add_argument('-q', '--quiet', action='store_const', const=0,
|
||||
dest='verbose', default=1,
|
||||
help='Print only error messages')
|
||||
group.add_argument('-v', '--verbose', action='count', default=1,
|
||||
help='Increase verboseness of output')
|
||||
group.add_argument('--project-dir', metavar='DIR', default='.',
|
||||
help='Base directory of the Nominatim installation (default:.)')
|
||||
group.add_argument('-j', '--threads', metavar='NUM', type=int,
|
||||
help='Number of parallel threads to use')
|
||||
|
||||
|
||||
def nominatim_version_text(self) -> str:
|
||||
""" Program name and version number as string
|
||||
"""
|
||||
text = f'Nominatim version {version.NOMINATIM_VERSION!s}'
|
||||
if version.GIT_COMMIT_HASH is not None:
|
||||
text += f' ({version.GIT_COMMIT_HASH})'
|
||||
return text
|
||||
|
||||
|
||||
def add_subcommand(self, name: str, cmd: Subcommand) -> None:
|
||||
""" Add a subcommand to the parser. The subcommand must be a class
|
||||
with a function add_args() that adds the parameters for the
|
||||
subcommand and a run() function that executes the command.
|
||||
"""
|
||||
assert cmd.__doc__ is not None
|
||||
|
||||
parser = self.subs.add_parser(name, parents=[self.default_args],
|
||||
help=cmd.__doc__.split('\n', 1)[0],
|
||||
description=cmd.__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
add_help=False)
|
||||
parser.set_defaults(command=cmd)
|
||||
cmd.add_args(parser)
|
||||
|
||||
|
||||
def run(self, **kwargs: Any) -> int:
|
||||
""" Parse the command line arguments of the program and execute the
|
||||
appropriate subcommand.
|
||||
"""
|
||||
args = NominatimArgs()
|
||||
try:
|
||||
self.parser.parse_args(args=kwargs.get('cli_args'), namespace=args)
|
||||
except SystemExit:
|
||||
return 1
|
||||
|
||||
if args.version:
|
||||
print(self.nominatim_version_text())
|
||||
return 0
|
||||
|
||||
if args.subcommand is None:
|
||||
self.parser.print_help()
|
||||
return 1
|
||||
|
||||
args.project_dir = Path(args.project_dir).resolve()
|
||||
|
||||
if 'cli_args' not in kwargs:
|
||||
logging.basicConfig(stream=sys.stderr,
|
||||
format='%(asctime)s: %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S',
|
||||
level=max(4 - args.verbose, 1) * 10)
|
||||
|
||||
args.config = Configuration(args.project_dir,
|
||||
environ=kwargs.get('environ', os.environ))
|
||||
args.config.set_libdirs(module=kwargs['module_dir'],
|
||||
osm2pgsql=kwargs['osm2pgsql_path'])
|
||||
|
||||
log = logging.getLogger()
|
||||
log.warning('Using project directory: %s', str(args.project_dir))
|
||||
|
||||
try:
|
||||
return args.command.run(args)
|
||||
except UsageError as exception:
|
||||
if log.isEnabledFor(logging.DEBUG):
|
||||
raise # use Python's exception printing
|
||||
log.fatal('FATAL: %s', exception)
|
||||
|
||||
# If we get here, then execution has failed in some way.
|
||||
return 1
|
||||
|
||||
|
||||
# Subcommand classes
|
||||
#
|
||||
# Each class needs to implement two functions: add_args() adds the CLI parameters
|
||||
# for the subfunction, run() executes the subcommand.
|
||||
#
|
||||
# The class documentation doubles as the help text for the command. The
|
||||
# first line is also used in the summary when calling the program without
|
||||
# a subcommand.
|
||||
#
|
||||
# No need to document the functions each time.
|
||||
# pylint: disable=C0111
|
||||
class AdminServe:
|
||||
"""\
|
||||
Start a simple web server for serving the API.
|
||||
|
||||
This command starts a built-in webserver to serve the website
|
||||
from the current project directory. This webserver is only suitable
|
||||
for testing and development. Do not use it in production setups!
|
||||
|
||||
There are different webservers available. The default 'php' engine
|
||||
runs the classic PHP frontend. The other engines are Python servers
|
||||
which run the new Python frontend code. This is highly experimental
|
||||
at the moment and may not include the full API.
|
||||
|
||||
By the default, the webserver can be accessed at: http://127.0.0.1:8088
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Server arguments')
|
||||
group.add_argument('--server', default='127.0.0.1:8088',
|
||||
help='The address the server will listen to.')
|
||||
group.add_argument('--engine', default='falcon',
|
||||
choices=('php', 'falcon', 'starlette'),
|
||||
help='Webserver framework to run. (default: falcon)')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
if args.engine == 'php':
|
||||
if args.config.lib_dir.php is None:
|
||||
raise UsageError("PHP frontend not configured.")
|
||||
run_php_server(args.server, args.project_dir / 'website')
|
||||
else:
|
||||
import uvicorn # pylint: disable=import-outside-toplevel
|
||||
server_info = args.server.split(':', 1)
|
||||
host = server_info[0]
|
||||
if len(server_info) > 1:
|
||||
if not server_info[1].isdigit():
|
||||
raise UsageError('Invalid format for --server parameter. Use <host>:<port>')
|
||||
port = int(server_info[1])
|
||||
else:
|
||||
port = 8088
|
||||
|
||||
server_module = importlib.import_module(f'nominatim.server.{args.engine}.server')
|
||||
|
||||
app = server_module.get_application(args.project_dir)
|
||||
uvicorn.run(app, host=host, port=port)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def get_set_parser() -> CommandlineParser:
|
||||
"""\
|
||||
Initializes the parser and adds various subcommands for
|
||||
nominatim cli.
|
||||
"""
|
||||
parser = CommandlineParser('nominatim', nominatim.__doc__)
|
||||
|
||||
parser.add_subcommand('import', clicmd.SetupAll())
|
||||
parser.add_subcommand('freeze', clicmd.SetupFreeze())
|
||||
parser.add_subcommand('replication', clicmd.UpdateReplication())
|
||||
|
||||
parser.add_subcommand('special-phrases', clicmd.ImportSpecialPhrases())
|
||||
|
||||
parser.add_subcommand('add-data', clicmd.UpdateAddData())
|
||||
parser.add_subcommand('index', clicmd.UpdateIndex())
|
||||
parser.add_subcommand('refresh', clicmd.UpdateRefresh())
|
||||
|
||||
parser.add_subcommand('admin', clicmd.AdminFuncs())
|
||||
|
||||
parser.add_subcommand('export', clicmd.QueryExport())
|
||||
parser.add_subcommand('convert', clicmd.ConvertDB())
|
||||
parser.add_subcommand('serve', AdminServe())
|
||||
|
||||
parser.add_subcommand('search', clicmd.APISearch())
|
||||
parser.add_subcommand('reverse', clicmd.APIReverse())
|
||||
parser.add_subcommand('lookup', clicmd.APILookup())
|
||||
parser.add_subcommand('details', clicmd.APIDetails())
|
||||
parser.add_subcommand('status', clicmd.APIStatus())
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def nominatim(**kwargs: Any) -> int:
|
||||
"""\
|
||||
Command-line tools for importing, updating, administrating and
|
||||
querying the Nominatim database.
|
||||
"""
|
||||
return get_set_parser().run(**kwargs)
|
||||
28
src/nominatim_db/clicmd/__init__.py
Normal file
28
src/nominatim_db/clicmd/__init__.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2023 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Subcommand definitions for the command-line tool.
|
||||
"""
|
||||
# mypy and pylint disagree about the style of explicit exports,
|
||||
# see https://github.com/PyCQA/pylint/issues/6006.
|
||||
# pylint: disable=useless-import-alias
|
||||
|
||||
from .setup import SetupAll as SetupAll
|
||||
from .replication import UpdateReplication as UpdateReplication
|
||||
from .api import (APISearch as APISearch,
|
||||
APIReverse as APIReverse,
|
||||
APILookup as APILookup,
|
||||
APIDetails as APIDetails,
|
||||
APIStatus as APIStatus)
|
||||
from .index import UpdateIndex as UpdateIndex
|
||||
from .refresh import UpdateRefresh as UpdateRefresh
|
||||
from .add_data import UpdateAddData as UpdateAddData
|
||||
from .admin import AdminFuncs as AdminFuncs
|
||||
from .freeze import SetupFreeze as SetupFreeze
|
||||
from .special_phrases import ImportSpecialPhrases as ImportSpecialPhrases
|
||||
from .export import QueryExport as QueryExport
|
||||
from .convert import ConvertDB as ConvertDB
|
||||
101
src/nominatim_db/clicmd/add_data.py
Normal file
101
src/nominatim_db/clicmd/add_data.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of the 'add-data' subcommand.
|
||||
"""
|
||||
from typing import cast
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
import psutil
|
||||
|
||||
from .args import NominatimArgs
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
# Using non-top-level imports to avoid eventually unused imports.
|
||||
# pylint: disable=E0012,C0415
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
class UpdateAddData:
|
||||
"""\
|
||||
Add additional data from a file or an online source.
|
||||
|
||||
This command allows to add or update the search data in the database.
|
||||
The data can come either from an OSM file or single OSM objects can
|
||||
directly be downloaded from the OSM API. This function only loads the
|
||||
data into the database. Afterwards it still needs to be integrated
|
||||
in the search index. Use the `nominatim index` command for that.
|
||||
|
||||
The command can also be used to add external non-OSM data to the
|
||||
database. At the moment the only supported format is TIGER housenumber
|
||||
data. See the online documentation at
|
||||
https://nominatim.org/release-docs/latest/admin/Import/#installing-tiger-housenumber-data-for-the-us
|
||||
for more information.
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group_name = parser.add_argument_group('Source')
|
||||
group1 = group_name.add_mutually_exclusive_group(required=True)
|
||||
group1.add_argument('--file', metavar='FILE',
|
||||
help='Import data from an OSM file or diff file')
|
||||
group1.add_argument('--diff', metavar='FILE',
|
||||
help='Import data from an OSM diff file (deprecated: use --file)')
|
||||
group1.add_argument('--node', metavar='ID', type=int,
|
||||
help='Import a single node from the API')
|
||||
group1.add_argument('--way', metavar='ID', type=int,
|
||||
help='Import a single way from the API')
|
||||
group1.add_argument('--relation', metavar='ID', type=int,
|
||||
help='Import a single relation from the API')
|
||||
group1.add_argument('--tiger-data', metavar='DIR',
|
||||
help='Add housenumbers from the US TIGER census database')
|
||||
group2 = parser.add_argument_group('Extra arguments')
|
||||
group2.add_argument('--use-main-api', action='store_true',
|
||||
help='Use OSM API instead of Overpass to download objects')
|
||||
group2.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
||||
help='Size of cache to be used by osm2pgsql (in MB)')
|
||||
group2.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
||||
help='Set timeout for file downloads')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
from ..tools import tiger_data, add_osm_data
|
||||
|
||||
if args.tiger_data:
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||
return tiger_data.add_tiger_data(args.tiger_data,
|
||||
args.config,
|
||||
args.threads or psutil.cpu_count() or 1,
|
||||
tokenizer)
|
||||
|
||||
osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
|
||||
if args.file or args.diff:
|
||||
return add_osm_data.add_data_from_file(args.config.get_libpq_dsn(),
|
||||
cast(str, args.file or args.diff),
|
||||
osm2pgsql_params)
|
||||
|
||||
if args.node:
|
||||
return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
|
||||
'node', args.node,
|
||||
args.use_main_api,
|
||||
osm2pgsql_params)
|
||||
|
||||
if args.way:
|
||||
return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
|
||||
'way', args.way,
|
||||
args.use_main_api,
|
||||
osm2pgsql_params)
|
||||
|
||||
if args.relation:
|
||||
return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
|
||||
'relation', args.relation,
|
||||
args.use_main_api,
|
||||
osm2pgsql_params)
|
||||
|
||||
return 0
|
||||
123
src/nominatim_db/clicmd/admin.py
Normal file
123
src/nominatim_db/clicmd/admin.py
Normal file
@@ -0,0 +1,123 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of the 'admin' subcommand.
|
||||
"""
|
||||
import logging
|
||||
import argparse
|
||||
import random
|
||||
|
||||
import nominatim_api as napi
|
||||
from nominatim_core.db.connection import connect
|
||||
from .args import NominatimArgs
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
# Using non-top-level imports to avoid eventually unused imports.
|
||||
# pylint: disable=E0012,C0415
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
|
||||
class AdminFuncs:
|
||||
"""\
|
||||
Analyse and maintain the database.
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Admin tasks')
|
||||
objs = group.add_mutually_exclusive_group(required=True)
|
||||
objs.add_argument('--warm', action='store_true',
|
||||
help='Warm database caches for search and reverse queries')
|
||||
objs.add_argument('--check-database', action='store_true',
|
||||
help='Check that the database is complete and operational')
|
||||
objs.add_argument('--migrate', action='store_true',
|
||||
help='Migrate the database to a new software version')
|
||||
objs.add_argument('--analyse-indexing', action='store_true',
|
||||
help='Print performance analysis of the indexing process')
|
||||
objs.add_argument('--collect-os-info', action="store_true",
|
||||
help="Generate a report about the host system information")
|
||||
objs.add_argument('--clean-deleted', action='store', metavar='AGE',
|
||||
help='Clean up deleted relations')
|
||||
group = parser.add_argument_group('Arguments for cache warming')
|
||||
group.add_argument('--search-only', action='store_const', dest='target',
|
||||
const='search',
|
||||
help="Only pre-warm tables for search queries")
|
||||
group.add_argument('--reverse-only', action='store_const', dest='target',
|
||||
const='reverse',
|
||||
help="Only pre-warm tables for reverse queries")
|
||||
group = parser.add_argument_group('Arguments for index anaysis')
|
||||
mgroup = group.add_mutually_exclusive_group()
|
||||
mgroup.add_argument('--osm-id', type=str,
|
||||
help='Analyse indexing of the given OSM object')
|
||||
mgroup.add_argument('--place-id', type=int,
|
||||
help='Analyse indexing of the given Nominatim object')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
# pylint: disable=too-many-return-statements
|
||||
if args.warm:
|
||||
return self._warm(args)
|
||||
|
||||
if args.check_database:
|
||||
LOG.warning('Checking database')
|
||||
from ..tools import check_database
|
||||
return check_database.check_database(args.config)
|
||||
|
||||
if args.analyse_indexing:
|
||||
LOG.warning('Analysing performance of indexing function')
|
||||
from ..tools import admin
|
||||
admin.analyse_indexing(args.config, osm_id=args.osm_id, place_id=args.place_id)
|
||||
return 0
|
||||
|
||||
if args.migrate:
|
||||
LOG.warning('Checking for necessary database migrations')
|
||||
from ..tools import migration
|
||||
return migration.migrate(args.config, args)
|
||||
|
||||
if args.collect_os_info:
|
||||
LOG.warning("Reporting System Information")
|
||||
from ..tools import collect_os_info
|
||||
collect_os_info.report_system_information(args.config)
|
||||
return 0
|
||||
|
||||
if args.clean_deleted:
|
||||
LOG.warning('Cleaning up deleted relations')
|
||||
from ..tools import admin
|
||||
admin.clean_deleted_relations(args.config, age=args.clean_deleted)
|
||||
return 0
|
||||
|
||||
return 1
|
||||
|
||||
|
||||
def _warm(self, args: NominatimArgs) -> int:
|
||||
LOG.warning('Warming database caches')
|
||||
|
||||
api = napi.NominatimAPI(args.project_dir)
|
||||
|
||||
try:
|
||||
if args.target != 'search':
|
||||
for _ in range(1000):
|
||||
api.reverse((random.uniform(-90, 90), random.uniform(-180, 180)),
|
||||
address_details=True)
|
||||
|
||||
if args.target != 'reverse':
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
if conn.table_exists('search_name'):
|
||||
words = tokenizer.most_frequent_words(conn, 1000)
|
||||
else:
|
||||
words = []
|
||||
|
||||
for word in words:
|
||||
api.search(word)
|
||||
finally:
|
||||
api.close()
|
||||
|
||||
return 0
|
||||
374
src/nominatim_db/clicmd/api.py
Normal file
374
src/nominatim_db/clicmd/api.py
Normal file
@@ -0,0 +1,374 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Subcommand definitions for API calls from the command line.
|
||||
"""
|
||||
from typing import Dict, Any
|
||||
import argparse
|
||||
import logging
|
||||
import json
|
||||
import sys
|
||||
|
||||
import nominatim_api as napi
|
||||
import nominatim_api.v1 as api_output
|
||||
from nominatim_api.v1.helpers import zoom_to_rank, deduplicate_results
|
||||
from nominatim_api.v1.format import dispatch as formatting
|
||||
import nominatim_api.logging as loglib
|
||||
from .args import NominatimArgs
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
STRUCTURED_QUERY = (
|
||||
('amenity', 'name and/or type of POI'),
|
||||
('street', 'housenumber and street'),
|
||||
('city', 'city, town or village'),
|
||||
('county', 'county'),
|
||||
('state', 'state'),
|
||||
('country', 'country'),
|
||||
('postalcode', 'postcode')
|
||||
)
|
||||
|
||||
EXTRADATA_PARAMS = (
|
||||
('addressdetails', 'Include a breakdown of the address into elements'),
|
||||
('extratags', ("Include additional information if available "
|
||||
"(e.g. wikipedia link, opening hours)")),
|
||||
('namedetails', 'Include a list of alternative names')
|
||||
)
|
||||
|
||||
def _add_api_output_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Output arguments')
|
||||
group.add_argument('--format', default='jsonv2',
|
||||
choices=formatting.list_formats(napi.SearchResults) + ['debug'],
|
||||
help='Format of result')
|
||||
for name, desc in EXTRADATA_PARAMS:
|
||||
group.add_argument('--' + name, action='store_true', help=desc)
|
||||
|
||||
group.add_argument('--lang', '--accept-language', metavar='LANGS',
|
||||
help='Preferred language order for presenting search results')
|
||||
group.add_argument('--polygon-output',
|
||||
choices=['geojson', 'kml', 'svg', 'text'],
|
||||
help='Output geometry of results as a GeoJSON, KML, SVG or WKT')
|
||||
group.add_argument('--polygon-threshold', type=float, default = 0.0,
|
||||
metavar='TOLERANCE',
|
||||
help=("Simplify output geometry."
|
||||
"Parameter is difference tolerance in degrees."))
|
||||
|
||||
|
||||
class APISearch:
|
||||
"""\
|
||||
Execute a search query.
|
||||
|
||||
This command works exactly the same as if calling the /search endpoint on
|
||||
the web API. See the online documentation for more details on the
|
||||
various parameters:
|
||||
https://nominatim.org/release-docs/latest/api/Search/
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Query arguments')
|
||||
group.add_argument('--query',
|
||||
help='Free-form query string')
|
||||
for name, desc in STRUCTURED_QUERY:
|
||||
group.add_argument('--' + name, help='Structured query: ' + desc)
|
||||
|
||||
_add_api_output_arguments(parser)
|
||||
|
||||
group = parser.add_argument_group('Result limitation')
|
||||
group.add_argument('--countrycodes', metavar='CC,..',
|
||||
help='Limit search results to one or more countries')
|
||||
group.add_argument('--exclude_place_ids', metavar='ID,..',
|
||||
help='List of search object to be excluded')
|
||||
group.add_argument('--limit', type=int, default=10,
|
||||
help='Limit the number of returned results')
|
||||
group.add_argument('--viewbox', metavar='X1,Y1,X2,Y2',
|
||||
help='Preferred area to find search results')
|
||||
group.add_argument('--bounded', action='store_true',
|
||||
help='Strictly restrict results to viewbox area')
|
||||
|
||||
group = parser.add_argument_group('Other arguments')
|
||||
group.add_argument('--no-dedupe', action='store_false', dest='dedupe',
|
||||
help='Do not remove duplicates from the result list')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
if args.format == 'debug':
|
||||
loglib.set_log_output('text')
|
||||
|
||||
api = napi.NominatimAPI(args.project_dir)
|
||||
|
||||
params: Dict[str, Any] = {'max_results': args.limit + min(args.limit, 10),
|
||||
'address_details': True, # needed for display name
|
||||
'geometry_output': args.get_geometry_output(),
|
||||
'geometry_simplification': args.polygon_threshold,
|
||||
'countries': args.countrycodes,
|
||||
'excluded': args.exclude_place_ids,
|
||||
'viewbox': args.viewbox,
|
||||
'bounded_viewbox': args.bounded,
|
||||
'locales': args.get_locales(api.config.DEFAULT_LANGUAGE)
|
||||
}
|
||||
|
||||
if args.query:
|
||||
results = api.search(args.query, **params)
|
||||
else:
|
||||
results = api.search_address(amenity=args.amenity,
|
||||
street=args.street,
|
||||
city=args.city,
|
||||
county=args.county,
|
||||
state=args.state,
|
||||
postalcode=args.postalcode,
|
||||
country=args.country,
|
||||
**params)
|
||||
|
||||
if args.dedupe and len(results) > 1:
|
||||
results = deduplicate_results(results, args.limit)
|
||||
|
||||
if args.format == 'debug':
|
||||
print(loglib.get_and_disable())
|
||||
return 0
|
||||
|
||||
output = api_output.format_result(
|
||||
results,
|
||||
args.format,
|
||||
{'extratags': args.extratags,
|
||||
'namedetails': args.namedetails,
|
||||
'addressdetails': args.addressdetails})
|
||||
if args.format != 'xml':
|
||||
# reformat the result, so it is pretty-printed
|
||||
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
|
||||
else:
|
||||
sys.stdout.write(output)
|
||||
sys.stdout.write('\n')
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
class APIReverse:
|
||||
"""\
|
||||
Execute API reverse query.
|
||||
|
||||
This command works exactly the same as if calling the /reverse endpoint on
|
||||
the web API. See the online documentation for more details on the
|
||||
various parameters:
|
||||
https://nominatim.org/release-docs/latest/api/Reverse/
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Query arguments')
|
||||
group.add_argument('--lat', type=float, required=True,
|
||||
help='Latitude of coordinate to look up (in WGS84)')
|
||||
group.add_argument('--lon', type=float, required=True,
|
||||
help='Longitude of coordinate to look up (in WGS84)')
|
||||
group.add_argument('--zoom', type=int,
|
||||
help='Level of detail required for the address')
|
||||
group.add_argument('--layer', metavar='LAYER',
|
||||
choices=[n.name.lower() for n in napi.DataLayer if n.name],
|
||||
action='append', required=False, dest='layers',
|
||||
help='OSM id to lookup in format <NRW><id> (may be repeated)')
|
||||
|
||||
_add_api_output_arguments(parser)
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
if args.format == 'debug':
|
||||
loglib.set_log_output('text')
|
||||
|
||||
api = napi.NominatimAPI(args.project_dir)
|
||||
|
||||
result = api.reverse(napi.Point(args.lon, args.lat),
|
||||
max_rank=zoom_to_rank(args.zoom or 18),
|
||||
layers=args.get_layers(napi.DataLayer.ADDRESS | napi.DataLayer.POI),
|
||||
address_details=True, # needed for display name
|
||||
geometry_output=args.get_geometry_output(),
|
||||
geometry_simplification=args.polygon_threshold,
|
||||
locales=args.get_locales(api.config.DEFAULT_LANGUAGE))
|
||||
|
||||
if args.format == 'debug':
|
||||
print(loglib.get_and_disable())
|
||||
return 0
|
||||
|
||||
if result:
|
||||
output = api_output.format_result(
|
||||
napi.ReverseResults([result]),
|
||||
args.format,
|
||||
{'extratags': args.extratags,
|
||||
'namedetails': args.namedetails,
|
||||
'addressdetails': args.addressdetails})
|
||||
if args.format != 'xml':
|
||||
# reformat the result, so it is pretty-printed
|
||||
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
|
||||
else:
|
||||
sys.stdout.write(output)
|
||||
sys.stdout.write('\n')
|
||||
|
||||
return 0
|
||||
|
||||
LOG.error("Unable to geocode.")
|
||||
return 42
|
||||
|
||||
|
||||
|
||||
class APILookup:
|
||||
"""\
|
||||
Execute API lookup query.
|
||||
|
||||
This command works exactly the same as if calling the /lookup endpoint on
|
||||
the web API. See the online documentation for more details on the
|
||||
various parameters:
|
||||
https://nominatim.org/release-docs/latest/api/Lookup/
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Query arguments')
|
||||
group.add_argument('--id', metavar='OSMID',
|
||||
action='append', required=True, dest='ids',
|
||||
help='OSM id to lookup in format <NRW><id> (may be repeated)')
|
||||
|
||||
_add_api_output_arguments(parser)
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
if args.format == 'debug':
|
||||
loglib.set_log_output('text')
|
||||
|
||||
api = napi.NominatimAPI(args.project_dir)
|
||||
|
||||
if args.format == 'debug':
|
||||
print(loglib.get_and_disable())
|
||||
return 0
|
||||
|
||||
places = [napi.OsmID(o[0], int(o[1:])) for o in args.ids]
|
||||
|
||||
results = api.lookup(places,
|
||||
address_details=True, # needed for display name
|
||||
geometry_output=args.get_geometry_output(),
|
||||
geometry_simplification=args.polygon_threshold or 0.0,
|
||||
locales=args.get_locales(api.config.DEFAULT_LANGUAGE))
|
||||
|
||||
output = api_output.format_result(
|
||||
results,
|
||||
args.format,
|
||||
{'extratags': args.extratags,
|
||||
'namedetails': args.namedetails,
|
||||
'addressdetails': args.addressdetails})
|
||||
if args.format != 'xml':
|
||||
# reformat the result, so it is pretty-printed
|
||||
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
|
||||
else:
|
||||
sys.stdout.write(output)
|
||||
sys.stdout.write('\n')
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
class APIDetails:
|
||||
"""\
|
||||
Execute API details query.
|
||||
|
||||
This command works exactly the same as if calling the /details endpoint on
|
||||
the web API. See the online documentation for more details on the
|
||||
various parameters:
|
||||
https://nominatim.org/release-docs/latest/api/Details/
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Query arguments')
|
||||
objs = group.add_mutually_exclusive_group(required=True)
|
||||
objs.add_argument('--node', '-n', type=int,
|
||||
help="Look up the OSM node with the given ID.")
|
||||
objs.add_argument('--way', '-w', type=int,
|
||||
help="Look up the OSM way with the given ID.")
|
||||
objs.add_argument('--relation', '-r', type=int,
|
||||
help="Look up the OSM relation with the given ID.")
|
||||
objs.add_argument('--place_id', '-p', type=int,
|
||||
help='Database internal identifier of the OSM object to look up')
|
||||
group.add_argument('--class', dest='object_class',
|
||||
help=("Class type to disambiguated multiple entries "
|
||||
"of the same object."))
|
||||
|
||||
group = parser.add_argument_group('Output arguments')
|
||||
group.add_argument('--addressdetails', action='store_true',
|
||||
help='Include a breakdown of the address into elements')
|
||||
group.add_argument('--keywords', action='store_true',
|
||||
help='Include a list of name keywords and address keywords')
|
||||
group.add_argument('--linkedplaces', action='store_true',
|
||||
help='Include a details of places that are linked with this one')
|
||||
group.add_argument('--hierarchy', action='store_true',
|
||||
help='Include details of places lower in the address hierarchy')
|
||||
group.add_argument('--group_hierarchy', action='store_true',
|
||||
help='Group the places by type')
|
||||
group.add_argument('--polygon_geojson', action='store_true',
|
||||
help='Include geometry of result')
|
||||
group.add_argument('--lang', '--accept-language', metavar='LANGS',
|
||||
help='Preferred language order for presenting search results')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
place: napi.PlaceRef
|
||||
if args.node:
|
||||
place = napi.OsmID('N', args.node, args.object_class)
|
||||
elif args.way:
|
||||
place = napi.OsmID('W', args.way, args.object_class)
|
||||
elif args.relation:
|
||||
place = napi.OsmID('R', args.relation, args.object_class)
|
||||
else:
|
||||
assert args.place_id is not None
|
||||
place = napi.PlaceID(args.place_id)
|
||||
|
||||
api = napi.NominatimAPI(args.project_dir)
|
||||
|
||||
locales = args.get_locales(api.config.DEFAULT_LANGUAGE)
|
||||
result = api.details(place,
|
||||
address_details=args.addressdetails,
|
||||
linked_places=args.linkedplaces,
|
||||
parented_places=args.hierarchy,
|
||||
keywords=args.keywords,
|
||||
geometry_output=napi.GeometryFormat.GEOJSON
|
||||
if args.polygon_geojson
|
||||
else napi.GeometryFormat.NONE,
|
||||
locales=locales)
|
||||
|
||||
|
||||
if result:
|
||||
output = api_output.format_result(
|
||||
result,
|
||||
'json',
|
||||
{'locales': locales,
|
||||
'group_hierarchy': args.group_hierarchy})
|
||||
# reformat the result, so it is pretty-printed
|
||||
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
|
||||
sys.stdout.write('\n')
|
||||
|
||||
return 0
|
||||
|
||||
LOG.error("Object not found in database.")
|
||||
return 42
|
||||
|
||||
|
||||
class APIStatus:
|
||||
"""
|
||||
Execute API status query.
|
||||
|
||||
This command works exactly the same as if calling the /status endpoint on
|
||||
the web API. See the online documentation for more details on the
|
||||
various parameters:
|
||||
https://nominatim.org/release-docs/latest/api/Status/
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
formats = api_output.list_formats(napi.StatusResult)
|
||||
group = parser.add_argument_group('API parameters')
|
||||
group.add_argument('--format', default=formats[0], choices=formats,
|
||||
help='Format of result')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
status = napi.NominatimAPI(args.project_dir).status()
|
||||
print(api_output.format_result(status, args.format, {}))
|
||||
return 0
|
||||
260
src/nominatim_db/clicmd/args.py
Normal file
260
src/nominatim_db/clicmd/args.py
Normal file
@@ -0,0 +1,260 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Provides custom functions over command-line arguments.
|
||||
"""
|
||||
from typing import Optional, List, Dict, Any, Sequence, Tuple
|
||||
import argparse
|
||||
import logging
|
||||
from functools import reduce
|
||||
from pathlib import Path
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.typing import Protocol
|
||||
import nominatim_api as napi
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
class Subcommand(Protocol):
|
||||
"""
|
||||
Interface to be implemented by classes implementing a CLI subcommand.
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
"""
|
||||
Fill the given parser for the subcommand with the appropriate
|
||||
parameters.
|
||||
"""
|
||||
|
||||
def run(self, args: 'NominatimArgs') -> int:
|
||||
"""
|
||||
Run the subcommand with the given parsed arguments.
|
||||
"""
|
||||
|
||||
|
||||
class NominatimArgs:
|
||||
""" Customized namespace class for the nominatim command line tool
|
||||
to receive the command-line arguments.
|
||||
"""
|
||||
# Basic environment set by root program.
|
||||
config: Configuration
|
||||
project_dir: Path
|
||||
|
||||
# Global switches
|
||||
version: bool
|
||||
subcommand: Optional[str]
|
||||
command: Subcommand
|
||||
|
||||
# Shared parameters
|
||||
osm2pgsql_cache: Optional[int]
|
||||
socket_timeout: int
|
||||
|
||||
# Arguments added to all subcommands.
|
||||
verbose: int
|
||||
threads: Optional[int]
|
||||
|
||||
# Arguments to 'add-data'
|
||||
file: Optional[str]
|
||||
diff: Optional[str]
|
||||
node: Optional[int]
|
||||
way: Optional[int]
|
||||
relation: Optional[int]
|
||||
tiger_data: Optional[str]
|
||||
use_main_api: bool
|
||||
|
||||
# Arguments to 'admin'
|
||||
warm: bool
|
||||
check_database: bool
|
||||
migrate: bool
|
||||
collect_os_info: bool
|
||||
clean_deleted: str
|
||||
analyse_indexing: bool
|
||||
target: Optional[str]
|
||||
osm_id: Optional[str]
|
||||
place_id: Optional[int]
|
||||
|
||||
# Arguments to 'import'
|
||||
osm_file: List[str]
|
||||
continue_at: Optional[str]
|
||||
reverse_only: bool
|
||||
no_partitions: bool
|
||||
no_updates: bool
|
||||
offline: bool
|
||||
ignore_errors: bool
|
||||
index_noanalyse: bool
|
||||
prepare_database: bool
|
||||
|
||||
# Arguments to 'index'
|
||||
boundaries_only: bool
|
||||
no_boundaries: bool
|
||||
minrank: int
|
||||
maxrank: int
|
||||
|
||||
# Arguments to 'export'
|
||||
output_type: str
|
||||
output_format: str
|
||||
output_all_postcodes: bool
|
||||
language: Optional[str]
|
||||
restrict_to_country: Optional[str]
|
||||
|
||||
# Arguments to 'convert'
|
||||
output: Path
|
||||
|
||||
# Arguments to 'refresh'
|
||||
postcodes: bool
|
||||
word_tokens: bool
|
||||
word_counts: bool
|
||||
address_levels: bool
|
||||
functions: bool
|
||||
wiki_data: bool
|
||||
secondary_importance: bool
|
||||
importance: bool
|
||||
website: bool
|
||||
diffs: bool
|
||||
enable_debug_statements: bool
|
||||
data_object: Sequence[Tuple[str, int]]
|
||||
data_area: Sequence[Tuple[str, int]]
|
||||
|
||||
# Arguments to 'replication'
|
||||
init: bool
|
||||
update_functions: bool
|
||||
check_for_updates: bool
|
||||
once: bool
|
||||
catch_up: bool
|
||||
do_index: bool
|
||||
|
||||
# Arguments to 'serve'
|
||||
server: str
|
||||
engine: str
|
||||
|
||||
# Arguments to 'special-phrases
|
||||
import_from_wiki: bool
|
||||
import_from_csv: Optional[str]
|
||||
no_replace: bool
|
||||
|
||||
# Arguments to all query functions
|
||||
format: str
|
||||
addressdetails: bool
|
||||
extratags: bool
|
||||
namedetails: bool
|
||||
lang: Optional[str]
|
||||
polygon_output: Optional[str]
|
||||
polygon_threshold: Optional[float]
|
||||
|
||||
# Arguments to 'search'
|
||||
query: Optional[str]
|
||||
amenity: Optional[str]
|
||||
street: Optional[str]
|
||||
city: Optional[str]
|
||||
county: Optional[str]
|
||||
state: Optional[str]
|
||||
country: Optional[str]
|
||||
postalcode: Optional[str]
|
||||
countrycodes: Optional[str]
|
||||
exclude_place_ids: Optional[str]
|
||||
limit: int
|
||||
viewbox: Optional[str]
|
||||
bounded: bool
|
||||
dedupe: bool
|
||||
|
||||
# Arguments to 'reverse'
|
||||
lat: float
|
||||
lon: float
|
||||
zoom: Optional[int]
|
||||
layers: Optional[Sequence[str]]
|
||||
|
||||
# Arguments to 'lookup'
|
||||
ids: Sequence[str]
|
||||
|
||||
# Arguments to 'details'
|
||||
object_class: Optional[str]
|
||||
linkedplaces: bool
|
||||
hierarchy: bool
|
||||
keywords: bool
|
||||
polygon_geojson: bool
|
||||
group_hierarchy: bool
|
||||
|
||||
|
||||
def osm2pgsql_options(self, default_cache: int,
|
||||
default_threads: int) -> Dict[str, Any]:
|
||||
""" Return the standard osm2pgsql options that can be derived
|
||||
from the command line arguments. The resulting dict can be
|
||||
further customized and then used in `run_osm2pgsql()`.
|
||||
"""
|
||||
return dict(osm2pgsql=self.config.OSM2PGSQL_BINARY or self.config.lib_dir.osm2pgsql,
|
||||
osm2pgsql_cache=self.osm2pgsql_cache or default_cache,
|
||||
osm2pgsql_style=self.config.get_import_style_file(),
|
||||
osm2pgsql_style_path=self.config.config_dir,
|
||||
threads=self.threads or default_threads,
|
||||
dsn=self.config.get_libpq_dsn(),
|
||||
flatnode_file=str(self.config.get_path('FLATNODE_FILE') or ''),
|
||||
tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA,
|
||||
slim_index=self.config.TABLESPACE_OSM_INDEX,
|
||||
main_data=self.config.TABLESPACE_PLACE_DATA,
|
||||
main_index=self.config.TABLESPACE_PLACE_INDEX
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def get_osm_file_list(self) -> Optional[List[Path]]:
|
||||
""" Return the --osm-file argument as a list of Paths or None
|
||||
if no argument was given. The function also checks if the files
|
||||
exist and raises a UsageError if one cannot be found.
|
||||
"""
|
||||
if not self.osm_file:
|
||||
return None
|
||||
|
||||
files = [Path(f) for f in self.osm_file]
|
||||
for fname in files:
|
||||
if not fname.is_file():
|
||||
LOG.fatal("OSM file '%s' does not exist.", fname)
|
||||
raise UsageError('Cannot access file.')
|
||||
|
||||
return files
|
||||
|
||||
|
||||
def get_geometry_output(self) -> napi.GeometryFormat:
|
||||
""" Get the requested geometry output format in a API-compatible
|
||||
format.
|
||||
"""
|
||||
if not self.polygon_output:
|
||||
return napi.GeometryFormat.NONE
|
||||
if self.polygon_output == 'geojson':
|
||||
return napi.GeometryFormat.GEOJSON
|
||||
if self.polygon_output == 'kml':
|
||||
return napi.GeometryFormat.KML
|
||||
if self.polygon_output == 'svg':
|
||||
return napi.GeometryFormat.SVG
|
||||
if self.polygon_output == 'text':
|
||||
return napi.GeometryFormat.TEXT
|
||||
|
||||
try:
|
||||
return napi.GeometryFormat[self.polygon_output.upper()]
|
||||
except KeyError as exp:
|
||||
raise UsageError(f"Unknown polygon output format '{self.polygon_output}'.") from exp
|
||||
|
||||
|
||||
def get_locales(self, default: Optional[str]) -> napi.Locales:
|
||||
""" Get the locales from the language parameter.
|
||||
"""
|
||||
if self.lang:
|
||||
return napi.Locales.from_accept_languages(self.lang)
|
||||
if default:
|
||||
return napi.Locales.from_accept_languages(default)
|
||||
|
||||
return napi.Locales()
|
||||
|
||||
|
||||
def get_layers(self, default: napi.DataLayer) -> Optional[napi.DataLayer]:
|
||||
""" Get the list of selected layers as a DataLayer enum.
|
||||
"""
|
||||
if not self.layers:
|
||||
return default
|
||||
|
||||
return reduce(napi.DataLayer.__or__,
|
||||
(napi.DataLayer[s.upper()] for s in self.layers))
|
||||
95
src/nominatim_db/clicmd/convert.py
Normal file
95
src/nominatim_db/clicmd/convert.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of the 'convert' subcommand.
|
||||
"""
|
||||
from typing import Set, Any, Union, Optional, Sequence
|
||||
import argparse
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from .args import NominatimArgs
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
# Using non-top-level imports to avoid eventually unused imports.
|
||||
# pylint: disable=E0012,C0415
|
||||
|
||||
class WithAction(argparse.Action):
|
||||
""" Special action that saves a list of flags, given on the command-line
|
||||
as `--with-foo` or `--without-foo`.
|
||||
"""
|
||||
def __init__(self, option_strings: Sequence[str], dest: Any,
|
||||
default: bool = True, **kwargs: Any) -> None:
|
||||
if 'nargs' in kwargs:
|
||||
raise ValueError("nargs not allowed.")
|
||||
if option_strings is None:
|
||||
raise ValueError("Positional parameter not allowed.")
|
||||
|
||||
self.dest_set = kwargs.pop('dest_set')
|
||||
full_option_strings = []
|
||||
for opt in option_strings:
|
||||
if not opt.startswith('--'):
|
||||
raise ValueError("short-form options not allowed")
|
||||
if default:
|
||||
self.dest_set.add(opt[2:])
|
||||
full_option_strings.append(f"--with-{opt[2:]}")
|
||||
full_option_strings.append(f"--without-{opt[2:]}")
|
||||
|
||||
super().__init__(full_option_strings, argparse.SUPPRESS, nargs=0, **kwargs)
|
||||
|
||||
|
||||
def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace,
|
||||
values: Union[str, Sequence[Any], None],
|
||||
option_string: Optional[str] = None) -> None:
|
||||
assert option_string
|
||||
if option_string.startswith('--with-'):
|
||||
self.dest_set.add(option_string[7:])
|
||||
if option_string.startswith('--without-'):
|
||||
self.dest_set.discard(option_string[10:])
|
||||
|
||||
|
||||
class ConvertDB:
|
||||
""" Convert an existing database into a different format. (EXPERIMENTAL)
|
||||
|
||||
Dump a read-only version of the database in a different format.
|
||||
At the moment only a SQLite database suitable for reverse lookup
|
||||
can be created.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.options: Set[str] = set()
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
parser.add_argument('--format', default='sqlite',
|
||||
choices=('sqlite', ),
|
||||
help='Format of the output database (must be sqlite currently)')
|
||||
parser.add_argument('--output', '-o', required=True, type=Path,
|
||||
help='File to write the database to.')
|
||||
group = parser.add_argument_group('Switches to define database layout'
|
||||
'(currently no effect)')
|
||||
group.add_argument('--reverse', action=WithAction, dest_set=self.options, default=True,
|
||||
help='Enable/disable support for reverse and lookup API'
|
||||
' (default: enabled)')
|
||||
group.add_argument('--search', action=WithAction, dest_set=self.options, default=True,
|
||||
help='Enable/disable support for search API (default: disabled)')
|
||||
group.add_argument('--details', action=WithAction, dest_set=self.options, default=True,
|
||||
help='Enable/disable support for details API (default: enabled)')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
if args.output.exists():
|
||||
raise UsageError(f"File '{args.output}' already exists. Refusing to overwrite.")
|
||||
|
||||
if args.format == 'sqlite':
|
||||
from ..tools import convert_sqlite
|
||||
|
||||
asyncio.run(convert_sqlite.convert(args.project_dir, args.output, self.options))
|
||||
return 0
|
||||
|
||||
return 1
|
||||
200
src/nominatim_db/clicmd/export.py
Normal file
200
src/nominatim_db/clicmd/export.py
Normal file
@@ -0,0 +1,200 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of the 'export' subcommand.
|
||||
"""
|
||||
from typing import Optional, List, cast
|
||||
import logging
|
||||
import argparse
|
||||
import asyncio
|
||||
import csv
|
||||
import sys
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
import nominatim_api as napi
|
||||
from nominatim_api.results import create_from_placex_row, ReverseResult, add_result_details
|
||||
from nominatim_api.types import LookupDetails
|
||||
from nominatim_core.errors import UsageError
|
||||
from .args import NominatimArgs
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
# Using non-top-level imports to avoid eventually unused imports.
|
||||
# pylint: disable=E0012,C0415
|
||||
# Needed for SQLAlchemy
|
||||
# pylint: disable=singleton-comparison
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
RANK_RANGE_MAP = {
|
||||
'country': (4, 4),
|
||||
'state': (5, 9),
|
||||
'county': (10, 12),
|
||||
'city': (13, 16),
|
||||
'suburb': (17, 21),
|
||||
'street': (26, 26),
|
||||
'path': (27, 27)
|
||||
}
|
||||
|
||||
RANK_TO_OUTPUT_MAP = {
|
||||
4: 'country',
|
||||
5: 'state', 6: 'state', 7: 'state', 8: 'state', 9: 'state',
|
||||
10: 'county', 11: 'county', 12: 'county',
|
||||
13: 'city', 14: 'city', 15: 'city', 16: 'city',
|
||||
17: 'suburb', 18: 'suburb', 19: 'suburb', 20: 'suburb', 21: 'suburb',
|
||||
26: 'street', 27: 'path'}
|
||||
|
||||
class QueryExport:
|
||||
"""\
|
||||
Export places as CSV file from the database.
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Output arguments')
|
||||
group.add_argument('--output-type', default='street',
|
||||
choices=('country', 'state', 'county',
|
||||
'city', 'suburb', 'street', 'path'),
|
||||
help='Type of places to output (default: street)')
|
||||
group.add_argument('--output-format',
|
||||
default='street;suburb;city;county;state;country',
|
||||
help=("Semicolon-separated list of address types "
|
||||
"(see --output-type). Additionally accepts:"
|
||||
"placeid,postcode"))
|
||||
group.add_argument('--language',
|
||||
help=("Preferred language for output "
|
||||
"(use local name, if omitted)"))
|
||||
group = parser.add_argument_group('Filter arguments')
|
||||
group.add_argument('--restrict-to-country', metavar='COUNTRY_CODE',
|
||||
help='Export only objects within country')
|
||||
group.add_argument('--restrict-to-osm-node', metavar='ID', type=int,
|
||||
dest='node',
|
||||
help='Export only children of this OSM node')
|
||||
group.add_argument('--restrict-to-osm-way', metavar='ID', type=int,
|
||||
dest='way',
|
||||
help='Export only children of this OSM way')
|
||||
group.add_argument('--restrict-to-osm-relation', metavar='ID', type=int,
|
||||
dest='relation',
|
||||
help='Export only children of this OSM relation')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
return asyncio.run(export(args))
|
||||
|
||||
|
||||
async def export(args: NominatimArgs) -> int:
|
||||
""" The actual export as a asynchronous function.
|
||||
"""
|
||||
|
||||
api = napi.NominatimAPIAsync(args.project_dir)
|
||||
|
||||
try:
|
||||
output_range = RANK_RANGE_MAP[args.output_type]
|
||||
|
||||
writer = init_csv_writer(args.output_format)
|
||||
|
||||
async with api.begin() as conn, api.begin() as detail_conn:
|
||||
t = conn.t.placex
|
||||
|
||||
sql = sa.select(t.c.place_id, t.c.parent_place_id,
|
||||
t.c.osm_type, t.c.osm_id, t.c.name,
|
||||
t.c.class_, t.c.type, t.c.admin_level,
|
||||
t.c.address, t.c.extratags,
|
||||
t.c.housenumber, t.c.postcode, t.c.country_code,
|
||||
t.c.importance, t.c.wikipedia, t.c.indexed_date,
|
||||
t.c.rank_address, t.c.rank_search,
|
||||
t.c.centroid)\
|
||||
.where(t.c.linked_place_id == None)\
|
||||
.where(t.c.rank_address.between(*output_range))
|
||||
|
||||
parent_place_id = await get_parent_id(conn, args.node, args.way, args.relation)
|
||||
if parent_place_id:
|
||||
taddr = conn.t.addressline
|
||||
|
||||
sql = sql.join(taddr, taddr.c.place_id == t.c.place_id)\
|
||||
.where(taddr.c.address_place_id == parent_place_id)\
|
||||
.where(taddr.c.isaddress)
|
||||
|
||||
if args.restrict_to_country:
|
||||
sql = sql.where(t.c.country_code == args.restrict_to_country.lower())
|
||||
|
||||
results = []
|
||||
for row in await conn.execute(sql):
|
||||
result = create_from_placex_row(row, ReverseResult)
|
||||
if result is not None:
|
||||
results.append(result)
|
||||
|
||||
if len(results) == 1000:
|
||||
await dump_results(detail_conn, results, writer, args.language)
|
||||
results = []
|
||||
|
||||
if results:
|
||||
await dump_results(detail_conn, results, writer, args.language)
|
||||
finally:
|
||||
await api.close()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def init_csv_writer(output_format: str) -> 'csv.DictWriter[str]':
|
||||
fields = output_format.split(';')
|
||||
writer = csv.DictWriter(sys.stdout, fieldnames=fields, extrasaction='ignore')
|
||||
writer.writeheader()
|
||||
|
||||
return writer
|
||||
|
||||
|
||||
async def dump_results(conn: napi.SearchConnection,
|
||||
results: List[ReverseResult],
|
||||
writer: 'csv.DictWriter[str]',
|
||||
lang: Optional[str]) -> None:
|
||||
locale = napi.Locales([lang] if lang else None)
|
||||
await add_result_details(conn, results,
|
||||
LookupDetails(address_details=True, locales=locale))
|
||||
|
||||
|
||||
for result in results:
|
||||
data = {'placeid': result.place_id,
|
||||
'postcode': result.postcode}
|
||||
|
||||
for line in (result.address_rows or []):
|
||||
if line.isaddress and line.local_name:
|
||||
if line.category[1] == 'postcode':
|
||||
data['postcode'] = line.local_name
|
||||
elif line.rank_address in RANK_TO_OUTPUT_MAP:
|
||||
data[RANK_TO_OUTPUT_MAP[line.rank_address]] = line.local_name
|
||||
|
||||
writer.writerow(data)
|
||||
|
||||
|
||||
async def get_parent_id(conn: napi.SearchConnection, node_id: Optional[int],
|
||||
way_id: Optional[int],
|
||||
relation_id: Optional[int]) -> Optional[int]:
|
||||
""" Get the place ID for the given OSM object.
|
||||
"""
|
||||
if node_id is not None:
|
||||
osm_type, osm_id = 'N', node_id
|
||||
elif way_id is not None:
|
||||
osm_type, osm_id = 'W', way_id
|
||||
elif relation_id is not None:
|
||||
osm_type, osm_id = 'R', relation_id
|
||||
else:
|
||||
return None
|
||||
|
||||
t = conn.t.placex
|
||||
sql = sa.select(t.c.place_id).limit(1)\
|
||||
.where(t.c.osm_type == osm_type)\
|
||||
.where(t.c.osm_id == osm_id)\
|
||||
.where(t.c.rank_address > 0)\
|
||||
.order_by(t.c.rank_address)
|
||||
|
||||
for result in await conn.execute(sql):
|
||||
return cast(int, result[0])
|
||||
|
||||
raise UsageError(f'Cannot find a place {osm_type}{osm_id}.')
|
||||
43
src/nominatim_db/clicmd/freeze.py
Normal file
43
src/nominatim_db/clicmd/freeze.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of the 'freeze' subcommand.
|
||||
"""
|
||||
import argparse
|
||||
|
||||
from nominatim_core.db.connection import connect
|
||||
from .args import NominatimArgs
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
# Using non-top-level imports to avoid eventually unused imports.
|
||||
# pylint: disable=E0012,C0415
|
||||
|
||||
class SetupFreeze:
|
||||
"""\
|
||||
Make database read-only.
|
||||
|
||||
About half of data in the Nominatim database is kept only to be able to
|
||||
keep the data up-to-date with new changes made in OpenStreetMap. This
|
||||
command drops all this data and only keeps the part needed for geocoding
|
||||
itself.
|
||||
|
||||
This command has the same effect as the `--no-updates` option for imports.
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
pass # No options
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
from ..tools import freeze
|
||||
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
freeze.drop_update_tables(conn)
|
||||
freeze.drop_flatnode_file(args.config.get_path('FLATNODE_FILE'))
|
||||
|
||||
return 0
|
||||
66
src/nominatim_db/clicmd/index.py
Normal file
66
src/nominatim_db/clicmd/index.py
Normal file
@@ -0,0 +1,66 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of the 'index' subcommand.
|
||||
"""
|
||||
import argparse
|
||||
|
||||
import psutil
|
||||
|
||||
from nominatim_core.db import status
|
||||
from nominatim_core.db.connection import connect
|
||||
from .args import NominatimArgs
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
# Using non-top-level imports to avoid eventually unused imports.
|
||||
# pylint: disable=E0012,C0415
|
||||
|
||||
|
||||
class UpdateIndex:
|
||||
"""\
|
||||
Reindex all new and modified data.
|
||||
|
||||
Indexing is the process of computing the address and search terms for
|
||||
the places in the database. Every time data is added or changed, indexing
|
||||
needs to be run. Imports and replication updates automatically take care
|
||||
of indexing. For other cases, this function allows to run indexing manually.
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Filter arguments')
|
||||
group.add_argument('--boundaries-only', action='store_true',
|
||||
help="""Index only administrative boundaries.""")
|
||||
group.add_argument('--no-boundaries', action='store_true',
|
||||
help="""Index everything except administrative boundaries.""")
|
||||
group.add_argument('--minrank', '-r', type=int, metavar='RANK', default=0,
|
||||
help='Minimum/starting rank')
|
||||
group.add_argument('--maxrank', '-R', type=int, metavar='RANK', default=30,
|
||||
help='Maximum/finishing rank')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
from ..indexer.indexer import Indexer
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||
|
||||
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
|
||||
args.threads or psutil.cpu_count() or 1)
|
||||
|
||||
if not args.no_boundaries:
|
||||
indexer.index_boundaries(args.minrank, args.maxrank)
|
||||
if not args.boundaries_only:
|
||||
indexer.index_by_rank(args.minrank, args.maxrank)
|
||||
indexer.index_postcodes()
|
||||
|
||||
if not args.no_boundaries and not args.boundaries_only \
|
||||
and args.minrank == 0 and args.maxrank == 30:
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
status.set_indexed(conn, True)
|
||||
|
||||
return 0
|
||||
187
src/nominatim_db/clicmd/refresh.py
Normal file
187
src/nominatim_db/clicmd/refresh.py
Normal file
@@ -0,0 +1,187 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of 'refresh' subcommand.
|
||||
"""
|
||||
from typing import Tuple, Optional
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.connection import connect
|
||||
from ..tokenizer.base import AbstractTokenizer
|
||||
from .args import NominatimArgs
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
# Using non-top-level imports to avoid eventually unused imports.
|
||||
# pylint: disable=E0012,C0415
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _parse_osm_object(obj: str) -> Tuple[str, int]:
|
||||
""" Parse the given argument into a tuple of OSM type and ID.
|
||||
Raises an ArgumentError if the format is not recognized.
|
||||
"""
|
||||
if len(obj) < 2 or obj[0].lower() not in 'nrw' or not obj[1:].isdigit():
|
||||
raise argparse.ArgumentTypeError("Cannot parse OSM ID. Expect format: [N|W|R]<id>.")
|
||||
|
||||
return (obj[0].upper(), int(obj[1:]))
|
||||
|
||||
|
||||
class UpdateRefresh:
|
||||
"""\
|
||||
Recompute auxiliary data used by the indexing process.
|
||||
|
||||
This sub-commands updates various static data and functions in the database.
|
||||
It usually needs to be run after changing various aspects of the
|
||||
configuration. The configuration documentation will mention the exact
|
||||
command to use in such case.
|
||||
|
||||
Warning: the 'update' command must not be run in parallel with other update
|
||||
commands like 'replication' or 'add-data'.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
self.tokenizer: Optional[AbstractTokenizer] = None
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Data arguments')
|
||||
group.add_argument('--postcodes', action='store_true',
|
||||
help='Update postcode centroid table')
|
||||
group.add_argument('--word-tokens', action='store_true',
|
||||
help='Clean up search terms')
|
||||
group.add_argument('--word-counts', action='store_true',
|
||||
help='Compute frequency of full-word search terms')
|
||||
group.add_argument('--address-levels', action='store_true',
|
||||
help='Reimport address level configuration')
|
||||
group.add_argument('--functions', action='store_true',
|
||||
help='Update the PL/pgSQL functions in the database')
|
||||
group.add_argument('--wiki-data', action='store_true',
|
||||
help='Update Wikipedia/data importance numbers')
|
||||
group.add_argument('--secondary-importance', action='store_true',
|
||||
help='Update secondary importance raster data')
|
||||
group.add_argument('--importance', action='store_true',
|
||||
help='Recompute place importances (expensive!)')
|
||||
group.add_argument('--website', action='store_true',
|
||||
help='Refresh the directory that serves the scripts for the web API')
|
||||
group.add_argument('--data-object', action='append',
|
||||
type=_parse_osm_object, metavar='OBJECT',
|
||||
help='Mark the given OSM object as requiring an update'
|
||||
' (format: [NWR]<id>)')
|
||||
group.add_argument('--data-area', action='append',
|
||||
type=_parse_osm_object, metavar='OBJECT',
|
||||
help='Mark the area around the given OSM object as requiring an update'
|
||||
' (format: [NWR]<id>)')
|
||||
|
||||
group = parser.add_argument_group('Arguments for function refresh')
|
||||
group.add_argument('--no-diff-updates', action='store_false', dest='diffs',
|
||||
help='Do not enable code for propagating updates')
|
||||
group.add_argument('--enable-debug-statements', action='store_true',
|
||||
help='Enable debug warning statements in functions')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, too-many-statements
|
||||
from ..tools import refresh, postcodes
|
||||
from ..indexer.indexer import Indexer
|
||||
|
||||
need_function_refresh = args.functions
|
||||
|
||||
if args.postcodes:
|
||||
if postcodes.can_compute(args.config.get_libpq_dsn()):
|
||||
LOG.warning("Update postcodes centroid")
|
||||
tokenizer = self._get_tokenizer(args.config)
|
||||
postcodes.update_postcodes(args.config.get_libpq_dsn(),
|
||||
args.project_dir, tokenizer)
|
||||
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
|
||||
args.threads or 1)
|
||||
indexer.index_postcodes()
|
||||
else:
|
||||
LOG.error("The place table doesn't exist. "
|
||||
"Postcode updates on a frozen database is not possible.")
|
||||
|
||||
if args.word_tokens:
|
||||
LOG.warning('Updating word tokens')
|
||||
tokenizer = self._get_tokenizer(args.config)
|
||||
tokenizer.update_word_tokens()
|
||||
|
||||
if args.word_counts:
|
||||
LOG.warning('Recompute word statistics')
|
||||
self._get_tokenizer(args.config).update_statistics(args.config,
|
||||
threads=args.threads or 1)
|
||||
|
||||
if args.address_levels:
|
||||
LOG.warning('Updating address levels')
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
refresh.load_address_levels_from_config(conn, args.config)
|
||||
|
||||
# Attention: must come BEFORE functions
|
||||
if args.secondary_importance:
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
# If the table did not exist before, then the importance code
|
||||
# needs to be enabled.
|
||||
if not conn.table_exists('secondary_importance'):
|
||||
args.functions = True
|
||||
|
||||
LOG.warning('Import secondary importance raster data from %s', args.project_dir)
|
||||
if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
|
||||
args.project_dir) > 0:
|
||||
LOG.fatal('FATAL: Cannot update secondary importance raster data')
|
||||
return 1
|
||||
need_function_refresh = True
|
||||
|
||||
if args.wiki_data:
|
||||
data_path = Path(args.config.WIKIPEDIA_DATA_PATH
|
||||
or args.project_dir)
|
||||
LOG.warning('Import wikipedia article importance from %s', data_path)
|
||||
if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
|
||||
data_path) > 0:
|
||||
LOG.fatal('FATAL: Wikipedia importance file not found in %s', data_path)
|
||||
return 1
|
||||
need_function_refresh = True
|
||||
|
||||
if need_function_refresh:
|
||||
LOG.warning('Create functions')
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
refresh.create_functions(conn, args.config,
|
||||
args.diffs, args.enable_debug_statements)
|
||||
self._get_tokenizer(args.config).update_sql_functions(args.config)
|
||||
|
||||
# Attention: importance MUST come after wiki data import and after functions.
|
||||
if args.importance:
|
||||
LOG.warning('Update importance values for database')
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
refresh.recompute_importance(conn)
|
||||
|
||||
if args.website:
|
||||
webdir = args.project_dir / 'website'
|
||||
LOG.warning('Setting up website directory at %s', webdir)
|
||||
# This is a little bit hacky: call the tokenizer setup, so that
|
||||
# the tokenizer directory gets repopulated as well, in case it
|
||||
# wasn't there yet.
|
||||
self._get_tokenizer(args.config)
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
refresh.setup_website(webdir, args.config, conn)
|
||||
|
||||
if args.data_object or args.data_area:
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
for obj in args.data_object or []:
|
||||
refresh.invalidate_osm_object(*obj, conn, recursive=False)
|
||||
for obj in args.data_area or []:
|
||||
refresh.invalidate_osm_object(*obj, conn, recursive=True)
|
||||
conn.commit()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def _get_tokenizer(self, config: Configuration) -> AbstractTokenizer:
|
||||
if self.tokenizer is None:
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
|
||||
self.tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
|
||||
|
||||
return self.tokenizer
|
||||
200
src/nominatim_db/clicmd/replication.py
Normal file
200
src/nominatim_db/clicmd/replication.py
Normal file
@@ -0,0 +1,200 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of the 'replication' sub-command.
|
||||
"""
|
||||
from typing import Optional
|
||||
import argparse
|
||||
import datetime as dt
|
||||
import logging
|
||||
import socket
|
||||
import time
|
||||
|
||||
from nominatim_core.db import status
|
||||
from nominatim_core.db.connection import connect
|
||||
from nominatim_core.errors import UsageError
|
||||
from .args import NominatimArgs
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
# Using non-top-level imports to make pyosmium optional for replication only.
|
||||
# pylint: disable=C0415
|
||||
|
||||
class UpdateReplication:
|
||||
"""\
|
||||
Update the database using an online replication service.
|
||||
|
||||
An OSM replication service is an online service that provides regular
|
||||
updates (OSM diff files) for the planet or update they provide. The OSMF
|
||||
provides the primary replication service for the full planet at
|
||||
https://planet.osm.org/replication/ but there are other providers of
|
||||
extracts of OSM data who provide such a service as well.
|
||||
|
||||
This sub-command allows to set up such a replication service and download
|
||||
and import updates at regular intervals. You need to call '--init' once to
|
||||
set up the process or whenever you change the replication configuration
|
||||
parameters. Without any arguments, the sub-command will go into a loop and
|
||||
continuously apply updates as they become available. Giving `--once` just
|
||||
downloads and imports the next batch of updates.
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Arguments for initialisation')
|
||||
group.add_argument('--init', action='store_true',
|
||||
help='Initialise the update process')
|
||||
group.add_argument('--no-update-functions', dest='update_functions',
|
||||
action='store_false',
|
||||
help="Do not update the trigger function to "
|
||||
"support differential updates (EXPERT)")
|
||||
group = parser.add_argument_group('Arguments for updates')
|
||||
group.add_argument('--check-for-updates', action='store_true',
|
||||
help='Check if new updates are available and exit')
|
||||
group.add_argument('--once', action='store_true',
|
||||
help="Download and apply updates only once. When "
|
||||
"not set, updates are continuously applied")
|
||||
group.add_argument('--catch-up', action='store_true',
|
||||
help="Download and apply updates until no new "
|
||||
"data is available on the server")
|
||||
group.add_argument('--no-index', action='store_false', dest='do_index',
|
||||
help=("Do not index the new data. Only usable "
|
||||
"together with --once"))
|
||||
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
||||
help='Size of cache to be used by osm2pgsql (in MB)')
|
||||
group = parser.add_argument_group('Download parameters')
|
||||
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
||||
help='Set timeout for file downloads')
|
||||
|
||||
|
||||
def _init_replication(self, args: NominatimArgs) -> int:
|
||||
from ..tools import replication, refresh
|
||||
|
||||
LOG.warning("Initialising replication updates")
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
replication.init_replication(conn, base_url=args.config.REPLICATION_URL,
|
||||
socket_timeout=args.socket_timeout)
|
||||
if args.update_functions:
|
||||
LOG.warning("Create functions")
|
||||
refresh.create_functions(conn, args.config, True, False)
|
||||
return 0
|
||||
|
||||
|
||||
def _check_for_updates(self, args: NominatimArgs) -> int:
|
||||
from ..tools import replication
|
||||
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
return replication.check_for_updates(conn, base_url=args.config.REPLICATION_URL,
|
||||
socket_timeout=args.socket_timeout)
|
||||
|
||||
|
||||
def _report_update(self, batchdate: dt.datetime,
|
||||
start_import: dt.datetime,
|
||||
start_index: Optional[dt.datetime]) -> None:
|
||||
def round_time(delta: dt.timedelta) -> dt.timedelta:
|
||||
return dt.timedelta(seconds=int(delta.total_seconds()))
|
||||
|
||||
end = dt.datetime.now(dt.timezone.utc)
|
||||
LOG.warning("Update completed. Import: %s. %sTotal: %s. Remaining backlog: %s.",
|
||||
round_time((start_index or end) - start_import),
|
||||
f"Indexing: {round_time(end - start_index)} " if start_index else '',
|
||||
round_time(end - start_import),
|
||||
round_time(end - batchdate))
|
||||
|
||||
|
||||
def _compute_update_interval(self, args: NominatimArgs) -> int:
|
||||
if args.catch_up:
|
||||
return 0
|
||||
|
||||
update_interval = args.config.get_int('REPLICATION_UPDATE_INTERVAL')
|
||||
# Sanity check to not overwhelm the Geofabrik servers.
|
||||
if 'download.geofabrik.de' in args.config.REPLICATION_URL\
|
||||
and update_interval < 86400:
|
||||
LOG.fatal("Update interval too low for download.geofabrik.de.\n"
|
||||
"Please check install documentation "
|
||||
"(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
|
||||
"setting-up-the-update-process).")
|
||||
raise UsageError("Invalid replication update interval setting.")
|
||||
|
||||
return update_interval
|
||||
|
||||
|
||||
def _update(self, args: NominatimArgs) -> None:
|
||||
# pylint: disable=too-many-locals
|
||||
from ..tools import replication
|
||||
from ..indexer.indexer import Indexer
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
|
||||
update_interval = self._compute_update_interval(args)
|
||||
|
||||
params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
|
||||
params.update(base_url=args.config.REPLICATION_URL,
|
||||
update_interval=update_interval,
|
||||
import_file=args.project_dir / 'osmosischange.osc',
|
||||
max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'),
|
||||
indexed_only=not args.once)
|
||||
|
||||
if not args.once:
|
||||
if not args.do_index:
|
||||
LOG.fatal("Indexing cannot be disabled when running updates continuously.")
|
||||
raise UsageError("Bad argument '--no-index'.")
|
||||
recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
|
||||
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, args.threads or 1)
|
||||
|
||||
dsn = args.config.get_libpq_dsn()
|
||||
|
||||
while True:
|
||||
start = dt.datetime.now(dt.timezone.utc)
|
||||
state = replication.update(dsn, params, socket_timeout=args.socket_timeout)
|
||||
|
||||
with connect(dsn) as conn:
|
||||
if state is not replication.UpdateState.NO_CHANGES:
|
||||
status.log_status(conn, start, 'import')
|
||||
batchdate, _, _ = status.get_status(conn)
|
||||
conn.commit()
|
||||
|
||||
if state is not replication.UpdateState.NO_CHANGES and args.do_index:
|
||||
index_start = dt.datetime.now(dt.timezone.utc)
|
||||
indexer.index_full(analyse=False)
|
||||
|
||||
with connect(dsn) as conn:
|
||||
status.set_indexed(conn, True)
|
||||
status.log_status(conn, index_start, 'index')
|
||||
conn.commit()
|
||||
else:
|
||||
index_start = None
|
||||
|
||||
if state is replication.UpdateState.NO_CHANGES and \
|
||||
args.catch_up or update_interval > 40*60:
|
||||
while indexer.has_pending():
|
||||
indexer.index_full(analyse=False)
|
||||
|
||||
if LOG.isEnabledFor(logging.WARNING):
|
||||
assert batchdate is not None
|
||||
self._report_update(batchdate, start, index_start)
|
||||
|
||||
if args.once or (args.catch_up and state is replication.UpdateState.NO_CHANGES):
|
||||
break
|
||||
|
||||
if state is replication.UpdateState.NO_CHANGES:
|
||||
LOG.warning("No new changes. Sleeping for %d sec.", recheck_interval)
|
||||
time.sleep(recheck_interval)
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
socket.setdefaulttimeout(args.socket_timeout)
|
||||
|
||||
if args.init:
|
||||
return self._init_replication(args)
|
||||
|
||||
if args.check_for_updates:
|
||||
return self._check_for_updates(args)
|
||||
|
||||
self._update(args)
|
||||
return 0
|
||||
229
src/nominatim_db/clicmd/setup.py
Normal file
229
src/nominatim_db/clicmd/setup.py
Normal file
@@ -0,0 +1,229 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of the 'import' subcommand.
|
||||
"""
|
||||
from typing import Optional
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import psutil
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.connection import connect
|
||||
from nominatim_core.db import status, properties
|
||||
from ..tokenizer.base import AbstractTokenizer
|
||||
from ..version import NOMINATIM_VERSION
|
||||
from .args import NominatimArgs
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
# Using non-top-level imports to avoid eventually unused imports.
|
||||
# pylint: disable=C0415
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
class SetupAll:
|
||||
"""\
|
||||
Create a new Nominatim database from an OSM file.
|
||||
|
||||
This sub-command sets up a new Nominatim database from scratch starting
|
||||
with creating a new database in Postgresql. The user running this command
|
||||
needs superuser rights on the database.
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group1 = parser.add_argument_group('Required arguments')
|
||||
group1.add_argument('--osm-file', metavar='FILE', action='append',
|
||||
help='OSM file to be imported'
|
||||
' (repeat for importing multiple files)',
|
||||
default=None)
|
||||
group1.add_argument('--continue', dest='continue_at',
|
||||
choices=['import-from-file', 'load-data', 'indexing', 'db-postprocess'],
|
||||
help='Continue an import that was interrupted',
|
||||
default=None)
|
||||
group2 = parser.add_argument_group('Optional arguments')
|
||||
group2.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
||||
help='Size of cache to be used by osm2pgsql (in MB)')
|
||||
group2.add_argument('--reverse-only', action='store_true',
|
||||
help='Do not create tables and indexes for searching')
|
||||
group2.add_argument('--no-partitions', action='store_true',
|
||||
help=("Do not partition search indices "
|
||||
"(speeds up import of single country extracts)"))
|
||||
group2.add_argument('--no-updates', action='store_true',
|
||||
help="Do not keep tables that are only needed for "
|
||||
"updating the database later")
|
||||
group2.add_argument('--offline', action='store_true',
|
||||
help="Do not attempt to load any additional data from the internet")
|
||||
group3 = parser.add_argument_group('Expert options')
|
||||
group3.add_argument('--ignore-errors', action='store_true',
|
||||
help='Continue import even when errors in SQL are present')
|
||||
group3.add_argument('--index-noanalyse', action='store_true',
|
||||
help='Do not perform analyse operations during index (expert only)')
|
||||
group3.add_argument('--prepare-database', action='store_true',
|
||||
help='Create the database but do not import any data')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int: # pylint: disable=too-many-statements, too-many-branches
|
||||
from ..data import country_info
|
||||
from ..tools import database_import, refresh, postcodes, freeze
|
||||
from ..indexer.indexer import Indexer
|
||||
|
||||
num_threads = args.threads or psutil.cpu_count() or 1
|
||||
|
||||
country_info.setup_country_config(args.config)
|
||||
|
||||
if args.osm_file is None and args.continue_at is None and not args.prepare_database:
|
||||
raise UsageError("No input files (use --osm-file).")
|
||||
|
||||
if args.osm_file is not None and args.continue_at not in ('import-from-file', None):
|
||||
raise UsageError(f"Cannot use --continue {args.continue_at} and --osm-file together.")
|
||||
|
||||
if args.continue_at is not None and args.prepare_database:
|
||||
raise UsageError(
|
||||
"Cannot use --continue and --prepare-database together."
|
||||
)
|
||||
|
||||
|
||||
if args.prepare_database or args.continue_at is None:
|
||||
LOG.warning('Creating database')
|
||||
database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
|
||||
rouser=args.config.DATABASE_WEBUSER)
|
||||
if args.prepare_database:
|
||||
return 0
|
||||
|
||||
if args.continue_at in (None, 'import-from-file'):
|
||||
files = args.get_osm_file_list()
|
||||
if not files:
|
||||
raise UsageError("No input files (use --osm-file).")
|
||||
|
||||
if args.continue_at in ('import-from-file', None):
|
||||
# Check if the correct plugins are installed
|
||||
database_import.check_existing_database_plugins(args.config.get_libpq_dsn())
|
||||
LOG.warning('Setting up country tables')
|
||||
country_info.setup_country_tables(args.config.get_libpq_dsn(),
|
||||
args.config.lib_dir.data,
|
||||
args.no_partitions)
|
||||
|
||||
LOG.warning('Importing OSM data file')
|
||||
database_import.import_osm_data(files,
|
||||
args.osm2pgsql_options(0, 1),
|
||||
drop=args.no_updates,
|
||||
ignore_errors=args.ignore_errors)
|
||||
|
||||
LOG.warning('Importing wikipedia importance data')
|
||||
data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
|
||||
if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
|
||||
data_path) > 0:
|
||||
LOG.error('Wikipedia importance dump file not found. '
|
||||
'Calculating importance values of locations will not '
|
||||
'use Wikipedia importance data.')
|
||||
|
||||
LOG.warning('Importing secondary importance raster data')
|
||||
if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
|
||||
args.project_dir) != 0:
|
||||
LOG.error('Secondary importance file not imported. '
|
||||
'Falling back to default ranking.')
|
||||
|
||||
self._setup_tables(args.config, args.reverse_only)
|
||||
|
||||
if args.continue_at in ('import-from-file', 'load-data', None):
|
||||
LOG.warning('Initialise tables')
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
database_import.truncate_data_tables(conn)
|
||||
|
||||
LOG.warning('Load data into placex table')
|
||||
database_import.load_data(args.config.get_libpq_dsn(), num_threads)
|
||||
|
||||
LOG.warning("Setting up tokenizer")
|
||||
tokenizer = self._get_tokenizer(args.continue_at, args.config)
|
||||
|
||||
if args.continue_at in ('import-from-file', 'load-data', None):
|
||||
LOG.warning('Calculate postcodes')
|
||||
postcodes.update_postcodes(args.config.get_libpq_dsn(),
|
||||
args.project_dir, tokenizer)
|
||||
|
||||
if args.continue_at in \
|
||||
('import-from-file', 'load-data', 'indexing', None):
|
||||
LOG.warning('Indexing places')
|
||||
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, num_threads)
|
||||
indexer.index_full(analyse=not args.index_noanalyse)
|
||||
|
||||
LOG.warning('Post-process tables')
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
database_import.create_search_indices(conn, args.config,
|
||||
drop=args.no_updates,
|
||||
threads=num_threads)
|
||||
LOG.warning('Create search index for default country names.')
|
||||
country_info.create_country_names(conn, tokenizer,
|
||||
args.config.get_str_list('LANGUAGES'))
|
||||
if args.no_updates:
|
||||
freeze.drop_update_tables(conn)
|
||||
tokenizer.finalize_import(args.config)
|
||||
|
||||
LOG.warning('Recompute word counts')
|
||||
tokenizer.update_statistics(args.config, threads=num_threads)
|
||||
|
||||
webdir = args.project_dir / 'website'
|
||||
LOG.warning('Setup website at %s', webdir)
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
refresh.setup_website(webdir, args.config, conn)
|
||||
|
||||
self._finalize_database(args.config.get_libpq_dsn(), args.offline)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def _setup_tables(self, config: Configuration, reverse_only: bool) -> None:
|
||||
""" Set up the basic database layout: tables, indexes and functions.
|
||||
"""
|
||||
from ..tools import database_import, refresh
|
||||
|
||||
with connect(config.get_libpq_dsn()) as conn:
|
||||
LOG.warning('Create functions (1st pass)')
|
||||
refresh.create_functions(conn, config, False, False)
|
||||
LOG.warning('Create tables')
|
||||
database_import.create_tables(conn, config, reverse_only=reverse_only)
|
||||
refresh.load_address_levels_from_config(conn, config)
|
||||
LOG.warning('Create functions (2nd pass)')
|
||||
refresh.create_functions(conn, config, False, False)
|
||||
LOG.warning('Create table triggers')
|
||||
database_import.create_table_triggers(conn, config)
|
||||
LOG.warning('Create partition tables')
|
||||
database_import.create_partition_tables(conn, config)
|
||||
LOG.warning('Create functions (3rd pass)')
|
||||
refresh.create_functions(conn, config, False, False)
|
||||
|
||||
|
||||
def _get_tokenizer(self, continue_at: Optional[str],
|
||||
config: Configuration) -> AbstractTokenizer:
|
||||
""" Set up a new tokenizer or load an already initialised one.
|
||||
"""
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
|
||||
if continue_at in ('import-from-file', 'load-data', None):
|
||||
# (re)initialise the tokenizer data
|
||||
return tokenizer_factory.create_tokenizer(config)
|
||||
|
||||
# just load the tokenizer
|
||||
return tokenizer_factory.get_tokenizer_for_db(config)
|
||||
|
||||
|
||||
def _finalize_database(self, dsn: str, offline: bool) -> None:
|
||||
""" Determine the database date and set the status accordingly.
|
||||
"""
|
||||
with connect(dsn) as conn:
|
||||
properties.set_property(conn, 'database_version', str(NOMINATIM_VERSION))
|
||||
|
||||
try:
|
||||
dbdate = status.compute_database_date(conn, offline)
|
||||
status.set_status(conn, dbdate)
|
||||
LOG.info('Database is at %s.', dbdate)
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
LOG.error('Cannot determine date of database: %s', exc)
|
||||
93
src/nominatim_db/clicmd/special_phrases.py
Normal file
93
src/nominatim_db/clicmd/special_phrases.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Implementation of the 'special-phrases' command.
|
||||
"""
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.db.connection import connect
|
||||
from ..tools.special_phrases.sp_importer import SPImporter, SpecialPhraseLoader
|
||||
from ..tools.special_phrases.sp_wiki_loader import SPWikiLoader
|
||||
from ..tools.special_phrases.sp_csv_loader import SPCsvLoader
|
||||
from .args import NominatimArgs
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
# Using non-top-level imports to avoid eventually unused imports.
|
||||
# pylint: disable=E0012,C0415
|
||||
|
||||
class ImportSpecialPhrases:
|
||||
"""\
|
||||
Import special phrases.
|
||||
|
||||
Special phrases are search terms that narrow down the type of object
|
||||
that should be searched. For example, you might want to search for
|
||||
'Hotels in Barcelona'. The OSM wiki has a selection of special phrases
|
||||
in many languages, which can be imported with this command.
|
||||
|
||||
You can also provide your own phrases in a CSV file. The file needs to have
|
||||
the following five columns:
|
||||
* phrase - the term expected for searching
|
||||
* class - the OSM tag key of the object type
|
||||
* type - the OSM tag value of the object type
|
||||
* operator - the kind of search to be done (one of: in, near, name, -)
|
||||
* plural - whether the term is a plural or not (Y/N)
|
||||
|
||||
An example file can be found in the Nominatim sources at
|
||||
'test/testdb/full_en_phrases_test.csv'.
|
||||
|
||||
The import can be further configured to ignore specific key/value pairs.
|
||||
This is particularly useful when importing phrases from the wiki. The
|
||||
default configuration excludes some very common tags like building=yes.
|
||||
The configuration can be customized by putting a file `phrase-settings.json`
|
||||
with custom rules into the project directory or by using the `--config`
|
||||
option to point to another configuration file.
|
||||
"""
|
||||
|
||||
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_argument_group('Input arguments')
|
||||
group.add_argument('--import-from-wiki', action='store_true',
|
||||
help='Import special phrases from the OSM wiki to the database')
|
||||
group.add_argument('--import-from-csv', metavar='FILE',
|
||||
help='Import special phrases from a CSV file')
|
||||
group.add_argument('--no-replace', action='store_true',
|
||||
help='Keep the old phrases and only add the new ones')
|
||||
|
||||
|
||||
def run(self, args: NominatimArgs) -> int:
|
||||
|
||||
if args.import_from_wiki:
|
||||
self.start_import(args, SPWikiLoader(args.config))
|
||||
|
||||
if args.import_from_csv:
|
||||
if not Path(args.import_from_csv).is_file():
|
||||
LOG.fatal("CSV file '%s' does not exist.", args.import_from_csv)
|
||||
raise UsageError('Cannot access file.')
|
||||
|
||||
self.start_import(args, SPCsvLoader(args.import_from_csv))
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def start_import(self, args: NominatimArgs, loader: SpecialPhraseLoader) -> None:
|
||||
"""
|
||||
Create the SPImporter object containing the right
|
||||
sp loader and then start the import of special phrases.
|
||||
"""
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||
should_replace = not args.no_replace
|
||||
with connect(args.config.get_libpq_dsn()) as db_connection:
|
||||
SPImporter(
|
||||
args.config, db_connection, loader
|
||||
).import_phrases(tokenizer, should_replace)
|
||||
0
src/nominatim_db/data/__init__.py
Normal file
0
src/nominatim_db/data/__init__.py
Normal file
175
src/nominatim_db/data/country_info.py
Normal file
175
src/nominatim_db/data/country_info.py
Normal file
@@ -0,0 +1,175 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for importing and managing static country information.
|
||||
"""
|
||||
from typing import Dict, Any, Iterable, Tuple, Optional, Container, overload
|
||||
from pathlib import Path
|
||||
import psycopg2.extras
|
||||
|
||||
from nominatim_core.db import utils as db_utils
|
||||
from nominatim_core.db.connection import connect, Connection
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.config import Configuration
|
||||
from ..tokenizer.base import AbstractTokenizer
|
||||
|
||||
def _flatten_name_list(names: Any) -> Dict[str, str]:
|
||||
if names is None:
|
||||
return {}
|
||||
|
||||
if not isinstance(names, dict):
|
||||
raise UsageError("Expected key-value list for names in country_settings.py")
|
||||
|
||||
flat = {}
|
||||
for prefix, remain in names.items():
|
||||
if isinstance(remain, str):
|
||||
flat[prefix] = remain
|
||||
elif not isinstance(remain, dict):
|
||||
raise UsageError("Entries in names must be key-value lists.")
|
||||
else:
|
||||
for suffix, name in remain.items():
|
||||
if suffix == 'default':
|
||||
flat[prefix] = name
|
||||
else:
|
||||
flat[f'{prefix}:{suffix}'] = name
|
||||
|
||||
return flat
|
||||
|
||||
|
||||
|
||||
class _CountryInfo:
|
||||
""" Caches country-specific properties from the configuration file.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._info: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
|
||||
def load(self, config: Configuration) -> None:
|
||||
""" Load the country properties from the configuration files,
|
||||
if they are not loaded yet.
|
||||
"""
|
||||
if not self._info:
|
||||
self._info = config.load_sub_configuration('country_settings.yaml')
|
||||
for prop in self._info.values():
|
||||
# Convert languages into a list for simpler handling.
|
||||
if 'languages' not in prop:
|
||||
prop['languages'] = []
|
||||
elif not isinstance(prop['languages'], list):
|
||||
prop['languages'] = [x.strip()
|
||||
for x in prop['languages'].split(',')]
|
||||
prop['names'] = _flatten_name_list(prop.get('names'))
|
||||
|
||||
|
||||
def items(self) -> Iterable[Tuple[str, Dict[str, Any]]]:
|
||||
""" Return tuples of (country_code, property dict) as iterable.
|
||||
"""
|
||||
return self._info.items()
|
||||
|
||||
def get(self, country_code: str) -> Dict[str, Any]:
|
||||
""" Get country information for the country with the given country code.
|
||||
"""
|
||||
return self._info.get(country_code, {})
|
||||
|
||||
|
||||
|
||||
_COUNTRY_INFO = _CountryInfo()
|
||||
|
||||
|
||||
def setup_country_config(config: Configuration) -> None:
|
||||
""" Load country properties from the configuration file.
|
||||
Needs to be called before using any other functions in this
|
||||
file.
|
||||
"""
|
||||
_COUNTRY_INFO.load(config)
|
||||
|
||||
@overload
|
||||
def iterate() -> Iterable[Tuple[str, Dict[str, Any]]]:
|
||||
...
|
||||
|
||||
@overload
|
||||
def iterate(prop: str) -> Iterable[Tuple[str, Any]]:
|
||||
...
|
||||
|
||||
def iterate(prop: Optional[str] = None) -> Iterable[Tuple[str, Dict[str, Any]]]:
|
||||
""" Iterate over country code and properties.
|
||||
|
||||
When `prop` is None, all countries are returned with their complete
|
||||
set of properties.
|
||||
|
||||
If `prop` is given, then only countries are returned where the
|
||||
given property is set. The second item of the tuple contains only
|
||||
the content of the given property.
|
||||
"""
|
||||
if prop is None:
|
||||
return _COUNTRY_INFO.items()
|
||||
|
||||
return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
|
||||
|
||||
|
||||
def setup_country_tables(dsn: str, sql_dir: Path, ignore_partitions: bool = False) -> None:
|
||||
""" Create and populate the tables with basic static data that provides
|
||||
the background for geocoding. Data is assumed to not yet exist.
|
||||
"""
|
||||
db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')
|
||||
|
||||
params = []
|
||||
for ccode, props in _COUNTRY_INFO.items():
|
||||
if ccode is not None and props is not None:
|
||||
if ignore_partitions:
|
||||
partition = 0
|
||||
else:
|
||||
partition = props.get('partition', 0)
|
||||
lang = props['languages'][0] if len(
|
||||
props['languages']) == 1 else None
|
||||
|
||||
params.append((ccode, props['names'], lang, partition))
|
||||
with connect(dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
psycopg2.extras.register_hstore(cur)
|
||||
cur.execute(
|
||||
""" CREATE TABLE public.country_name (
|
||||
country_code character varying(2),
|
||||
name public.hstore,
|
||||
derived_name public.hstore,
|
||||
country_default_language_code text,
|
||||
partition integer
|
||||
); """)
|
||||
cur.execute_values(
|
||||
""" INSERT INTO public.country_name
|
||||
(country_code, name, country_default_language_code, partition) VALUES %s
|
||||
""", params)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def create_country_names(conn: Connection, tokenizer: AbstractTokenizer,
|
||||
languages: Optional[Container[str]] = None) -> None:
|
||||
""" Add default country names to search index. `languages` is a comma-
|
||||
separated list of language codes as used in OSM. If `languages` is not
|
||||
empty then only name translations for the given languages are added
|
||||
to the index.
|
||||
"""
|
||||
def _include_key(key: str) -> bool:
|
||||
return ':' not in key or not languages or \
|
||||
key[key.index(':') + 1:] in languages
|
||||
|
||||
with conn.cursor() as cur:
|
||||
psycopg2.extras.register_hstore(cur)
|
||||
cur.execute("""SELECT country_code, name FROM country_name
|
||||
WHERE country_code is not null""")
|
||||
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
for code, name in cur:
|
||||
names = {'countrycode': code}
|
||||
|
||||
# country names (only in languages as provided)
|
||||
if name:
|
||||
names.update({k : v for k, v in name.items() if _include_key(k)})
|
||||
|
||||
analyzer.add_country_names(code, names)
|
||||
|
||||
conn.commit()
|
||||
86
src/nominatim_db/data/place_info.py
Normal file
86
src/nominatim_db/data/place_info.py
Normal file
@@ -0,0 +1,86 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Wrapper around place information the indexer gets from the database and hands to
|
||||
the tokenizer.
|
||||
"""
|
||||
from typing import Optional, Mapping, Any, Tuple
|
||||
|
||||
class PlaceInfo:
|
||||
""" This data class contains all information the tokenizer can access
|
||||
about a place.
|
||||
"""
|
||||
|
||||
def __init__(self, info: Mapping[str, Any]) -> None:
|
||||
self._info = info
|
||||
|
||||
|
||||
@property
|
||||
def name(self) -> Optional[Mapping[str, str]]:
|
||||
""" A dictionary with the names of the place. Keys and values represent
|
||||
the full key and value of the corresponding OSM tag. Which tags
|
||||
are saved as names is determined by the import style.
|
||||
The property may be None if the place has no names.
|
||||
"""
|
||||
return self._info.get('name')
|
||||
|
||||
|
||||
@property
|
||||
def address(self) -> Optional[Mapping[str, str]]:
|
||||
""" A dictionary with the address elements of the place. They key
|
||||
usually corresponds to the suffix part of the key of an OSM
|
||||
'addr:*' or 'isin:*' tag. There are also some special keys like
|
||||
`country` or `country_code` which merge OSM keys that contain
|
||||
the same information. See [Import Styles][1] for details.
|
||||
|
||||
The property may be None if the place has no address information.
|
||||
|
||||
[1]: ../customize/Import-Styles.md
|
||||
"""
|
||||
return self._info.get('address')
|
||||
|
||||
|
||||
@property
|
||||
def country_code(self) -> Optional[str]:
|
||||
""" The country code of the country the place is in. Guaranteed
|
||||
to be a two-letter lower-case string. If the place is not inside
|
||||
any country, the property is set to None.
|
||||
"""
|
||||
return self._info.get('country_code')
|
||||
|
||||
|
||||
@property
|
||||
def rank_address(self) -> int:
|
||||
""" The [rank address][1] before any rank correction is applied.
|
||||
|
||||
[1]: ../customize/Ranking.md#address-rank
|
||||
"""
|
||||
return self._info.get('rank_address', 0)
|
||||
|
||||
|
||||
@property
|
||||
def centroid(self) -> Optional[Tuple[float, float]]:
|
||||
""" A center point of the place in WGS84. May be None when the
|
||||
geometry of the place is unknown.
|
||||
"""
|
||||
x, y = self._info.get('centroid_x'), self._info.get('centroid_y')
|
||||
return None if x is None or y is None else (x, y)
|
||||
|
||||
|
||||
def is_a(self, key: str, value: str) -> bool:
|
||||
""" Set to True when the place's primary tag corresponds to the given
|
||||
key and value.
|
||||
"""
|
||||
return self._info.get('class') == key and self._info.get('type') == value
|
||||
|
||||
|
||||
def is_country(self) -> bool:
|
||||
""" Set to True when the place is a valid country boundary.
|
||||
"""
|
||||
return self.rank_address == 4 \
|
||||
and self.is_a('boundary', 'administrative') \
|
||||
and self.country_code is not None
|
||||
78
src/nominatim_db/data/place_name.py
Normal file
78
src/nominatim_db/data/place_name.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Data class for a single name of a place.
|
||||
"""
|
||||
from typing import Optional, Dict, Mapping
|
||||
|
||||
class PlaceName:
|
||||
""" Each name and address part of a place is encapsulated in an object of
|
||||
this class. It saves not only the name proper but also describes the
|
||||
kind of name with two properties:
|
||||
|
||||
* `kind` describes the name of the OSM key used without any suffixes
|
||||
(i.e. the part after the colon removed)
|
||||
* `suffix` contains the suffix of the OSM tag, if any. The suffix
|
||||
is the part of the key after the first colon.
|
||||
|
||||
In addition to that, a name may have arbitrary additional attributes.
|
||||
How attributes are used, depends on the sanitizers and token analysers.
|
||||
The exception is the 'analyzer' attribute. This attribute determines
|
||||
which token analysis module will be used to finalize the treatment of
|
||||
names.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, kind: str, suffix: Optional[str]):
|
||||
self.name = name
|
||||
self.kind = kind
|
||||
self.suffix = suffix
|
||||
self.attr: Dict[str, str] = {}
|
||||
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"PlaceName(name={self.name!r},kind={self.kind!r},suffix={self.suffix!r})"
|
||||
|
||||
|
||||
def clone(self, name: Optional[str] = None,
|
||||
kind: Optional[str] = None,
|
||||
suffix: Optional[str] = None,
|
||||
attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
|
||||
""" Create a deep copy of the place name, optionally with the
|
||||
given parameters replaced. In the attribute list only the given
|
||||
keys are updated. The list is not replaced completely.
|
||||
In particular, the function cannot to be used to remove an
|
||||
attribute from a place name.
|
||||
"""
|
||||
newobj = PlaceName(name or self.name,
|
||||
kind or self.kind,
|
||||
suffix or self.suffix)
|
||||
|
||||
newobj.attr.update(self.attr)
|
||||
if attr:
|
||||
newobj.attr.update(attr)
|
||||
|
||||
return newobj
|
||||
|
||||
|
||||
def set_attr(self, key: str, value: str) -> None:
|
||||
""" Add the given property to the name. If the property was already
|
||||
set, then the value is overwritten.
|
||||
"""
|
||||
self.attr[key] = value
|
||||
|
||||
|
||||
def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
|
||||
""" Return the given property or the value of 'default' if it
|
||||
is not set.
|
||||
"""
|
||||
return self.attr.get(key, default)
|
||||
|
||||
|
||||
def has_attr(self, key: str) -> bool:
|
||||
""" Check if the given attribute is set.
|
||||
"""
|
||||
return key in self.attr
|
||||
114
src/nominatim_db/data/postcode_format.py
Normal file
114
src/nominatim_db/data/postcode_format.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for formatting postcodes according to their country-specific
|
||||
format.
|
||||
"""
|
||||
from typing import Any, Mapping, Optional, Set, Match
|
||||
import re
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from . import country_info
|
||||
|
||||
class CountryPostcodeMatcher:
|
||||
""" Matches and formats a postcode according to a format definition
|
||||
of the given country.
|
||||
"""
|
||||
def __init__(self, country_code: str, config: Mapping[str, Any]) -> None:
|
||||
if 'pattern' not in config:
|
||||
raise UsageError("Field 'pattern' required for 'postcode' "
|
||||
f"for country '{country_code}'")
|
||||
|
||||
pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
|
||||
|
||||
self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?({pc_pattern})\\s*')
|
||||
self.pattern = re.compile(pc_pattern)
|
||||
|
||||
self.output = config.get('output', r'\g<0>')
|
||||
|
||||
|
||||
def match(self, postcode: str) -> Optional[Match[str]]:
|
||||
""" Match the given postcode against the postcode pattern for this
|
||||
matcher. Returns a `re.Match` object if the match was successful
|
||||
and None otherwise.
|
||||
"""
|
||||
# Upper-case, strip spaces and leading country code.
|
||||
normalized = self.norm_pattern.fullmatch(postcode.upper())
|
||||
|
||||
if normalized:
|
||||
return self.pattern.fullmatch(normalized.group(1))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def normalize(self, match: Match[str]) -> str:
|
||||
""" Return the default format of the postcode for the given match.
|
||||
`match` must be a `re.Match` object previously returned by
|
||||
`match()`
|
||||
"""
|
||||
return match.expand(self.output)
|
||||
|
||||
|
||||
class PostcodeFormatter:
|
||||
""" Container for different postcode formats of the world and
|
||||
access functions.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
# Objects without a country code can't have a postcode per definition.
|
||||
self.country_without_postcode: Set[Optional[str]] = {None}
|
||||
self.country_matcher = {}
|
||||
self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
|
||||
|
||||
for ccode, prop in country_info.iterate('postcode'):
|
||||
if prop is False:
|
||||
self.country_without_postcode.add(ccode)
|
||||
elif isinstance(prop, dict):
|
||||
self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
|
||||
else:
|
||||
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
|
||||
|
||||
|
||||
def set_default_pattern(self, pattern: str) -> None:
|
||||
""" Set the postcode match pattern to use, when a country does not
|
||||
have a specific pattern.
|
||||
"""
|
||||
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
|
||||
|
||||
|
||||
def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
|
||||
""" Return the CountryPostcodeMatcher for the given country.
|
||||
Returns None if the country doesn't have a postcode and the
|
||||
default matcher if there is no specific matcher configured for
|
||||
the country.
|
||||
"""
|
||||
if country_code in self.country_without_postcode:
|
||||
return None
|
||||
|
||||
assert country_code is not None
|
||||
|
||||
return self.country_matcher.get(country_code, self.default_matcher)
|
||||
|
||||
|
||||
def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
|
||||
""" Match the given postcode against the postcode pattern for this
|
||||
matcher. Returns a `re.Match` object if the country has a pattern
|
||||
and the match was successful or None if the match failed.
|
||||
"""
|
||||
if country_code in self.country_without_postcode:
|
||||
return None
|
||||
|
||||
assert country_code is not None
|
||||
|
||||
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
|
||||
|
||||
|
||||
def normalize(self, country_code: str, match: Match[str]) -> str:
|
||||
""" Return the default format of the postcode for the given match.
|
||||
`match` must be a `re.Match` object previously returned by
|
||||
`match()`
|
||||
"""
|
||||
return self.country_matcher.get(country_code, self.default_matcher).normalize(match)
|
||||
0
src/nominatim_db/indexer/__init__.py
Normal file
0
src/nominatim_db/indexer/__init__.py
Normal file
242
src/nominatim_db/indexer/indexer.py
Normal file
242
src/nominatim_db/indexer/indexer.py
Normal file
@@ -0,0 +1,242 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Main work horse for indexing (computing addresses) the database.
|
||||
"""
|
||||
from typing import Optional, Any, cast
|
||||
import logging
|
||||
import time
|
||||
|
||||
import psycopg2.extras
|
||||
|
||||
from nominatim_core.typing import DictCursorResults
|
||||
from nominatim_core.db.async_connection import DBConnection, WorkerPool
|
||||
from nominatim_core.db.connection import connect, Connection, Cursor
|
||||
from ..tokenizer.base import AbstractTokenizer
|
||||
from .progress import ProgressLogger
|
||||
from . import runners
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
|
||||
class PlaceFetcher:
|
||||
""" Asynchronous connection that fetches place details for processing.
|
||||
"""
|
||||
def __init__(self, dsn: str, setup_conn: Connection) -> None:
|
||||
self.wait_time = 0.0
|
||||
self.current_ids: Optional[DictCursorResults] = None
|
||||
self.conn: Optional[DBConnection] = DBConnection(dsn,
|
||||
cursor_factory=psycopg2.extras.DictCursor)
|
||||
|
||||
with setup_conn.cursor() as cur:
|
||||
# need to fetch those manually because register_hstore cannot
|
||||
# fetch them on an asynchronous connection below.
|
||||
hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
|
||||
hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
|
||||
|
||||
psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
|
||||
array_oid=hstore_array_oid)
|
||||
|
||||
def close(self) -> None:
|
||||
""" Close the underlying asynchronous connection.
|
||||
"""
|
||||
if self.conn:
|
||||
self.conn.close()
|
||||
self.conn = None
|
||||
|
||||
|
||||
def fetch_next_batch(self, cur: Cursor, runner: runners.Runner) -> bool:
|
||||
""" Send a request for the next batch of places.
|
||||
If details for the places are required, they will be fetched
|
||||
asynchronously.
|
||||
|
||||
Returns true if there is still data available.
|
||||
"""
|
||||
ids = cast(Optional[DictCursorResults], cur.fetchmany(100))
|
||||
|
||||
if not ids:
|
||||
self.current_ids = None
|
||||
return False
|
||||
|
||||
assert self.conn is not None
|
||||
self.current_ids = runner.get_place_details(self.conn, ids)
|
||||
|
||||
return True
|
||||
|
||||
def get_batch(self) -> DictCursorResults:
|
||||
""" Get the next batch of data, previously requested with
|
||||
`fetch_next_batch`.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
assert self.conn.cursor is not None
|
||||
|
||||
if self.current_ids is not None and not self.current_ids:
|
||||
tstart = time.time()
|
||||
self.conn.wait()
|
||||
self.wait_time += time.time() - tstart
|
||||
self.current_ids = cast(Optional[DictCursorResults],
|
||||
self.conn.cursor.fetchall())
|
||||
|
||||
return self.current_ids if self.current_ids is not None else []
|
||||
|
||||
def __enter__(self) -> 'PlaceFetcher':
|
||||
return self
|
||||
|
||||
|
||||
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
|
||||
assert self.conn is not None
|
||||
self.conn.wait()
|
||||
self.close()
|
||||
|
||||
|
||||
class Indexer:
|
||||
""" Main indexing routine.
|
||||
"""
|
||||
|
||||
def __init__(self, dsn: str, tokenizer: AbstractTokenizer, num_threads: int):
|
||||
self.dsn = dsn
|
||||
self.tokenizer = tokenizer
|
||||
self.num_threads = num_threads
|
||||
|
||||
|
||||
def has_pending(self) -> bool:
|
||||
""" Check if any data still needs indexing.
|
||||
This function must only be used after the import has finished.
|
||||
Otherwise it will be very expensive.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT 'a' FROM placex WHERE indexed_status > 0 LIMIT 1")
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def index_full(self, analyse: bool = True) -> None:
|
||||
""" Index the complete database. This will first index boundaries
|
||||
followed by all other objects. When `analyse` is True, then the
|
||||
database will be analysed at the appropriate places to
|
||||
ensure that database statistics are updated.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
conn.autocommit = True
|
||||
|
||||
def _analyze() -> None:
|
||||
if analyse:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('ANALYZE')
|
||||
|
||||
if self.index_by_rank(0, 4) > 0:
|
||||
_analyze()
|
||||
|
||||
if self.index_boundaries(0, 30) > 100:
|
||||
_analyze()
|
||||
|
||||
if self.index_by_rank(5, 25) > 100:
|
||||
_analyze()
|
||||
|
||||
if self.index_by_rank(26, 30) > 1000:
|
||||
_analyze()
|
||||
|
||||
if self.index_postcodes() > 100:
|
||||
_analyze()
|
||||
|
||||
|
||||
def index_boundaries(self, minrank: int, maxrank: int) -> int:
|
||||
""" Index only administrative boundaries within the given rank range.
|
||||
"""
|
||||
total = 0
|
||||
LOG.warning("Starting indexing boundaries using %s threads",
|
||||
self.num_threads)
|
||||
|
||||
with self.tokenizer.name_analyzer() as analyzer:
|
||||
for rank in range(max(minrank, 4), min(maxrank, 26)):
|
||||
total += self._index(runners.BoundaryRunner(rank, analyzer))
|
||||
|
||||
return total
|
||||
|
||||
def index_by_rank(self, minrank: int, maxrank: int) -> int:
|
||||
""" Index all entries of placex in the given rank range (inclusive)
|
||||
in order of their address rank.
|
||||
|
||||
When rank 30 is requested then also interpolations and
|
||||
places with address rank 0 will be indexed.
|
||||
"""
|
||||
total = 0
|
||||
maxrank = min(maxrank, 30)
|
||||
LOG.warning("Starting indexing rank (%i to %i) using %i threads",
|
||||
minrank, maxrank, self.num_threads)
|
||||
|
||||
with self.tokenizer.name_analyzer() as analyzer:
|
||||
for rank in range(max(1, minrank), maxrank + 1):
|
||||
total += self._index(runners.RankRunner(rank, analyzer), 20 if rank == 30 else 1)
|
||||
|
||||
if maxrank == 30:
|
||||
total += self._index(runners.RankRunner(0, analyzer))
|
||||
total += self._index(runners.InterpolationRunner(analyzer), 20)
|
||||
|
||||
return total
|
||||
|
||||
|
||||
def index_postcodes(self) -> int:
|
||||
"""Index the entries of the location_postcode table.
|
||||
"""
|
||||
LOG.warning("Starting indexing postcodes using %s threads", self.num_threads)
|
||||
|
||||
return self._index(runners.PostcodeRunner(), 20)
|
||||
|
||||
|
||||
def update_status_table(self) -> None:
|
||||
""" Update the status in the status table to 'indexed'.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('UPDATE import_status SET indexed = true')
|
||||
|
||||
conn.commit()
|
||||
|
||||
def _index(self, runner: runners.Runner, batch: int = 1) -> int:
|
||||
""" Index a single rank or table. `runner` describes the SQL to use
|
||||
for indexing. `batch` describes the number of objects that
|
||||
should be processed with a single SQL statement
|
||||
"""
|
||||
LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
|
||||
|
||||
with connect(self.dsn) as conn:
|
||||
psycopg2.extras.register_hstore(conn)
|
||||
with conn.cursor() as cur:
|
||||
total_tuples = cur.scalar(runner.sql_count_objects())
|
||||
LOG.debug("Total number of rows: %i", total_tuples)
|
||||
|
||||
conn.commit()
|
||||
|
||||
progress = ProgressLogger(runner.name(), total_tuples)
|
||||
|
||||
if total_tuples > 0:
|
||||
with conn.cursor(name='places') as cur:
|
||||
cur.execute(runner.sql_get_objects())
|
||||
|
||||
with PlaceFetcher(self.dsn, conn) as fetcher:
|
||||
with WorkerPool(self.dsn, self.num_threads) as pool:
|
||||
has_more = fetcher.fetch_next_batch(cur, runner)
|
||||
while has_more:
|
||||
places = fetcher.get_batch()
|
||||
|
||||
# asynchronously get the next batch
|
||||
has_more = fetcher.fetch_next_batch(cur, runner)
|
||||
|
||||
# And insert the current batch
|
||||
for idx in range(0, len(places), batch):
|
||||
part = places[idx:idx + batch]
|
||||
LOG.debug("Processing places: %s", str(part))
|
||||
runner.index_places(pool.next_free_worker(), part)
|
||||
progress.add(len(part))
|
||||
|
||||
LOG.info("Wait time: fetcher: %.2fs, pool: %.2fs",
|
||||
fetcher.wait_time, pool.wait_time)
|
||||
|
||||
conn.commit()
|
||||
|
||||
return progress.done()
|
||||
74
src/nominatim_db/indexer/progress.py
Normal file
74
src/nominatim_db/indexer/progress.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Helpers for progress logging.
|
||||
"""
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
INITIAL_PROGRESS = 10
|
||||
|
||||
class ProgressLogger:
|
||||
""" Tracks and prints progress for the indexing process.
|
||||
`name` is the name of the indexing step being tracked.
|
||||
`total` sets up the total number of items that need processing.
|
||||
`log_interval` denotes the interval in seconds at which progress
|
||||
should be reported.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, total: int, log_interval: int = 1) -> None:
|
||||
self.name = name
|
||||
self.total_places = total
|
||||
self.done_places = 0
|
||||
self.rank_start_time = datetime.now()
|
||||
self.log_interval = log_interval
|
||||
self.next_info = INITIAL_PROGRESS if LOG.isEnabledFor(logging.WARNING) else total + 1
|
||||
|
||||
def add(self, num: int = 1) -> None:
|
||||
""" Mark `num` places as processed. Print a log message if the
|
||||
logging is at least info and the log interval has passed.
|
||||
"""
|
||||
self.done_places += num
|
||||
|
||||
if self.done_places < self.next_info:
|
||||
return
|
||||
|
||||
now = datetime.now()
|
||||
done_time = (now - self.rank_start_time).total_seconds()
|
||||
|
||||
if done_time < 2:
|
||||
self.next_info = self.done_places + INITIAL_PROGRESS
|
||||
return
|
||||
|
||||
places_per_sec = self.done_places / done_time
|
||||
eta = (self.total_places - self.done_places) / places_per_sec
|
||||
|
||||
LOG.warning("Done %d in %d @ %.3f per second - %s ETA (seconds): %.2f",
|
||||
self.done_places, int(done_time),
|
||||
places_per_sec, self.name, eta)
|
||||
|
||||
self.next_info += int(places_per_sec) * self.log_interval
|
||||
|
||||
def done(self) -> int:
|
||||
""" Print final statistics about the progress.
|
||||
"""
|
||||
rank_end_time = datetime.now()
|
||||
|
||||
if rank_end_time == self.rank_start_time:
|
||||
diff_seconds = 0.0
|
||||
places_per_sec = float(self.done_places)
|
||||
else:
|
||||
diff_seconds = (rank_end_time - self.rank_start_time).total_seconds()
|
||||
places_per_sec = self.done_places / diff_seconds
|
||||
|
||||
LOG.warning("Done %d/%d in %d @ %.3f per second - FINISHED %s\n",
|
||||
self.done_places, self.total_places, int(diff_seconds),
|
||||
places_per_sec, self.name)
|
||||
|
||||
return self.done_places
|
||||
196
src/nominatim_db/indexer/runners.py
Normal file
196
src/nominatim_db/indexer/runners.py
Normal file
@@ -0,0 +1,196 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Mix-ins that provide the actual commands for the indexer for various indexing
|
||||
tasks.
|
||||
"""
|
||||
from typing import Any, List
|
||||
import functools
|
||||
|
||||
from psycopg2 import sql as pysql
|
||||
import psycopg2.extras
|
||||
|
||||
from nominatim_core.typing import Query, DictCursorResult, DictCursorResults, Protocol
|
||||
from nominatim_core.db.async_connection import DBConnection
|
||||
from ..data.place_info import PlaceInfo
|
||||
from ..tokenizer.base import AbstractAnalyzer
|
||||
|
||||
# pylint: disable=C0111
|
||||
|
||||
def _mk_valuelist(template: str, num: int) -> pysql.Composed:
|
||||
return pysql.SQL(',').join([pysql.SQL(template)] * num)
|
||||
|
||||
def _analyze_place(place: DictCursorResult, analyzer: AbstractAnalyzer) -> psycopg2.extras.Json:
|
||||
return psycopg2.extras.Json(analyzer.process_place(PlaceInfo(place)))
|
||||
|
||||
|
||||
class Runner(Protocol):
|
||||
def name(self) -> str: ...
|
||||
def sql_count_objects(self) -> Query: ...
|
||||
def sql_get_objects(self) -> Query: ...
|
||||
def get_place_details(self, worker: DBConnection,
|
||||
ids: DictCursorResults) -> DictCursorResults: ...
|
||||
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None: ...
|
||||
|
||||
|
||||
class AbstractPlacexRunner:
|
||||
""" Returns SQL commands for indexing of the placex table.
|
||||
"""
|
||||
SELECT_SQL = pysql.SQL('SELECT place_id FROM placex ')
|
||||
UPDATE_LINE = "(%s, %s::hstore, %s::hstore, %s::int, %s::jsonb)"
|
||||
|
||||
def __init__(self, rank: int, analyzer: AbstractAnalyzer) -> None:
|
||||
self.rank = rank
|
||||
self.analyzer = analyzer
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def _index_sql(self, num_places: int) -> pysql.Composed:
|
||||
return pysql.SQL(
|
||||
""" UPDATE placex
|
||||
SET indexed_status = 0, address = v.addr, token_info = v.ti,
|
||||
name = v.name, linked_place_id = v.linked_place_id
|
||||
FROM (VALUES {}) as v(id, name, addr, linked_place_id, ti)
|
||||
WHERE place_id = v.id
|
||||
""").format(_mk_valuelist(AbstractPlacexRunner.UPDATE_LINE, num_places))
|
||||
|
||||
|
||||
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
|
||||
worker.perform("""SELECT place_id, extra.*
|
||||
FROM placex, LATERAL placex_indexing_prepare(placex) as extra
|
||||
WHERE place_id IN %s""",
|
||||
(tuple((p[0] for p in ids)), ))
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
|
||||
values: List[Any] = []
|
||||
for place in places:
|
||||
for field in ('place_id', 'name', 'address', 'linked_place_id'):
|
||||
values.append(place[field])
|
||||
values.append(_analyze_place(place, self.analyzer))
|
||||
|
||||
worker.perform(self._index_sql(len(places)), values)
|
||||
|
||||
|
||||
class RankRunner(AbstractPlacexRunner):
|
||||
""" Returns SQL commands for indexing one rank within the placex table.
|
||||
"""
|
||||
|
||||
def name(self) -> str:
|
||||
return f"rank {self.rank}"
|
||||
|
||||
def sql_count_objects(self) -> pysql.Composed:
|
||||
return pysql.SQL("""SELECT count(*) FROM placex
|
||||
WHERE rank_address = {} and indexed_status > 0
|
||||
""").format(pysql.Literal(self.rank))
|
||||
|
||||
def sql_get_objects(self) -> pysql.Composed:
|
||||
return self.SELECT_SQL + pysql.SQL(
|
||||
"""WHERE indexed_status > 0 and rank_address = {}
|
||||
ORDER BY geometry_sector
|
||||
""").format(pysql.Literal(self.rank))
|
||||
|
||||
|
||||
class BoundaryRunner(AbstractPlacexRunner):
|
||||
""" Returns SQL commands for indexing the administrative boundaries
|
||||
of a certain rank.
|
||||
"""
|
||||
|
||||
def name(self) -> str:
|
||||
return f"boundaries rank {self.rank}"
|
||||
|
||||
def sql_count_objects(self) -> pysql.Composed:
|
||||
return pysql.SQL("""SELECT count(*) FROM placex
|
||||
WHERE indexed_status > 0
|
||||
AND rank_search = {}
|
||||
AND class = 'boundary' and type = 'administrative'
|
||||
""").format(pysql.Literal(self.rank))
|
||||
|
||||
def sql_get_objects(self) -> pysql.Composed:
|
||||
return self.SELECT_SQL + pysql.SQL(
|
||||
"""WHERE indexed_status > 0 and rank_search = {}
|
||||
and class = 'boundary' and type = 'administrative'
|
||||
ORDER BY partition, admin_level
|
||||
""").format(pysql.Literal(self.rank))
|
||||
|
||||
|
||||
class InterpolationRunner:
|
||||
""" Returns SQL commands for indexing the address interpolation table
|
||||
location_property_osmline.
|
||||
"""
|
||||
|
||||
def __init__(self, analyzer: AbstractAnalyzer) -> None:
|
||||
self.analyzer = analyzer
|
||||
|
||||
|
||||
def name(self) -> str:
|
||||
return "interpolation lines (location_property_osmline)"
|
||||
|
||||
def sql_count_objects(self) -> str:
|
||||
return """SELECT count(*) FROM location_property_osmline
|
||||
WHERE indexed_status > 0"""
|
||||
|
||||
def sql_get_objects(self) -> str:
|
||||
return """SELECT place_id
|
||||
FROM location_property_osmline
|
||||
WHERE indexed_status > 0
|
||||
ORDER BY geometry_sector"""
|
||||
|
||||
|
||||
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
|
||||
worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
|
||||
FROM location_property_osmline WHERE place_id IN %s""",
|
||||
(tuple((p[0] for p in ids)), ))
|
||||
return []
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def _index_sql(self, num_places: int) -> pysql.Composed:
|
||||
return pysql.SQL("""UPDATE location_property_osmline
|
||||
SET indexed_status = 0, address = v.addr, token_info = v.ti
|
||||
FROM (VALUES {}) as v(id, addr, ti)
|
||||
WHERE place_id = v.id
|
||||
""").format(_mk_valuelist("(%s, %s::hstore, %s::jsonb)", num_places))
|
||||
|
||||
|
||||
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
|
||||
values: List[Any] = []
|
||||
for place in places:
|
||||
values.extend((place[x] for x in ('place_id', 'address')))
|
||||
values.append(_analyze_place(place, self.analyzer))
|
||||
|
||||
worker.perform(self._index_sql(len(places)), values)
|
||||
|
||||
|
||||
|
||||
class PostcodeRunner(Runner):
|
||||
""" Provides the SQL commands for indexing the location_postcode table.
|
||||
"""
|
||||
|
||||
def name(self) -> str:
|
||||
return "postcodes (location_postcode)"
|
||||
|
||||
|
||||
def sql_count_objects(self) -> str:
|
||||
return 'SELECT count(*) FROM location_postcode WHERE indexed_status > 0'
|
||||
|
||||
|
||||
def sql_get_objects(self) -> str:
|
||||
return """SELECT place_id FROM location_postcode
|
||||
WHERE indexed_status > 0
|
||||
ORDER BY country_code, postcode"""
|
||||
|
||||
|
||||
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
|
||||
return ids
|
||||
|
||||
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
|
||||
worker.perform(pysql.SQL("""UPDATE location_postcode SET indexed_status = 0
|
||||
WHERE place_id IN ({})""")
|
||||
.format(pysql.SQL(',').join((pysql.Literal(i[0]) for i in places))))
|
||||
0
src/nominatim_db/tokenizer/__init__.py
Normal file
0
src/nominatim_db/tokenizer/__init__.py
Normal file
253
src/nominatim_db/tokenizer/base.py
Normal file
253
src/nominatim_db/tokenizer/base.py
Normal file
@@ -0,0 +1,253 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Abstract class definitions for tokenizers. These base classes are here
|
||||
mainly for documentation purposes.
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Tuple, Dict, Any, Optional, Iterable
|
||||
from pathlib import Path
|
||||
|
||||
from nominatim_core.typing import Protocol
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.connection import Connection
|
||||
from ..data.place_info import PlaceInfo
|
||||
|
||||
class AbstractAnalyzer(ABC):
|
||||
""" The analyzer provides the functions for analysing names and building
|
||||
the token database.
|
||||
|
||||
Analyzers are instantiated on a per-thread base. Access to global data
|
||||
structures must be synchronised accordingly.
|
||||
"""
|
||||
|
||||
def __enter__(self) -> 'AbstractAnalyzer':
|
||||
return self
|
||||
|
||||
|
||||
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
|
||||
self.close()
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def close(self) -> None:
|
||||
""" Free all resources used by the analyzer.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
|
||||
""" Return token information for the given list of words.
|
||||
|
||||
The function is used for testing and debugging only
|
||||
and does not need to be particularly efficient.
|
||||
|
||||
Arguments:
|
||||
words: A list of words to look up the tokens for.
|
||||
If a word starts with # it is assumed to be a full name
|
||||
otherwise is a partial term.
|
||||
|
||||
Returns:
|
||||
The function returns the list of all tuples that could be
|
||||
found for the given words. Each list entry is a tuple of
|
||||
(original word, word token, word id).
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def normalize_postcode(self, postcode: str) -> str:
|
||||
""" Convert the postcode to its standardized form.
|
||||
|
||||
This function must yield exactly the same result as the SQL function
|
||||
`token_normalized_postcode()`.
|
||||
|
||||
Arguments:
|
||||
postcode: The postcode to be normalized.
|
||||
|
||||
Returns:
|
||||
The given postcode after normalization.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_postcodes_from_db(self) -> None:
|
||||
""" Update the tokenizer's postcode tokens from the current content
|
||||
of the `location_postcode` table.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_special_phrases(self,
|
||||
phrases: Iterable[Tuple[str, str, str, str]],
|
||||
should_replace: bool) -> None:
|
||||
""" Update the tokenizer's special phrase tokens from the given
|
||||
list of special phrases.
|
||||
|
||||
Arguments:
|
||||
phrases: The new list of special phrases. Each entry is
|
||||
a tuple of (phrase, class, type, operator).
|
||||
should_replace: If true, replace the current list of phrases.
|
||||
When false, just add the given phrases to the
|
||||
ones that already exist.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
|
||||
""" Add the given names to the tokenizer's list of country tokens.
|
||||
|
||||
Arguments:
|
||||
country_code: two-letter country code for the country the names
|
||||
refer to.
|
||||
names: Dictionary of name type to name.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def process_place(self, place: PlaceInfo) -> Any:
|
||||
""" Extract tokens for the given place and compute the
|
||||
information to be handed to the PL/pgSQL processor for building
|
||||
the search index.
|
||||
|
||||
Arguments:
|
||||
place: Place information retrieved from the database.
|
||||
|
||||
Returns:
|
||||
A JSON-serialisable structure that will be handed into
|
||||
the database via the `token_info` field.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
class AbstractTokenizer(ABC):
|
||||
""" The tokenizer instance is the central instance of the tokenizer in
|
||||
the system. There will only be a single instance of the tokenizer
|
||||
active at any time.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
|
||||
""" Set up a new tokenizer for the database.
|
||||
|
||||
The function should copy all necessary data into the project
|
||||
directory or save it in the property table to make sure that
|
||||
the tokenizer remains stable over updates.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
|
||||
init_db: When set to False, then initialisation of database
|
||||
tables should be skipped. This option is only required for
|
||||
migration purposes and can be safely ignored by custom
|
||||
tokenizers.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def init_from_project(self, config: Configuration) -> None:
|
||||
""" Initialise the tokenizer from an existing database setup.
|
||||
|
||||
The function should load all previously saved configuration from
|
||||
the project directory and/or the property table.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def finalize_import(self, config: Configuration) -> None:
|
||||
""" This function is called at the very end of an import when all
|
||||
data has been imported and indexed. The tokenizer may create
|
||||
at this point any additional indexes and data structures needed
|
||||
during query time.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_sql_functions(self, config: Configuration) -> None:
|
||||
""" Update the SQL part of the tokenizer. This function is called
|
||||
automatically on migrations or may be called explicitly by the
|
||||
user through the `nominatim refresh --functions` command.
|
||||
|
||||
The tokenizer must only update the code of the tokenizer. The
|
||||
data structures or data itself must not be changed by this function.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def check_database(self, config: Configuration) -> Optional[str]:
|
||||
""" Check that the database is set up correctly and ready for being
|
||||
queried.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
|
||||
Returns:
|
||||
If an issue was found, return an error message with the
|
||||
description of the issue as well as hints for the user on
|
||||
how to resolve the issue. If everything is okay, return `None`.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
|
||||
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
||||
This function is meant to be called from time to time by the user
|
||||
to improve performance. However, the tokenizer must not depend on
|
||||
it to be called in order to work.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_word_tokens(self) -> None:
|
||||
""" Do house-keeping on the tokenizers internal data structures.
|
||||
Remove unused word tokens, resort data etc.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def name_analyzer(self) -> AbstractAnalyzer:
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
using this tokinzer. Analyzers are context managers and should
|
||||
be used accordingly:
|
||||
|
||||
```
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
analyser.tokenize()
|
||||
```
|
||||
|
||||
When used outside the with construct, the caller must ensure to
|
||||
call the close() function before destructing the analyzer.
|
||||
"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
|
||||
""" Return a list of the most frequent full words in the database.
|
||||
|
||||
Arguments:
|
||||
conn: Open connection to the database which may be used to
|
||||
retrieve the words.
|
||||
num: Maximum number of words to return.
|
||||
"""
|
||||
|
||||
|
||||
class TokenizerModule(Protocol):
|
||||
""" Interface that must be exported by modules that implement their
|
||||
own tokenizer.
|
||||
"""
|
||||
|
||||
def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
|
||||
""" Factory for new tokenizers.
|
||||
"""
|
||||
102
src/nominatim_db/tokenizer/factory.py
Normal file
102
src/nominatim_db/tokenizer/factory.py
Normal file
@@ -0,0 +1,102 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for creating a tokenizer or initialising the right one for an
|
||||
existing database.
|
||||
|
||||
A tokenizer is something that is bound to the lifetime of a database. It
|
||||
can be chosen and configured before the initial import but then needs to
|
||||
be used consistently when querying and updating the database.
|
||||
|
||||
This module provides the functions to create and configure a new tokenizer
|
||||
as well as instantiating the appropriate tokenizer for updating an existing
|
||||
database.
|
||||
|
||||
A tokenizer usually also includes PHP code for querying. The appropriate PHP
|
||||
normalizer module is installed, when the tokenizer is created.
|
||||
"""
|
||||
from typing import Optional
|
||||
import logging
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.db import properties
|
||||
from nominatim_core.db.connection import connect
|
||||
from nominatim_core.config import Configuration
|
||||
from ..tokenizer.base import AbstractTokenizer, TokenizerModule
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _import_tokenizer(name: str) -> TokenizerModule:
|
||||
""" Load the tokenizer.py module from project directory.
|
||||
"""
|
||||
src_file = Path(__file__).parent / (name + '_tokenizer.py')
|
||||
if not src_file.is_file():
|
||||
LOG.fatal("No tokenizer named '%s' available. "
|
||||
"Check the setting of NOMINATIM_TOKENIZER.", name)
|
||||
raise UsageError('Tokenizer not found')
|
||||
|
||||
return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
|
||||
|
||||
|
||||
def create_tokenizer(config: Configuration, init_db: bool = True,
|
||||
module_name: Optional[str] = None) -> AbstractTokenizer:
|
||||
""" Create a new tokenizer as defined by the given configuration.
|
||||
|
||||
The tokenizer data and code is copied into the 'tokenizer' directory
|
||||
of the project directory and the tokenizer loaded from its new location.
|
||||
"""
|
||||
if module_name is None:
|
||||
module_name = config.TOKENIZER
|
||||
|
||||
# Create the directory for the tokenizer data
|
||||
assert config.project_dir is not None
|
||||
basedir = config.project_dir / 'tokenizer'
|
||||
if not basedir.exists():
|
||||
basedir.mkdir()
|
||||
elif not basedir.is_dir():
|
||||
LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
|
||||
raise UsageError("Tokenizer setup failed.")
|
||||
|
||||
# Import and initialize the tokenizer.
|
||||
tokenizer_module = _import_tokenizer(module_name)
|
||||
|
||||
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
||||
tokenizer.init_new_db(config, init_db=init_db)
|
||||
|
||||
with connect(config.get_libpq_dsn()) as conn:
|
||||
properties.set_property(conn, 'tokenizer', module_name)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
|
||||
""" Instantiate a tokenizer for an existing database.
|
||||
|
||||
The function looks up the appropriate tokenizer in the database
|
||||
and initialises it.
|
||||
"""
|
||||
assert config.project_dir is not None
|
||||
basedir = config.project_dir / 'tokenizer'
|
||||
if not basedir.is_dir():
|
||||
# Directory will be repopulated by tokenizer below.
|
||||
basedir.mkdir()
|
||||
|
||||
with connect(config.get_libpq_dsn()) as conn:
|
||||
name = properties.get_property(conn, 'tokenizer')
|
||||
|
||||
if name is None:
|
||||
LOG.fatal("Tokenizer was not set up properly. Database property missing.")
|
||||
raise UsageError('Cannot initialize tokenizer.')
|
||||
|
||||
tokenizer_module = _import_tokenizer(name)
|
||||
|
||||
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
||||
tokenizer.init_from_project(config)
|
||||
|
||||
return tokenizer
|
||||
196
src/nominatim_db/tokenizer/icu_rule_loader.py
Normal file
196
src/nominatim_db/tokenizer/icu_rule_loader.py
Normal file
@@ -0,0 +1,196 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Helper class to create ICU rules from a configuration file.
|
||||
"""
|
||||
from typing import Mapping, Any, Dict, Optional
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
from nominatim_core.config import flatten_config_list, Configuration
|
||||
from nominatim_core.db.properties import set_property, get_property
|
||||
from nominatim_core.db.connection import Connection
|
||||
from nominatim_core.errors import UsageError
|
||||
from .place_sanitizer import PlaceSanitizer
|
||||
from .icu_token_analysis import ICUTokenAnalysis
|
||||
from .token_analysis.base import AnalysisModule, Analyzer
|
||||
from ..data import country_info
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
|
||||
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
||||
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
|
||||
|
||||
|
||||
def _get_section(rules: Mapping[str, Any], section: str) -> Any:
|
||||
""" Get the section named 'section' from the rules. If the section does
|
||||
not exist, raise a usage error with a meaningful message.
|
||||
"""
|
||||
if section not in rules:
|
||||
LOG.fatal("Section '%s' not found in tokenizer config.", section)
|
||||
raise UsageError("Syntax error in tokenizer configuration file.")
|
||||
|
||||
return rules[section]
|
||||
|
||||
|
||||
class ICURuleLoader:
|
||||
""" Compiler for ICU rules from a tokenizer configuration file.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Configuration) -> None:
|
||||
self.config = config
|
||||
rules = config.load_sub_configuration('icu_tokenizer.yaml',
|
||||
config='TOKENIZER_CONFIG')
|
||||
|
||||
# Make sure country information is available to analyzers and sanitizers.
|
||||
country_info.setup_country_config(config)
|
||||
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||
self.analysis_rules = _get_section(rules, 'token-analysis')
|
||||
self._setup_analysis()
|
||||
|
||||
# Load optional sanitizer rule set.
|
||||
self.sanitizer_rules = rules.get('sanitizers', [])
|
||||
|
||||
|
||||
def load_config_from_db(self, conn: Connection) -> None:
|
||||
""" Get previously saved parts of the configuration from the
|
||||
database.
|
||||
"""
|
||||
rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
||||
if rules is not None:
|
||||
self.normalization_rules = rules
|
||||
|
||||
rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
||||
if rules is not None:
|
||||
self.transliteration_rules = rules
|
||||
|
||||
rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
|
||||
if rules:
|
||||
self.analysis_rules = json.loads(rules)
|
||||
else:
|
||||
self.analysis_rules = []
|
||||
self._setup_analysis()
|
||||
|
||||
|
||||
def save_config_to_db(self, conn: Connection) -> None:
|
||||
""" Save the part of the configuration that cannot be changed into
|
||||
the database.
|
||||
"""
|
||||
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
|
||||
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
|
||||
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
|
||||
|
||||
|
||||
def make_sanitizer(self) -> PlaceSanitizer:
|
||||
""" Create a place sanitizer from the configured rules.
|
||||
"""
|
||||
return PlaceSanitizer(self.sanitizer_rules, self.config)
|
||||
|
||||
|
||||
def make_token_analysis(self) -> ICUTokenAnalysis:
|
||||
""" Create a token analyser from the reviouly loaded rules.
|
||||
"""
|
||||
return ICUTokenAnalysis(self.normalization_rules,
|
||||
self.transliteration_rules, self.analysis)
|
||||
|
||||
|
||||
def get_search_rules(self) -> str:
|
||||
""" Return the ICU rules to be used during search.
|
||||
The rules combine normalization and transliteration.
|
||||
"""
|
||||
# First apply the normalization rules.
|
||||
rules = io.StringIO()
|
||||
rules.write(self.normalization_rules)
|
||||
|
||||
# Then add transliteration.
|
||||
rules.write(self.transliteration_rules)
|
||||
return rules.getvalue()
|
||||
|
||||
|
||||
def get_normalization_rules(self) -> str:
|
||||
""" Return rules for normalisation of a term.
|
||||
"""
|
||||
return self.normalization_rules
|
||||
|
||||
|
||||
def get_transliteration_rules(self) -> str:
|
||||
""" Return the rules for converting a string into its asciii representation.
|
||||
"""
|
||||
return self.transliteration_rules
|
||||
|
||||
|
||||
def _setup_analysis(self) -> None:
|
||||
""" Process the rules used for creating the various token analyzers.
|
||||
"""
|
||||
self.analysis: Dict[Optional[str], TokenAnalyzerRule] = {}
|
||||
|
||||
if not isinstance(self.analysis_rules, list):
|
||||
raise UsageError("Configuration section 'token-analysis' must be a list.")
|
||||
|
||||
norm = Transliterator.createFromRules("rule_loader_normalization",
|
||||
self.normalization_rules)
|
||||
trans = Transliterator.createFromRules("rule_loader_transliteration",
|
||||
self.transliteration_rules)
|
||||
|
||||
for section in self.analysis_rules:
|
||||
name = section.get('id', None)
|
||||
if name in self.analysis:
|
||||
if name is None:
|
||||
LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
|
||||
else:
|
||||
LOG.fatal("ICU tokenizer configuration has two token "
|
||||
"analyzers with id '%s'.", name)
|
||||
raise UsageError("Syntax error in ICU tokenizer config.")
|
||||
self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
|
||||
self.config)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
|
||||
""" Load an ICU ruleset from the given section. If the section is a
|
||||
simple string, it is interpreted as a file name and the rules are
|
||||
loaded verbatim from the given file. The filename is expected to be
|
||||
relative to the tokenizer rule file. If the section is a list then
|
||||
each line is assumed to be a rule. All rules are concatenated and returned.
|
||||
"""
|
||||
content = _get_section(rules, section)
|
||||
|
||||
if content is None:
|
||||
return ''
|
||||
|
||||
return ';'.join(flatten_config_list(content, section)) + ';'
|
||||
|
||||
|
||||
class TokenAnalyzerRule:
|
||||
""" Factory for a single analysis module. The class saves the configuration
|
||||
and creates a new token analyzer on request.
|
||||
"""
|
||||
|
||||
def __init__(self, rules: Mapping[str, Any],
|
||||
normalizer: Any, transliterator: Any,
|
||||
config: Configuration) -> None:
|
||||
analyzer_name = _get_section(rules, 'analyzer')
|
||||
if not analyzer_name or not isinstance(analyzer_name, str):
|
||||
raise UsageError("'analyzer' parameter needs to be simple string")
|
||||
|
||||
self._analysis_mod: AnalysisModule = \
|
||||
config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
|
||||
|
||||
self.config = self._analysis_mod.configure(rules, normalizer,
|
||||
transliterator)
|
||||
|
||||
|
||||
def create(self, normalizer: Any, transliterator: Any) -> Analyzer:
|
||||
""" Create a new analyser instance for the given rule.
|
||||
"""
|
||||
return self._analysis_mod.create(normalizer, transliterator, self.config)
|
||||
43
src/nominatim_db/tokenizer/icu_token_analysis.py
Normal file
43
src/nominatim_db/tokenizer/icu_token_analysis.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Container class collecting all components required to transform an OSM name
|
||||
into a Nominatim token.
|
||||
"""
|
||||
from typing import Mapping, Optional, TYPE_CHECKING
|
||||
from icu import Transliterator
|
||||
|
||||
from .token_analysis.base import Analyzer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any
|
||||
from .icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import
|
||||
|
||||
class ICUTokenAnalysis:
|
||||
""" Container class collecting the transliterators and token analysis
|
||||
modules for a single Analyser instance.
|
||||
"""
|
||||
|
||||
def __init__(self, norm_rules: str, trans_rules: str,
|
||||
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
|
||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||
norm_rules)
|
||||
trans_rules += ";[:Space:]+ > ' '"
|
||||
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
|
||||
trans_rules)
|
||||
self.search = Transliterator.createFromRules("icu_search",
|
||||
norm_rules + trans_rules)
|
||||
|
||||
self.analysis = {name: arules.create(self.normalizer, self.to_ascii)
|
||||
for name, arules in analysis_rules.items()}
|
||||
|
||||
|
||||
def get_analyzer(self, name: Optional[str]) -> Analyzer:
|
||||
""" Return the given named analyzer. If no analyzer with that
|
||||
name exists, return the default analyzer.
|
||||
"""
|
||||
return self.analysis.get(name) or self.analysis[None]
|
||||
952
src/nominatim_db/tokenizer/icu_tokenizer.py
Normal file
952
src/nominatim_db/tokenizer/icu_tokenizer.py
Normal file
@@ -0,0 +1,952 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
||||
libICU instead of the PostgreSQL module.
|
||||
"""
|
||||
from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
|
||||
Dict, Set, Iterable
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from textwrap import dedent
|
||||
|
||||
from nominatim_core.db.connection import connect, Connection, Cursor
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.utils import CopyBuffer
|
||||
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
|
||||
from ..data.place_info import PlaceInfo
|
||||
from ..data.place_name import PlaceName
|
||||
from .icu_rule_loader import ICURuleLoader
|
||||
from .place_sanitizer import PlaceSanitizer
|
||||
from .icu_token_analysis import ICUTokenAnalysis
|
||||
from .base import AbstractAnalyzer, AbstractTokenizer
|
||||
|
||||
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
WORD_TYPES =(('country_names', 'C'),
|
||||
('postcodes', 'P'),
|
||||
('full_word', 'W'),
|
||||
('housenumbers', 'H'))
|
||||
|
||||
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
|
||||
""" Create a new instance of the tokenizer provided by this module.
|
||||
"""
|
||||
return ICUTokenizer(dsn, data_dir)
|
||||
|
||||
|
||||
class ICUTokenizer(AbstractTokenizer):
|
||||
""" This tokenizer uses libICU to convert names and queries to ASCII.
|
||||
Otherwise it uses the same algorithms and data structures as the
|
||||
normalization routines in Nominatim 3.
|
||||
"""
|
||||
|
||||
def __init__(self, dsn: str, data_dir: Path) -> None:
|
||||
self.dsn = dsn
|
||||
self.data_dir = data_dir
|
||||
self.loader: Optional[ICURuleLoader] = None
|
||||
|
||||
|
||||
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
|
||||
""" Set up a new tokenizer for the database.
|
||||
|
||||
This copies all necessary data in the project directory to make
|
||||
sure the tokenizer remains stable even over updates.
|
||||
"""
|
||||
self.loader = ICURuleLoader(config)
|
||||
|
||||
self._install_php(config.lib_dir.php, overwrite=True)
|
||||
self._save_config()
|
||||
|
||||
if init_db:
|
||||
self.update_sql_functions(config)
|
||||
self._setup_db_tables(config)
|
||||
self._create_base_indices(config, 'word')
|
||||
|
||||
|
||||
def init_from_project(self, config: Configuration) -> None:
|
||||
""" Initialise the tokenizer from the project directory.
|
||||
"""
|
||||
self.loader = ICURuleLoader(config)
|
||||
|
||||
with connect(self.dsn) as conn:
|
||||
self.loader.load_config_from_db(conn)
|
||||
|
||||
self._install_php(config.lib_dir.php, overwrite=False)
|
||||
|
||||
|
||||
def finalize_import(self, config: Configuration) -> None:
|
||||
""" Do any required postprocessing to make the tokenizer data ready
|
||||
for use.
|
||||
"""
|
||||
self._create_lookup_indices(config, 'word')
|
||||
|
||||
|
||||
def update_sql_functions(self, config: Configuration) -> None:
|
||||
""" Reimport the SQL functions for this tokenizer.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
|
||||
|
||||
|
||||
def check_database(self, config: Configuration) -> None:
|
||||
""" Check that the tokenizer is set up correctly.
|
||||
"""
|
||||
# Will throw an error if there is an issue.
|
||||
self.init_from_project(config)
|
||||
|
||||
|
||||
def update_statistics(self, config: Configuration, threads: int = 2) -> None:
|
||||
""" Recompute frequencies for all name words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
if not conn.table_exists('search_name'):
|
||||
return
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('ANALYSE search_name')
|
||||
if threads > 1:
|
||||
cur.execute('SET max_parallel_workers_per_gather TO %s',
|
||||
(min(threads, 6),))
|
||||
|
||||
if conn.server_version_tuple() < (12, 0):
|
||||
LOG.info('Computing word frequencies')
|
||||
cur.drop_table('word_frequencies')
|
||||
cur.drop_table('addressword_frequencies')
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute('CREATE INDEX ON word_frequencies(id)')
|
||||
cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
|
||||
SELECT unnest(nameaddress_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute('CREATE INDEX ON addressword_frequencies(id)')
|
||||
cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
|
||||
INOUT info JSONB)
|
||||
AS $$
|
||||
DECLARE rec RECORD;
|
||||
BEGIN
|
||||
IF info is null THEN
|
||||
info = '{}'::jsonb;
|
||||
END IF;
|
||||
FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
|
||||
LOOP
|
||||
info = info || jsonb_build_object('count', rec.count);
|
||||
END LOOP;
|
||||
FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
|
||||
LOOP
|
||||
info = info || jsonb_build_object('addr_count', rec.count);
|
||||
END LOOP;
|
||||
IF info = '{}'::jsonb THEN
|
||||
info = null;
|
||||
END IF;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql IMMUTABLE;
|
||||
""")
|
||||
LOG.info('Update word table with recomputed frequencies')
|
||||
cur.drop_table('tmp_word')
|
||||
cur.execute("""CREATE TABLE tmp_word AS
|
||||
SELECT word_id, word_token, type, word,
|
||||
word_freq_update(word_id, info) as info
|
||||
FROM word
|
||||
""")
|
||||
cur.drop_table('word_frequencies')
|
||||
cur.drop_table('addressword_frequencies')
|
||||
else:
|
||||
LOG.info('Computing word frequencies')
|
||||
cur.drop_table('word_frequencies')
|
||||
cur.execute("""
|
||||
CREATE TEMP TABLE word_frequencies AS
|
||||
WITH word_freq AS MATERIALIZED (
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id),
|
||||
addr_freq AS MATERIALIZED (
|
||||
SELECT unnest(nameaddress_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id)
|
||||
SELECT coalesce(a.id, w.id) as id,
|
||||
(CASE WHEN w.count is null THEN '{}'::JSONB
|
||||
ELSE jsonb_build_object('count', w.count) END
|
||||
||
|
||||
CASE WHEN a.count is null THEN '{}'::JSONB
|
||||
ELSE jsonb_build_object('addr_count', a.count) END) as info
|
||||
FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
|
||||
""")
|
||||
cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
|
||||
cur.execute('ANALYSE word_frequencies')
|
||||
LOG.info('Update word table with recomputed frequencies')
|
||||
cur.drop_table('tmp_word')
|
||||
cur.execute("""CREATE TABLE tmp_word AS
|
||||
SELECT word_id, word_token, type, word,
|
||||
(CASE WHEN wf.info is null THEN word.info
|
||||
ELSE coalesce(word.info, '{}'::jsonb) || wf.info
|
||||
END) as info
|
||||
FROM word LEFT JOIN word_frequencies wf
|
||||
ON word.word_id = wf.id
|
||||
""")
|
||||
cur.drop_table('word_frequencies')
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('SET max_parallel_workers_per_gather TO 0')
|
||||
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_string(conn,
|
||||
'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
|
||||
conn.commit()
|
||||
self._create_base_indices(config, 'tmp_word')
|
||||
self._create_lookup_indices(config, 'tmp_word')
|
||||
self._move_temporary_word_table('tmp_word')
|
||||
|
||||
|
||||
|
||||
def _cleanup_housenumbers(self) -> None:
|
||||
""" Remove unused house numbers.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
if not conn.table_exists('search_name'):
|
||||
return
|
||||
with conn.cursor(name="hnr_counter") as cur:
|
||||
cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
|
||||
FROM word
|
||||
WHERE type = 'H'
|
||||
AND NOT EXISTS(SELECT * FROM search_name
|
||||
WHERE ARRAY[word.word_id] && name_vector)
|
||||
AND (char_length(coalesce(word, word_token)) > 6
|
||||
OR coalesce(word, word_token) not similar to '\\d+')
|
||||
""")
|
||||
candidates = {token: wid for wid, token in cur}
|
||||
with conn.cursor(name="hnr_counter") as cur:
|
||||
cur.execute("""SELECT housenumber FROM placex
|
||||
WHERE housenumber is not null
|
||||
AND (char_length(housenumber) > 6
|
||||
OR housenumber not similar to '\\d+')
|
||||
""")
|
||||
for row in cur:
|
||||
for hnr in row[0].split(';'):
|
||||
candidates.pop(hnr, None)
|
||||
LOG.info("There are %s outdated housenumbers.", len(candidates))
|
||||
LOG.debug("Outdated housenumbers: %s", candidates.keys())
|
||||
if candidates:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
|
||||
(list(candidates.values()), ))
|
||||
conn.commit()
|
||||
|
||||
|
||||
|
||||
def update_word_tokens(self) -> None:
|
||||
""" Remove unused tokens.
|
||||
"""
|
||||
LOG.warning("Cleaning up housenumber tokens.")
|
||||
self._cleanup_housenumbers()
|
||||
LOG.warning("Tokenizer house-keeping done.")
|
||||
|
||||
|
||||
def name_analyzer(self) -> 'ICUNameAnalyzer':
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
using this tokinzer. Analyzers are context managers and should
|
||||
be used accordingly:
|
||||
|
||||
```
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
analyser.tokenize()
|
||||
```
|
||||
|
||||
When used outside the with construct, the caller must ensure to
|
||||
call the close() function before destructing the analyzer.
|
||||
|
||||
Analyzers are not thread-safe. You need to instantiate one per thread.
|
||||
"""
|
||||
assert self.loader is not None
|
||||
return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
|
||||
self.loader.make_token_analysis())
|
||||
|
||||
|
||||
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
|
||||
""" Return a list of the `num` most frequent full words
|
||||
in the database.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT word, sum((info->>'count')::int) as count
|
||||
FROM word WHERE type = 'W'
|
||||
GROUP BY word
|
||||
ORDER BY count DESC LIMIT %s""", (num,))
|
||||
return list(s[0].split('@')[0] for s in cur)
|
||||
|
||||
|
||||
def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
|
||||
""" Install the php script for the tokenizer.
|
||||
"""
|
||||
if phpdir is not None:
|
||||
assert self.loader is not None
|
||||
php_file = self.data_dir / "tokenizer.php"
|
||||
|
||||
if not php_file.exists() or overwrite:
|
||||
php_file.write_text(dedent(f"""\
|
||||
<?php
|
||||
@define('CONST_Max_Word_Frequency', 10000000);
|
||||
@define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
|
||||
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
|
||||
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
|
||||
|
||||
|
||||
def _save_config(self) -> None:
|
||||
""" Save the configuration that needs to remain stable for the given
|
||||
database as database properties.
|
||||
"""
|
||||
assert self.loader is not None
|
||||
with connect(self.dsn) as conn:
|
||||
self.loader.save_config_to_db(conn)
|
||||
|
||||
|
||||
def _setup_db_tables(self, config: Configuration) -> None:
|
||||
""" Set up the word table and fill it with pre-computed word
|
||||
frequencies.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table('word')
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_string(conn, """
|
||||
CREATE TABLE word (
|
||||
word_id INTEGER,
|
||||
word_token text NOT NULL,
|
||||
type text NOT NULL,
|
||||
word text,
|
||||
info jsonb
|
||||
) {{db.tablespace.search_data}};
|
||||
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
|
||||
|
||||
DROP SEQUENCE IF EXISTS seq_word;
|
||||
CREATE SEQUENCE seq_word start 1;
|
||||
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
|
||||
def _create_base_indices(self, config: Configuration, table_name: str) -> None:
|
||||
""" Set up the word table and fill it with pre-computed word
|
||||
frequencies.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_string(conn,
|
||||
"""CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
|
||||
USING BTREE (word_token) {{db.tablespace.search_index}}""",
|
||||
table_name=table_name)
|
||||
for name, ctype in WORD_TYPES:
|
||||
sqlp.run_string(conn,
|
||||
"""CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
|
||||
USING BTREE (word) {{db.tablespace.address_index}}
|
||||
WHERE type = '{{column_type}}'
|
||||
""",
|
||||
table_name=table_name, idx_name=name,
|
||||
column_type=ctype)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
|
||||
""" Create additional indexes used when running the API.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
# Index required for details lookup.
|
||||
sqlp.run_string(conn, """
|
||||
CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
|
||||
ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
|
||||
""",
|
||||
table_name=table_name)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def _move_temporary_word_table(self, old: str) -> None:
|
||||
""" Rename all tables and indexes used by the tokenizer.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table('word')
|
||||
cur.execute(f"ALTER TABLE {old} RENAME TO word")
|
||||
for idx in ('word_token', 'word_id'):
|
||||
cur.execute(f"""ALTER INDEX idx_{old}_{idx}
|
||||
RENAME TO idx_word_{idx}""")
|
||||
for name, _ in WORD_TYPES:
|
||||
cur.execute(f"""ALTER INDEX idx_{old}_{name}
|
||||
RENAME TO idx_word_{name}""")
|
||||
conn.commit()
|
||||
|
||||
|
||||
|
||||
|
||||
class ICUNameAnalyzer(AbstractAnalyzer):
|
||||
""" The ICU analyzer uses the ICU library for splitting names.
|
||||
|
||||
Each instance opens a connection to the database to request the
|
||||
normalization.
|
||||
"""
|
||||
|
||||
def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
|
||||
token_analysis: ICUTokenAnalysis) -> None:
|
||||
self.conn: Optional[Connection] = connect(dsn).connection
|
||||
self.conn.autocommit = True
|
||||
self.sanitizer = sanitizer
|
||||
self.token_analysis = token_analysis
|
||||
|
||||
self._cache = _TokenCache()
|
||||
|
||||
|
||||
def close(self) -> None:
|
||||
""" Free all resources used by the analyzer.
|
||||
"""
|
||||
if self.conn:
|
||||
self.conn.close()
|
||||
self.conn = None
|
||||
|
||||
|
||||
def _search_normalized(self, name: str) -> str:
|
||||
""" Return the search token transliteration of the given name.
|
||||
"""
|
||||
return cast(str, self.token_analysis.search.transliterate(name)).strip()
|
||||
|
||||
|
||||
def _normalized(self, name: str) -> str:
|
||||
""" Return the normalized version of the given name with all
|
||||
non-relevant information removed.
|
||||
"""
|
||||
return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
|
||||
|
||||
|
||||
def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
|
||||
""" Return token information for the given list of words.
|
||||
If a word starts with # it is assumed to be a full name
|
||||
otherwise is a partial name.
|
||||
|
||||
The function returns a list of tuples with
|
||||
(original word, word token, word id).
|
||||
|
||||
The function is used for testing and debugging only
|
||||
and not necessarily efficient.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
full_tokens = {}
|
||||
partial_tokens = {}
|
||||
for word in words:
|
||||
if word.startswith('#'):
|
||||
full_tokens[word] = self._search_normalized(word[1:])
|
||||
else:
|
||||
partial_tokens[word] = self._search_normalized(word)
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT word_token, word_id
|
||||
FROM word WHERE word_token = ANY(%s) and type = 'W'
|
||||
""", (list(full_tokens.values()),))
|
||||
full_ids = {r[0]: r[1] for r in cur}
|
||||
cur.execute("""SELECT word_token, word_id
|
||||
FROM word WHERE word_token = ANY(%s) and type = 'w'""",
|
||||
(list(partial_tokens.values()),))
|
||||
part_ids = {r[0]: r[1] for r in cur}
|
||||
|
||||
return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
|
||||
+ [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
|
||||
|
||||
|
||||
def normalize_postcode(self, postcode: str) -> str:
|
||||
""" Convert the postcode to a standardized form.
|
||||
|
||||
This function must yield exactly the same result as the SQL function
|
||||
'token_normalized_postcode()'.
|
||||
"""
|
||||
return postcode.strip().upper()
|
||||
|
||||
|
||||
def update_postcodes_from_db(self) -> None:
|
||||
""" Update postcode tokens in the word table from the location_postcode
|
||||
table.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
analyzer = self.token_analysis.analysis.get('@postcode')
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
# First get all postcode names currently in the word table.
|
||||
cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
|
||||
word_entries = set((entry[0] for entry in cur))
|
||||
|
||||
# Then compute the required postcode names from the postcode table.
|
||||
needed_entries = set()
|
||||
cur.execute("SELECT country_code, postcode FROM location_postcode")
|
||||
for cc, postcode in cur:
|
||||
info = PlaceInfo({'country_code': cc,
|
||||
'class': 'place', 'type': 'postcode',
|
||||
'address': {'postcode': postcode}})
|
||||
address = self.sanitizer.process_names(info)[1]
|
||||
for place in address:
|
||||
if place.kind == 'postcode':
|
||||
if analyzer is None:
|
||||
postcode_name = place.name.strip().upper()
|
||||
variant_base = None
|
||||
else:
|
||||
postcode_name = analyzer.get_canonical_id(place)
|
||||
variant_base = place.get_attr("variant")
|
||||
|
||||
if variant_base:
|
||||
needed_entries.add(f'{postcode_name}@{variant_base}')
|
||||
else:
|
||||
needed_entries.add(postcode_name)
|
||||
break
|
||||
|
||||
# Now update the word table.
|
||||
self._delete_unused_postcode_words(word_entries - needed_entries)
|
||||
self._add_missing_postcode_words(needed_entries - word_entries)
|
||||
|
||||
def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
|
||||
assert self.conn is not None
|
||||
if tokens:
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
|
||||
(list(tokens), ))
|
||||
|
||||
def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
|
||||
assert self.conn is not None
|
||||
if not tokens:
|
||||
return
|
||||
|
||||
analyzer = self.token_analysis.analysis.get('@postcode')
|
||||
terms = []
|
||||
|
||||
for postcode_name in tokens:
|
||||
if '@' in postcode_name:
|
||||
term, variant = postcode_name.split('@', 2)
|
||||
term = self._search_normalized(term)
|
||||
if analyzer is None:
|
||||
variants = [term]
|
||||
else:
|
||||
variants = analyzer.compute_variants(variant)
|
||||
if term not in variants:
|
||||
variants.append(term)
|
||||
else:
|
||||
variants = [self._search_normalized(postcode_name)]
|
||||
terms.append((postcode_name, variants))
|
||||
|
||||
if terms:
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute_values("""SELECT create_postcode_word(pc, var)
|
||||
FROM (VALUES %s) AS v(pc, var)""",
|
||||
terms)
|
||||
|
||||
|
||||
|
||||
|
||||
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
|
||||
should_replace: bool) -> None:
|
||||
""" Replace the search index for special phrases with the new phrases.
|
||||
If `should_replace` is True, then the previous set of will be
|
||||
completely replaced. Otherwise the phrases are added to the
|
||||
already existing ones.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
|
||||
for p in phrases))
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
# Get the old phrases.
|
||||
existing_phrases = set()
|
||||
cur.execute("SELECT word, info FROM word WHERE type = 'S'")
|
||||
for word, info in cur:
|
||||
existing_phrases.add((word, info['class'], info['type'],
|
||||
info.get('op') or '-'))
|
||||
|
||||
added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
|
||||
if should_replace:
|
||||
deleted = self._remove_special_phrases(cur, norm_phrases,
|
||||
existing_phrases)
|
||||
else:
|
||||
deleted = 0
|
||||
|
||||
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
|
||||
len(norm_phrases), added, deleted)
|
||||
|
||||
|
||||
def _add_special_phrases(self, cursor: Cursor,
|
||||
new_phrases: Set[Tuple[str, str, str, str]],
|
||||
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
|
||||
""" Add all phrases to the database that are not yet there.
|
||||
"""
|
||||
to_add = new_phrases - existing_phrases
|
||||
|
||||
added = 0
|
||||
with CopyBuffer() as copystr:
|
||||
for word, cls, typ, oper in to_add:
|
||||
term = self._search_normalized(word)
|
||||
if term:
|
||||
copystr.add(term, 'S', word,
|
||||
json.dumps({'class': cls, 'type': typ,
|
||||
'op': oper if oper in ('in', 'near') else None}))
|
||||
added += 1
|
||||
|
||||
copystr.copy_out(cursor, 'word',
|
||||
columns=['word_token', 'type', 'word', 'info'])
|
||||
|
||||
return added
|
||||
|
||||
|
||||
def _remove_special_phrases(self, cursor: Cursor,
|
||||
new_phrases: Set[Tuple[str, str, str, str]],
|
||||
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
|
||||
""" Remove all phrases from the database that are no longer in the
|
||||
new phrase list.
|
||||
"""
|
||||
to_delete = existing_phrases - new_phrases
|
||||
|
||||
if to_delete:
|
||||
cursor.execute_values(
|
||||
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
|
||||
WHERE type = 'S' and word = name
|
||||
and info->>'class' = in_class and info->>'type' = in_type
|
||||
and ((op = '-' and info->>'op' is null) or op = info->>'op')
|
||||
""", to_delete)
|
||||
|
||||
return len(to_delete)
|
||||
|
||||
|
||||
def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
|
||||
""" Add default names for the given country to the search index.
|
||||
"""
|
||||
# Make sure any name preprocessing for country names applies.
|
||||
info = PlaceInfo({'name': names, 'country_code': country_code,
|
||||
'rank_address': 4, 'class': 'boundary',
|
||||
'type': 'administrative'})
|
||||
self._add_country_full_names(country_code,
|
||||
self.sanitizer.process_names(info)[0],
|
||||
internal=True)
|
||||
|
||||
|
||||
def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
|
||||
internal: bool = False) -> None:
|
||||
""" Add names for the given country from an already sanitized
|
||||
name list.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
word_tokens = set()
|
||||
for name in names:
|
||||
norm_name = self._search_normalized(name.name)
|
||||
if norm_name:
|
||||
word_tokens.add(norm_name)
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
# Get existing names
|
||||
cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
|
||||
FROM word
|
||||
WHERE type = 'C' and word = %s""",
|
||||
(country_code, ))
|
||||
# internal/external names
|
||||
existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
|
||||
for word in cur:
|
||||
existing_tokens[word[1]].add(word[0])
|
||||
|
||||
# Delete names that no longer exist.
|
||||
gone_tokens = existing_tokens[internal] - word_tokens
|
||||
if internal:
|
||||
gone_tokens.update(existing_tokens[False] & word_tokens)
|
||||
if gone_tokens:
|
||||
cur.execute("""DELETE FROM word
|
||||
USING unnest(%s) as token
|
||||
WHERE type = 'C' and word = %s
|
||||
and word_token = token""",
|
||||
(list(gone_tokens), country_code))
|
||||
|
||||
# Only add those names that are not yet in the list.
|
||||
new_tokens = word_tokens - existing_tokens[True]
|
||||
if not internal:
|
||||
new_tokens -= existing_tokens[False]
|
||||
if new_tokens:
|
||||
if internal:
|
||||
sql = """INSERT INTO word (word_token, type, word, info)
|
||||
(SELECT token, 'C', %s, '{"internal": "yes"}'
|
||||
FROM unnest(%s) as token)
|
||||
"""
|
||||
else:
|
||||
sql = """INSERT INTO word (word_token, type, word)
|
||||
(SELECT token, 'C', %s
|
||||
FROM unnest(%s) as token)
|
||||
"""
|
||||
cur.execute(sql, (country_code, list(new_tokens)))
|
||||
|
||||
|
||||
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
|
||||
""" Determine tokenizer information about the given place.
|
||||
|
||||
Returns a JSON-serializable structure that will be handed into
|
||||
the database via the token_info field.
|
||||
"""
|
||||
token_info = _TokenInfo()
|
||||
|
||||
names, address = self.sanitizer.process_names(place)
|
||||
|
||||
if names:
|
||||
token_info.set_names(*self._compute_name_tokens(names))
|
||||
|
||||
if place.is_country():
|
||||
assert place.country_code is not None
|
||||
self._add_country_full_names(place.country_code, names)
|
||||
|
||||
if address:
|
||||
self._process_place_address(token_info, address)
|
||||
|
||||
return token_info.to_dict()
|
||||
|
||||
|
||||
def _process_place_address(self, token_info: '_TokenInfo',
|
||||
address: Sequence[PlaceName]) -> None:
|
||||
for item in address:
|
||||
if item.kind == 'postcode':
|
||||
token_info.set_postcode(self._add_postcode(item))
|
||||
elif item.kind == 'housenumber':
|
||||
token_info.add_housenumber(*self._compute_housenumber_token(item))
|
||||
elif item.kind == 'street':
|
||||
token_info.add_street(self._retrieve_full_tokens(item.name))
|
||||
elif item.kind == 'place':
|
||||
if not item.suffix:
|
||||
token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
|
||||
elif not item.kind.startswith('_') and not item.suffix and \
|
||||
item.kind not in ('country', 'full', 'inclusion'):
|
||||
token_info.add_address_term(item.kind,
|
||||
itertools.chain(*self._compute_name_tokens([item])))
|
||||
|
||||
|
||||
def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
|
||||
""" Normalize the housenumber and return the word token and the
|
||||
canonical form.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
analyzer = self.token_analysis.analysis.get('@housenumber')
|
||||
result: Tuple[Optional[int], Optional[str]] = (None, None)
|
||||
|
||||
if analyzer is None:
|
||||
# When no custom analyzer is set, simply normalize and transliterate
|
||||
norm_name = self._search_normalized(hnr.name)
|
||||
if norm_name:
|
||||
result = self._cache.housenumbers.get(norm_name, result)
|
||||
if result[0] is None:
|
||||
with self.conn.cursor() as cur:
|
||||
hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
|
||||
|
||||
result = hid, norm_name
|
||||
self._cache.housenumbers[norm_name] = result
|
||||
else:
|
||||
# Otherwise use the analyzer to determine the canonical name.
|
||||
# Per convention we use the first variant as the 'lookup name', the
|
||||
# name that gets saved in the housenumber field of the place.
|
||||
word_id = analyzer.get_canonical_id(hnr)
|
||||
if word_id:
|
||||
result = self._cache.housenumbers.get(word_id, result)
|
||||
if result[0] is None:
|
||||
variants = analyzer.compute_variants(word_id)
|
||||
if variants:
|
||||
with self.conn.cursor() as cur:
|
||||
hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
|
||||
(word_id, list(variants)))
|
||||
result = hid, variants[0]
|
||||
self._cache.housenumbers[word_id] = result
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _retrieve_full_tokens(self, name: str) -> List[int]:
|
||||
""" Get the full name token for the given name, if it exists.
|
||||
The name is only retrieved for the standard analyser.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
norm_name = self._search_normalized(name)
|
||||
|
||||
# return cached if possible
|
||||
if norm_name in self._cache.fulls:
|
||||
return self._cache.fulls[norm_name]
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
|
||||
(norm_name, ))
|
||||
full = [row[0] for row in cur]
|
||||
|
||||
self._cache.fulls[norm_name] = full
|
||||
|
||||
return full
|
||||
|
||||
|
||||
def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
|
||||
""" Computes the full name and partial name tokens for the given
|
||||
dictionary of names.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
full_tokens: Set[int] = set()
|
||||
partial_tokens: Set[int] = set()
|
||||
|
||||
for name in names:
|
||||
analyzer_id = name.get_attr('analyzer')
|
||||
analyzer = self.token_analysis.get_analyzer(analyzer_id)
|
||||
word_id = analyzer.get_canonical_id(name)
|
||||
if analyzer_id is None:
|
||||
token_id = word_id
|
||||
else:
|
||||
token_id = f'{word_id}@{analyzer_id}'
|
||||
|
||||
full, part = self._cache.names.get(token_id, (None, None))
|
||||
if full is None:
|
||||
variants = analyzer.compute_variants(word_id)
|
||||
if not variants:
|
||||
continue
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
|
||||
(token_id, variants))
|
||||
full, part = cast(Tuple[int, List[int]], cur.fetchone())
|
||||
|
||||
self._cache.names[token_id] = (full, part)
|
||||
|
||||
assert part is not None
|
||||
|
||||
full_tokens.add(full)
|
||||
partial_tokens.update(part)
|
||||
|
||||
return full_tokens, partial_tokens
|
||||
|
||||
|
||||
def _add_postcode(self, item: PlaceName) -> Optional[str]:
|
||||
""" Make sure the normalized postcode is present in the word table.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
analyzer = self.token_analysis.analysis.get('@postcode')
|
||||
|
||||
if analyzer is None:
|
||||
postcode_name = item.name.strip().upper()
|
||||
variant_base = None
|
||||
else:
|
||||
postcode_name = analyzer.get_canonical_id(item)
|
||||
variant_base = item.get_attr("variant")
|
||||
|
||||
if variant_base:
|
||||
postcode = f'{postcode_name}@{variant_base}'
|
||||
else:
|
||||
postcode = postcode_name
|
||||
|
||||
if postcode not in self._cache.postcodes:
|
||||
term = self._search_normalized(postcode_name)
|
||||
if not term:
|
||||
return None
|
||||
|
||||
variants = {term}
|
||||
if analyzer is not None and variant_base:
|
||||
variants.update(analyzer.compute_variants(variant_base))
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT create_postcode_word(%s, %s)",
|
||||
(postcode, list(variants)))
|
||||
self._cache.postcodes.add(postcode)
|
||||
|
||||
return postcode_name
|
||||
|
||||
|
||||
class _TokenInfo:
|
||||
""" Collect token information to be sent back to the database.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
self.names: Optional[str] = None
|
||||
self.housenumbers: Set[str] = set()
|
||||
self.housenumber_tokens: Set[int] = set()
|
||||
self.street_tokens: Optional[Set[int]] = None
|
||||
self.place_tokens: Set[int] = set()
|
||||
self.address_tokens: Dict[str, str] = {}
|
||||
self.postcode: Optional[str] = None
|
||||
|
||||
|
||||
def _mk_array(self, tokens: Iterable[Any]) -> str:
|
||||
return f"{{{','.join((str(s) for s in tokens))}}}"
|
||||
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
""" Return the token information in database importable format.
|
||||
"""
|
||||
out: Dict[str, Any] = {}
|
||||
|
||||
if self.names:
|
||||
out['names'] = self.names
|
||||
|
||||
if self.housenumbers:
|
||||
out['hnr'] = ';'.join(self.housenumbers)
|
||||
out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
|
||||
|
||||
if self.street_tokens is not None:
|
||||
out['street'] = self._mk_array(self.street_tokens)
|
||||
|
||||
if self.place_tokens:
|
||||
out['place'] = self._mk_array(self.place_tokens)
|
||||
|
||||
if self.address_tokens:
|
||||
out['addr'] = self.address_tokens
|
||||
|
||||
if self.postcode:
|
||||
out['postcode'] = self.postcode
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
|
||||
""" Adds token information for the normalised names.
|
||||
"""
|
||||
self.names = self._mk_array(itertools.chain(fulls, partials))
|
||||
|
||||
|
||||
def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
|
||||
""" Extract housenumber information from a list of normalised
|
||||
housenumbers.
|
||||
"""
|
||||
if token:
|
||||
assert hnr is not None
|
||||
self.housenumbers.add(hnr)
|
||||
self.housenumber_tokens.add(token)
|
||||
|
||||
|
||||
def add_street(self, tokens: Iterable[int]) -> None:
|
||||
""" Add addr:street match terms.
|
||||
"""
|
||||
if self.street_tokens is None:
|
||||
self.street_tokens = set()
|
||||
self.street_tokens.update(tokens)
|
||||
|
||||
|
||||
def add_place(self, tokens: Iterable[int]) -> None:
|
||||
""" Add addr:place search and match terms.
|
||||
"""
|
||||
self.place_tokens.update(tokens)
|
||||
|
||||
|
||||
def add_address_term(self, key: str, partials: Iterable[int]) -> None:
|
||||
""" Add additional address terms.
|
||||
"""
|
||||
array = self._mk_array(partials)
|
||||
if len(array) > 2:
|
||||
self.address_tokens[key] = array
|
||||
|
||||
def set_postcode(self, postcode: Optional[str]) -> None:
|
||||
""" Set the postcode to the given one.
|
||||
"""
|
||||
self.postcode = postcode
|
||||
|
||||
|
||||
class _TokenCache:
|
||||
""" Cache for token information to avoid repeated database queries.
|
||||
|
||||
This cache is not thread-safe and needs to be instantiated per
|
||||
analyzer.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
self.names: Dict[str, Tuple[int, List[int]]] = {}
|
||||
self.partials: Dict[str, int] = {}
|
||||
self.fulls: Dict[str, List[int]] = {}
|
||||
self.postcodes: Set[str] = set()
|
||||
self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}
|
||||
681
src/nominatim_db/tokenizer/legacy_tokenizer.py
Normal file
681
src/nominatim_db/tokenizer/legacy_tokenizer.py
Normal file
@@ -0,0 +1,681 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tokenizer implementing normalisation as used before Nominatim 4.
|
||||
"""
|
||||
from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
|
||||
cast, Dict, Set, Iterable
|
||||
from collections import OrderedDict
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import re
|
||||
import shutil
|
||||
from textwrap import dedent
|
||||
|
||||
from icu import Transliterator
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.db.connection import connect, Connection
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db import properties
|
||||
from nominatim_core.db import utils as db_utils
|
||||
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
|
||||
from ..data.place_info import PlaceInfo
|
||||
from .base import AbstractAnalyzer, AbstractTokenizer
|
||||
|
||||
DBCFG_NORMALIZATION = "tokenizer_normalization"
|
||||
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
|
||||
""" Create a new instance of the tokenizer provided by this module.
|
||||
"""
|
||||
return LegacyTokenizer(dsn, data_dir)
|
||||
|
||||
|
||||
def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
|
||||
""" Copies the PostgreSQL normalisation module into the project
|
||||
directory if necessary. For historical reasons the module is
|
||||
saved in the '/module' subdirectory and not with the other tokenizer
|
||||
data.
|
||||
|
||||
The function detects when the installation is run from the
|
||||
build directory. It doesn't touch the module in that case.
|
||||
"""
|
||||
# Custom module locations are simply used as is.
|
||||
if config_module_path:
|
||||
LOG.info("Using custom path for database module at '%s'", config_module_path)
|
||||
return config_module_path
|
||||
|
||||
# Compatibility mode for builddir installations.
|
||||
if module_dir.exists() and src_dir.samefile(module_dir):
|
||||
LOG.info('Running from build directory. Leaving database module as is.')
|
||||
return str(module_dir)
|
||||
|
||||
# In any other case install the module in the project directory.
|
||||
if not module_dir.exists():
|
||||
module_dir.mkdir()
|
||||
|
||||
destfile = module_dir / 'nominatim.so'
|
||||
shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
|
||||
destfile.chmod(0o755)
|
||||
|
||||
LOG.info('Database module installed at %s', str(destfile))
|
||||
|
||||
return str(module_dir)
|
||||
|
||||
|
||||
def _check_module(module_dir: str, conn: Connection) -> None:
|
||||
""" Try to use the PostgreSQL module to confirm that it is correctly
|
||||
installed and accessible from PostgreSQL.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
try:
|
||||
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
|
||||
RETURNS text AS %s, 'transliteration'
|
||||
LANGUAGE c IMMUTABLE STRICT;
|
||||
DROP FUNCTION nominatim_test_import_func(text)
|
||||
""", (f'{module_dir}/nominatim.so', ))
|
||||
except psycopg2.DatabaseError as err:
|
||||
LOG.fatal("Error accessing database module: %s", err)
|
||||
raise UsageError("Database module cannot be accessed.") from err
|
||||
|
||||
|
||||
class LegacyTokenizer(AbstractTokenizer):
|
||||
""" The legacy tokenizer uses a special PostgreSQL module to normalize
|
||||
names and queries. The tokenizer thus implements normalization through
|
||||
calls to the database.
|
||||
"""
|
||||
|
||||
def __init__(self, dsn: str, data_dir: Path) -> None:
|
||||
self.dsn = dsn
|
||||
self.data_dir = data_dir
|
||||
self.normalization: Optional[str] = None
|
||||
|
||||
|
||||
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
|
||||
""" Set up a new tokenizer for the database.
|
||||
|
||||
This copies all necessary data in the project directory to make
|
||||
sure the tokenizer remains stable even over updates.
|
||||
"""
|
||||
assert config.project_dir is not None
|
||||
module_dir = _install_module(config.DATABASE_MODULE_PATH,
|
||||
config.lib_dir.module,
|
||||
config.project_dir / 'module')
|
||||
|
||||
self.normalization = config.TERM_NORMALIZATION
|
||||
|
||||
self._install_php(config, overwrite=True)
|
||||
|
||||
with connect(self.dsn) as conn:
|
||||
_check_module(module_dir, conn)
|
||||
self._save_config(conn, config)
|
||||
conn.commit()
|
||||
|
||||
if init_db:
|
||||
self.update_sql_functions(config)
|
||||
self._init_db_tables(config)
|
||||
|
||||
|
||||
def init_from_project(self, config: Configuration) -> None:
|
||||
""" Initialise the tokenizer from the project directory.
|
||||
"""
|
||||
assert config.project_dir is not None
|
||||
|
||||
with connect(self.dsn) as conn:
|
||||
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
|
||||
|
||||
if not (config.project_dir / 'module' / 'nominatim.so').exists():
|
||||
_install_module(config.DATABASE_MODULE_PATH,
|
||||
config.lib_dir.module,
|
||||
config.project_dir / 'module')
|
||||
|
||||
self._install_php(config, overwrite=False)
|
||||
|
||||
def finalize_import(self, config: Configuration) -> None:
|
||||
""" Do any required postprocessing to make the tokenizer data ready
|
||||
for use.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
|
||||
|
||||
|
||||
def update_sql_functions(self, config: Configuration) -> None:
|
||||
""" Reimport the SQL functions for this tokenizer.
|
||||
"""
|
||||
assert config.project_dir is not None
|
||||
|
||||
with connect(self.dsn) as conn:
|
||||
max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
|
||||
modulepath = config.DATABASE_MODULE_PATH or \
|
||||
str((config.project_dir / 'module').resolve())
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
|
||||
max_word_freq=max_word_freq,
|
||||
modulepath=modulepath)
|
||||
|
||||
|
||||
def check_database(self, _: Configuration) -> Optional[str]:
|
||||
""" Check that the tokenizer is set up correctly.
|
||||
"""
|
||||
hint = """\
|
||||
The Postgresql extension nominatim.so was not correctly loaded.
|
||||
|
||||
Error: {error}
|
||||
|
||||
Hints:
|
||||
* Check the output of the CMmake/make installation step
|
||||
* Does nominatim.so exist?
|
||||
* Does nominatim.so exist on the database server?
|
||||
* Can nominatim.so be accessed by the database user?
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
try:
|
||||
out = cur.scalar("SELECT make_standard_name('a')")
|
||||
except psycopg2.Error as err:
|
||||
return hint.format(error=str(err))
|
||||
|
||||
if out != 'a':
|
||||
return hint.format(error='Unexpected result for make_standard_name()')
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def migrate_database(self, config: Configuration) -> None:
|
||||
""" Initialise the project directory of an existing database for
|
||||
use with this tokenizer.
|
||||
|
||||
This is a special migration function for updating existing databases
|
||||
to new software versions.
|
||||
"""
|
||||
assert config.project_dir is not None
|
||||
|
||||
self.normalization = config.TERM_NORMALIZATION
|
||||
module_dir = _install_module(config.DATABASE_MODULE_PATH,
|
||||
config.lib_dir.module,
|
||||
config.project_dir / 'module')
|
||||
|
||||
with connect(self.dsn) as conn:
|
||||
_check_module(module_dir, conn)
|
||||
self._save_config(conn, config)
|
||||
|
||||
|
||||
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
|
||||
""" Recompute the frequency of full words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
if conn.table_exists('search_name'):
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table("word_frequencies")
|
||||
LOG.info("Computing word frequencies")
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||
LOG.info("Update word table with recomputed frequencies")
|
||||
cur.execute("""UPDATE word SET search_name_count = count
|
||||
FROM word_frequencies
|
||||
WHERE word_token like ' %' and word_id = id""")
|
||||
cur.drop_table("word_frequencies")
|
||||
conn.commit()
|
||||
|
||||
|
||||
def update_word_tokens(self) -> None:
|
||||
""" No house-keeping implemented for the legacy tokenizer.
|
||||
"""
|
||||
LOG.info("No tokenizer clean-up available.")
|
||||
|
||||
|
||||
def name_analyzer(self) -> 'LegacyNameAnalyzer':
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
using this tokinzer. Analyzers are context managers and should
|
||||
be used accordingly:
|
||||
|
||||
```
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
analyser.tokenize()
|
||||
```
|
||||
|
||||
When used outside the with construct, the caller must ensure to
|
||||
call the close() function before destructing the analyzer.
|
||||
|
||||
Analyzers are not thread-safe. You need to instantiate one per thread.
|
||||
"""
|
||||
normalizer = Transliterator.createFromRules("phrase normalizer",
|
||||
self.normalization)
|
||||
return LegacyNameAnalyzer(self.dsn, normalizer)
|
||||
|
||||
|
||||
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
|
||||
""" Return a list of the `num` most frequent full words
|
||||
in the database.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(""" SELECT word FROM word WHERE word is not null
|
||||
ORDER BY search_name_count DESC LIMIT %s""", (num,))
|
||||
return list(s[0] for s in cur)
|
||||
|
||||
|
||||
def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
|
||||
""" Install the php script for the tokenizer.
|
||||
"""
|
||||
if config.lib_dir.php is not None:
|
||||
php_file = self.data_dir / "tokenizer.php"
|
||||
|
||||
if not php_file.exists() or overwrite:
|
||||
php_file.write_text(dedent(f"""\
|
||||
<?php
|
||||
@define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
|
||||
@define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
|
||||
require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
|
||||
"""), encoding='utf-8')
|
||||
|
||||
|
||||
def _init_db_tables(self, config: Configuration) -> None:
|
||||
""" Set up the word table and fill it with pre-computed word
|
||||
frequencies.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
|
||||
conn.commit()
|
||||
|
||||
LOG.warning("Precomputing word tokens")
|
||||
db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
|
||||
|
||||
|
||||
def _save_config(self, conn: Connection, config: Configuration) -> None:
|
||||
""" Save the configuration that needs to remain stable for the given
|
||||
database as database properties.
|
||||
"""
|
||||
assert self.normalization is not None
|
||||
|
||||
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
|
||||
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
|
||||
|
||||
|
||||
class LegacyNameAnalyzer(AbstractAnalyzer):
|
||||
""" The legacy analyzer uses the special Postgresql module for
|
||||
splitting names.
|
||||
|
||||
Each instance opens a connection to the database to request the
|
||||
normalization.
|
||||
"""
|
||||
|
||||
def __init__(self, dsn: str, normalizer: Any):
|
||||
self.conn: Optional[Connection] = connect(dsn).connection
|
||||
self.conn.autocommit = True
|
||||
self.normalizer = normalizer
|
||||
psycopg2.extras.register_hstore(self.conn)
|
||||
|
||||
self._cache = _TokenCache(self.conn)
|
||||
|
||||
|
||||
def close(self) -> None:
|
||||
""" Free all resources used by the analyzer.
|
||||
"""
|
||||
if self.conn:
|
||||
self.conn.close()
|
||||
self.conn = None
|
||||
|
||||
|
||||
def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
|
||||
""" Return token information for the given list of words.
|
||||
If a word starts with # it is assumed to be a full name
|
||||
otherwise is a partial name.
|
||||
|
||||
The function returns a list of tuples with
|
||||
(original word, word token, word id).
|
||||
|
||||
The function is used for testing and debugging only
|
||||
and not necessarily efficient.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT t.term, word_token, word_id
|
||||
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
|
||||
WHERE word_token = (CASE
|
||||
WHEN left(t.term, 1) = '#' THEN
|
||||
' ' || make_standard_name(substring(t.term from 2))
|
||||
ELSE
|
||||
make_standard_name(t.term)
|
||||
END)
|
||||
and class is null and country_code is null""",
|
||||
(words, ))
|
||||
|
||||
return [(r[0], r[1], r[2]) for r in cur]
|
||||
|
||||
|
||||
def normalize(self, phrase: str) -> str:
|
||||
""" Normalize the given phrase, i.e. remove all properties that
|
||||
are irrelevant for search.
|
||||
"""
|
||||
return cast(str, self.normalizer.transliterate(phrase))
|
||||
|
||||
|
||||
def normalize_postcode(self, postcode: str) -> str:
|
||||
""" Convert the postcode to a standardized form.
|
||||
|
||||
This function must yield exactly the same result as the SQL function
|
||||
'token_normalized_postcode()'.
|
||||
"""
|
||||
return postcode.strip().upper()
|
||||
|
||||
|
||||
def update_postcodes_from_db(self) -> None:
|
||||
""" Update postcode tokens in the word table from the location_postcode
|
||||
table.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
# This finds us the rows in location_postcode and word that are
|
||||
# missing in the other table.
|
||||
cur.execute("""SELECT * FROM
|
||||
(SELECT pc, word FROM
|
||||
(SELECT distinct(postcode) as pc FROM location_postcode) p
|
||||
FULL JOIN
|
||||
(SELECT word FROM word
|
||||
WHERE class ='place' and type = 'postcode') w
|
||||
ON pc = word) x
|
||||
WHERE pc is null or word is null""")
|
||||
|
||||
to_delete = []
|
||||
to_add = []
|
||||
|
||||
for postcode, word in cur:
|
||||
if postcode is None:
|
||||
to_delete.append(word)
|
||||
else:
|
||||
to_add.append(postcode)
|
||||
|
||||
if to_delete:
|
||||
cur.execute("""DELETE FROM WORD
|
||||
WHERE class ='place' and type = 'postcode'
|
||||
and word = any(%s)
|
||||
""", (to_delete, ))
|
||||
if to_add:
|
||||
cur.execute("""SELECT count(create_postcode_id(pc))
|
||||
FROM unnest(%s) as pc
|
||||
""", (to_add, ))
|
||||
|
||||
|
||||
|
||||
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
|
||||
should_replace: bool) -> None:
|
||||
""" Replace the search index for special phrases with the new phrases.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
|
||||
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
|
||||
for p in phrases))
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
# Get the old phrases.
|
||||
existing_phrases = set()
|
||||
cur.execute("""SELECT word, class, type, operator FROM word
|
||||
WHERE class != 'place'
|
||||
OR (type != 'house' AND type != 'postcode')""")
|
||||
for label, cls, typ, oper in cur:
|
||||
existing_phrases.add((label, cls, typ, oper or '-'))
|
||||
|
||||
to_add = norm_phrases - existing_phrases
|
||||
to_delete = existing_phrases - norm_phrases
|
||||
|
||||
if to_add:
|
||||
cur.execute_values(
|
||||
""" INSERT INTO word (word_id, word_token, word, class, type,
|
||||
search_name_count, operator)
|
||||
(SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
|
||||
class, type, 0,
|
||||
CASE WHEN op in ('in', 'near') THEN op ELSE null END
|
||||
FROM (VALUES %s) as v(name, class, type, op))""",
|
||||
to_add)
|
||||
|
||||
if to_delete and should_replace:
|
||||
cur.execute_values(
|
||||
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
|
||||
WHERE word = name and class = in_class and type = in_type
|
||||
and ((op = '-' and operator is null) or op = operator)""",
|
||||
to_delete)
|
||||
|
||||
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
|
||||
len(norm_phrases), len(to_add), len(to_delete))
|
||||
|
||||
|
||||
def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
|
||||
""" Add names for the given country to the search index.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""INSERT INTO word (word_id, word_token, country_code)
|
||||
(SELECT nextval('seq_word'), lookup_token, %s
|
||||
FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
|
||||
FROM unnest(%s)n) y
|
||||
WHERE NOT EXISTS(SELECT * FROM word
|
||||
WHERE word_token = lookup_token and country_code = %s))
|
||||
""", (country_code, list(names.values()), country_code))
|
||||
|
||||
|
||||
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
|
||||
""" Determine tokenizer information about the given place.
|
||||
|
||||
Returns a JSON-serialisable structure that will be handed into
|
||||
the database via the token_info field.
|
||||
"""
|
||||
assert self.conn is not None
|
||||
|
||||
token_info = _TokenInfo(self._cache)
|
||||
|
||||
names = place.name
|
||||
|
||||
if names:
|
||||
token_info.add_names(self.conn, names)
|
||||
|
||||
if place.is_country():
|
||||
assert place.country_code is not None
|
||||
self.add_country_names(place.country_code, names)
|
||||
|
||||
address = place.address
|
||||
if address:
|
||||
self._process_place_address(token_info, address)
|
||||
|
||||
return token_info.data
|
||||
|
||||
|
||||
def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
|
||||
assert self.conn is not None
|
||||
hnrs = []
|
||||
addr_terms = []
|
||||
|
||||
for key, value in address.items():
|
||||
if key == 'postcode':
|
||||
# Make sure the normalized postcode is present in the word table.
|
||||
if re.search(r'[:,;]', value) is None:
|
||||
norm_pc = self.normalize_postcode(value)
|
||||
token_info.set_postcode(norm_pc)
|
||||
self._cache.add_postcode(self.conn, norm_pc)
|
||||
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
||||
hnrs.append(value)
|
||||
elif key == 'street':
|
||||
token_info.add_street(self.conn, value)
|
||||
elif key == 'place':
|
||||
token_info.add_place(self.conn, value)
|
||||
elif not key.startswith('_') \
|
||||
and key not in ('country', 'full', 'inclusion'):
|
||||
addr_terms.append((key, value))
|
||||
|
||||
if hnrs:
|
||||
token_info.add_housenumbers(self.conn, hnrs)
|
||||
|
||||
if addr_terms:
|
||||
token_info.add_address_terms(self.conn, addr_terms)
|
||||
|
||||
|
||||
|
||||
class _TokenInfo:
|
||||
""" Collect token information to be sent back to the database.
|
||||
"""
|
||||
def __init__(self, cache: '_TokenCache') -> None:
|
||||
self.cache = cache
|
||||
self.data: Dict[str, Any] = {}
|
||||
|
||||
|
||||
def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
|
||||
""" Add token information for the names of the place.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
# Create the token IDs for all names.
|
||||
self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
|
||||
(names, ))
|
||||
|
||||
|
||||
def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
|
||||
""" Extract housenumber information from the address.
|
||||
"""
|
||||
if len(hnrs) == 1:
|
||||
token = self.cache.get_housenumber(hnrs[0])
|
||||
if token is not None:
|
||||
self.data['hnr_tokens'] = token
|
||||
self.data['hnr'] = hnrs[0]
|
||||
return
|
||||
|
||||
# split numbers if necessary
|
||||
simple_list: List[str] = []
|
||||
for hnr in hnrs:
|
||||
simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
|
||||
|
||||
if len(simple_list) > 1:
|
||||
simple_list = list(set(simple_list))
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
|
||||
result = cur.fetchone()
|
||||
assert result is not None
|
||||
self.data['hnr_tokens'], self.data['hnr'] = result
|
||||
|
||||
|
||||
def set_postcode(self, postcode: str) -> None:
|
||||
""" Set or replace the postcode token with the given value.
|
||||
"""
|
||||
self.data['postcode'] = postcode
|
||||
|
||||
def add_street(self, conn: Connection, street: str) -> None:
|
||||
""" Add addr:street match terms.
|
||||
"""
|
||||
def _get_street(name: str) -> Optional[str]:
|
||||
with conn.cursor() as cur:
|
||||
return cast(Optional[str],
|
||||
cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
|
||||
|
||||
tokens = self.cache.streets.get(street, _get_street)
|
||||
self.data['street'] = tokens or '{}'
|
||||
|
||||
|
||||
def add_place(self, conn: Connection, place: str) -> None:
|
||||
""" Add addr:place search and match terms.
|
||||
"""
|
||||
def _get_place(name: str) -> Tuple[List[int], List[int]]:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
|
||||
word_ids_from_name(%s)::text""",
|
||||
(name, name))
|
||||
return cast(Tuple[List[int], List[int]], cur.fetchone())
|
||||
|
||||
self.data['place_search'], self.data['place_match'] = \
|
||||
self.cache.places.get(place, _get_place)
|
||||
|
||||
|
||||
def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
|
||||
""" Add additional address terms.
|
||||
"""
|
||||
def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT addr_ids_from_name(%s)::text,
|
||||
word_ids_from_name(%s)::text""",
|
||||
(name, name))
|
||||
return cast(Tuple[List[int], List[int]], cur.fetchone())
|
||||
|
||||
tokens = {}
|
||||
for key, value in terms:
|
||||
items = self.cache.address_terms.get(value, _get_address_term)
|
||||
if items[0] or items[1]:
|
||||
tokens[key] = items
|
||||
|
||||
if tokens:
|
||||
self.data['addr'] = tokens
|
||||
|
||||
|
||||
class _LRU:
|
||||
""" Least recently used cache that accepts a generator function to
|
||||
produce the item when there is a cache miss.
|
||||
"""
|
||||
|
||||
def __init__(self, maxsize: int = 128):
|
||||
self.data: 'OrderedDict[str, Any]' = OrderedDict()
|
||||
self.maxsize = maxsize
|
||||
|
||||
|
||||
def get(self, key: str, generator: Callable[[str], Any]) -> Any:
|
||||
""" Get the item with the given key from the cache. If nothing
|
||||
is found in the cache, generate the value through the
|
||||
generator function and store it in the cache.
|
||||
"""
|
||||
value = self.data.get(key)
|
||||
if value is not None:
|
||||
self.data.move_to_end(key)
|
||||
else:
|
||||
value = generator(key)
|
||||
if len(self.data) >= self.maxsize:
|
||||
self.data.popitem(last=False)
|
||||
self.data[key] = value
|
||||
|
||||
return value
|
||||
|
||||
|
||||
class _TokenCache:
|
||||
""" Cache for token information to avoid repeated database queries.
|
||||
|
||||
This cache is not thread-safe and needs to be instantiated per
|
||||
analyzer.
|
||||
"""
|
||||
def __init__(self, conn: Connection):
|
||||
# various LRU caches
|
||||
self.streets = _LRU(maxsize=256)
|
||||
self.places = _LRU(maxsize=128)
|
||||
self.address_terms = _LRU(maxsize=1024)
|
||||
|
||||
# Lookup houseunumbers up to 100 and cache them
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
|
||||
FROM generate_series(1, 100) as i""")
|
||||
self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
|
||||
|
||||
# For postcodes remember the ones that have already been added
|
||||
self.postcodes: Set[str] = set()
|
||||
|
||||
def get_housenumber(self, number: str) -> Optional[str]:
|
||||
""" Get a housenumber token from the cache.
|
||||
"""
|
||||
return self._cached_housenumbers.get(number)
|
||||
|
||||
|
||||
def add_postcode(self, conn: Connection, postcode: str) -> None:
|
||||
""" Make sure the given postcode is in the database.
|
||||
"""
|
||||
if postcode not in self.postcodes:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
|
||||
self.postcodes.add(postcode)
|
||||
53
src/nominatim_db/tokenizer/place_sanitizer.py
Normal file
53
src/nominatim_db/tokenizer/place_sanitizer.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Handler for cleaning name and address tags in place information before it
|
||||
is handed to the token analysis.
|
||||
"""
|
||||
from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.config import Configuration
|
||||
from .sanitizers.config import SanitizerConfig
|
||||
from .sanitizers.base import SanitizerHandler, ProcessInfo
|
||||
from ..data.place_name import PlaceName
|
||||
from ..data.place_info import PlaceInfo
|
||||
|
||||
|
||||
class PlaceSanitizer:
|
||||
""" Controller class which applies sanitizer functions on the place
|
||||
names and address before they are used by the token analysers.
|
||||
"""
|
||||
|
||||
def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]],
|
||||
config: Configuration) -> None:
|
||||
self.handlers: List[Callable[[ProcessInfo], None]] = []
|
||||
|
||||
if rules:
|
||||
for func in rules:
|
||||
if 'step' not in func:
|
||||
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
|
||||
if not isinstance(func['step'], str):
|
||||
raise UsageError("'step' attribute must be a simple string.")
|
||||
|
||||
module: SanitizerHandler = \
|
||||
config.load_plugin_module(func['step'], 'nominatim.tokenizer.sanitizers')
|
||||
|
||||
self.handlers.append(module.create(SanitizerConfig(func)))
|
||||
|
||||
|
||||
def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
|
||||
""" Extract a sanitized list of names and address parts from the
|
||||
given place. The function returns a tuple
|
||||
(list of names, list of address names)
|
||||
"""
|
||||
obj = ProcessInfo(place)
|
||||
|
||||
for func in self.handlers:
|
||||
func(obj)
|
||||
|
||||
return obj.names, obj.address
|
||||
0
src/nominatim_db/tokenizer/sanitizers/__init__.py
Normal file
0
src/nominatim_db/tokenizer/sanitizers/__init__.py
Normal file
64
src/nominatim_db/tokenizer/sanitizers/base.py
Normal file
64
src/nominatim_db/tokenizer/sanitizers/base.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Common data types and protocols for sanitizers.
|
||||
"""
|
||||
from typing import Optional, List, Mapping, Callable
|
||||
|
||||
from nominatim_core.typing import Protocol, Final
|
||||
from ...data.place_info import PlaceInfo
|
||||
from ...data.place_name import PlaceName
|
||||
from .config import SanitizerConfig
|
||||
|
||||
|
||||
class ProcessInfo:
|
||||
""" Container class for information handed into to handler functions.
|
||||
The 'names' and 'address' members are mutable. A handler must change
|
||||
them by either modifying the lists place or replacing the old content
|
||||
with a new list.
|
||||
"""
|
||||
|
||||
def __init__(self, place: PlaceInfo):
|
||||
self.place: Final = place
|
||||
self.names = self._convert_name_dict(place.name)
|
||||
self.address = self._convert_name_dict(place.address)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
|
||||
""" Convert a dictionary of names into a list of PlaceNames.
|
||||
The dictionary key is split into the primary part of the key
|
||||
and the suffix (the part after an optional colon).
|
||||
"""
|
||||
out = []
|
||||
|
||||
if names:
|
||||
for key, value in names.items():
|
||||
parts = key.split(':', 1)
|
||||
out.append(PlaceName(value.strip(),
|
||||
parts[0].strip(),
|
||||
parts[1].strip() if len(parts) > 1 else None))
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class SanitizerHandler(Protocol):
|
||||
""" Protocol for sanitizer modules.
|
||||
"""
|
||||
|
||||
def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||
"""
|
||||
Create a function for sanitizing a place.
|
||||
|
||||
Arguments:
|
||||
config: A dictionary with the additional configuration options
|
||||
specified in the tokenizer configuration
|
||||
|
||||
Return:
|
||||
The result must be a callable that takes a place description
|
||||
and transforms name and address as required.
|
||||
"""
|
||||
80
src/nominatim_db/tokenizer/sanitizers/clean_housenumbers.py
Normal file
80
src/nominatim_db/tokenizer/sanitizers/clean_housenumbers.py
Normal file
@@ -0,0 +1,80 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Sanitizer that preprocesses address tags for house numbers. The sanitizer
|
||||
allows to
|
||||
|
||||
* define which tags are to be considered house numbers (see 'filter-kind')
|
||||
* split house number lists into individual numbers (see 'delimiters')
|
||||
|
||||
Arguments:
|
||||
delimiters: Define the set of characters to be used for
|
||||
splitting a list of house numbers into parts. (default: ',;')
|
||||
filter-kind: Define the address tags that are considered to be a
|
||||
house number. Either takes a single string or a list of strings,
|
||||
where each string is a regular expression. An address item
|
||||
is considered a house number if the 'kind' fully matches any
|
||||
of the given regular expressions. (default: 'housenumber')
|
||||
convert-to-name: Define house numbers that should be treated as a name
|
||||
instead of a house number. Either takes a single string
|
||||
or a list of strings, where each string is a regular
|
||||
expression that must match the full house number value.
|
||||
"""
|
||||
from typing import Callable, Iterator, List
|
||||
|
||||
from ...data.place_name import PlaceName
|
||||
from .base import ProcessInfo
|
||||
from .config import SanitizerConfig
|
||||
|
||||
class _HousenumberSanitizer:
|
||||
|
||||
def __init__(self, config: SanitizerConfig) -> None:
|
||||
self.filter_kind = config.get_filter('filter-kind', ['housenumber'])
|
||||
self.split_regexp = config.get_delimiter()
|
||||
|
||||
self.filter_name = config.get_filter('convert-to-name', 'FAIL_ALL')
|
||||
|
||||
|
||||
def __call__(self, obj: ProcessInfo) -> None:
|
||||
if not obj.address:
|
||||
return
|
||||
|
||||
new_address: List[PlaceName] = []
|
||||
for item in obj.address:
|
||||
if self.filter_kind(item.kind):
|
||||
if self.filter_name(item.name):
|
||||
obj.names.append(item.clone(kind='housenumber'))
|
||||
else:
|
||||
new_address.extend(item.clone(kind='housenumber', name=n)
|
||||
for n in self.sanitize(item.name))
|
||||
else:
|
||||
# Don't touch other address items.
|
||||
new_address.append(item)
|
||||
|
||||
obj.address = new_address
|
||||
|
||||
|
||||
def sanitize(self, value: str) -> Iterator[str]:
|
||||
""" Extract housenumbers in a regularized format from an OSM value.
|
||||
|
||||
The function works as a generator that yields all valid housenumbers
|
||||
that can be created from the value.
|
||||
"""
|
||||
for hnr in self.split_regexp.split(value):
|
||||
if hnr:
|
||||
yield from self._regularize(hnr)
|
||||
|
||||
|
||||
def _regularize(self, hnr: str) -> Iterator[str]:
|
||||
yield hnr
|
||||
|
||||
|
||||
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||
""" Create a housenumber processing function.
|
||||
"""
|
||||
|
||||
return _HousenumberSanitizer(config)
|
||||
80
src/nominatim_db/tokenizer/sanitizers/clean_postcodes.py
Normal file
80
src/nominatim_db/tokenizer/sanitizers/clean_postcodes.py
Normal file
@@ -0,0 +1,80 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Sanitizer that filters postcodes by their officially allowed pattern.
|
||||
|
||||
Arguments:
|
||||
convert-to-address: If set to 'yes' (the default), then postcodes that do
|
||||
not conform with their country-specific pattern are
|
||||
converted to an address component. That means that
|
||||
the postcode does not take part when computing the
|
||||
postcode centroids of a country but is still searchable.
|
||||
When set to 'no', non-conforming postcodes are not
|
||||
searchable either.
|
||||
default-pattern: Pattern to use, when there is none available for the
|
||||
country in question. Warning: will not be used for
|
||||
objects that have no country assigned. These are always
|
||||
assumed to have no postcode.
|
||||
"""
|
||||
from typing import Callable, Optional, Tuple
|
||||
|
||||
from ...data.postcode_format import PostcodeFormatter
|
||||
from .base import ProcessInfo
|
||||
from .config import SanitizerConfig
|
||||
|
||||
class _PostcodeSanitizer:
|
||||
|
||||
def __init__(self, config: SanitizerConfig) -> None:
|
||||
self.convert_to_address = config.get_bool('convert-to-address', True)
|
||||
self.matcher = PostcodeFormatter()
|
||||
|
||||
default_pattern = config.get('default-pattern')
|
||||
if default_pattern is not None and isinstance(default_pattern, str):
|
||||
self.matcher.set_default_pattern(default_pattern)
|
||||
|
||||
|
||||
def __call__(self, obj: ProcessInfo) -> None:
|
||||
if not obj.address:
|
||||
return
|
||||
|
||||
postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
|
||||
|
||||
for pos, postcode in postcodes:
|
||||
formatted = self.scan(postcode.name, obj.place.country_code)
|
||||
|
||||
if formatted is None:
|
||||
if self.convert_to_address:
|
||||
postcode.kind = 'unofficial_postcode'
|
||||
else:
|
||||
obj.address.pop(pos)
|
||||
else:
|
||||
postcode.name = formatted[0]
|
||||
postcode.set_attr('variant', formatted[1])
|
||||
|
||||
|
||||
def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
|
||||
""" Check the postcode for correct formatting and return the
|
||||
normalized version. Returns None if the postcode does not
|
||||
correspond to the official format of the given country.
|
||||
"""
|
||||
match = self.matcher.match(country, postcode)
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
assert country is not None
|
||||
|
||||
return self.matcher.normalize(country, match),\
|
||||
' '.join(filter(lambda p: p is not None, match.groups()))
|
||||
|
||||
|
||||
|
||||
|
||||
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||
""" Create a function that filters postcodes by their officially allowed pattern.
|
||||
"""
|
||||
|
||||
return _PostcodeSanitizer(config)
|
||||
46
src/nominatim_db/tokenizer/sanitizers/clean_tiger_tags.py
Normal file
46
src/nominatim_db/tokenizer/sanitizers/clean_tiger_tags.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Sanitizer that preprocesses tags from the TIGER import.
|
||||
|
||||
It makes the following changes:
|
||||
|
||||
* remove state reference from tiger:county
|
||||
"""
|
||||
from typing import Callable
|
||||
import re
|
||||
|
||||
from .base import ProcessInfo
|
||||
from .config import SanitizerConfig
|
||||
|
||||
COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]')
|
||||
|
||||
def _clean_tiger_county(obj: ProcessInfo) -> None:
|
||||
""" Remove the state reference from tiger:county tags.
|
||||
|
||||
This transforms a name like 'Hamilton, AL' into 'Hamilton'.
|
||||
If no state reference is detected at the end, the name is left as is.
|
||||
"""
|
||||
if not obj.address:
|
||||
return
|
||||
|
||||
for item in obj.address:
|
||||
if item.kind == 'tiger' and item.suffix == 'county':
|
||||
m = COUNTY_MATCH.fullmatch(item.name)
|
||||
if m:
|
||||
item.name = m[1]
|
||||
# Switch kind and suffix, the split left them reversed.
|
||||
item.kind = 'county'
|
||||
item.suffix = 'tiger'
|
||||
|
||||
return
|
||||
|
||||
|
||||
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||
""" Create a function that preprocesses tags from the TIGER import.
|
||||
"""
|
||||
return _clean_tiger_county
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user