split code into submodules

This commit is contained in:
Sarah Hoffmann
2024-05-16 11:55:17 +02:00
parent 0fb4fe8e4d
commit 6e89310a92
137 changed files with 757 additions and 716 deletions

View File

@@ -0,0 +1,10 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module with functions for importing, updating Nominatim databases
as well as general maintenance helpers.
"""

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Function to add additional OSM data from a file or the API into the database.
"""
from typing import Any, MutableMapping
from pathlib import Path
import logging
import urllib
from nominatim_core.db.connection import connect
from nominatim_core.utils.url_utils import get_url
from .exec_utils import run_osm2pgsql
LOG = logging.getLogger()
def _run_osm2pgsql(dsn: str, options: MutableMapping[str, Any]) -> None:
run_osm2pgsql(options)
# Handle deletions
with connect(dsn) as conn:
with conn.cursor() as cur:
cur.execute('SELECT flush_deleted_places()')
conn.commit()
def add_data_from_file(dsn: str, fname: str, options: MutableMapping[str, Any]) -> int:
""" Adds data from a OSM file to the database. The file may be a normal
OSM file or a diff file in all formats supported by libosmium.
"""
options['import_file'] = Path(fname)
options['append'] = True
_run_osm2pgsql(dsn, options)
# No status update. We don't know where the file came from.
return 0
def add_osm_object(dsn: str, osm_type: str, osm_id: int, use_main_api: bool,
options: MutableMapping[str, Any]) -> int:
""" Add or update a single OSM object from the latest version of the
API.
"""
if use_main_api:
base_url = f'https://www.openstreetmap.org/api/0.6/{osm_type}/{osm_id}'
if osm_type in ('way', 'relation'):
base_url += '/full'
else:
# use Overpass API
if osm_type == 'node':
data = f'node({osm_id});out meta;'
elif osm_type == 'way':
data = f'(way({osm_id});>;);out meta;'
else:
data = f'(rel(id:{osm_id});>;);out meta;'
base_url = 'https://overpass-api.de/api/interpreter?' \
+ urllib.parse.urlencode({'data': data})
options['append'] = True
options['import_data'] = get_url(base_url).encode('utf-8')
_run_osm2pgsql(dsn, options)
return 0

View File

@@ -0,0 +1,106 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for database analysis and maintenance.
"""
from typing import Optional, Tuple, Any, cast
import logging
from psycopg2.extras import Json, register_hstore
from psycopg2 import DataError
from nominatim_core.typing import DictCursorResult
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect, Cursor
from nominatim_core.errors import UsageError
from ..tokenizer import factory as tokenizer_factory
from ..data.place_info import PlaceInfo
LOG = logging.getLogger()
def _get_place_info(cursor: Cursor, osm_id: Optional[str],
place_id: Optional[int]) -> DictCursorResult:
sql = """SELECT place_id, extra.*
FROM placex, LATERAL placex_indexing_prepare(placex) as extra
"""
values: Tuple[Any, ...]
if osm_id:
osm_type = osm_id[0].upper()
if osm_type not in 'NWR' or not osm_id[1:].isdigit():
LOG.fatal('OSM ID must be of form <N|W|R><id>. Got: %s', osm_id)
raise UsageError("OSM ID parameter badly formatted")
sql += ' WHERE placex.osm_type = %s AND placex.osm_id = %s'
values = (osm_type, int(osm_id[1:]))
elif place_id is not None:
sql += ' WHERE placex.place_id = %s'
values = (place_id, )
else:
LOG.fatal("No OSM object given to index.")
raise UsageError("OSM object not found")
cursor.execute(sql + ' LIMIT 1', values)
if cursor.rowcount < 1:
LOG.fatal("OSM object %s not found in database.", osm_id)
raise UsageError("OSM object not found")
return cast(DictCursorResult, cursor.fetchone())
def analyse_indexing(config: Configuration, osm_id: Optional[str] = None,
place_id: Optional[int] = None) -> None:
""" Analyse indexing of a single Nominatim object.
"""
with connect(config.get_libpq_dsn()) as conn:
register_hstore(conn)
with conn.cursor() as cur:
place = _get_place_info(cur, osm_id, place_id)
cur.execute("update placex set indexed_status = 2 where place_id = %s",
(place['place_id'], ))
cur.execute("""SET auto_explain.log_min_duration = '0';
SET auto_explain.log_analyze = 'true';
SET auto_explain.log_nested_statements = 'true';
LOAD 'auto_explain';
SET client_min_messages = LOG;
SET log_min_messages = FATAL""")
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
with tokenizer.name_analyzer() as analyzer:
cur.execute("""UPDATE placex
SET indexed_status = 0, address = %s, token_info = %s,
name = %s, linked_place_id = %s
WHERE place_id = %s""",
(place['address'],
Json(analyzer.process_place(PlaceInfo(place))),
place['name'], place['linked_place_id'], place['place_id']))
# we do not want to keep the results
conn.rollback()
for msg in conn.notices:
print(msg)
def clean_deleted_relations(config: Configuration, age: str) -> None:
""" Clean deleted relations older than a given age
"""
with connect(config.get_libpq_dsn()) as conn:
with conn.cursor() as cur:
try:
cur.execute("""SELECT place_force_delete(p.place_id)
FROM import_polygon_delete d, placex p
WHERE p.osm_type = d.osm_type AND p.osm_id = d.osm_id
AND age(p.indexed_date) > %s::interval""",
(age, ))
except DataError as exc:
raise UsageError('Invalid PostgreSQL time interval format') from exc
conn.commit()

View File

@@ -0,0 +1,350 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Collection of functions that check if the database is complete and functional.
"""
from typing import Callable, Optional, Any, Union, Tuple, Mapping, List
from enum import Enum
from textwrap import dedent
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect, Connection
from nominatim_core.db import properties
from nominatim_core.errors import UsageError
from ..tokenizer import factory as tokenizer_factory
from . import freeze
from ..version import NOMINATIM_VERSION, parse_version
CHECKLIST = []
class CheckState(Enum):
""" Possible states of a check. FATAL stops check execution entirely.
"""
OK = 0
FAIL = 1
FATAL = 2
NOT_APPLICABLE = 3
WARN = 4
CheckResult = Union[CheckState, Tuple[CheckState, Mapping[str, Any]]]
CheckFunc = Callable[[Connection, Configuration], CheckResult]
def _check(hint: Optional[str] = None) -> Callable[[CheckFunc], CheckFunc]:
""" Decorator for checks. It adds the function to the list of
checks to execute and adds the code for printing progress messages.
"""
def decorator(func: CheckFunc) -> CheckFunc:
title = (func.__doc__ or '').split('\n', 1)[0].strip()
def run_check(conn: Connection, config: Configuration) -> CheckState:
print(title, end=' ... ')
ret = func(conn, config)
if isinstance(ret, tuple):
ret, params = ret
else:
params = {}
if ret == CheckState.OK:
print('\033[92mOK\033[0m')
elif ret == CheckState.WARN:
print('\033[93mWARNING\033[0m')
if hint:
print('')
print(dedent(hint.format(**params)))
elif ret == CheckState.NOT_APPLICABLE:
print('not applicable')
else:
print('\x1B[31mFailed\033[0m')
if hint:
print(dedent(hint.format(**params)))
return ret
CHECKLIST.append(run_check)
return run_check
return decorator
class _BadConnection:
def __init__(self, msg: str) -> None:
self.msg = msg
def close(self) -> None:
""" Dummy function to provide the implementation.
"""
def check_database(config: Configuration) -> int:
""" Run a number of checks on the database and return the status.
"""
try:
conn = connect(config.get_libpq_dsn()).connection
except UsageError as err:
conn = _BadConnection(str(err)) # type: ignore[assignment]
overall_result = 0
for check in CHECKLIST:
ret = check(conn, config)
if ret == CheckState.FATAL:
conn.close()
return 1
if ret in (CheckState.FATAL, CheckState.FAIL):
overall_result = 1
conn.close()
return overall_result
def _get_indexes(conn: Connection) -> List[str]:
indexes = ['idx_place_addressline_address_place_id',
'idx_placex_rank_search',
'idx_placex_rank_address',
'idx_placex_parent_place_id',
'idx_placex_geometry_reverse_lookuppolygon',
'idx_placex_geometry_placenode',
'idx_osmline_parent_place_id',
'idx_osmline_parent_osm_id',
'idx_postcode_id',
'idx_postcode_postcode'
]
if conn.table_exists('search_name'):
indexes.extend(('idx_search_name_nameaddress_vector',
'idx_search_name_name_vector',
'idx_search_name_centroid'))
if conn.server_version_tuple() >= (11, 0, 0):
indexes.extend(('idx_placex_housenumber',
'idx_osmline_parent_osm_id_with_hnr'))
if conn.table_exists('place'):
indexes.extend(('idx_location_area_country_place_id',
'idx_place_osm_unique',
'idx_placex_rank_address_sector',
'idx_placex_rank_boundaries_sector'))
return indexes
# CHECK FUNCTIONS
#
# Functions are executed in the order they appear here.
@_check(hint="""\
{error}
Hints:
* Is the database server started?
* Check the NOMINATIM_DATABASE_DSN variable in your local .env
* Try connecting to the database with the same settings
Project directory: {config.project_dir}
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
""")
def check_connection(conn: Any, config: Configuration) -> CheckResult:
""" Checking database connection
"""
if isinstance(conn, _BadConnection):
return CheckState.FATAL, dict(error=conn.msg, config=config)
return CheckState.OK
@_check(hint="""\
Database version ({db_version}) doesn't match Nominatim version ({nom_version})
Hints:
* Are you connecting to the correct database?
{instruction}
Check the Migration chapter of the Administration Guide.
Project directory: {config.project_dir}
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
""")
def check_database_version(conn: Connection, config: Configuration) -> CheckResult:
""" Checking database_version matches Nominatim software version
"""
if conn.table_exists('nominatim_properties'):
db_version_str = properties.get_property(conn, 'database_version')
else:
db_version_str = None
if db_version_str is not None:
db_version = parse_version(db_version_str)
if db_version == NOMINATIM_VERSION:
return CheckState.OK
instruction = (
'Run migrations: nominatim admin --migrate'
if db_version < NOMINATIM_VERSION
else 'You need to upgrade the Nominatim software.'
)
else:
instruction = ''
return CheckState.FATAL, dict(db_version=db_version_str,
nom_version=NOMINATIM_VERSION,
instruction=instruction,
config=config)
@_check(hint="""\
placex table not found
Hints:
* Are you connecting to the correct database?
* Did the import process finish without errors?
Project directory: {config.project_dir}
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
""")
def check_placex_table(conn: Connection, config: Configuration) -> CheckResult:
""" Checking for placex table
"""
if conn.table_exists('placex'):
return CheckState.OK
return CheckState.FATAL, dict(config=config)
@_check(hint="""placex table has no data. Did the import finish successfully?""")
def check_placex_size(conn: Connection, _: Configuration) -> CheckResult:
""" Checking for placex content
"""
with conn.cursor() as cur:
cnt = cur.scalar('SELECT count(*) FROM (SELECT * FROM placex LIMIT 100) x')
return CheckState.OK if cnt > 0 else CheckState.FATAL
@_check(hint="""{msg}""")
def check_tokenizer(_: Connection, config: Configuration) -> CheckResult:
""" Checking that tokenizer works
"""
try:
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
except UsageError:
return CheckState.FAIL, dict(msg="""\
Cannot load tokenizer. Did the import finish successfully?""")
result = tokenizer.check_database(config)
if result is None:
return CheckState.OK
return CheckState.FAIL, dict(msg=result)
@_check(hint="""\
Wikipedia/Wikidata importance tables missing.
Quality of search results may be degraded. Reverse geocoding is unaffected.
See https://nominatim.org/release-docs/latest/admin/Import/#wikipediawikidata-rankings
""")
def check_existance_wikipedia(conn: Connection, _: Configuration) -> CheckResult:
""" Checking for wikipedia/wikidata data
"""
if not conn.table_exists('search_name') or not conn.table_exists('place'):
return CheckState.NOT_APPLICABLE
with conn.cursor() as cur:
if conn.table_exists('wikimedia_importance'):
cnt = cur.scalar('SELECT count(*) FROM wikimedia_importance')
else:
cnt = cur.scalar('SELECT count(*) FROM wikipedia_article')
return CheckState.WARN if cnt == 0 else CheckState.OK
@_check(hint="""\
The indexing didn't finish. {count} entries are not yet indexed.
To index the remaining entries, run: {index_cmd}
""")
def check_indexing(conn: Connection, _: Configuration) -> CheckResult:
""" Checking indexing status
"""
with conn.cursor() as cur:
cnt = cur.scalar('SELECT count(*) FROM placex WHERE indexed_status > 0')
if cnt == 0:
return CheckState.OK
if freeze.is_frozen(conn):
index_cmd="""\
Database is marked frozen, it cannot be updated.
Low counts of unindexed places are fine."""
return CheckState.WARN, dict(count=cnt, index_cmd=index_cmd)
if conn.index_exists('idx_placex_rank_search'):
# Likely just an interrupted update.
index_cmd = 'nominatim index'
else:
# Looks like the import process got interrupted.
index_cmd = 'nominatim import --continue indexing'
return CheckState.FAIL, dict(count=cnt, index_cmd=index_cmd)
@_check(hint="""\
The following indexes are missing:
{indexes}
Rerun the index creation with: nominatim import --continue db-postprocess
""")
def check_database_indexes(conn: Connection, _: Configuration) -> CheckResult:
""" Checking that database indexes are complete
"""
missing = []
for index in _get_indexes(conn):
if not conn.index_exists(index):
missing.append(index)
if missing:
return CheckState.FAIL, dict(indexes='\n '.join(missing))
return CheckState.OK
@_check(hint="""\
At least one index is invalid. That can happen, e.g. when index creation was
disrupted and later restarted. You should delete the affected indices
and recreate them.
Invalid indexes:
{indexes}
""")
def check_database_index_valid(conn: Connection, _: Configuration) -> CheckResult:
""" Checking that all database indexes are valid
"""
with conn.cursor() as cur:
cur.execute(""" SELECT relname FROM pg_class, pg_index
WHERE pg_index.indisvalid = false
AND pg_index.indexrelid = pg_class.oid""")
broken = [c[0] for c in cur]
if broken:
return CheckState.FAIL, dict(indexes='\n '.join(broken))
return CheckState.OK
@_check(hint="""\
{error}
Run TIGER import again: nominatim add-data --tiger-data <DIR>
""")
def check_tiger_table(conn: Connection, config: Configuration) -> CheckResult:
""" Checking TIGER external data table.
"""
if not config.get_bool('USE_US_TIGER_DATA'):
return CheckState.NOT_APPLICABLE
if not conn.table_exists('location_property_tiger'):
return CheckState.FAIL, dict(error='TIGER data table not found.')
with conn.cursor() as cur:
if cur.scalar('SELECT count(*) FROM location_property_tiger') == 0:
return CheckState.FAIL, dict(error='TIGER data table is empty.')
return CheckState.OK

View File

@@ -0,0 +1,166 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Collection of host system information including software versions, memory,
storage, and database configuration.
"""
import os
import subprocess
import sys
from pathlib import Path
from typing import List, Optional, Tuple, Union
import psutil
from psycopg2.extensions import make_dsn, parse_dsn
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect
from ..version import NOMINATIM_VERSION
def convert_version(ver_tup: Tuple[int, int]) -> str:
"""converts tuple version (ver_tup) to a string representation"""
return ".".join(map(str, ver_tup))
def friendly_memory_string(mem: float) -> str:
"""Create a user friendly string for the amount of memory specified as mem"""
mem_magnitude = ("bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
mag = 0
# determine order of magnitude
while mem > 1000:
mem /= 1000
mag += 1
return f"{mem:.1f} {mem_magnitude[mag]}"
def run_command(cmd: Union[str, List[str]]) -> str:
"""Runs a command using the shell and returns the output from stdout"""
try:
if sys.version_info < (3, 7):
cap_out = subprocess.run(cmd, stdout=subprocess.PIPE, check=False)
else:
cap_out = subprocess.run(cmd, capture_output=True, check=False)
return cap_out.stdout.decode("utf-8")
except FileNotFoundError:
# non-Linux system should end up here
return f"Unknown (unable to find the '{cmd}' command)"
def os_name_info() -> str:
"""Obtain Operating System Name (and possibly the version)"""
os_info = None
# man page os-release(5) details meaning of the fields
if Path("/etc/os-release").is_file():
os_info = from_file_find_line_portion(
"/etc/os-release", "PRETTY_NAME", "=")
# alternative location
elif Path("/usr/lib/os-release").is_file():
os_info = from_file_find_line_portion(
"/usr/lib/os-release", "PRETTY_NAME", "="
)
# fallback on Python's os name
if os_info is None or os_info == "":
os_info = os.name
# if the above is insufficient, take a look at neofetch's approach to OS detection
return os_info
# Note: Intended to be used on informational files like /proc
def from_file_find_line_portion(
filename: str, start: str, sep: str, fieldnum: int = 1
) -> Optional[str]:
"""open filename, finds the line starting with the 'start' string.
Splits the line using separator and returns a "fieldnum" from the split."""
with open(filename, encoding='utf8') as file:
result = ""
for line in file:
if line.startswith(start):
result = line.split(sep)[fieldnum].strip()
return result
def get_postgresql_config(version: int) -> str:
"""Retrieve postgres configuration file"""
try:
with open(f"/etc/postgresql/{version}/main/postgresql.conf", encoding='utf8') as file:
db_config = file.read()
file.close()
return db_config
except IOError:
return f"**Could not read '/etc/postgresql/{version}/main/postgresql.conf'**"
def report_system_information(config: Configuration) -> None:
"""Generate a report about the host system including software versions, memory,
storage, and database configuration."""
with connect(make_dsn(config.get_libpq_dsn(), dbname='postgres')) as conn:
postgresql_ver: str = convert_version(conn.server_version_tuple())
with conn.cursor() as cur:
num = cur.scalar("SELECT count(*) FROM pg_catalog.pg_database WHERE datname=%s",
(parse_dsn(config.get_libpq_dsn())['dbname'], ))
nominatim_db_exists = num == 1 if isinstance(num, int) else False
if nominatim_db_exists:
with connect(config.get_libpq_dsn()) as conn:
postgis_ver: str = convert_version(conn.postgis_version_tuple())
else:
postgis_ver = "Unable to connect to database"
postgresql_config: str = get_postgresql_config(int(float(postgresql_ver)))
# Note: psutil.disk_partitions() is similar to run_command("lsblk")
# Note: run_command("systemd-detect-virt") only works on Linux, on other OSes
# should give a message: "Unknown (unable to find the 'systemd-detect-virt' command)"
# Generates the Markdown report.
report = f"""
**Instructions**
Use this information in your issue report at https://github.com/osm-search/Nominatim/issues
Redirect the output to a file:
$ ./collect_os_info.py > report.md
**Software Environment:**
- Python version: {sys.version}
- Nominatim version: {NOMINATIM_VERSION!s}
- PostgreSQL version: {postgresql_ver}
- PostGIS version: {postgis_ver}
- OS: {os_name_info()}
**Hardware Configuration:**
- RAM: {friendly_memory_string(psutil.virtual_memory().total)}
- number of CPUs: {psutil.cpu_count(logical=False)}
- bare metal/AWS/other cloud service (per systemd-detect-virt(1)): {run_command("systemd-detect-virt")}
- type and size of disks:
**`df -h` - df - report file system disk space usage: **
```
{run_command(["df", "-h"])}
```
**lsblk - list block devices: **
```
{run_command("lsblk")}
```
**Postgresql Configuration:**
```
{postgresql_config}
```
**Notes**
Please add any notes about anything above anything above that is incorrect.
"""
print(report)

View File

@@ -0,0 +1,265 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Exporting a Nominatim database to SQlite.
"""
from typing import Set, Any
import datetime as dt
import logging
from pathlib import Path
import sqlalchemy as sa
import nominatim_api as napi
from nominatim_api.search.query_analyzer_factory import make_query_analyzer
from nominatim_core.typing import SaSelect, SaRow
from nominatim_core.db.sqlalchemy_types import Geometry, IntArray
LOG = logging.getLogger()
async def convert(project_dir: Path, outfile: Path, options: Set[str]) -> None:
""" Export an existing database to sqlite. The resulting database
will be usable against the Python frontend of Nominatim.
"""
api = napi.NominatimAPIAsync(project_dir)
try:
outapi = napi.NominatimAPIAsync(project_dir,
{'NOMINATIM_DATABASE_DSN': f"sqlite:dbname={outfile}",
'NOMINATIM_DATABASE_RW': '1'})
try:
async with api.begin() as src, outapi.begin() as dest:
writer = SqliteWriter(src, dest, options)
await writer.write()
finally:
await outapi.close()
finally:
await api.close()
class SqliteWriter:
""" Worker class which creates a new SQLite database.
"""
def __init__(self, src: napi.SearchConnection,
dest: napi.SearchConnection, options: Set[str]) -> None:
self.src = src
self.dest = dest
self.options = options
async def write(self) -> None:
""" Create the database structure and copy the data from
the source database to the destination.
"""
LOG.warning('Setting up spatialite')
await self.dest.execute(sa.select(sa.func.InitSpatialMetaData(True, 'WGS84')))
await self.create_tables()
await self.copy_data()
if 'search' in self.options:
await self.create_word_table()
await self.create_indexes()
async def create_tables(self) -> None:
""" Set up the database tables.
"""
LOG.warning('Setting up tables')
if 'search' not in self.options:
self.dest.t.meta.remove(self.dest.t.search_name)
else:
await self.create_class_tables()
await self.dest.connection.run_sync(self.dest.t.meta.create_all)
# Convert all Geometry columns to Spatialite geometries
for table in self.dest.t.meta.sorted_tables:
for col in table.c:
if isinstance(col.type, Geometry):
await self.dest.execute(sa.select(
sa.func.RecoverGeometryColumn(table.name, col.name, 4326,
col.type.subtype.upper(), 'XY')))
async def create_class_tables(self) -> None:
""" Set up the table that serve class/type-specific geometries.
"""
sql = sa.text("""SELECT tablename FROM pg_tables
WHERE tablename LIKE 'place_classtype_%'""")
for res in await self.src.execute(sql):
for db in (self.src, self.dest):
sa.Table(res[0], db.t.meta,
sa.Column('place_id', sa.BigInteger),
sa.Column('centroid', Geometry))
async def create_word_table(self) -> None:
""" Create the word table.
This table needs the property information to determine the
correct format. Therefore needs to be done after all other
data has been copied.
"""
await make_query_analyzer(self.src)
await make_query_analyzer(self.dest)
src = self.src.t.meta.tables['word']
dest = self.dest.t.meta.tables['word']
await self.dest.connection.run_sync(dest.create)
LOG.warning("Copying word table")
async_result = await self.src.connection.stream(sa.select(src))
async for partition in async_result.partitions(10000):
data = [{k: getattr(r, k) for k in r._fields} for r in partition]
await self.dest.execute(dest.insert(), data)
await self.dest.connection.run_sync(sa.Index('idx_word_woken', dest.c.word_token).create)
async def copy_data(self) -> None:
""" Copy data for all registered tables.
"""
def _getfield(row: SaRow, key: str) -> Any:
value = getattr(row, key)
if isinstance(value, dt.datetime):
if value.tzinfo is not None:
value = value.astimezone(dt.timezone.utc)
return value
for table in self.dest.t.meta.sorted_tables:
LOG.warning("Copying '%s'", table.name)
async_result = await self.src.connection.stream(self.select_from(table.name))
async for partition in async_result.partitions(10000):
data = [{('class_' if k == 'class' else k): _getfield(r, k)
for k in r._fields}
for r in partition]
await self.dest.execute(table.insert(), data)
# Set up a minimal copy of pg_tables used to look up the class tables later.
pg_tables = sa.Table('pg_tables', self.dest.t.meta,
sa.Column('schemaname', sa.Text, default='public'),
sa.Column('tablename', sa.Text))
await self.dest.connection.run_sync(pg_tables.create)
data = [{'tablename': t} for t in self.dest.t.meta.tables]
await self.dest.execute(pg_tables.insert().values(data))
async def create_indexes(self) -> None:
""" Add indexes necessary for the frontend.
"""
# reverse place node lookup needs an extra table to simulate a
# partial index with adaptive buffering.
await self.dest.execute(sa.text(
""" CREATE TABLE placex_place_node_areas AS
SELECT place_id, ST_Expand(geometry,
14.0 * exp(-0.2 * rank_search) - 0.03) as geometry
FROM placex
WHERE rank_address between 5 and 25
and osm_type = 'N'
and linked_place_id is NULL """))
await self.dest.execute(sa.select(
sa.func.RecoverGeometryColumn('placex_place_node_areas', 'geometry',
4326, 'GEOMETRY', 'XY')))
await self.dest.execute(sa.select(sa.func.CreateSpatialIndex(
'placex_place_node_areas', 'geometry')))
# Remaining indexes.
await self.create_spatial_index('country_grid', 'geometry')
await self.create_spatial_index('placex', 'geometry')
await self.create_spatial_index('osmline', 'linegeo')
await self.create_spatial_index('tiger', 'linegeo')
await self.create_index('placex', 'place_id')
await self.create_index('placex', 'parent_place_id')
await self.create_index('placex', 'rank_address')
await self.create_index('addressline', 'place_id')
await self.create_index('postcode', 'place_id')
await self.create_index('osmline', 'place_id')
await self.create_index('tiger', 'place_id')
if 'search' in self.options:
await self.create_spatial_index('postcode', 'geometry')
await self.create_spatial_index('search_name', 'centroid')
await self.create_index('search_name', 'place_id')
await self.create_index('osmline', 'parent_place_id')
await self.create_index('tiger', 'parent_place_id')
await self.create_search_index()
for t in self.dest.t.meta.tables:
if t.startswith('place_classtype_'):
await self.dest.execute(sa.select(
sa.func.CreateSpatialIndex(t, 'centroid')))
async def create_spatial_index(self, table: str, column: str) -> None:
""" Create a spatial index on the given table and column.
"""
await self.dest.execute(sa.select(
sa.func.CreateSpatialIndex(getattr(self.dest.t, table).name, column)))
async def create_index(self, table_name: str, column: str) -> None:
""" Create a simple index on the given table and column.
"""
table = getattr(self.dest.t, table_name)
await self.dest.connection.run_sync(
sa.Index(f"idx_{table}_{column}", getattr(table.c, column)).create)
async def create_search_index(self) -> None:
""" Create the tables and indexes needed for word lookup.
"""
LOG.warning("Creating reverse search table")
rsn = sa.Table('reverse_search_name', self.dest.t.meta,
sa.Column('word', sa.Integer()),
sa.Column('column', sa.Text()),
sa.Column('places', IntArray))
await self.dest.connection.run_sync(rsn.create)
tsrc = self.src.t.search_name
for column in ('name_vector', 'nameaddress_vector'):
sql = sa.select(sa.func.unnest(getattr(tsrc.c, column)).label('word'),
sa.func.ArrayAgg(tsrc.c.place_id).label('places'))\
.group_by('word')
async_result = await self.src.connection.stream(sql)
async for partition in async_result.partitions(100):
data = []
for row in partition:
row.places.sort()
data.append({'word': row.word,
'column': column,
'places': row.places})
await self.dest.execute(rsn.insert(), data)
await self.dest.connection.run_sync(
sa.Index('idx_reverse_search_name_word', rsn.c.word).create)
def select_from(self, table: str) -> SaSelect:
""" Create the SQL statement to select the source columns and rows.
"""
columns = self.src.t.meta.tables[table].c
if table == 'placex':
# SQLite struggles with Geometries that are larger than 5MB,
# so simplify those.
return sa.select(*(c for c in columns if not isinstance(c.type, Geometry)),
sa.func.ST_AsText(columns.centroid).label('centroid'),
sa.func.ST_AsText(
sa.case((sa.func.ST_MemSize(columns.geometry) < 5000000,
columns.geometry),
else_=sa.func.ST_SimplifyPreserveTopology(
columns.geometry, 0.0001)
)).label('geometry'))
sql = sa.select(*(sa.func.ST_AsText(c).label(c.name)
if isinstance(c.type, Geometry) else c for c in columns))
return sql

View File

@@ -0,0 +1,272 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for setting up and importing a new Nominatim database.
"""
from typing import Tuple, Optional, Union, Sequence, MutableMapping, Any
import logging
import os
import selectors
import subprocess
from pathlib import Path
import psutil
from psycopg2 import sql as pysql
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect, get_pg_env, Connection
from nominatim_core.db.async_connection import DBConnection
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from .exec_utils import run_osm2pgsql
from ..version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION
LOG = logging.getLogger()
def _require_version(module: str, actual: Tuple[int, int], expected: Tuple[int, int]) -> None:
""" Compares the version for the given module and raises an exception
if the actual version is too old.
"""
if actual < expected:
LOG.fatal('Minimum supported version of %s is %d.%d. '
'Found version %d.%d.',
module, expected[0], expected[1], actual[0], actual[1])
raise UsageError(f'{module} is too old.')
def _require_loaded(extension_name: str, conn: Connection) -> None:
""" Check that the given extension is loaded. """
if not conn.extension_loaded(extension_name):
LOG.fatal('Required module %s is not loaded.', extension_name)
raise UsageError(f'{extension_name} is not loaded.')
def check_existing_database_plugins(dsn: str) -> None:
""" Check that the database has the required plugins installed."""
with connect(dsn) as conn:
_require_version('PostgreSQL server',
conn.server_version_tuple(),
POSTGRESQL_REQUIRED_VERSION)
_require_version('PostGIS',
conn.postgis_version_tuple(),
POSTGIS_REQUIRED_VERSION)
_require_loaded('hstore', conn)
def setup_database_skeleton(dsn: str, rouser: Optional[str] = None) -> None:
""" Create a new database for Nominatim and populate it with the
essential extensions.
The function fails when the database already exists or Postgresql or
PostGIS versions are too old.
Uses `createdb` to create the database.
If 'rouser' is given, then the function also checks that the user
with that given name exists.
Requires superuser rights by the caller.
"""
proc = subprocess.run(['createdb'], env=get_pg_env(dsn), check=False)
if proc.returncode != 0:
raise UsageError('Creating new database failed.')
with connect(dsn) as conn:
_require_version('PostgreSQL server',
conn.server_version_tuple(),
POSTGRESQL_REQUIRED_VERSION)
if rouser is not None:
with conn.cursor() as cur:
cnt = cur.scalar('SELECT count(*) FROM pg_user where usename = %s',
(rouser, ))
if cnt == 0:
LOG.fatal("Web user '%s' does not exist. Create it with:\n"
"\n createuser %s", rouser, rouser)
raise UsageError('Missing read-only user.')
# Create extensions.
with conn.cursor() as cur:
cur.execute('CREATE EXTENSION IF NOT EXISTS hstore')
cur.execute('CREATE EXTENSION IF NOT EXISTS postgis')
postgis_version = conn.postgis_version_tuple()
if postgis_version[0] >= 3:
cur.execute('CREATE EXTENSION IF NOT EXISTS postgis_raster')
conn.commit()
_require_version('PostGIS',
conn.postgis_version_tuple(),
POSTGIS_REQUIRED_VERSION)
def import_osm_data(osm_files: Union[Path, Sequence[Path]],
options: MutableMapping[str, Any],
drop: bool = False, ignore_errors: bool = False) -> None:
""" Import the given OSM files. 'options' contains the list of
default settings for osm2pgsql.
"""
options['import_file'] = osm_files
options['append'] = False
options['threads'] = 1
if not options['flatnode_file'] and options['osm2pgsql_cache'] == 0:
# Make some educated guesses about cache size based on the size
# of the import file and the available memory.
mem = psutil.virtual_memory()
fsize = 0
if isinstance(osm_files, list):
for fname in osm_files:
fsize += os.stat(str(fname)).st_size
else:
fsize = os.stat(str(osm_files)).st_size
options['osm2pgsql_cache'] = int(min((mem.available + mem.cached) * 0.75,
fsize * 2) / 1024 / 1024) + 1
run_osm2pgsql(options)
with connect(options['dsn']) as conn:
if not ignore_errors:
with conn.cursor() as cur:
cur.execute('SELECT * FROM place LIMIT 1')
if cur.rowcount == 0:
raise UsageError('No data imported by osm2pgsql.')
if drop:
conn.drop_table('planet_osm_nodes')
if drop and options['flatnode_file']:
Path(options['flatnode_file']).unlink()
def create_tables(conn: Connection, config: Configuration, reverse_only: bool = False) -> None:
""" Create the set of basic tables.
When `reverse_only` is True, then the main table for searching will
be skipped and only reverse search is possible.
"""
sql = SQLPreprocessor(conn, config)
sql.env.globals['db']['reverse_only'] = reverse_only
sql.run_sql_file(conn, 'tables.sql')
def create_table_triggers(conn: Connection, config: Configuration) -> None:
""" Create the triggers for the tables. The trigger functions must already
have been imported with refresh.create_functions().
"""
sql = SQLPreprocessor(conn, config)
sql.run_sql_file(conn, 'table-triggers.sql')
def create_partition_tables(conn: Connection, config: Configuration) -> None:
""" Create tables that have explicit partitioning.
"""
sql = SQLPreprocessor(conn, config)
sql.run_sql_file(conn, 'partition-tables.src.sql')
def truncate_data_tables(conn: Connection) -> None:
""" Truncate all data tables to prepare for a fresh load.
"""
with conn.cursor() as cur:
cur.execute('TRUNCATE placex')
cur.execute('TRUNCATE place_addressline')
cur.execute('TRUNCATE location_area')
cur.execute('TRUNCATE location_area_country')
cur.execute('TRUNCATE location_property_tiger')
cur.execute('TRUNCATE location_property_osmline')
cur.execute('TRUNCATE location_postcode')
if conn.table_exists('search_name'):
cur.execute('TRUNCATE search_name')
cur.execute('DROP SEQUENCE IF EXISTS seq_place')
cur.execute('CREATE SEQUENCE seq_place start 100000')
cur.execute("""SELECT tablename FROM pg_tables
WHERE tablename LIKE 'location_road_%'""")
for table in [r[0] for r in list(cur)]:
cur.execute('TRUNCATE ' + table)
conn.commit()
_COPY_COLUMNS = pysql.SQL(',').join(map(pysql.Identifier,
('osm_type', 'osm_id', 'class', 'type',
'name', 'admin_level', 'address',
'extratags', 'geometry')))
def load_data(dsn: str, threads: int) -> None:
""" Copy data into the word and placex table.
"""
sel = selectors.DefaultSelector()
# Then copy data from place to placex in <threads - 1> chunks.
place_threads = max(1, threads - 1)
for imod in range(place_threads):
conn = DBConnection(dsn)
conn.connect()
conn.perform(
pysql.SQL("""INSERT INTO placex ({columns})
SELECT {columns} FROM place
WHERE osm_id % {total} = {mod}
AND NOT (class='place' and (type='houses' or type='postcode'))
AND ST_IsValid(geometry)
""").format(columns=_COPY_COLUMNS,
total=pysql.Literal(place_threads),
mod=pysql.Literal(imod)))
sel.register(conn, selectors.EVENT_READ, conn)
# Address interpolations go into another table.
conn = DBConnection(dsn)
conn.connect()
conn.perform("""INSERT INTO location_property_osmline (osm_id, address, linegeo)
SELECT osm_id, address, geometry FROM place
WHERE class='place' and type='houses' and osm_type='W'
and ST_GeometryType(geometry) = 'ST_LineString'
""")
sel.register(conn, selectors.EVENT_READ, conn)
# Now wait for all of them to finish.
todo = place_threads + 1
while todo > 0:
for key, _ in sel.select(1):
conn = key.data
sel.unregister(conn)
conn.wait()
conn.close()
todo -= 1
print('.', end='', flush=True)
print('\n')
with connect(dsn) as syn_conn:
with syn_conn.cursor() as cur:
cur.execute('ANALYSE')
def create_search_indices(conn: Connection, config: Configuration,
drop: bool = False, threads: int = 1) -> None:
""" Create tables that have explicit partitioning.
"""
# If index creation failed and left an index invalid, they need to be
# cleaned out first, so that the script recreates them.
with conn.cursor() as cur:
cur.execute("""SELECT relname FROM pg_class, pg_index
WHERE pg_index.indisvalid = false
AND pg_index.indexrelid = pg_class.oid""")
bad_indices = [row[0] for row in list(cur)]
for idx in bad_indices:
LOG.info("Drop invalid index %s.", idx)
cur.execute(pysql.SQL('DROP INDEX {}').format(pysql.Identifier(idx)))
conn.commit()
sql = SQLPreprocessor(conn, config)
sql.run_parallel_sql_file(config.get_libpq_dsn(),
'indices.sql', min(8, threads), drop=drop)

View File

@@ -0,0 +1,84 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for executing external programs.
"""
from typing import Any, Mapping
import logging
import os
import subprocess
import shutil
from nominatim_core.typing import StrPath
from nominatim_core.db.connection import get_pg_env
LOG = logging.getLogger()
def run_php_server(server_address: str, base_dir: StrPath) -> None:
""" Run the built-in server from the given directory.
"""
subprocess.run(['/usr/bin/env', 'php', '-S', server_address],
cwd=str(base_dir), check=True)
def run_osm2pgsql(options: Mapping[str, Any]) -> None:
""" Run osm2pgsql with the given options.
"""
env = get_pg_env(options['dsn'])
osm2pgsql_cmd = options['osm2pgsql']
if osm2pgsql_cmd is None:
osm2pgsql_cmd = shutil.which('osm2pgsql')
if osm2pgsql_cmd is None:
raise RuntimeError('osm2pgsql executable not found. Please install osm2pgsql first.')
cmd = [str(osm2pgsql_cmd),
'--slim',
'--log-progress', 'true',
'--number-processes', '1' if options['append'] else str(options['threads']),
'--cache', str(options['osm2pgsql_cache']),
'--style', str(options['osm2pgsql_style'])
]
if str(options['osm2pgsql_style']).endswith('.lua'):
env['LUA_PATH'] = ';'.join((str(options['osm2pgsql_style_path'] / '?.lua'),
os.environ.get('LUAPATH', ';')))
cmd.extend(('--output', 'flex'))
else:
cmd.extend(('--output', 'gazetteer', '--hstore', '--latlon'))
cmd.append('--append' if options['append'] else '--create')
if options['flatnode_file']:
cmd.extend(('--flat-nodes', options['flatnode_file']))
for key, param in (('slim_data', '--tablespace-slim-data'),
('slim_index', '--tablespace-slim-index'),
('main_data', '--tablespace-main-data'),
('main_index', '--tablespace-main-index')):
if options['tablespaces'][key]:
cmd.extend((param, options['tablespaces'][key]))
if options['tablespaces']['main_data']:
env['NOMINATIM_TABLESPACE_PLACE_DATA'] = options['tablespaces']['main_data']
if options['tablespaces']['main_index']:
env['NOMINATIM_TABLESPACE_PLACE_INDEX'] = options['tablespaces']['main_index']
if options.get('disable_jit', False):
env['PGOPTIONS'] = '-c jit=off -c max_parallel_workers_per_gather=0'
if 'import_data' in options:
cmd.extend(('-r', 'xml', '-'))
elif isinstance(options['import_file'], list):
for fname in options['import_file']:
cmd.append(str(fname))
else:
cmd.append(str(options['import_file']))
subprocess.run(cmd, cwd=options.get('cwd', '.'),
input=options.get('import_data'),
env=env, check=True)

View File

@@ -0,0 +1,58 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for removing unnecessary data from the database.
"""
from typing import Optional
from pathlib import Path
from psycopg2 import sql as pysql
from nominatim_core.db.connection import Connection
UPDATE_TABLES = [
'address_levels',
'gb_postcode',
'import_osmosis_log',
'import_polygon_%',
'location_area%',
'location_road%',
'place',
'planet_osm_%',
'search_name_%',
'us_postcode',
'wikipedia_%'
]
def drop_update_tables(conn: Connection) -> None:
""" Drop all tables only necessary for updating the database from
OSM replication data.
"""
parts = (pysql.SQL("(tablename LIKE {})").format(pysql.Literal(t)) for t in UPDATE_TABLES)
with conn.cursor() as cur:
cur.execute(pysql.SQL("SELECT tablename FROM pg_tables WHERE ")
+ pysql.SQL(' or ').join(parts))
tables = [r[0] for r in cur]
for table in tables:
cur.drop_table(table, cascade=True)
conn.commit()
def drop_flatnode_file(fpath: Optional[Path]) -> None:
""" Remove the flatnode file if it exists.
"""
if fpath and fpath.exists():
fpath.unlink()
def is_frozen(conn: Connection) -> bool:
""" Returns true if database is in a frozen state
"""
return conn.table_exists('place') is False

View File

@@ -0,0 +1,405 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for database migration to newer software versions.
"""
from typing import List, Tuple, Callable, Any
import logging
from psycopg2 import sql as pysql
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from nominatim_core.db import properties
from nominatim_core.db.connection import connect, Connection
from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version
from ..tokenizer import factory as tokenizer_factory
from . import refresh
LOG = logging.getLogger()
_MIGRATION_FUNCTIONS : List[Tuple[NominatimVersion, Callable[..., None]]] = []
def migrate(config: Configuration, paths: Any) -> int:
""" Check for the current database version and execute migrations,
if necesssary.
"""
with connect(config.get_libpq_dsn()) as conn:
if conn.table_exists('nominatim_properties'):
db_version_str = properties.get_property(conn, 'database_version')
else:
db_version_str = None
if db_version_str is not None:
db_version = parse_version(db_version_str)
if db_version == NOMINATIM_VERSION:
LOG.warning("Database already at latest version (%s)", db_version_str)
return 0
LOG.info("Detected database version: %s", db_version_str)
else:
db_version = _guess_version(conn)
for version, func in _MIGRATION_FUNCTIONS:
if db_version < version or \
(db_version == (3, 5, 0, 99) and version == (3, 5, 0, 99)):
title = func.__doc__ or ''
LOG.warning("Running: %s (%s)", title.split('\n', 1)[0], version)
kwargs = dict(conn=conn, config=config, paths=paths)
func(**kwargs)
conn.commit()
LOG.warning('Updating SQL functions.')
refresh.create_functions(conn, config)
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
tokenizer.update_sql_functions(config)
properties.set_property(conn, 'database_version', str(NOMINATIM_VERSION))
conn.commit()
return 0
def _guess_version(conn: Connection) -> NominatimVersion:
""" Guess a database version when there is no property table yet.
Only migrations for 3.6 and later are supported, so bail out
when the version seems older.
"""
with conn.cursor() as cur:
# In version 3.6, the country_name table was updated. Check for that.
cnt = cur.scalar("""SELECT count(*) FROM
(SELECT svals(name) FROM country_name
WHERE country_code = 'gb')x;
""")
if cnt < 100:
LOG.fatal('It looks like your database was imported with a version '
'prior to 3.6.0. Automatic migration not possible.')
raise UsageError('Migration not possible.')
return NominatimVersion(3, 5, 0, 99)
def _migration(major: int, minor: int, patch: int = 0,
dbpatch: int = 0) -> Callable[[Callable[..., None]], Callable[..., None]]:
""" Decorator for a single migration step. The parameters describe the
version after which the migration is applicable, i.e before changing
from the given version to the next, the migration is required.
All migrations are run in the order in which they are defined in this
file. Do not run global SQL scripts for migrations as you cannot be sure
that these scripts do the same in later versions.
Functions will always be reimported in full at the end of the migration
process, so the migration functions may leave a temporary state behind
there.
"""
def decorator(func: Callable[..., None]) -> Callable[..., None]:
version = NominatimVersion(major, minor, patch, dbpatch)
_MIGRATION_FUNCTIONS.append((version, func))
return func
return decorator
@_migration(3, 5, 0, 99)
def import_status_timestamp_change(conn: Connection, **_: Any) -> None:
""" Add timezone to timestamp in status table.
The import_status table has been changed to include timezone information
with the time stamp.
"""
with conn.cursor() as cur:
cur.execute("""ALTER TABLE import_status ALTER COLUMN lastimportdate
TYPE timestamp with time zone;""")
@_migration(3, 5, 0, 99)
def add_nominatim_property_table(conn: Connection, config: Configuration, **_: Any) -> None:
""" Add nominatim_property table.
"""
if not conn.table_exists('nominatim_properties'):
with conn.cursor() as cur:
cur.execute(pysql.SQL("""CREATE TABLE nominatim_properties (
property TEXT,
value TEXT);
GRANT SELECT ON TABLE nominatim_properties TO {};
""").format(pysql.Identifier(config.DATABASE_WEBUSER)))
@_migration(3, 6, 0, 0)
def change_housenumber_transliteration(conn: Connection, **_: Any) -> None:
""" Transliterate housenumbers.
The database schema switched from saving raw housenumbers in
placex.housenumber to saving transliterated ones.
Note: the function create_housenumber_id() has been dropped in later
versions.
"""
with conn.cursor() as cur:
cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
RETURNS TEXT AS $$
DECLARE
normtext TEXT;
BEGIN
SELECT array_to_string(array_agg(trans), ';')
INTO normtext
FROM (SELECT lookup_word as trans,
getorcreate_housenumber_id(lookup_word)
FROM (SELECT make_standard_name(h) as lookup_word
FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
return normtext;
END;
$$ LANGUAGE plpgsql STABLE STRICT;""")
cur.execute("DELETE FROM word WHERE class = 'place' and type = 'house'")
cur.execute("""UPDATE placex
SET housenumber = create_housenumber_id(housenumber)
WHERE housenumber is not null""")
@_migration(3, 7, 0, 0)
def switch_placenode_geometry_index(conn: Connection, **_: Any) -> None:
""" Replace idx_placex_geometry_reverse_placeNode index.
Make the index slightly more permissive, so that it can also be used
when matching up boundaries and place nodes. It makes the index
idx_placex_adminname index unnecessary.
"""
with conn.cursor() as cur:
cur.execute(""" CREATE INDEX IF NOT EXISTS idx_placex_geometry_placenode ON placex
USING GIST (geometry)
WHERE osm_type = 'N' and rank_search < 26
and class = 'place' and type != 'postcode'
and linked_place_id is null""")
cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
@_migration(3, 7, 0, 1)
def install_legacy_tokenizer(conn: Connection, config: Configuration, **_: Any) -> None:
""" Setup legacy tokenizer.
If no other tokenizer has been configured yet, then create the
configuration for the backwards-compatible legacy tokenizer
"""
if properties.get_property(conn, 'tokenizer') is None:
with conn.cursor() as cur:
for table in ('placex', 'location_property_osmline'):
has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
WHERE table_name = %s
and column_name = 'token_info'""",
(table, ))
if has_column == 0:
cur.execute(pysql.SQL('ALTER TABLE {} ADD COLUMN token_info JSONB')
.format(pysql.Identifier(table)))
tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
module_name='legacy')
tokenizer.migrate_database(config) # type: ignore[attr-defined]
@_migration(4, 0, 99, 0)
def create_tiger_housenumber_index(conn: Connection, **_: Any) -> None:
""" Create idx_location_property_tiger_parent_place_id with included
house number.
The inclusion is needed for efficient lookup of housenumbers in
full address searches.
"""
if conn.server_version_tuple() >= (11, 0, 0):
with conn.cursor() as cur:
cur.execute(""" CREATE INDEX IF NOT EXISTS
idx_location_property_tiger_housenumber_migrated
ON location_property_tiger
USING btree(parent_place_id)
INCLUDE (startnumber, endnumber) """)
@_migration(4, 0, 99, 1)
def create_interpolation_index_on_place(conn: Connection, **_: Any) -> None:
""" Create idx_place_interpolations for lookup of interpolation lines
on updates.
"""
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_place_interpolations
ON place USING gist(geometry)
WHERE osm_type = 'W' and address ? 'interpolation'""")
@_migration(4, 0, 99, 2)
def add_step_column_for_interpolation(conn: Connection, **_: Any) -> None:
""" Add a new column 'step' to the interpolations table.
Also converts the data into the stricter format which requires that
startnumbers comply with the odd/even requirements.
"""
if conn.table_has_column('location_property_osmline', 'step'):
return
with conn.cursor() as cur:
# Mark invalid all interpolations with no intermediate numbers.
cur.execute("""UPDATE location_property_osmline SET startnumber = null
WHERE endnumber - startnumber <= 1 """)
# Align the start numbers where odd/even does not match.
cur.execute("""UPDATE location_property_osmline
SET startnumber = startnumber + 1,
linegeo = ST_LineSubString(linegeo,
1.0 / (endnumber - startnumber)::float,
1)
WHERE (interpolationtype = 'odd' and startnumber % 2 = 0)
or (interpolationtype = 'even' and startnumber % 2 = 1)
""")
# Mark invalid odd/even interpolations with no intermediate numbers.
cur.execute("""UPDATE location_property_osmline SET startnumber = null
WHERE interpolationtype in ('odd', 'even')
and endnumber - startnumber = 2""")
# Finally add the new column and populate it.
cur.execute("ALTER TABLE location_property_osmline ADD COLUMN step SMALLINT")
cur.execute("""UPDATE location_property_osmline
SET step = CASE WHEN interpolationtype = 'all'
THEN 1 ELSE 2 END
""")
@_migration(4, 0, 99, 3)
def add_step_column_for_tiger(conn: Connection, **_: Any) -> None:
""" Add a new column 'step' to the tiger data table.
"""
if conn.table_has_column('location_property_tiger', 'step'):
return
with conn.cursor() as cur:
cur.execute("ALTER TABLE location_property_tiger ADD COLUMN step SMALLINT")
cur.execute("""UPDATE location_property_tiger
SET step = CASE WHEN interpolationtype = 'all'
THEN 1 ELSE 2 END
""")
@_migration(4, 0, 99, 4)
def add_derived_name_column_for_country_names(conn: Connection, **_: Any) -> None:
""" Add a new column 'derived_name' which in the future takes the
country names as imported from OSM data.
"""
if not conn.table_has_column('country_name', 'derived_name'):
with conn.cursor() as cur:
cur.execute("ALTER TABLE country_name ADD COLUMN derived_name public.HSTORE")
@_migration(4, 0, 99, 5)
def mark_internal_country_names(conn: Connection, config: Configuration, **_: Any) -> None:
""" Names from the country table should be marked as internal to prevent
them from being deleted. Only necessary for ICU tokenizer.
"""
import psycopg2.extras # pylint: disable=import-outside-toplevel
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
with tokenizer.name_analyzer() as analyzer:
with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur)
cur.execute("SELECT country_code, name FROM country_name")
for country_code, names in cur:
if not names:
names = {}
names['countrycode'] = country_code
analyzer.add_country_names(country_code, names)
@_migration(4, 1, 99, 0)
def add_place_deletion_todo_table(conn: Connection, **_: Any) -> None:
""" Add helper table for deleting data on updates.
The table is only necessary when updates are possible, i.e.
the database is not in freeze mode.
"""
if conn.table_exists('place'):
with conn.cursor() as cur:
cur.execute("""CREATE TABLE IF NOT EXISTS place_to_be_deleted (
osm_type CHAR(1),
osm_id BIGINT,
class TEXT,
type TEXT,
deferred BOOLEAN)""")
@_migration(4, 1, 99, 1)
def split_pending_index(conn: Connection, **_: Any) -> None:
""" Reorganise indexes for pending updates.
"""
if conn.table_exists('place'):
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_rank_address_sector
ON placex USING BTREE (rank_address, geometry_sector)
WHERE indexed_status > 0""")
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_rank_boundaries_sector
ON placex USING BTREE (rank_search, geometry_sector)
WHERE class = 'boundary' and type = 'administrative'
and indexed_status > 0""")
cur.execute("DROP INDEX IF EXISTS idx_placex_pendingsector")
@_migration(4, 2, 99, 0)
def enable_forward_dependencies(conn: Connection, **_: Any) -> None:
""" Create indexes for updates with forward dependency tracking (long-running).
"""
if conn.table_exists('planet_osm_ways'):
with conn.cursor() as cur:
cur.execute("""SELECT * FROM pg_indexes
WHERE tablename = 'planet_osm_ways'
and indexdef LIKE '%nodes%'""")
if cur.rowcount == 0:
cur.execute("""CREATE OR REPLACE FUNCTION public.planet_osm_index_bucket(bigint[])
RETURNS bigint[]
LANGUAGE sql IMMUTABLE
AS $function$
SELECT ARRAY(SELECT DISTINCT unnest($1) >> 5)
$function$""")
cur.execute("""CREATE INDEX planet_osm_ways_nodes_bucket_idx
ON planet_osm_ways
USING gin (planet_osm_index_bucket(nodes))
WITH (fastupdate=off)""")
cur.execute("""CREATE INDEX planet_osm_rels_parts_idx
ON planet_osm_rels USING gin (parts)
WITH (fastupdate=off)""")
cur.execute("ANALYZE planet_osm_ways")
@_migration(4, 2, 99, 1)
def add_improved_geometry_reverse_placenode_index(conn: Connection, **_: Any) -> None:
""" Create improved index for reverse lookup of place nodes.
"""
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_geometry_reverse_lookupPlaceNode
ON placex
USING gist (ST_Buffer(geometry, reverse_place_diameter(rank_search)))
WHERE rank_address between 4 and 25 AND type != 'postcode'
AND name is not null AND linked_place_id is null AND osm_type = 'N'
""")
@_migration(4, 4, 99, 0)
def create_postcode_area_lookup_index(conn: Connection, **_: Any) -> None:
""" Create index needed for looking up postcode areas from postocde points.
"""
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_postcode_areas
ON placex USING BTREE (country_code, postcode)
WHERE osm_type = 'R' AND class = 'boundary' AND type = 'postal_code'
""")
@_migration(4, 4, 99, 1)
def create_postcode_parent_index(conn: Connection, **_: Any) -> None:
""" Create index needed for updating postcodes when a parent changes.
"""
if conn.table_exists('planet_osm_ways'):
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS
idx_location_postcode_parent_place_id
ON location_postcode USING BTREE (parent_place_id)""")

View File

@@ -0,0 +1,234 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for importing, updating and otherwise maintaining the table
of artificial postcode centroids.
"""
from typing import Optional, Tuple, Dict, List, TextIO
from collections import defaultdict
from pathlib import Path
import csv
import gzip
import logging
from math import isfinite
from psycopg2 import sql as pysql
from nominatim_core.db.connection import connect, Connection
from nominatim_core.utils.centroid import PointsCentroid
from ..data.postcode_format import PostcodeFormatter, CountryPostcodeMatcher
from ..tokenizer.base import AbstractAnalyzer, AbstractTokenizer
LOG = logging.getLogger()
def _to_float(numstr: str, max_value: float) -> float:
""" Convert the number in string into a float. The number is expected
to be in the range of [-max_value, max_value]. Otherwise rises a
ValueError.
"""
num = float(numstr)
if not isfinite(num) or num <= -max_value or num >= max_value:
raise ValueError()
return num
class _PostcodeCollector:
""" Collector for postcodes of a single country.
"""
def __init__(self, country: str, matcher: Optional[CountryPostcodeMatcher]):
self.country = country
self.matcher = matcher
self.collected: Dict[str, PointsCentroid] = defaultdict(PointsCentroid)
self.normalization_cache: Optional[Tuple[str, Optional[str]]] = None
def add(self, postcode: str, x: float, y: float) -> None:
""" Add the given postcode to the collection cache. If the postcode
already existed, it is overwritten with the new centroid.
"""
if self.matcher is not None:
normalized: Optional[str]
if self.normalization_cache and self.normalization_cache[0] == postcode:
normalized = self.normalization_cache[1]
else:
match = self.matcher.match(postcode)
normalized = self.matcher.normalize(match) if match else None
self.normalization_cache = (postcode, normalized)
if normalized:
self.collected[normalized] += (x, y)
def commit(self, conn: Connection, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
""" Update postcodes for the country from the postcodes selected so far
as well as any externally supplied postcodes.
"""
self._update_from_external(analyzer, project_dir)
to_add, to_delete, to_update = self._compute_changes(conn)
LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
self.country, len(to_add), len(to_delete), len(to_update))
with conn.cursor() as cur:
if to_add:
cur.execute_values(
"""INSERT INTO location_postcode
(place_id, indexed_status, country_code,
postcode, geometry) VALUES %s""",
to_add,
template=pysql.SQL("""(nextval('seq_place'), 1, {},
%s, 'SRID=4326;POINT(%s %s)')
""").format(pysql.Literal(self.country)))
if to_delete:
cur.execute("""DELETE FROM location_postcode
WHERE country_code = %s and postcode = any(%s)
""", (self.country, to_delete))
if to_update:
cur.execute_values(
pysql.SQL("""UPDATE location_postcode
SET indexed_status = 2,
geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
FROM (VALUES %s) AS v (pc, x, y)
WHERE country_code = {} and postcode = pc
""").format(pysql.Literal(self.country)), to_update)
def _compute_changes(self, conn: Connection) \
-> Tuple[List[Tuple[str, float, float]], List[str], List[Tuple[str, float, float]]]:
""" Compute which postcodes from the collected postcodes have to be
added or modified and which from the location_postcode table
have to be deleted.
"""
to_update = []
to_delete = []
with conn.cursor() as cur:
cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
FROM location_postcode
WHERE country_code = %s""",
(self.country, ))
for postcode, x, y in cur:
pcobj = self.collected.pop(postcode, None)
if pcobj:
newx, newy = pcobj.centroid()
if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
to_update.append((postcode, newx, newy))
else:
to_delete.append(postcode)
to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
self.collected = defaultdict(PointsCentroid)
return to_add, to_delete, to_update
def _update_from_external(self, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
""" Look for an external postcode file for the active country in
the project directory and add missing postcodes when found.
"""
csvfile = self._open_external(project_dir)
if csvfile is None:
return
try:
reader = csv.DictReader(csvfile)
for row in reader:
if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
LOG.warning("Bad format for external postcode file for country '%s'."
" Ignored.", self.country)
return
postcode = analyzer.normalize_postcode(row['postcode'])
if postcode not in self.collected:
try:
# Do float conversation separately, it might throw
centroid = (_to_float(row['lon'], 180),
_to_float(row['lat'], 90))
self.collected[postcode] += centroid
except ValueError:
LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
row['lat'], row['lon'], self.country)
finally:
csvfile.close()
def _open_external(self, project_dir: Path) -> Optional[TextIO]:
fname = project_dir / f'{self.country}_postcodes.csv'
if fname.is_file():
LOG.info("Using external postcode file '%s'.", fname)
return open(fname, 'r', encoding='utf-8')
fname = project_dir / f'{self.country}_postcodes.csv.gz'
if fname.is_file():
LOG.info("Using external postcode file '%s'.", fname)
return gzip.open(fname, 'rt')
return None
def update_postcodes(dsn: str, project_dir: Path, tokenizer: AbstractTokenizer) -> None:
""" Update the table of artificial postcodes.
Computes artificial postcode centroids from the placex table,
potentially enhances it with external data and then updates the
postcodes in the table 'location_postcode'.
"""
matcher = PostcodeFormatter()
with tokenizer.name_analyzer() as analyzer:
with connect(dsn) as conn:
# First get the list of countries that currently have postcodes.
# (Doing this before starting to insert, so it is fast on import.)
with conn.cursor() as cur:
cur.execute("SELECT DISTINCT country_code FROM location_postcode")
todo_countries = set((row[0] for row in cur))
# Recompute the list of valid postcodes from placex.
with conn.cursor(name="placex_postcodes") as cur:
cur.execute("""
SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
FROM (SELECT
COALESCE(plx.country_code,
get_country_code(ST_Centroid(pl.geometry))) as cc,
pl.address->'postcode' as pc,
COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
FROM place AS pl LEFT OUTER JOIN placex AS plx
ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
WHERE pc IS NOT null AND cc IS NOT null
ORDER BY cc, pc""")
collector = None
for country, postcode, x, y in cur:
if collector is None or country != collector.country:
if collector is not None:
collector.commit(conn, analyzer, project_dir)
collector = _PostcodeCollector(country, matcher.get_matcher(country))
todo_countries.discard(country)
collector.add(postcode, x, y)
if collector is not None:
collector.commit(conn, analyzer, project_dir)
# Now handle any countries that are only in the postcode table.
for country in todo_countries:
fmt = matcher.get_matcher(country)
_PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir)
conn.commit()
analyzer.update_postcodes_from_db()
def can_compute(dsn: str) -> bool:
"""
Check that the place table exists so that
postcodes can be computed.
"""
with connect(dsn) as conn:
return conn.table_exists('place')

View File

@@ -0,0 +1,346 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for bringing auxiliary data in the database up-to-date.
"""
from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
import csv
import gzip
import logging
from textwrap import dedent
from pathlib import Path
from psycopg2 import sql as pysql
from nominatim_core.config import Configuration
from nominatim_core.db.connection import Connection, connect
from nominatim_core.db.utils import execute_file, CopyBuffer
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from ..version import NOMINATIM_VERSION
LOG = logging.getLogger()
OSM_TYPE = {'N': 'node', 'W': 'way', 'R': 'relation'}
def _add_address_level_rows_from_entry(rows: MutableSequence[Tuple[Any, ...]],
entry: Mapping[str, Any]) -> None:
""" Converts a single entry from the JSON format for address rank
descriptions into a flat format suitable for inserting into a
PostgreSQL table and adds these lines to `rows`.
"""
countries = entry.get('countries') or (None, )
for key, values in entry['tags'].items():
for value, ranks in values.items():
if isinstance(ranks, list):
rank_search, rank_address = ranks
else:
rank_search = rank_address = ranks
if not value:
value = None
for country in countries:
rows.append((country, key, value, rank_search, rank_address))
def load_address_levels(conn: Connection, table: str, levels: Sequence[Mapping[str, Any]]) -> None:
""" Replace the `address_levels` table with the contents of `levels'.
A new table is created any previously existing table is dropped.
The table has the following columns:
country, class, type, rank_search, rank_address
"""
rows: List[Tuple[Any, ...]] = []
for entry in levels:
_add_address_level_rows_from_entry(rows, entry)
with conn.cursor() as cur:
cur.drop_table(table)
cur.execute(pysql.SQL("""CREATE TABLE {} (
country_code varchar(2),
class TEXT,
type TEXT,
rank_search SMALLINT,
rank_address SMALLINT)
""").format(pysql.Identifier(table)))
cur.execute_values(pysql.SQL("INSERT INTO {} VALUES %s")
.format(pysql.Identifier(table)), rows)
cur.execute(pysql.SQL('CREATE UNIQUE INDEX ON {} (country_code, class, type)')
.format(pysql.Identifier(table)))
conn.commit()
def load_address_levels_from_config(conn: Connection, config: Configuration) -> None:
""" Replace the `address_levels` table with the content as
defined in the given configuration. Uses the parameter
NOMINATIM_ADDRESS_LEVEL_CONFIG to determine the location of the
configuration file.
"""
cfg = config.load_sub_configuration('', config='ADDRESS_LEVEL_CONFIG')
load_address_levels(conn, 'address_levels', cfg)
def create_functions(conn: Connection, config: Configuration,
enable_diff_updates: bool = True,
enable_debug: bool = False) -> None:
""" (Re)create the PL/pgSQL functions.
"""
sql = SQLPreprocessor(conn, config)
sql.run_sql_file(conn, 'functions.sql',
disable_diff_updates=not enable_diff_updates,
debug=enable_debug)
WEBSITE_SCRIPTS = (
'deletable.php',
'details.php',
'lookup.php',
'polygons.php',
'reverse.php',
'search.php',
'status.php'
)
# constants needed by PHP scripts: PHP name, config name, type
PHP_CONST_DEFS = (
('Database_DSN', 'DATABASE_DSN', str),
('Default_Language', 'DEFAULT_LANGUAGE', str),
('Log_DB', 'LOG_DB', bool),
('Log_File', 'LOG_FILE', Path),
('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
('MapIcon_URL', 'MAPICON_URL', str),
('Search_WithinCountries', 'SEARCH_WITHIN_COUNTRIES', bool),
)
def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
""" Replaces the wikipedia importance tables with new data.
The import is run in a single transaction so that the new data
is replace seamlessly.
Returns 0 if all was well and 1 if the importance file could not
be found. Throws an exception if there was an error reading the file.
"""
if import_importance_csv(dsn, data_path / 'wikimedia-importance.csv.gz') == 0 \
or import_importance_sql(dsn, data_path / 'wikimedia-importance.sql.gz',
ignore_errors) == 0:
return 0
return 1
def import_importance_csv(dsn: str, data_file: Path) -> int:
""" Replace wikipedia importance table with data from a
single CSV file.
The file must be a gzipped CSV and have the following columns:
language, title, importance, wikidata_id
Other columns may be present but will be ignored.
"""
if not data_file.exists():
return 1
# Only import the first occurance of a wikidata ID.
# This keeps indexes and table small.
wd_done = set()
with connect(dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('wikipedia_article')
cur.drop_table('wikipedia_redirect')
cur.drop_table('wikimedia_importance')
cur.execute("""CREATE TABLE wikimedia_importance (
language TEXT NOT NULL,
title TEXT NOT NULL,
importance double precision NOT NULL,
wikidata TEXT
) """)
with gzip.open(str(data_file), 'rt') as fd, CopyBuffer() as buf:
for row in csv.DictReader(fd, delimiter='\t', quotechar='|'):
wd_id = int(row['wikidata_id'][1:])
buf.add(row['language'], row['title'], row['importance'],
None if wd_id in wd_done else row['wikidata_id'])
wd_done.add(wd_id)
if buf.size() > 10000000:
with conn.cursor() as cur:
buf.copy_out(cur, 'wikimedia_importance',
columns=['language', 'title', 'importance',
'wikidata'])
with conn.cursor() as cur:
buf.copy_out(cur, 'wikimedia_importance',
columns=['language', 'title', 'importance', 'wikidata'])
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_title
ON wikimedia_importance (title)""")
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_wikidata
ON wikimedia_importance (wikidata)
WHERE wikidata is not null""")
conn.commit()
return 0
def import_importance_sql(dsn: str, data_file: Path, ignore_errors: bool) -> int:
""" Replace wikipedia importance table with data from an SQL file.
"""
if not data_file.exists():
return 1
pre_code = """BEGIN;
DROP TABLE IF EXISTS "wikipedia_article";
DROP TABLE IF EXISTS "wikipedia_redirect";
DROP TABLE IF EXISTS "wikipedia_importance";
"""
post_code = "COMMIT"
execute_file(dsn, data_file, ignore_errors=ignore_errors,
pre_code=pre_code, post_code=post_code)
return 0
def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
""" Replaces the secondary importance raster data table with new data.
Returns 0 if all was well and 1 if the raster SQL file could not
be found. Throws an exception if there was an error reading the file.
"""
datafile = data_path / 'secondary_importance.sql.gz'
if not datafile.exists():
return 1
with connect(dsn) as conn:
postgis_version = conn.postgis_version_tuple()
if postgis_version[0] < 3:
LOG.error('PostGIS version is too old for using OSM raster data.')
return 2
execute_file(dsn, datafile, ignore_errors=ignore_errors)
return 0
def recompute_importance(conn: Connection) -> None:
""" Recompute wikipedia links and importance for all entries in placex.
This is a long-running operations that must not be executed in
parallel with updates.
"""
with conn.cursor() as cur:
cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL')
cur.execute("""
UPDATE placex SET (wikipedia, importance) =
(SELECT wikipedia, importance
FROM compute_importance(extratags, country_code, rank_search, centroid))
""")
cur.execute("""
UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
FROM placex d
WHERE s.place_id = d.linked_place_id and d.wikipedia is not null
and (s.wikipedia is null or s.importance < d.importance);
""")
cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL')
conn.commit()
def _quote_php_variable(var_type: Type[Any], config: Configuration,
conf_name: str) -> str:
if var_type == bool:
return 'true' if config.get_bool(conf_name) else 'false'
if var_type == int:
return cast(str, getattr(config, conf_name))
if not getattr(config, conf_name):
return 'false'
if var_type == Path:
value = str(config.get_path(conf_name) or '')
else:
value = getattr(config, conf_name)
quoted = value.replace("'", "\\'")
return f"'{quoted}'"
def setup_website(basedir: Path, config: Configuration, conn: Connection) -> None:
""" Create the website script stubs.
"""
if config.lib_dir.php is None:
LOG.info("Python frontend does not require website setup. Skipping.")
return
if not basedir.exists():
LOG.info('Creating website directory.')
basedir.mkdir()
assert config.project_dir is not None
basedata = dedent(f"""\
<?php
@define('CONST_Debug', $_GET['debug'] ?? false);
@define('CONST_LibDir', '{config.lib_dir.php}');
@define('CONST_TokenizerDir', '{config.project_dir / 'tokenizer'}');
@define('CONST_NominatimVersion', '{NOMINATIM_VERSION!s}');
""")
for php_name, conf_name, var_type in PHP_CONST_DEFS:
varout = _quote_php_variable(var_type, config, conf_name)
basedata += f"@define('CONST_{php_name}', {varout});\n"
template = "\nrequire_once(CONST_LibDir.'/website/{}');\n"
search_name_table_exists = bool(conn and conn.table_exists('search_name'))
for script in WEBSITE_SCRIPTS:
if not search_name_table_exists and script == 'search.php':
out = template.format('reverse-only-search.php')
else:
out = template.format(script)
(basedir / script).write_text(basedata + out, 'utf-8')
def invalidate_osm_object(osm_type: str, osm_id: int, conn: Connection,
recursive: bool = True) -> None:
""" Mark the given OSM object for reindexing. When 'recursive' is set
to True (the default), then all dependent objects are marked for
reindexing as well.
'osm_type' must be on of 'N' (node), 'W' (way) or 'R' (relation).
If the given object does not exist, then nothing happens.
"""
assert osm_type in ('N', 'R', 'W')
LOG.warning("Invalidating OSM %s %s%s.",
OSM_TYPE[osm_type], osm_id,
' and its dependent places' if recursive else '')
with conn.cursor() as cur:
if recursive:
sql = """SELECT place_force_update(place_id)
FROM placex WHERE osm_type = %s and osm_id = %s"""
else:
sql = """UPDATE placex SET indexed_status = 2
WHERE osm_type = %s and osm_id = %s"""
cur.execute(sql, (osm_type, osm_id))

View File

@@ -0,0 +1,206 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for updating a database from a replication source.
"""
from typing import ContextManager, MutableMapping, Any, Generator, cast, Iterator
from contextlib import contextmanager
import datetime as dt
from enum import Enum
import logging
import time
import types
import urllib.request as urlrequest
import requests
from nominatim_core.errors import UsageError
from nominatim_core.db import status
from nominatim_core.db.connection import Connection, connect
from .exec_utils import run_osm2pgsql
try:
from osmium.replication.server import ReplicationServer
from osmium import WriteHandler
from osmium import version as pyo_version
except ImportError as exc:
logging.getLogger().critical("pyosmium not installed. Replication functions not available.\n"
"To install pyosmium via pip: pip3 install osmium")
raise UsageError("replication tools not available") from exc
LOG = logging.getLogger()
def init_replication(conn: Connection, base_url: str,
socket_timeout: int = 60) -> None:
""" Set up replication for the server at the given base URL.
"""
LOG.info("Using replication source: %s", base_url)
date = status.compute_database_date(conn)
# margin of error to make sure we get all data
date -= dt.timedelta(hours=3)
with _make_replication_server(base_url, socket_timeout) as repl:
seq = repl.timestamp_to_sequence(date)
if seq is None:
LOG.fatal("Cannot reach the configured replication service '%s'.\n"
"Does the URL point to a directory containing OSM update data?",
base_url)
raise UsageError("Failed to reach replication service")
status.set_status(conn, date=date, seq=seq)
LOG.warning("Updates initialised at sequence %s (%s)", seq, date)
def check_for_updates(conn: Connection, base_url: str,
socket_timeout: int = 60) -> int:
""" Check if new data is available from the replication service at the
given base URL.
"""
_, seq, _ = status.get_status(conn)
if seq is None:
LOG.error("Replication not set up. "
"Please run 'nominatim replication --init' first.")
return 254
with _make_replication_server(base_url, socket_timeout) as repl:
state = repl.get_state_info()
if state is None:
LOG.error("Cannot get state for URL %s.", base_url)
return 253
if state.sequence <= seq:
LOG.warning("Database is up to date.")
return 2
LOG.warning("New data available (%i => %i).", seq, state.sequence)
return 0
class UpdateState(Enum):
""" Possible states after an update has run.
"""
UP_TO_DATE = 0
MORE_PENDING = 2
NO_CHANGES = 3
def update(dsn: str, options: MutableMapping[str, Any],
socket_timeout: int = 60) -> UpdateState:
""" Update database from the next batch of data. Returns the state of
updates according to `UpdateState`.
"""
with connect(dsn) as conn:
startdate, startseq, indexed = status.get_status(conn)
conn.commit()
if startseq is None:
LOG.error("Replication not set up. "
"Please run 'nominatim replication --init' first.")
raise UsageError("Replication not set up.")
assert startdate is not None
if not indexed and options['indexed_only']:
LOG.info("Skipping update. There is data that needs indexing.")
return UpdateState.MORE_PENDING
last_since_update = dt.datetime.now(dt.timezone.utc) - startdate
update_interval = dt.timedelta(seconds=options['update_interval'])
if last_since_update < update_interval:
duration = (update_interval - last_since_update).seconds
LOG.warning("Sleeping for %s sec before next update.", duration)
time.sleep(duration)
if options['import_file'].exists():
options['import_file'].unlink()
# Read updates into file.
with _make_replication_server(options['base_url'], socket_timeout) as repl:
outhandler = WriteHandler(str(options['import_file']))
endseq = repl.apply_diffs(outhandler, startseq + 1,
max_size=options['max_diff_size'] * 1024)
outhandler.close()
if endseq is None:
return UpdateState.NO_CHANGES
with connect(dsn) as conn:
run_osm2pgsql_updates(conn, options)
# Write the current status to the file
endstate = repl.get_state_info(endseq)
status.set_status(conn, endstate.timestamp if endstate else None,
seq=endseq, indexed=False)
conn.commit()
return UpdateState.UP_TO_DATE
def run_osm2pgsql_updates(conn: Connection, options: MutableMapping[str, Any]) -> None:
""" Run osm2pgsql in append mode.
"""
# Remove any stale deletion marks.
with conn.cursor() as cur:
cur.execute('TRUNCATE place_to_be_deleted')
conn.commit()
# Consume updates with osm2pgsql.
options['append'] = True
options['disable_jit'] = conn.server_version_tuple() >= (11, 0)
run_osm2pgsql(options)
# Handle deletions
with conn.cursor() as cur:
cur.execute('SELECT flush_deleted_places()')
conn.commit()
def _make_replication_server(url: str, timeout: int) -> ContextManager[ReplicationServer]:
""" Returns a ReplicationServer in form of a context manager.
Creates a light wrapper around older versions of pyosmium that did
not support the context manager interface.
"""
if hasattr(ReplicationServer, '__enter__'):
# Patches the open_url function for pyosmium >= 3.2
# where the socket timeout is no longer respected.
def patched_open_url(self: ReplicationServer, url: urlrequest.Request) -> Any:
""" Download a resource from the given URL and return a byte sequence
of the content.
"""
headers = {"User-Agent" : f"Nominatim (pyosmium/{pyo_version.pyosmium_release})"}
if self.session is not None:
return self.session.get(url.get_full_url(),
headers=headers, timeout=timeout or None,
stream=True)
@contextmanager
def _get_url_with_session() -> Iterator[requests.Response]:
with requests.Session() as session:
request = session.get(url.get_full_url(),
headers=headers, timeout=timeout or None,
stream=True)
yield request
return _get_url_with_session()
repl = ReplicationServer(url)
setattr(repl, 'open_url', types.MethodType(patched_open_url, repl))
return cast(ContextManager[ReplicationServer], repl)
@contextmanager
def get_cm() -> Generator[ReplicationServer, None, None]:
yield ReplicationServer(url)
return get_cm()

View File

@@ -0,0 +1,78 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Contains the class which handles statistics for the
import of special phrases.
"""
import logging
LOG = logging.getLogger()
class SpecialPhrasesImporterStatistics():
"""
Class handling statistics of the import
process of special phrases.
"""
def __init__(self) -> None:
self._intialize_values()
def _intialize_values(self) -> None:
"""
Set all counts for the global
import to 0.
"""
self.tables_created = 0
self.tables_deleted = 0
self.tables_ignored = 0
self.invalids = 0
def notify_one_phrase_invalid(self) -> None:
"""
Add +1 to the count of invalid entries
fetched from the wiki.
"""
self.invalids += 1
def notify_one_table_created(self) -> None:
"""
Add +1 to the count of created tables.
"""
self.tables_created += 1
def notify_one_table_deleted(self) -> None:
"""
Add +1 to the count of deleted tables.
"""
self.tables_deleted += 1
def notify_one_table_ignored(self) -> None:
"""
Add +1 to the count of ignored tables.
"""
self.tables_ignored += 1
def notify_import_done(self) -> None:
"""
Print stats for the whole import process
and reset all values.
"""
LOG.info('====================================================================')
LOG.info('Final statistics of the import:')
LOG.info('- %s phrases were invalid.', self.invalids)
if self.invalids > 0:
LOG.info(' Those invalid phrases have been skipped.')
LOG.info('- %s tables were ignored as they already exist on the database',
self.tables_ignored)
LOG.info('- %s tables were created', self.tables_created)
LOG.info('- %s tables were deleted from the database', self.tables_deleted)
if self.tables_deleted > 0:
LOG.info(' They were deleted as they are not valid anymore.')
if self.invalids > 0:
LOG.warning('%s phrases were invalid and have been skipped during the whole process.',
self.invalids)
self._intialize_values()

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the SPCsvLoader class.
The class allows to load phrases from a csv file.
"""
from typing import Iterable
import csv
import os
from nominatim_core.errors import UsageError
from .special_phrase import SpecialPhrase
class SPCsvLoader:
"""
Handles loading of special phrases from external csv file.
"""
def __init__(self, csv_path: str) -> None:
self.csv_path = csv_path
def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Open and parse the given csv file.
Create the corresponding SpecialPhrases.
"""
self._check_csv_validity()
with open(self.csv_path, encoding='utf-8') as fd:
reader = csv.DictReader(fd, delimiter=',')
for row in reader:
yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
def _check_csv_validity(self) -> None:
"""
Check that the csv file has the right extension.
"""
_, extension = os.path.splitext(self.csv_path)
if extension != '.csv':
raise UsageError(f'The file {self.csv_path} is not a csv file.')

View File

@@ -0,0 +1,274 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the class handling the import
of the special phrases.
Phrases are analyzed and imported into the database.
The phrases already present in the database which are not
valids anymore are removed.
"""
from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set
import logging
import re
from psycopg2.sql import Identifier, SQL
from nominatim_core.typing import Protocol
from nominatim_core.config import Configuration
from nominatim_core.db.connection import Connection
from .importer_statistics import SpecialPhrasesImporterStatistics
from .special_phrase import SpecialPhrase
from ...tokenizer.base import AbstractTokenizer
LOG = logging.getLogger()
def _classtype_table(phrase_class: str, phrase_type: str) -> str:
""" Return the name of the table for the given class and type.
"""
return f'place_classtype_{phrase_class}_{phrase_type}'
class SpecialPhraseLoader(Protocol):
""" Protocol for classes implementing a loader for special phrases.
"""
def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Generates all special phrase terms this loader can produce.
"""
class SPImporter():
# pylint: disable-msg=too-many-instance-attributes
"""
Class handling the process of special phrases importation into the database.
Take a sp loader which load the phrases from an external source.
"""
def __init__(self, config: Configuration, conn: Connection,
sp_loader: SpecialPhraseLoader) -> None:
self.config = config
self.db_connection = conn
self.sp_loader = sp_loader
self.statistics_handler = SpecialPhrasesImporterStatistics()
self.black_list, self.white_list = self._load_white_and_black_lists()
self.sanity_check_pattern = re.compile(r'^\w+$')
# This set will contain all existing phrases to be added.
# It contains tuples with the following format: (label, class, type, operator)
self.word_phrases: Set[Tuple[str, str, str, str]] = set()
# This set will contain all existing place_classtype tables which doesn't match any
# special phrases class/type on the wiki.
self.table_phrases_to_delete: Set[str] = set()
def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None:
"""
Iterate through all SpecialPhrases extracted from the
loader and import them into the database.
If should_replace is set to True only the loaded phrases
will be kept into the database. All other phrases already
in the database will be removed.
"""
LOG.warning('Special phrases importation starting')
self._fetch_existing_place_classtype_tables()
# Store pairs of class/type for further processing
class_type_pairs = set()
for phrase in self.sp_loader.generate_phrases():
result = self._process_phrase(phrase)
if result:
class_type_pairs.add(result)
self._create_classtype_table_and_indexes(class_type_pairs)
if should_replace:
self._remove_non_existent_tables_from_db()
self.db_connection.commit()
with tokenizer.name_analyzer() as analyzer:
analyzer.update_special_phrases(self.word_phrases, should_replace)
LOG.warning('Import done.')
self.statistics_handler.notify_import_done()
def _fetch_existing_place_classtype_tables(self) -> None:
"""
Fetch existing place_classtype tables.
Fill the table_phrases_to_delete set of the class.
"""
query = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema='public'
AND table_name like 'place_classtype_%';
"""
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL(query))
for row in db_cursor:
self.table_phrases_to_delete.add(row[0])
def _load_white_and_black_lists(self) \
-> Tuple[Mapping[str, Sequence[str]], Mapping[str, Sequence[str]]]:
"""
Load white and black lists from phrases-settings.json.
"""
settings = self.config.load_sub_configuration('phrase-settings.json')
return settings['blackList'], settings['whiteList']
def _check_sanity(self, phrase: SpecialPhrase) -> bool:
"""
Check sanity of given inputs in case somebody added garbage in the wiki.
If a bad class/type is detected the system will exit with an error.
"""
class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
if not class_matchs or not type_matchs:
LOG.warning("Bad class/type: %s=%s. It will not be imported",
phrase.p_class, phrase.p_type)
return False
return True
def _process_phrase(self, phrase: SpecialPhrase) -> Optional[Tuple[str, str]]:
"""
Processes the given phrase by checking black and white list
and sanity.
Return the class/type pair corresponding to the phrase.
"""
# blacklisting: disallow certain class/type combinations
if phrase.p_class in self.black_list.keys() \
and phrase.p_type in self.black_list[phrase.p_class]:
return None
# whitelisting: if class is in whitelist, allow only tags in the list
if phrase.p_class in self.white_list.keys() \
and phrase.p_type not in self.white_list[phrase.p_class]:
return None
# sanity check, in case somebody added garbage in the wiki
if not self._check_sanity(phrase):
self.statistics_handler.notify_one_phrase_invalid()
return None
self.word_phrases.add((phrase.p_label, phrase.p_class,
phrase.p_type, phrase.p_operator))
return (phrase.p_class, phrase.p_type)
def _create_classtype_table_and_indexes(self,
class_type_pairs: Iterable[Tuple[str, str]]) -> None:
"""
Create table place_classtype for each given pair.
Also create indexes on place_id and centroid.
"""
LOG.warning('Create tables and indexes...')
sql_tablespace = self.config.TABLESPACE_AUX_DATA
if sql_tablespace:
sql_tablespace = ' TABLESPACE ' + sql_tablespace
with self.db_connection.cursor() as db_cursor:
db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
for pair in class_type_pairs:
phrase_class = pair[0]
phrase_type = pair[1]
table_name = _classtype_table(phrase_class, phrase_type)
if table_name in self.table_phrases_to_delete:
self.statistics_handler.notify_one_table_ignored()
# Remove this table from the ones to delete as it match a
# class/type still existing on the special phrases of the wiki.
self.table_phrases_to_delete.remove(table_name)
# So don't need to create the table and indexes.
continue
# Table creation
self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
# Indexes creation
self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
# Grant access on read to the web user.
self._grant_access_to_webuser(phrase_class, phrase_type)
self.statistics_handler.notify_one_table_created()
with self.db_connection.cursor() as db_cursor:
db_cursor.execute("DROP INDEX idx_placex_classtype")
def _create_place_classtype_table(self, sql_tablespace: str,
phrase_class: str, phrase_type: str) -> None:
"""
Create table place_classtype of the given phrase_class/phrase_type
if doesn't exit.
"""
table_name = _classtype_table(phrase_class, phrase_type)
with self.db_connection.cursor() as cur:
cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS
SELECT place_id AS place_id,
st_centroid(geometry) AS centroid
FROM placex
WHERE class = %s AND type = %s
""").format(Identifier(table_name), SQL(sql_tablespace)),
(phrase_class, phrase_type))
def _create_place_classtype_indexes(self, sql_tablespace: str,
phrase_class: str, phrase_type: str) -> None:
"""
Create indexes on centroid and place_id for the place_classtype table.
"""
index_prefix = f'idx_place_classtype_{phrase_class}_{phrase_type}_'
base_table = _classtype_table(phrase_class, phrase_type)
# Index on centroid
if not self.db_connection.index_exists(index_prefix + 'centroid'):
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}")
.format(Identifier(index_prefix + 'centroid'),
Identifier(base_table),
SQL(sql_tablespace)))
# Index on place_id
if not self.db_connection.index_exists(index_prefix + 'place_id'):
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL("CREATE INDEX {} ON {} USING btree(place_id) {}")
.format(Identifier(index_prefix + 'place_id'),
Identifier(base_table),
SQL(sql_tablespace)))
def _grant_access_to_webuser(self, phrase_class: str, phrase_type: str) -> None:
"""
Grant access on read to the table place_classtype for the webuser.
"""
table_name = _classtype_table(phrase_class, phrase_type)
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
.format(Identifier(table_name),
Identifier(self.config.DATABASE_WEBUSER)))
def _remove_non_existent_tables_from_db(self) -> None:
"""
Remove special phrases which doesn't exist on the wiki anymore.
Delete the place_classtype tables.
"""
LOG.warning('Cleaning database...')
# Delete place_classtype tables corresponding to class/type which
# are not on the wiki anymore.
with self.db_connection.cursor() as db_cursor:
for table in self.table_phrases_to_delete:
self.statistics_handler.notify_one_table_deleted()
db_cursor.drop_table(table)

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the SPWikiLoader class.
"""
from typing import Iterable
import re
import logging
from nominatim_core.config import Configuration
from nominatim_core.utils.url_utils import get_url
from .special_phrase import SpecialPhrase
LOG = logging.getLogger()
def _get_wiki_content(lang: str) -> str:
"""
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
"""
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+ lang.upper()
return get_url(url)
class SPWikiLoader:
"""
Handles loading of special phrases from the wiki.
"""
def __init__(self, config: Configuration) -> None:
self.config = config
# Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
)
# Hack around a bug where building=yes was imported with quotes into the wiki
self.type_fix_pattern = re.compile(r'\"|&quot;')
self.languages = self.config.get_str_list('LANGUAGES') or \
['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi',
'lv', 'tr']
def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Download the wiki pages for the configured languages
and extract the phrases from the page.
"""
for lang in self.languages:
LOG.warning('Importing phrases for lang: %s...', lang)
loaded_xml = _get_wiki_content(lang)
# One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(loaded_xml)
for match in matches:
yield SpecialPhrase(match[0],
match[1],
self.type_fix_pattern.sub('', match[2]),
match[3])

View File

@@ -0,0 +1,37 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the class SpecialPhrase.
This class is a model used to transfer a special phrase through
the process of load and importation.
"""
from typing import Any
class SpecialPhrase:
"""
Model representing a special phrase.
"""
def __init__(self, p_label: str, p_class: str, p_type: str, p_operator: str) -> None:
self.p_label = p_label.strip()
self.p_class = p_class.strip()
self.p_type = p_type.strip()
# Needed if some operator in the wiki are not written in english
p_operator = p_operator.strip().lower()
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
def __eq__(self, other: Any) -> bool:
if not isinstance(other, SpecialPhrase):
return False
return self.p_label == other.p_label \
and self.p_class == other.p_class \
and self.p_type == other.p_type \
and self.p_operator == other.p_operator
def __hash__(self) -> int:
return hash((self.p_label, self.p_class, self.p_type, self.p_operator))

View File

@@ -0,0 +1,149 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for importing tiger data and handling tarbar and directory files
"""
from typing import Any, TextIO, List, Union, cast
import csv
import io
import logging
import os
import tarfile
from psycopg2.extras import Json
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect
from nominatim_core.db.async_connection import WorkerPool
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from nominatim_core.errors import UsageError
from ..data.place_info import PlaceInfo
from ..tokenizer.base import AbstractAnalyzer, AbstractTokenizer
from . import freeze
LOG = logging.getLogger()
class TigerInput:
""" Context manager that goes through Tiger input files which may
either be in a directory or gzipped together in a tar file.
"""
def __init__(self, data_dir: str) -> None:
self.tar_handle = None
self.files: List[Union[str, tarfile.TarInfo]] = []
if data_dir.endswith('.tar.gz'):
try:
self.tar_handle = tarfile.open(data_dir) # pylint: disable=consider-using-with
except tarfile.ReadError as err:
LOG.fatal("Cannot open '%s'. Is this a tar file?", data_dir)
raise UsageError("Cannot open Tiger data file.") from err
self.files = [i for i in self.tar_handle.getmembers() if i.name.endswith('.csv')]
LOG.warning("Found %d CSV files in tarfile with path %s", len(self.files), data_dir)
else:
files = os.listdir(data_dir)
self.files = [os.path.join(data_dir, i) for i in files if i.endswith('.csv')]
LOG.warning("Found %d CSV files in path %s", len(self.files), data_dir)
if not self.files:
LOG.warning("Tiger data import selected but no files found at %s", data_dir)
def __enter__(self) -> 'TigerInput':
return self
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
if self.tar_handle:
self.tar_handle.close()
self.tar_handle = None
def next_file(self) -> TextIO:
""" Return a file handle to the next file to be processed.
Raises an IndexError if there is no file left.
"""
fname = self.files.pop(0)
if self.tar_handle is not None:
extracted = self.tar_handle.extractfile(fname)
assert extracted is not None
return io.TextIOWrapper(extracted)
return open(cast(str, fname), encoding='utf-8')
def __len__(self) -> int:
return len(self.files)
def handle_threaded_sql_statements(pool: WorkerPool, fd: TextIO,
analyzer: AbstractAnalyzer) -> None:
""" Handles sql statement with multiplexing
"""
lines = 0
# Using pool of database connections to execute sql statements
sql = "SELECT tiger_line_import(%s, %s, %s, %s, %s, %s)"
for row in csv.DictReader(fd, delimiter=';'):
try:
address = dict(street=row['street'], postcode=row['postcode'])
args = ('SRID=4326;' + row['geometry'],
int(row['from']), int(row['to']), row['interpolation'],
Json(analyzer.process_place(PlaceInfo({'address': address}))),
analyzer.normalize_postcode(row['postcode']))
except ValueError:
continue
pool.next_free_worker().perform(sql, args=args)
lines += 1
if lines == 1000:
print('.', end='', flush=True)
lines = 0
def add_tiger_data(data_dir: str, config: Configuration, threads: int,
tokenizer: AbstractTokenizer) -> int:
""" Import tiger data from directory or tar file `data dir`.
"""
dsn = config.get_libpq_dsn()
with connect(dsn) as conn:
is_frozen = freeze.is_frozen(conn)
conn.close()
if is_frozen:
raise UsageError("Tiger cannot be imported when database frozen (Github issue #3048)")
with TigerInput(data_dir) as tar:
if not tar:
return 1
with connect(dsn) as conn:
sql = SQLPreprocessor(conn, config)
sql.run_sql_file(conn, 'tiger_import_start.sql')
# Reading files and then for each file line handling
# sql_query in <threads - 1> chunks.
place_threads = max(1, threads - 1)
with WorkerPool(dsn, place_threads, ignore_sql_errors=True) as pool:
with tokenizer.name_analyzer() as analyzer:
while tar:
with tar.next_file() as fd:
handle_threaded_sql_statements(pool, fd, analyzer)
print('\n')
LOG.warning("Creating indexes on Tiger data")
with connect(dsn) as conn:
sql = SQLPreprocessor(conn, config)
sql.run_sql_file(conn, 'tiger_import_finish.sql')
return 0