mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-06 18:14:16 +00:00
split code into submodules
This commit is contained in:
10
src/nominatim_db/tools/__init__.py
Normal file
10
src/nominatim_db/tools/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Module with functions for importing, updating Nominatim databases
|
||||
as well as general maintenance helpers.
|
||||
"""
|
||||
68
src/nominatim_db/tools/add_osm_data.py
Normal file
68
src/nominatim_db/tools/add_osm_data.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Function to add additional OSM data from a file or the API into the database.
|
||||
"""
|
||||
from typing import Any, MutableMapping
|
||||
from pathlib import Path
|
||||
import logging
|
||||
import urllib
|
||||
|
||||
from nominatim_core.db.connection import connect
|
||||
from nominatim_core.utils.url_utils import get_url
|
||||
from .exec_utils import run_osm2pgsql
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _run_osm2pgsql(dsn: str, options: MutableMapping[str, Any]) -> None:
|
||||
run_osm2pgsql(options)
|
||||
|
||||
# Handle deletions
|
||||
with connect(dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('SELECT flush_deleted_places()')
|
||||
conn.commit()
|
||||
|
||||
|
||||
def add_data_from_file(dsn: str, fname: str, options: MutableMapping[str, Any]) -> int:
|
||||
""" Adds data from a OSM file to the database. The file may be a normal
|
||||
OSM file or a diff file in all formats supported by libosmium.
|
||||
"""
|
||||
options['import_file'] = Path(fname)
|
||||
options['append'] = True
|
||||
_run_osm2pgsql(dsn, options)
|
||||
|
||||
# No status update. We don't know where the file came from.
|
||||
return 0
|
||||
|
||||
|
||||
def add_osm_object(dsn: str, osm_type: str, osm_id: int, use_main_api: bool,
|
||||
options: MutableMapping[str, Any]) -> int:
|
||||
""" Add or update a single OSM object from the latest version of the
|
||||
API.
|
||||
"""
|
||||
if use_main_api:
|
||||
base_url = f'https://www.openstreetmap.org/api/0.6/{osm_type}/{osm_id}'
|
||||
if osm_type in ('way', 'relation'):
|
||||
base_url += '/full'
|
||||
else:
|
||||
# use Overpass API
|
||||
if osm_type == 'node':
|
||||
data = f'node({osm_id});out meta;'
|
||||
elif osm_type == 'way':
|
||||
data = f'(way({osm_id});>;);out meta;'
|
||||
else:
|
||||
data = f'(rel(id:{osm_id});>;);out meta;'
|
||||
base_url = 'https://overpass-api.de/api/interpreter?' \
|
||||
+ urllib.parse.urlencode({'data': data})
|
||||
|
||||
options['append'] = True
|
||||
options['import_data'] = get_url(base_url).encode('utf-8')
|
||||
|
||||
_run_osm2pgsql(dsn, options)
|
||||
|
||||
return 0
|
||||
106
src/nominatim_db/tools/admin.py
Normal file
106
src/nominatim_db/tools/admin.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for database analysis and maintenance.
|
||||
"""
|
||||
from typing import Optional, Tuple, Any, cast
|
||||
import logging
|
||||
|
||||
from psycopg2.extras import Json, register_hstore
|
||||
from psycopg2 import DataError
|
||||
|
||||
from nominatim_core.typing import DictCursorResult
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.connection import connect, Cursor
|
||||
from nominatim_core.errors import UsageError
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
from ..data.place_info import PlaceInfo
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _get_place_info(cursor: Cursor, osm_id: Optional[str],
|
||||
place_id: Optional[int]) -> DictCursorResult:
|
||||
sql = """SELECT place_id, extra.*
|
||||
FROM placex, LATERAL placex_indexing_prepare(placex) as extra
|
||||
"""
|
||||
|
||||
values: Tuple[Any, ...]
|
||||
if osm_id:
|
||||
osm_type = osm_id[0].upper()
|
||||
if osm_type not in 'NWR' or not osm_id[1:].isdigit():
|
||||
LOG.fatal('OSM ID must be of form <N|W|R><id>. Got: %s', osm_id)
|
||||
raise UsageError("OSM ID parameter badly formatted")
|
||||
|
||||
sql += ' WHERE placex.osm_type = %s AND placex.osm_id = %s'
|
||||
values = (osm_type, int(osm_id[1:]))
|
||||
elif place_id is not None:
|
||||
sql += ' WHERE placex.place_id = %s'
|
||||
values = (place_id, )
|
||||
else:
|
||||
LOG.fatal("No OSM object given to index.")
|
||||
raise UsageError("OSM object not found")
|
||||
|
||||
cursor.execute(sql + ' LIMIT 1', values)
|
||||
|
||||
if cursor.rowcount < 1:
|
||||
LOG.fatal("OSM object %s not found in database.", osm_id)
|
||||
raise UsageError("OSM object not found")
|
||||
|
||||
return cast(DictCursorResult, cursor.fetchone())
|
||||
|
||||
|
||||
def analyse_indexing(config: Configuration, osm_id: Optional[str] = None,
|
||||
place_id: Optional[int] = None) -> None:
|
||||
""" Analyse indexing of a single Nominatim object.
|
||||
"""
|
||||
with connect(config.get_libpq_dsn()) as conn:
|
||||
register_hstore(conn)
|
||||
with conn.cursor() as cur:
|
||||
place = _get_place_info(cur, osm_id, place_id)
|
||||
|
||||
cur.execute("update placex set indexed_status = 2 where place_id = %s",
|
||||
(place['place_id'], ))
|
||||
|
||||
cur.execute("""SET auto_explain.log_min_duration = '0';
|
||||
SET auto_explain.log_analyze = 'true';
|
||||
SET auto_explain.log_nested_statements = 'true';
|
||||
LOAD 'auto_explain';
|
||||
SET client_min_messages = LOG;
|
||||
SET log_min_messages = FATAL""")
|
||||
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
|
||||
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
cur.execute("""UPDATE placex
|
||||
SET indexed_status = 0, address = %s, token_info = %s,
|
||||
name = %s, linked_place_id = %s
|
||||
WHERE place_id = %s""",
|
||||
(place['address'],
|
||||
Json(analyzer.process_place(PlaceInfo(place))),
|
||||
place['name'], place['linked_place_id'], place['place_id']))
|
||||
|
||||
# we do not want to keep the results
|
||||
conn.rollback()
|
||||
|
||||
for msg in conn.notices:
|
||||
print(msg)
|
||||
|
||||
|
||||
def clean_deleted_relations(config: Configuration, age: str) -> None:
|
||||
""" Clean deleted relations older than a given age
|
||||
"""
|
||||
with connect(config.get_libpq_dsn()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
try:
|
||||
cur.execute("""SELECT place_force_delete(p.place_id)
|
||||
FROM import_polygon_delete d, placex p
|
||||
WHERE p.osm_type = d.osm_type AND p.osm_id = d.osm_id
|
||||
AND age(p.indexed_date) > %s::interval""",
|
||||
(age, ))
|
||||
except DataError as exc:
|
||||
raise UsageError('Invalid PostgreSQL time interval format') from exc
|
||||
conn.commit()
|
||||
350
src/nominatim_db/tools/check_database.py
Normal file
350
src/nominatim_db/tools/check_database.py
Normal file
@@ -0,0 +1,350 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Collection of functions that check if the database is complete and functional.
|
||||
"""
|
||||
from typing import Callable, Optional, Any, Union, Tuple, Mapping, List
|
||||
from enum import Enum
|
||||
from textwrap import dedent
|
||||
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.connection import connect, Connection
|
||||
from nominatim_core.db import properties
|
||||
from nominatim_core.errors import UsageError
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
from . import freeze
|
||||
from ..version import NOMINATIM_VERSION, parse_version
|
||||
|
||||
CHECKLIST = []
|
||||
|
||||
class CheckState(Enum):
|
||||
""" Possible states of a check. FATAL stops check execution entirely.
|
||||
"""
|
||||
OK = 0
|
||||
FAIL = 1
|
||||
FATAL = 2
|
||||
NOT_APPLICABLE = 3
|
||||
WARN = 4
|
||||
|
||||
CheckResult = Union[CheckState, Tuple[CheckState, Mapping[str, Any]]]
|
||||
CheckFunc = Callable[[Connection, Configuration], CheckResult]
|
||||
|
||||
def _check(hint: Optional[str] = None) -> Callable[[CheckFunc], CheckFunc]:
|
||||
""" Decorator for checks. It adds the function to the list of
|
||||
checks to execute and adds the code for printing progress messages.
|
||||
"""
|
||||
def decorator(func: CheckFunc) -> CheckFunc:
|
||||
title = (func.__doc__ or '').split('\n', 1)[0].strip()
|
||||
|
||||
def run_check(conn: Connection, config: Configuration) -> CheckState:
|
||||
print(title, end=' ... ')
|
||||
ret = func(conn, config)
|
||||
if isinstance(ret, tuple):
|
||||
ret, params = ret
|
||||
else:
|
||||
params = {}
|
||||
if ret == CheckState.OK:
|
||||
print('\033[92mOK\033[0m')
|
||||
elif ret == CheckState.WARN:
|
||||
print('\033[93mWARNING\033[0m')
|
||||
if hint:
|
||||
print('')
|
||||
print(dedent(hint.format(**params)))
|
||||
elif ret == CheckState.NOT_APPLICABLE:
|
||||
print('not applicable')
|
||||
else:
|
||||
print('\x1B[31mFailed\033[0m')
|
||||
if hint:
|
||||
print(dedent(hint.format(**params)))
|
||||
return ret
|
||||
|
||||
CHECKLIST.append(run_check)
|
||||
return run_check
|
||||
|
||||
return decorator
|
||||
|
||||
class _BadConnection:
|
||||
|
||||
def __init__(self, msg: str) -> None:
|
||||
self.msg = msg
|
||||
|
||||
def close(self) -> None:
|
||||
""" Dummy function to provide the implementation.
|
||||
"""
|
||||
|
||||
def check_database(config: Configuration) -> int:
|
||||
""" Run a number of checks on the database and return the status.
|
||||
"""
|
||||
try:
|
||||
conn = connect(config.get_libpq_dsn()).connection
|
||||
except UsageError as err:
|
||||
conn = _BadConnection(str(err)) # type: ignore[assignment]
|
||||
|
||||
overall_result = 0
|
||||
for check in CHECKLIST:
|
||||
ret = check(conn, config)
|
||||
if ret == CheckState.FATAL:
|
||||
conn.close()
|
||||
return 1
|
||||
if ret in (CheckState.FATAL, CheckState.FAIL):
|
||||
overall_result = 1
|
||||
|
||||
conn.close()
|
||||
return overall_result
|
||||
|
||||
|
||||
def _get_indexes(conn: Connection) -> List[str]:
|
||||
indexes = ['idx_place_addressline_address_place_id',
|
||||
'idx_placex_rank_search',
|
||||
'idx_placex_rank_address',
|
||||
'idx_placex_parent_place_id',
|
||||
'idx_placex_geometry_reverse_lookuppolygon',
|
||||
'idx_placex_geometry_placenode',
|
||||
'idx_osmline_parent_place_id',
|
||||
'idx_osmline_parent_osm_id',
|
||||
'idx_postcode_id',
|
||||
'idx_postcode_postcode'
|
||||
]
|
||||
if conn.table_exists('search_name'):
|
||||
indexes.extend(('idx_search_name_nameaddress_vector',
|
||||
'idx_search_name_name_vector',
|
||||
'idx_search_name_centroid'))
|
||||
if conn.server_version_tuple() >= (11, 0, 0):
|
||||
indexes.extend(('idx_placex_housenumber',
|
||||
'idx_osmline_parent_osm_id_with_hnr'))
|
||||
if conn.table_exists('place'):
|
||||
indexes.extend(('idx_location_area_country_place_id',
|
||||
'idx_place_osm_unique',
|
||||
'idx_placex_rank_address_sector',
|
||||
'idx_placex_rank_boundaries_sector'))
|
||||
|
||||
return indexes
|
||||
|
||||
|
||||
# CHECK FUNCTIONS
|
||||
#
|
||||
# Functions are executed in the order they appear here.
|
||||
|
||||
@_check(hint="""\
|
||||
{error}
|
||||
|
||||
Hints:
|
||||
* Is the database server started?
|
||||
* Check the NOMINATIM_DATABASE_DSN variable in your local .env
|
||||
* Try connecting to the database with the same settings
|
||||
|
||||
Project directory: {config.project_dir}
|
||||
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
|
||||
""")
|
||||
def check_connection(conn: Any, config: Configuration) -> CheckResult:
|
||||
""" Checking database connection
|
||||
"""
|
||||
if isinstance(conn, _BadConnection):
|
||||
return CheckState.FATAL, dict(error=conn.msg, config=config)
|
||||
|
||||
return CheckState.OK
|
||||
|
||||
@_check(hint="""\
|
||||
Database version ({db_version}) doesn't match Nominatim version ({nom_version})
|
||||
|
||||
Hints:
|
||||
* Are you connecting to the correct database?
|
||||
|
||||
{instruction}
|
||||
|
||||
Check the Migration chapter of the Administration Guide.
|
||||
|
||||
Project directory: {config.project_dir}
|
||||
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
|
||||
""")
|
||||
def check_database_version(conn: Connection, config: Configuration) -> CheckResult:
|
||||
""" Checking database_version matches Nominatim software version
|
||||
"""
|
||||
|
||||
if conn.table_exists('nominatim_properties'):
|
||||
db_version_str = properties.get_property(conn, 'database_version')
|
||||
else:
|
||||
db_version_str = None
|
||||
|
||||
if db_version_str is not None:
|
||||
db_version = parse_version(db_version_str)
|
||||
|
||||
if db_version == NOMINATIM_VERSION:
|
||||
return CheckState.OK
|
||||
|
||||
instruction = (
|
||||
'Run migrations: nominatim admin --migrate'
|
||||
if db_version < NOMINATIM_VERSION
|
||||
else 'You need to upgrade the Nominatim software.'
|
||||
)
|
||||
else:
|
||||
instruction = ''
|
||||
|
||||
return CheckState.FATAL, dict(db_version=db_version_str,
|
||||
nom_version=NOMINATIM_VERSION,
|
||||
instruction=instruction,
|
||||
config=config)
|
||||
|
||||
@_check(hint="""\
|
||||
placex table not found
|
||||
|
||||
Hints:
|
||||
* Are you connecting to the correct database?
|
||||
* Did the import process finish without errors?
|
||||
|
||||
Project directory: {config.project_dir}
|
||||
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
|
||||
""")
|
||||
def check_placex_table(conn: Connection, config: Configuration) -> CheckResult:
|
||||
""" Checking for placex table
|
||||
"""
|
||||
if conn.table_exists('placex'):
|
||||
return CheckState.OK
|
||||
|
||||
return CheckState.FATAL, dict(config=config)
|
||||
|
||||
|
||||
@_check(hint="""placex table has no data. Did the import finish successfully?""")
|
||||
def check_placex_size(conn: Connection, _: Configuration) -> CheckResult:
|
||||
""" Checking for placex content
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cnt = cur.scalar('SELECT count(*) FROM (SELECT * FROM placex LIMIT 100) x')
|
||||
|
||||
return CheckState.OK if cnt > 0 else CheckState.FATAL
|
||||
|
||||
|
||||
@_check(hint="""{msg}""")
|
||||
def check_tokenizer(_: Connection, config: Configuration) -> CheckResult:
|
||||
""" Checking that tokenizer works
|
||||
"""
|
||||
try:
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
|
||||
except UsageError:
|
||||
return CheckState.FAIL, dict(msg="""\
|
||||
Cannot load tokenizer. Did the import finish successfully?""")
|
||||
|
||||
result = tokenizer.check_database(config)
|
||||
|
||||
if result is None:
|
||||
return CheckState.OK
|
||||
|
||||
return CheckState.FAIL, dict(msg=result)
|
||||
|
||||
|
||||
@_check(hint="""\
|
||||
Wikipedia/Wikidata importance tables missing.
|
||||
Quality of search results may be degraded. Reverse geocoding is unaffected.
|
||||
See https://nominatim.org/release-docs/latest/admin/Import/#wikipediawikidata-rankings
|
||||
""")
|
||||
def check_existance_wikipedia(conn: Connection, _: Configuration) -> CheckResult:
|
||||
""" Checking for wikipedia/wikidata data
|
||||
"""
|
||||
if not conn.table_exists('search_name') or not conn.table_exists('place'):
|
||||
return CheckState.NOT_APPLICABLE
|
||||
|
||||
with conn.cursor() as cur:
|
||||
if conn.table_exists('wikimedia_importance'):
|
||||
cnt = cur.scalar('SELECT count(*) FROM wikimedia_importance')
|
||||
else:
|
||||
cnt = cur.scalar('SELECT count(*) FROM wikipedia_article')
|
||||
|
||||
return CheckState.WARN if cnt == 0 else CheckState.OK
|
||||
|
||||
|
||||
@_check(hint="""\
|
||||
The indexing didn't finish. {count} entries are not yet indexed.
|
||||
|
||||
To index the remaining entries, run: {index_cmd}
|
||||
""")
|
||||
def check_indexing(conn: Connection, _: Configuration) -> CheckResult:
|
||||
""" Checking indexing status
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cnt = cur.scalar('SELECT count(*) FROM placex WHERE indexed_status > 0')
|
||||
|
||||
if cnt == 0:
|
||||
return CheckState.OK
|
||||
|
||||
if freeze.is_frozen(conn):
|
||||
index_cmd="""\
|
||||
Database is marked frozen, it cannot be updated.
|
||||
Low counts of unindexed places are fine."""
|
||||
return CheckState.WARN, dict(count=cnt, index_cmd=index_cmd)
|
||||
|
||||
if conn.index_exists('idx_placex_rank_search'):
|
||||
# Likely just an interrupted update.
|
||||
index_cmd = 'nominatim index'
|
||||
else:
|
||||
# Looks like the import process got interrupted.
|
||||
index_cmd = 'nominatim import --continue indexing'
|
||||
|
||||
return CheckState.FAIL, dict(count=cnt, index_cmd=index_cmd)
|
||||
|
||||
|
||||
@_check(hint="""\
|
||||
The following indexes are missing:
|
||||
{indexes}
|
||||
|
||||
Rerun the index creation with: nominatim import --continue db-postprocess
|
||||
""")
|
||||
def check_database_indexes(conn: Connection, _: Configuration) -> CheckResult:
|
||||
""" Checking that database indexes are complete
|
||||
"""
|
||||
missing = []
|
||||
for index in _get_indexes(conn):
|
||||
if not conn.index_exists(index):
|
||||
missing.append(index)
|
||||
|
||||
if missing:
|
||||
return CheckState.FAIL, dict(indexes='\n '.join(missing))
|
||||
|
||||
return CheckState.OK
|
||||
|
||||
|
||||
@_check(hint="""\
|
||||
At least one index is invalid. That can happen, e.g. when index creation was
|
||||
disrupted and later restarted. You should delete the affected indices
|
||||
and recreate them.
|
||||
|
||||
Invalid indexes:
|
||||
{indexes}
|
||||
""")
|
||||
def check_database_index_valid(conn: Connection, _: Configuration) -> CheckResult:
|
||||
""" Checking that all database indexes are valid
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(""" SELECT relname FROM pg_class, pg_index
|
||||
WHERE pg_index.indisvalid = false
|
||||
AND pg_index.indexrelid = pg_class.oid""")
|
||||
|
||||
broken = [c[0] for c in cur]
|
||||
|
||||
if broken:
|
||||
return CheckState.FAIL, dict(indexes='\n '.join(broken))
|
||||
|
||||
return CheckState.OK
|
||||
|
||||
|
||||
@_check(hint="""\
|
||||
{error}
|
||||
Run TIGER import again: nominatim add-data --tiger-data <DIR>
|
||||
""")
|
||||
def check_tiger_table(conn: Connection, config: Configuration) -> CheckResult:
|
||||
""" Checking TIGER external data table.
|
||||
"""
|
||||
if not config.get_bool('USE_US_TIGER_DATA'):
|
||||
return CheckState.NOT_APPLICABLE
|
||||
|
||||
if not conn.table_exists('location_property_tiger'):
|
||||
return CheckState.FAIL, dict(error='TIGER data table not found.')
|
||||
|
||||
with conn.cursor() as cur:
|
||||
if cur.scalar('SELECT count(*) FROM location_property_tiger') == 0:
|
||||
return CheckState.FAIL, dict(error='TIGER data table is empty.')
|
||||
|
||||
return CheckState.OK
|
||||
166
src/nominatim_db/tools/collect_os_info.py
Normal file
166
src/nominatim_db/tools/collect_os_info.py
Normal file
@@ -0,0 +1,166 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Collection of host system information including software versions, memory,
|
||||
storage, and database configuration.
|
||||
"""
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import psutil
|
||||
from psycopg2.extensions import make_dsn, parse_dsn
|
||||
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.connection import connect
|
||||
from ..version import NOMINATIM_VERSION
|
||||
|
||||
|
||||
def convert_version(ver_tup: Tuple[int, int]) -> str:
|
||||
"""converts tuple version (ver_tup) to a string representation"""
|
||||
return ".".join(map(str, ver_tup))
|
||||
|
||||
|
||||
def friendly_memory_string(mem: float) -> str:
|
||||
"""Create a user friendly string for the amount of memory specified as mem"""
|
||||
mem_magnitude = ("bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
|
||||
mag = 0
|
||||
# determine order of magnitude
|
||||
while mem > 1000:
|
||||
mem /= 1000
|
||||
mag += 1
|
||||
|
||||
return f"{mem:.1f} {mem_magnitude[mag]}"
|
||||
|
||||
|
||||
def run_command(cmd: Union[str, List[str]]) -> str:
|
||||
"""Runs a command using the shell and returns the output from stdout"""
|
||||
try:
|
||||
if sys.version_info < (3, 7):
|
||||
cap_out = subprocess.run(cmd, stdout=subprocess.PIPE, check=False)
|
||||
else:
|
||||
cap_out = subprocess.run(cmd, capture_output=True, check=False)
|
||||
return cap_out.stdout.decode("utf-8")
|
||||
except FileNotFoundError:
|
||||
# non-Linux system should end up here
|
||||
return f"Unknown (unable to find the '{cmd}' command)"
|
||||
|
||||
|
||||
def os_name_info() -> str:
|
||||
"""Obtain Operating System Name (and possibly the version)"""
|
||||
os_info = None
|
||||
# man page os-release(5) details meaning of the fields
|
||||
if Path("/etc/os-release").is_file():
|
||||
os_info = from_file_find_line_portion(
|
||||
"/etc/os-release", "PRETTY_NAME", "=")
|
||||
# alternative location
|
||||
elif Path("/usr/lib/os-release").is_file():
|
||||
os_info = from_file_find_line_portion(
|
||||
"/usr/lib/os-release", "PRETTY_NAME", "="
|
||||
)
|
||||
|
||||
# fallback on Python's os name
|
||||
if os_info is None or os_info == "":
|
||||
os_info = os.name
|
||||
|
||||
# if the above is insufficient, take a look at neofetch's approach to OS detection
|
||||
return os_info
|
||||
|
||||
|
||||
# Note: Intended to be used on informational files like /proc
|
||||
def from_file_find_line_portion(
|
||||
filename: str, start: str, sep: str, fieldnum: int = 1
|
||||
) -> Optional[str]:
|
||||
"""open filename, finds the line starting with the 'start' string.
|
||||
Splits the line using separator and returns a "fieldnum" from the split."""
|
||||
with open(filename, encoding='utf8') as file:
|
||||
result = ""
|
||||
for line in file:
|
||||
if line.startswith(start):
|
||||
result = line.split(sep)[fieldnum].strip()
|
||||
return result
|
||||
|
||||
|
||||
def get_postgresql_config(version: int) -> str:
|
||||
"""Retrieve postgres configuration file"""
|
||||
try:
|
||||
with open(f"/etc/postgresql/{version}/main/postgresql.conf", encoding='utf8') as file:
|
||||
db_config = file.read()
|
||||
file.close()
|
||||
return db_config
|
||||
except IOError:
|
||||
return f"**Could not read '/etc/postgresql/{version}/main/postgresql.conf'**"
|
||||
|
||||
|
||||
def report_system_information(config: Configuration) -> None:
|
||||
"""Generate a report about the host system including software versions, memory,
|
||||
storage, and database configuration."""
|
||||
|
||||
with connect(make_dsn(config.get_libpq_dsn(), dbname='postgres')) as conn:
|
||||
postgresql_ver: str = convert_version(conn.server_version_tuple())
|
||||
|
||||
with conn.cursor() as cur:
|
||||
num = cur.scalar("SELECT count(*) FROM pg_catalog.pg_database WHERE datname=%s",
|
||||
(parse_dsn(config.get_libpq_dsn())['dbname'], ))
|
||||
nominatim_db_exists = num == 1 if isinstance(num, int) else False
|
||||
|
||||
if nominatim_db_exists:
|
||||
with connect(config.get_libpq_dsn()) as conn:
|
||||
postgis_ver: str = convert_version(conn.postgis_version_tuple())
|
||||
else:
|
||||
postgis_ver = "Unable to connect to database"
|
||||
|
||||
postgresql_config: str = get_postgresql_config(int(float(postgresql_ver)))
|
||||
|
||||
# Note: psutil.disk_partitions() is similar to run_command("lsblk")
|
||||
|
||||
# Note: run_command("systemd-detect-virt") only works on Linux, on other OSes
|
||||
# should give a message: "Unknown (unable to find the 'systemd-detect-virt' command)"
|
||||
|
||||
# Generates the Markdown report.
|
||||
|
||||
report = f"""
|
||||
**Instructions**
|
||||
Use this information in your issue report at https://github.com/osm-search/Nominatim/issues
|
||||
Redirect the output to a file:
|
||||
$ ./collect_os_info.py > report.md
|
||||
|
||||
|
||||
**Software Environment:**
|
||||
- Python version: {sys.version}
|
||||
- Nominatim version: {NOMINATIM_VERSION!s}
|
||||
- PostgreSQL version: {postgresql_ver}
|
||||
- PostGIS version: {postgis_ver}
|
||||
- OS: {os_name_info()}
|
||||
|
||||
|
||||
**Hardware Configuration:**
|
||||
- RAM: {friendly_memory_string(psutil.virtual_memory().total)}
|
||||
- number of CPUs: {psutil.cpu_count(logical=False)}
|
||||
- bare metal/AWS/other cloud service (per systemd-detect-virt(1)): {run_command("systemd-detect-virt")}
|
||||
- type and size of disks:
|
||||
**`df -h` - df - report file system disk space usage: **
|
||||
```
|
||||
{run_command(["df", "-h"])}
|
||||
```
|
||||
|
||||
**lsblk - list block devices: **
|
||||
```
|
||||
{run_command("lsblk")}
|
||||
```
|
||||
|
||||
|
||||
**Postgresql Configuration:**
|
||||
```
|
||||
{postgresql_config}
|
||||
```
|
||||
**Notes**
|
||||
Please add any notes about anything above anything above that is incorrect.
|
||||
"""
|
||||
print(report)
|
||||
265
src/nominatim_db/tools/convert_sqlite.py
Normal file
265
src/nominatim_db/tools/convert_sqlite.py
Normal file
@@ -0,0 +1,265 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Exporting a Nominatim database to SQlite.
|
||||
"""
|
||||
from typing import Set, Any
|
||||
import datetime as dt
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
import nominatim_api as napi
|
||||
from nominatim_api.search.query_analyzer_factory import make_query_analyzer
|
||||
from nominatim_core.typing import SaSelect, SaRow
|
||||
from nominatim_core.db.sqlalchemy_types import Geometry, IntArray
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
async def convert(project_dir: Path, outfile: Path, options: Set[str]) -> None:
|
||||
""" Export an existing database to sqlite. The resulting database
|
||||
will be usable against the Python frontend of Nominatim.
|
||||
"""
|
||||
api = napi.NominatimAPIAsync(project_dir)
|
||||
|
||||
try:
|
||||
outapi = napi.NominatimAPIAsync(project_dir,
|
||||
{'NOMINATIM_DATABASE_DSN': f"sqlite:dbname={outfile}",
|
||||
'NOMINATIM_DATABASE_RW': '1'})
|
||||
|
||||
try:
|
||||
async with api.begin() as src, outapi.begin() as dest:
|
||||
writer = SqliteWriter(src, dest, options)
|
||||
await writer.write()
|
||||
finally:
|
||||
await outapi.close()
|
||||
finally:
|
||||
await api.close()
|
||||
|
||||
|
||||
class SqliteWriter:
|
||||
""" Worker class which creates a new SQLite database.
|
||||
"""
|
||||
|
||||
def __init__(self, src: napi.SearchConnection,
|
||||
dest: napi.SearchConnection, options: Set[str]) -> None:
|
||||
self.src = src
|
||||
self.dest = dest
|
||||
self.options = options
|
||||
|
||||
|
||||
async def write(self) -> None:
|
||||
""" Create the database structure and copy the data from
|
||||
the source database to the destination.
|
||||
"""
|
||||
LOG.warning('Setting up spatialite')
|
||||
await self.dest.execute(sa.select(sa.func.InitSpatialMetaData(True, 'WGS84')))
|
||||
|
||||
await self.create_tables()
|
||||
await self.copy_data()
|
||||
if 'search' in self.options:
|
||||
await self.create_word_table()
|
||||
await self.create_indexes()
|
||||
|
||||
|
||||
async def create_tables(self) -> None:
|
||||
""" Set up the database tables.
|
||||
"""
|
||||
LOG.warning('Setting up tables')
|
||||
if 'search' not in self.options:
|
||||
self.dest.t.meta.remove(self.dest.t.search_name)
|
||||
else:
|
||||
await self.create_class_tables()
|
||||
|
||||
await self.dest.connection.run_sync(self.dest.t.meta.create_all)
|
||||
|
||||
# Convert all Geometry columns to Spatialite geometries
|
||||
for table in self.dest.t.meta.sorted_tables:
|
||||
for col in table.c:
|
||||
if isinstance(col.type, Geometry):
|
||||
await self.dest.execute(sa.select(
|
||||
sa.func.RecoverGeometryColumn(table.name, col.name, 4326,
|
||||
col.type.subtype.upper(), 'XY')))
|
||||
|
||||
|
||||
async def create_class_tables(self) -> None:
|
||||
""" Set up the table that serve class/type-specific geometries.
|
||||
"""
|
||||
sql = sa.text("""SELECT tablename FROM pg_tables
|
||||
WHERE tablename LIKE 'place_classtype_%'""")
|
||||
for res in await self.src.execute(sql):
|
||||
for db in (self.src, self.dest):
|
||||
sa.Table(res[0], db.t.meta,
|
||||
sa.Column('place_id', sa.BigInteger),
|
||||
sa.Column('centroid', Geometry))
|
||||
|
||||
|
||||
async def create_word_table(self) -> None:
|
||||
""" Create the word table.
|
||||
This table needs the property information to determine the
|
||||
correct format. Therefore needs to be done after all other
|
||||
data has been copied.
|
||||
"""
|
||||
await make_query_analyzer(self.src)
|
||||
await make_query_analyzer(self.dest)
|
||||
src = self.src.t.meta.tables['word']
|
||||
dest = self.dest.t.meta.tables['word']
|
||||
|
||||
await self.dest.connection.run_sync(dest.create)
|
||||
|
||||
LOG.warning("Copying word table")
|
||||
async_result = await self.src.connection.stream(sa.select(src))
|
||||
|
||||
async for partition in async_result.partitions(10000):
|
||||
data = [{k: getattr(r, k) for k in r._fields} for r in partition]
|
||||
await self.dest.execute(dest.insert(), data)
|
||||
|
||||
await self.dest.connection.run_sync(sa.Index('idx_word_woken', dest.c.word_token).create)
|
||||
|
||||
|
||||
async def copy_data(self) -> None:
|
||||
""" Copy data for all registered tables.
|
||||
"""
|
||||
def _getfield(row: SaRow, key: str) -> Any:
|
||||
value = getattr(row, key)
|
||||
if isinstance(value, dt.datetime):
|
||||
if value.tzinfo is not None:
|
||||
value = value.astimezone(dt.timezone.utc)
|
||||
return value
|
||||
|
||||
for table in self.dest.t.meta.sorted_tables:
|
||||
LOG.warning("Copying '%s'", table.name)
|
||||
async_result = await self.src.connection.stream(self.select_from(table.name))
|
||||
|
||||
async for partition in async_result.partitions(10000):
|
||||
data = [{('class_' if k == 'class' else k): _getfield(r, k)
|
||||
for k in r._fields}
|
||||
for r in partition]
|
||||
await self.dest.execute(table.insert(), data)
|
||||
|
||||
# Set up a minimal copy of pg_tables used to look up the class tables later.
|
||||
pg_tables = sa.Table('pg_tables', self.dest.t.meta,
|
||||
sa.Column('schemaname', sa.Text, default='public'),
|
||||
sa.Column('tablename', sa.Text))
|
||||
await self.dest.connection.run_sync(pg_tables.create)
|
||||
data = [{'tablename': t} for t in self.dest.t.meta.tables]
|
||||
await self.dest.execute(pg_tables.insert().values(data))
|
||||
|
||||
|
||||
async def create_indexes(self) -> None:
|
||||
""" Add indexes necessary for the frontend.
|
||||
"""
|
||||
# reverse place node lookup needs an extra table to simulate a
|
||||
# partial index with adaptive buffering.
|
||||
await self.dest.execute(sa.text(
|
||||
""" CREATE TABLE placex_place_node_areas AS
|
||||
SELECT place_id, ST_Expand(geometry,
|
||||
14.0 * exp(-0.2 * rank_search) - 0.03) as geometry
|
||||
FROM placex
|
||||
WHERE rank_address between 5 and 25
|
||||
and osm_type = 'N'
|
||||
and linked_place_id is NULL """))
|
||||
await self.dest.execute(sa.select(
|
||||
sa.func.RecoverGeometryColumn('placex_place_node_areas', 'geometry',
|
||||
4326, 'GEOMETRY', 'XY')))
|
||||
await self.dest.execute(sa.select(sa.func.CreateSpatialIndex(
|
||||
'placex_place_node_areas', 'geometry')))
|
||||
|
||||
# Remaining indexes.
|
||||
await self.create_spatial_index('country_grid', 'geometry')
|
||||
await self.create_spatial_index('placex', 'geometry')
|
||||
await self.create_spatial_index('osmline', 'linegeo')
|
||||
await self.create_spatial_index('tiger', 'linegeo')
|
||||
await self.create_index('placex', 'place_id')
|
||||
await self.create_index('placex', 'parent_place_id')
|
||||
await self.create_index('placex', 'rank_address')
|
||||
await self.create_index('addressline', 'place_id')
|
||||
await self.create_index('postcode', 'place_id')
|
||||
await self.create_index('osmline', 'place_id')
|
||||
await self.create_index('tiger', 'place_id')
|
||||
|
||||
if 'search' in self.options:
|
||||
await self.create_spatial_index('postcode', 'geometry')
|
||||
await self.create_spatial_index('search_name', 'centroid')
|
||||
await self.create_index('search_name', 'place_id')
|
||||
await self.create_index('osmline', 'parent_place_id')
|
||||
await self.create_index('tiger', 'parent_place_id')
|
||||
await self.create_search_index()
|
||||
|
||||
for t in self.dest.t.meta.tables:
|
||||
if t.startswith('place_classtype_'):
|
||||
await self.dest.execute(sa.select(
|
||||
sa.func.CreateSpatialIndex(t, 'centroid')))
|
||||
|
||||
|
||||
async def create_spatial_index(self, table: str, column: str) -> None:
|
||||
""" Create a spatial index on the given table and column.
|
||||
"""
|
||||
await self.dest.execute(sa.select(
|
||||
sa.func.CreateSpatialIndex(getattr(self.dest.t, table).name, column)))
|
||||
|
||||
|
||||
async def create_index(self, table_name: str, column: str) -> None:
|
||||
""" Create a simple index on the given table and column.
|
||||
"""
|
||||
table = getattr(self.dest.t, table_name)
|
||||
await self.dest.connection.run_sync(
|
||||
sa.Index(f"idx_{table}_{column}", getattr(table.c, column)).create)
|
||||
|
||||
|
||||
async def create_search_index(self) -> None:
|
||||
""" Create the tables and indexes needed for word lookup.
|
||||
"""
|
||||
LOG.warning("Creating reverse search table")
|
||||
rsn = sa.Table('reverse_search_name', self.dest.t.meta,
|
||||
sa.Column('word', sa.Integer()),
|
||||
sa.Column('column', sa.Text()),
|
||||
sa.Column('places', IntArray))
|
||||
await self.dest.connection.run_sync(rsn.create)
|
||||
|
||||
tsrc = self.src.t.search_name
|
||||
for column in ('name_vector', 'nameaddress_vector'):
|
||||
sql = sa.select(sa.func.unnest(getattr(tsrc.c, column)).label('word'),
|
||||
sa.func.ArrayAgg(tsrc.c.place_id).label('places'))\
|
||||
.group_by('word')
|
||||
|
||||
async_result = await self.src.connection.stream(sql)
|
||||
async for partition in async_result.partitions(100):
|
||||
data = []
|
||||
for row in partition:
|
||||
row.places.sort()
|
||||
data.append({'word': row.word,
|
||||
'column': column,
|
||||
'places': row.places})
|
||||
await self.dest.execute(rsn.insert(), data)
|
||||
|
||||
await self.dest.connection.run_sync(
|
||||
sa.Index('idx_reverse_search_name_word', rsn.c.word).create)
|
||||
|
||||
|
||||
def select_from(self, table: str) -> SaSelect:
|
||||
""" Create the SQL statement to select the source columns and rows.
|
||||
"""
|
||||
columns = self.src.t.meta.tables[table].c
|
||||
|
||||
if table == 'placex':
|
||||
# SQLite struggles with Geometries that are larger than 5MB,
|
||||
# so simplify those.
|
||||
return sa.select(*(c for c in columns if not isinstance(c.type, Geometry)),
|
||||
sa.func.ST_AsText(columns.centroid).label('centroid'),
|
||||
sa.func.ST_AsText(
|
||||
sa.case((sa.func.ST_MemSize(columns.geometry) < 5000000,
|
||||
columns.geometry),
|
||||
else_=sa.func.ST_SimplifyPreserveTopology(
|
||||
columns.geometry, 0.0001)
|
||||
)).label('geometry'))
|
||||
|
||||
sql = sa.select(*(sa.func.ST_AsText(c).label(c.name)
|
||||
if isinstance(c.type, Geometry) else c for c in columns))
|
||||
|
||||
return sql
|
||||
272
src/nominatim_db/tools/database_import.py
Normal file
272
src/nominatim_db/tools/database_import.py
Normal file
@@ -0,0 +1,272 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for setting up and importing a new Nominatim database.
|
||||
"""
|
||||
from typing import Tuple, Optional, Union, Sequence, MutableMapping, Any
|
||||
import logging
|
||||
import os
|
||||
import selectors
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import psutil
|
||||
from psycopg2 import sql as pysql
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.connection import connect, get_pg_env, Connection
|
||||
from nominatim_core.db.async_connection import DBConnection
|
||||
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
|
||||
from .exec_utils import run_osm2pgsql
|
||||
from ..version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _require_version(module: str, actual: Tuple[int, int], expected: Tuple[int, int]) -> None:
|
||||
""" Compares the version for the given module and raises an exception
|
||||
if the actual version is too old.
|
||||
"""
|
||||
if actual < expected:
|
||||
LOG.fatal('Minimum supported version of %s is %d.%d. '
|
||||
'Found version %d.%d.',
|
||||
module, expected[0], expected[1], actual[0], actual[1])
|
||||
raise UsageError(f'{module} is too old.')
|
||||
|
||||
|
||||
def _require_loaded(extension_name: str, conn: Connection) -> None:
|
||||
""" Check that the given extension is loaded. """
|
||||
if not conn.extension_loaded(extension_name):
|
||||
LOG.fatal('Required module %s is not loaded.', extension_name)
|
||||
raise UsageError(f'{extension_name} is not loaded.')
|
||||
|
||||
|
||||
def check_existing_database_plugins(dsn: str) -> None:
|
||||
""" Check that the database has the required plugins installed."""
|
||||
with connect(dsn) as conn:
|
||||
_require_version('PostgreSQL server',
|
||||
conn.server_version_tuple(),
|
||||
POSTGRESQL_REQUIRED_VERSION)
|
||||
_require_version('PostGIS',
|
||||
conn.postgis_version_tuple(),
|
||||
POSTGIS_REQUIRED_VERSION)
|
||||
_require_loaded('hstore', conn)
|
||||
|
||||
|
||||
def setup_database_skeleton(dsn: str, rouser: Optional[str] = None) -> None:
|
||||
""" Create a new database for Nominatim and populate it with the
|
||||
essential extensions.
|
||||
|
||||
The function fails when the database already exists or Postgresql or
|
||||
PostGIS versions are too old.
|
||||
|
||||
Uses `createdb` to create the database.
|
||||
|
||||
If 'rouser' is given, then the function also checks that the user
|
||||
with that given name exists.
|
||||
|
||||
Requires superuser rights by the caller.
|
||||
"""
|
||||
proc = subprocess.run(['createdb'], env=get_pg_env(dsn), check=False)
|
||||
|
||||
if proc.returncode != 0:
|
||||
raise UsageError('Creating new database failed.')
|
||||
|
||||
with connect(dsn) as conn:
|
||||
_require_version('PostgreSQL server',
|
||||
conn.server_version_tuple(),
|
||||
POSTGRESQL_REQUIRED_VERSION)
|
||||
|
||||
if rouser is not None:
|
||||
with conn.cursor() as cur:
|
||||
cnt = cur.scalar('SELECT count(*) FROM pg_user where usename = %s',
|
||||
(rouser, ))
|
||||
if cnt == 0:
|
||||
LOG.fatal("Web user '%s' does not exist. Create it with:\n"
|
||||
"\n createuser %s", rouser, rouser)
|
||||
raise UsageError('Missing read-only user.')
|
||||
|
||||
# Create extensions.
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('CREATE EXTENSION IF NOT EXISTS hstore')
|
||||
cur.execute('CREATE EXTENSION IF NOT EXISTS postgis')
|
||||
|
||||
postgis_version = conn.postgis_version_tuple()
|
||||
if postgis_version[0] >= 3:
|
||||
cur.execute('CREATE EXTENSION IF NOT EXISTS postgis_raster')
|
||||
|
||||
conn.commit()
|
||||
|
||||
_require_version('PostGIS',
|
||||
conn.postgis_version_tuple(),
|
||||
POSTGIS_REQUIRED_VERSION)
|
||||
|
||||
|
||||
def import_osm_data(osm_files: Union[Path, Sequence[Path]],
|
||||
options: MutableMapping[str, Any],
|
||||
drop: bool = False, ignore_errors: bool = False) -> None:
|
||||
""" Import the given OSM files. 'options' contains the list of
|
||||
default settings for osm2pgsql.
|
||||
"""
|
||||
options['import_file'] = osm_files
|
||||
options['append'] = False
|
||||
options['threads'] = 1
|
||||
|
||||
if not options['flatnode_file'] and options['osm2pgsql_cache'] == 0:
|
||||
# Make some educated guesses about cache size based on the size
|
||||
# of the import file and the available memory.
|
||||
mem = psutil.virtual_memory()
|
||||
fsize = 0
|
||||
if isinstance(osm_files, list):
|
||||
for fname in osm_files:
|
||||
fsize += os.stat(str(fname)).st_size
|
||||
else:
|
||||
fsize = os.stat(str(osm_files)).st_size
|
||||
options['osm2pgsql_cache'] = int(min((mem.available + mem.cached) * 0.75,
|
||||
fsize * 2) / 1024 / 1024) + 1
|
||||
|
||||
run_osm2pgsql(options)
|
||||
|
||||
with connect(options['dsn']) as conn:
|
||||
if not ignore_errors:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('SELECT * FROM place LIMIT 1')
|
||||
if cur.rowcount == 0:
|
||||
raise UsageError('No data imported by osm2pgsql.')
|
||||
|
||||
if drop:
|
||||
conn.drop_table('planet_osm_nodes')
|
||||
|
||||
if drop and options['flatnode_file']:
|
||||
Path(options['flatnode_file']).unlink()
|
||||
|
||||
|
||||
def create_tables(conn: Connection, config: Configuration, reverse_only: bool = False) -> None:
|
||||
""" Create the set of basic tables.
|
||||
When `reverse_only` is True, then the main table for searching will
|
||||
be skipped and only reverse search is possible.
|
||||
"""
|
||||
sql = SQLPreprocessor(conn, config)
|
||||
sql.env.globals['db']['reverse_only'] = reverse_only
|
||||
|
||||
sql.run_sql_file(conn, 'tables.sql')
|
||||
|
||||
|
||||
def create_table_triggers(conn: Connection, config: Configuration) -> None:
|
||||
""" Create the triggers for the tables. The trigger functions must already
|
||||
have been imported with refresh.create_functions().
|
||||
"""
|
||||
sql = SQLPreprocessor(conn, config)
|
||||
sql.run_sql_file(conn, 'table-triggers.sql')
|
||||
|
||||
|
||||
def create_partition_tables(conn: Connection, config: Configuration) -> None:
|
||||
""" Create tables that have explicit partitioning.
|
||||
"""
|
||||
sql = SQLPreprocessor(conn, config)
|
||||
sql.run_sql_file(conn, 'partition-tables.src.sql')
|
||||
|
||||
|
||||
def truncate_data_tables(conn: Connection) -> None:
|
||||
""" Truncate all data tables to prepare for a fresh load.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('TRUNCATE placex')
|
||||
cur.execute('TRUNCATE place_addressline')
|
||||
cur.execute('TRUNCATE location_area')
|
||||
cur.execute('TRUNCATE location_area_country')
|
||||
cur.execute('TRUNCATE location_property_tiger')
|
||||
cur.execute('TRUNCATE location_property_osmline')
|
||||
cur.execute('TRUNCATE location_postcode')
|
||||
if conn.table_exists('search_name'):
|
||||
cur.execute('TRUNCATE search_name')
|
||||
cur.execute('DROP SEQUENCE IF EXISTS seq_place')
|
||||
cur.execute('CREATE SEQUENCE seq_place start 100000')
|
||||
|
||||
cur.execute("""SELECT tablename FROM pg_tables
|
||||
WHERE tablename LIKE 'location_road_%'""")
|
||||
|
||||
for table in [r[0] for r in list(cur)]:
|
||||
cur.execute('TRUNCATE ' + table)
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
_COPY_COLUMNS = pysql.SQL(',').join(map(pysql.Identifier,
|
||||
('osm_type', 'osm_id', 'class', 'type',
|
||||
'name', 'admin_level', 'address',
|
||||
'extratags', 'geometry')))
|
||||
|
||||
|
||||
def load_data(dsn: str, threads: int) -> None:
|
||||
""" Copy data into the word and placex table.
|
||||
"""
|
||||
sel = selectors.DefaultSelector()
|
||||
# Then copy data from place to placex in <threads - 1> chunks.
|
||||
place_threads = max(1, threads - 1)
|
||||
for imod in range(place_threads):
|
||||
conn = DBConnection(dsn)
|
||||
conn.connect()
|
||||
conn.perform(
|
||||
pysql.SQL("""INSERT INTO placex ({columns})
|
||||
SELECT {columns} FROM place
|
||||
WHERE osm_id % {total} = {mod}
|
||||
AND NOT (class='place' and (type='houses' or type='postcode'))
|
||||
AND ST_IsValid(geometry)
|
||||
""").format(columns=_COPY_COLUMNS,
|
||||
total=pysql.Literal(place_threads),
|
||||
mod=pysql.Literal(imod)))
|
||||
sel.register(conn, selectors.EVENT_READ, conn)
|
||||
|
||||
# Address interpolations go into another table.
|
||||
conn = DBConnection(dsn)
|
||||
conn.connect()
|
||||
conn.perform("""INSERT INTO location_property_osmline (osm_id, address, linegeo)
|
||||
SELECT osm_id, address, geometry FROM place
|
||||
WHERE class='place' and type='houses' and osm_type='W'
|
||||
and ST_GeometryType(geometry) = 'ST_LineString'
|
||||
""")
|
||||
sel.register(conn, selectors.EVENT_READ, conn)
|
||||
|
||||
# Now wait for all of them to finish.
|
||||
todo = place_threads + 1
|
||||
while todo > 0:
|
||||
for key, _ in sel.select(1):
|
||||
conn = key.data
|
||||
sel.unregister(conn)
|
||||
conn.wait()
|
||||
conn.close()
|
||||
todo -= 1
|
||||
print('.', end='', flush=True)
|
||||
print('\n')
|
||||
|
||||
with connect(dsn) as syn_conn:
|
||||
with syn_conn.cursor() as cur:
|
||||
cur.execute('ANALYSE')
|
||||
|
||||
|
||||
def create_search_indices(conn: Connection, config: Configuration,
|
||||
drop: bool = False, threads: int = 1) -> None:
|
||||
""" Create tables that have explicit partitioning.
|
||||
"""
|
||||
|
||||
# If index creation failed and left an index invalid, they need to be
|
||||
# cleaned out first, so that the script recreates them.
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT relname FROM pg_class, pg_index
|
||||
WHERE pg_index.indisvalid = false
|
||||
AND pg_index.indexrelid = pg_class.oid""")
|
||||
bad_indices = [row[0] for row in list(cur)]
|
||||
for idx in bad_indices:
|
||||
LOG.info("Drop invalid index %s.", idx)
|
||||
cur.execute(pysql.SQL('DROP INDEX {}').format(pysql.Identifier(idx)))
|
||||
conn.commit()
|
||||
|
||||
sql = SQLPreprocessor(conn, config)
|
||||
|
||||
sql.run_parallel_sql_file(config.get_libpq_dsn(),
|
||||
'indices.sql', min(8, threads), drop=drop)
|
||||
84
src/nominatim_db/tools/exec_utils.py
Normal file
84
src/nominatim_db/tools/exec_utils.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Helper functions for executing external programs.
|
||||
"""
|
||||
from typing import Any, Mapping
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import shutil
|
||||
|
||||
from nominatim_core.typing import StrPath
|
||||
from nominatim_core.db.connection import get_pg_env
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def run_php_server(server_address: str, base_dir: StrPath) -> None:
|
||||
""" Run the built-in server from the given directory.
|
||||
"""
|
||||
subprocess.run(['/usr/bin/env', 'php', '-S', server_address],
|
||||
cwd=str(base_dir), check=True)
|
||||
|
||||
|
||||
def run_osm2pgsql(options: Mapping[str, Any]) -> None:
|
||||
""" Run osm2pgsql with the given options.
|
||||
"""
|
||||
env = get_pg_env(options['dsn'])
|
||||
|
||||
osm2pgsql_cmd = options['osm2pgsql']
|
||||
if osm2pgsql_cmd is None:
|
||||
osm2pgsql_cmd = shutil.which('osm2pgsql')
|
||||
if osm2pgsql_cmd is None:
|
||||
raise RuntimeError('osm2pgsql executable not found. Please install osm2pgsql first.')
|
||||
|
||||
cmd = [str(osm2pgsql_cmd),
|
||||
'--slim',
|
||||
'--log-progress', 'true',
|
||||
'--number-processes', '1' if options['append'] else str(options['threads']),
|
||||
'--cache', str(options['osm2pgsql_cache']),
|
||||
'--style', str(options['osm2pgsql_style'])
|
||||
]
|
||||
|
||||
if str(options['osm2pgsql_style']).endswith('.lua'):
|
||||
env['LUA_PATH'] = ';'.join((str(options['osm2pgsql_style_path'] / '?.lua'),
|
||||
os.environ.get('LUAPATH', ';')))
|
||||
cmd.extend(('--output', 'flex'))
|
||||
else:
|
||||
cmd.extend(('--output', 'gazetteer', '--hstore', '--latlon'))
|
||||
|
||||
cmd.append('--append' if options['append'] else '--create')
|
||||
|
||||
if options['flatnode_file']:
|
||||
cmd.extend(('--flat-nodes', options['flatnode_file']))
|
||||
|
||||
for key, param in (('slim_data', '--tablespace-slim-data'),
|
||||
('slim_index', '--tablespace-slim-index'),
|
||||
('main_data', '--tablespace-main-data'),
|
||||
('main_index', '--tablespace-main-index')):
|
||||
if options['tablespaces'][key]:
|
||||
cmd.extend((param, options['tablespaces'][key]))
|
||||
|
||||
if options['tablespaces']['main_data']:
|
||||
env['NOMINATIM_TABLESPACE_PLACE_DATA'] = options['tablespaces']['main_data']
|
||||
if options['tablespaces']['main_index']:
|
||||
env['NOMINATIM_TABLESPACE_PLACE_INDEX'] = options['tablespaces']['main_index']
|
||||
|
||||
if options.get('disable_jit', False):
|
||||
env['PGOPTIONS'] = '-c jit=off -c max_parallel_workers_per_gather=0'
|
||||
|
||||
if 'import_data' in options:
|
||||
cmd.extend(('-r', 'xml', '-'))
|
||||
elif isinstance(options['import_file'], list):
|
||||
for fname in options['import_file']:
|
||||
cmd.append(str(fname))
|
||||
else:
|
||||
cmd.append(str(options['import_file']))
|
||||
|
||||
subprocess.run(cmd, cwd=options.get('cwd', '.'),
|
||||
input=options.get('import_data'),
|
||||
env=env, check=True)
|
||||
58
src/nominatim_db/tools/freeze.py
Normal file
58
src/nominatim_db/tools/freeze.py
Normal file
@@ -0,0 +1,58 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for removing unnecessary data from the database.
|
||||
"""
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
|
||||
from psycopg2 import sql as pysql
|
||||
|
||||
from nominatim_core.db.connection import Connection
|
||||
|
||||
UPDATE_TABLES = [
|
||||
'address_levels',
|
||||
'gb_postcode',
|
||||
'import_osmosis_log',
|
||||
'import_polygon_%',
|
||||
'location_area%',
|
||||
'location_road%',
|
||||
'place',
|
||||
'planet_osm_%',
|
||||
'search_name_%',
|
||||
'us_postcode',
|
||||
'wikipedia_%'
|
||||
]
|
||||
|
||||
def drop_update_tables(conn: Connection) -> None:
|
||||
""" Drop all tables only necessary for updating the database from
|
||||
OSM replication data.
|
||||
"""
|
||||
parts = (pysql.SQL("(tablename LIKE {})").format(pysql.Literal(t)) for t in UPDATE_TABLES)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(pysql.SQL("SELECT tablename FROM pg_tables WHERE ")
|
||||
+ pysql.SQL(' or ').join(parts))
|
||||
tables = [r[0] for r in cur]
|
||||
|
||||
for table in tables:
|
||||
cur.drop_table(table, cascade=True)
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
def drop_flatnode_file(fpath: Optional[Path]) -> None:
|
||||
""" Remove the flatnode file if it exists.
|
||||
"""
|
||||
if fpath and fpath.exists():
|
||||
fpath.unlink()
|
||||
|
||||
def is_frozen(conn: Connection) -> bool:
|
||||
""" Returns true if database is in a frozen state
|
||||
"""
|
||||
|
||||
return conn.table_exists('place') is False
|
||||
405
src/nominatim_db/tools/migration.py
Normal file
405
src/nominatim_db/tools/migration.py
Normal file
@@ -0,0 +1,405 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for database migration to newer software versions.
|
||||
"""
|
||||
from typing import List, Tuple, Callable, Any
|
||||
import logging
|
||||
|
||||
from psycopg2 import sql as pysql
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db import properties
|
||||
from nominatim_core.db.connection import connect, Connection
|
||||
from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
from . import refresh
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
_MIGRATION_FUNCTIONS : List[Tuple[NominatimVersion, Callable[..., None]]] = []
|
||||
|
||||
def migrate(config: Configuration, paths: Any) -> int:
|
||||
""" Check for the current database version and execute migrations,
|
||||
if necesssary.
|
||||
"""
|
||||
with connect(config.get_libpq_dsn()) as conn:
|
||||
if conn.table_exists('nominatim_properties'):
|
||||
db_version_str = properties.get_property(conn, 'database_version')
|
||||
else:
|
||||
db_version_str = None
|
||||
|
||||
if db_version_str is not None:
|
||||
db_version = parse_version(db_version_str)
|
||||
|
||||
if db_version == NOMINATIM_VERSION:
|
||||
LOG.warning("Database already at latest version (%s)", db_version_str)
|
||||
return 0
|
||||
|
||||
LOG.info("Detected database version: %s", db_version_str)
|
||||
else:
|
||||
db_version = _guess_version(conn)
|
||||
|
||||
|
||||
for version, func in _MIGRATION_FUNCTIONS:
|
||||
if db_version < version or \
|
||||
(db_version == (3, 5, 0, 99) and version == (3, 5, 0, 99)):
|
||||
title = func.__doc__ or ''
|
||||
LOG.warning("Running: %s (%s)", title.split('\n', 1)[0], version)
|
||||
kwargs = dict(conn=conn, config=config, paths=paths)
|
||||
func(**kwargs)
|
||||
conn.commit()
|
||||
|
||||
LOG.warning('Updating SQL functions.')
|
||||
refresh.create_functions(conn, config)
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
|
||||
tokenizer.update_sql_functions(config)
|
||||
|
||||
properties.set_property(conn, 'database_version', str(NOMINATIM_VERSION))
|
||||
|
||||
conn.commit()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def _guess_version(conn: Connection) -> NominatimVersion:
|
||||
""" Guess a database version when there is no property table yet.
|
||||
Only migrations for 3.6 and later are supported, so bail out
|
||||
when the version seems older.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
# In version 3.6, the country_name table was updated. Check for that.
|
||||
cnt = cur.scalar("""SELECT count(*) FROM
|
||||
(SELECT svals(name) FROM country_name
|
||||
WHERE country_code = 'gb')x;
|
||||
""")
|
||||
if cnt < 100:
|
||||
LOG.fatal('It looks like your database was imported with a version '
|
||||
'prior to 3.6.0. Automatic migration not possible.')
|
||||
raise UsageError('Migration not possible.')
|
||||
|
||||
return NominatimVersion(3, 5, 0, 99)
|
||||
|
||||
|
||||
|
||||
def _migration(major: int, minor: int, patch: int = 0,
|
||||
dbpatch: int = 0) -> Callable[[Callable[..., None]], Callable[..., None]]:
|
||||
""" Decorator for a single migration step. The parameters describe the
|
||||
version after which the migration is applicable, i.e before changing
|
||||
from the given version to the next, the migration is required.
|
||||
|
||||
All migrations are run in the order in which they are defined in this
|
||||
file. Do not run global SQL scripts for migrations as you cannot be sure
|
||||
that these scripts do the same in later versions.
|
||||
|
||||
Functions will always be reimported in full at the end of the migration
|
||||
process, so the migration functions may leave a temporary state behind
|
||||
there.
|
||||
"""
|
||||
def decorator(func: Callable[..., None]) -> Callable[..., None]:
|
||||
version = NominatimVersion(major, minor, patch, dbpatch)
|
||||
_MIGRATION_FUNCTIONS.append((version, func))
|
||||
return func
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@_migration(3, 5, 0, 99)
|
||||
def import_status_timestamp_change(conn: Connection, **_: Any) -> None:
|
||||
""" Add timezone to timestamp in status table.
|
||||
|
||||
The import_status table has been changed to include timezone information
|
||||
with the time stamp.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""ALTER TABLE import_status ALTER COLUMN lastimportdate
|
||||
TYPE timestamp with time zone;""")
|
||||
|
||||
|
||||
@_migration(3, 5, 0, 99)
|
||||
def add_nominatim_property_table(conn: Connection, config: Configuration, **_: Any) -> None:
|
||||
""" Add nominatim_property table.
|
||||
"""
|
||||
if not conn.table_exists('nominatim_properties'):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(pysql.SQL("""CREATE TABLE nominatim_properties (
|
||||
property TEXT,
|
||||
value TEXT);
|
||||
GRANT SELECT ON TABLE nominatim_properties TO {};
|
||||
""").format(pysql.Identifier(config.DATABASE_WEBUSER)))
|
||||
|
||||
@_migration(3, 6, 0, 0)
|
||||
def change_housenumber_transliteration(conn: Connection, **_: Any) -> None:
|
||||
""" Transliterate housenumbers.
|
||||
|
||||
The database schema switched from saving raw housenumbers in
|
||||
placex.housenumber to saving transliterated ones.
|
||||
|
||||
Note: the function create_housenumber_id() has been dropped in later
|
||||
versions.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
|
||||
RETURNS TEXT AS $$
|
||||
DECLARE
|
||||
normtext TEXT;
|
||||
BEGIN
|
||||
SELECT array_to_string(array_agg(trans), ';')
|
||||
INTO normtext
|
||||
FROM (SELECT lookup_word as trans,
|
||||
getorcreate_housenumber_id(lookup_word)
|
||||
FROM (SELECT make_standard_name(h) as lookup_word
|
||||
FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
|
||||
return normtext;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE STRICT;""")
|
||||
cur.execute("DELETE FROM word WHERE class = 'place' and type = 'house'")
|
||||
cur.execute("""UPDATE placex
|
||||
SET housenumber = create_housenumber_id(housenumber)
|
||||
WHERE housenumber is not null""")
|
||||
|
||||
|
||||
@_migration(3, 7, 0, 0)
|
||||
def switch_placenode_geometry_index(conn: Connection, **_: Any) -> None:
|
||||
""" Replace idx_placex_geometry_reverse_placeNode index.
|
||||
|
||||
Make the index slightly more permissive, so that it can also be used
|
||||
when matching up boundaries and place nodes. It makes the index
|
||||
idx_placex_adminname index unnecessary.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(""" CREATE INDEX IF NOT EXISTS idx_placex_geometry_placenode ON placex
|
||||
USING GIST (geometry)
|
||||
WHERE osm_type = 'N' and rank_search < 26
|
||||
and class = 'place' and type != 'postcode'
|
||||
and linked_place_id is null""")
|
||||
cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
|
||||
|
||||
|
||||
@_migration(3, 7, 0, 1)
|
||||
def install_legacy_tokenizer(conn: Connection, config: Configuration, **_: Any) -> None:
|
||||
""" Setup legacy tokenizer.
|
||||
|
||||
If no other tokenizer has been configured yet, then create the
|
||||
configuration for the backwards-compatible legacy tokenizer
|
||||
"""
|
||||
if properties.get_property(conn, 'tokenizer') is None:
|
||||
with conn.cursor() as cur:
|
||||
for table in ('placex', 'location_property_osmline'):
|
||||
has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
|
||||
WHERE table_name = %s
|
||||
and column_name = 'token_info'""",
|
||||
(table, ))
|
||||
if has_column == 0:
|
||||
cur.execute(pysql.SQL('ALTER TABLE {} ADD COLUMN token_info JSONB')
|
||||
.format(pysql.Identifier(table)))
|
||||
tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
|
||||
module_name='legacy')
|
||||
|
||||
tokenizer.migrate_database(config) # type: ignore[attr-defined]
|
||||
|
||||
|
||||
@_migration(4, 0, 99, 0)
|
||||
def create_tiger_housenumber_index(conn: Connection, **_: Any) -> None:
|
||||
""" Create idx_location_property_tiger_parent_place_id with included
|
||||
house number.
|
||||
|
||||
The inclusion is needed for efficient lookup of housenumbers in
|
||||
full address searches.
|
||||
"""
|
||||
if conn.server_version_tuple() >= (11, 0, 0):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(""" CREATE INDEX IF NOT EXISTS
|
||||
idx_location_property_tiger_housenumber_migrated
|
||||
ON location_property_tiger
|
||||
USING btree(parent_place_id)
|
||||
INCLUDE (startnumber, endnumber) """)
|
||||
|
||||
|
||||
@_migration(4, 0, 99, 1)
|
||||
def create_interpolation_index_on_place(conn: Connection, **_: Any) -> None:
|
||||
""" Create idx_place_interpolations for lookup of interpolation lines
|
||||
on updates.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""CREATE INDEX IF NOT EXISTS idx_place_interpolations
|
||||
ON place USING gist(geometry)
|
||||
WHERE osm_type = 'W' and address ? 'interpolation'""")
|
||||
|
||||
|
||||
@_migration(4, 0, 99, 2)
|
||||
def add_step_column_for_interpolation(conn: Connection, **_: Any) -> None:
|
||||
""" Add a new column 'step' to the interpolations table.
|
||||
|
||||
Also converts the data into the stricter format which requires that
|
||||
startnumbers comply with the odd/even requirements.
|
||||
"""
|
||||
if conn.table_has_column('location_property_osmline', 'step'):
|
||||
return
|
||||
|
||||
with conn.cursor() as cur:
|
||||
# Mark invalid all interpolations with no intermediate numbers.
|
||||
cur.execute("""UPDATE location_property_osmline SET startnumber = null
|
||||
WHERE endnumber - startnumber <= 1 """)
|
||||
# Align the start numbers where odd/even does not match.
|
||||
cur.execute("""UPDATE location_property_osmline
|
||||
SET startnumber = startnumber + 1,
|
||||
linegeo = ST_LineSubString(linegeo,
|
||||
1.0 / (endnumber - startnumber)::float,
|
||||
1)
|
||||
WHERE (interpolationtype = 'odd' and startnumber % 2 = 0)
|
||||
or (interpolationtype = 'even' and startnumber % 2 = 1)
|
||||
""")
|
||||
# Mark invalid odd/even interpolations with no intermediate numbers.
|
||||
cur.execute("""UPDATE location_property_osmline SET startnumber = null
|
||||
WHERE interpolationtype in ('odd', 'even')
|
||||
and endnumber - startnumber = 2""")
|
||||
# Finally add the new column and populate it.
|
||||
cur.execute("ALTER TABLE location_property_osmline ADD COLUMN step SMALLINT")
|
||||
cur.execute("""UPDATE location_property_osmline
|
||||
SET step = CASE WHEN interpolationtype = 'all'
|
||||
THEN 1 ELSE 2 END
|
||||
""")
|
||||
|
||||
|
||||
@_migration(4, 0, 99, 3)
|
||||
def add_step_column_for_tiger(conn: Connection, **_: Any) -> None:
|
||||
""" Add a new column 'step' to the tiger data table.
|
||||
"""
|
||||
if conn.table_has_column('location_property_tiger', 'step'):
|
||||
return
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("ALTER TABLE location_property_tiger ADD COLUMN step SMALLINT")
|
||||
cur.execute("""UPDATE location_property_tiger
|
||||
SET step = CASE WHEN interpolationtype = 'all'
|
||||
THEN 1 ELSE 2 END
|
||||
""")
|
||||
|
||||
|
||||
@_migration(4, 0, 99, 4)
|
||||
def add_derived_name_column_for_country_names(conn: Connection, **_: Any) -> None:
|
||||
""" Add a new column 'derived_name' which in the future takes the
|
||||
country names as imported from OSM data.
|
||||
"""
|
||||
if not conn.table_has_column('country_name', 'derived_name'):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("ALTER TABLE country_name ADD COLUMN derived_name public.HSTORE")
|
||||
|
||||
|
||||
@_migration(4, 0, 99, 5)
|
||||
def mark_internal_country_names(conn: Connection, config: Configuration, **_: Any) -> None:
|
||||
""" Names from the country table should be marked as internal to prevent
|
||||
them from being deleted. Only necessary for ICU tokenizer.
|
||||
"""
|
||||
import psycopg2.extras # pylint: disable=import-outside-toplevel
|
||||
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
with conn.cursor() as cur:
|
||||
psycopg2.extras.register_hstore(cur)
|
||||
cur.execute("SELECT country_code, name FROM country_name")
|
||||
|
||||
for country_code, names in cur:
|
||||
if not names:
|
||||
names = {}
|
||||
names['countrycode'] = country_code
|
||||
analyzer.add_country_names(country_code, names)
|
||||
|
||||
|
||||
@_migration(4, 1, 99, 0)
|
||||
def add_place_deletion_todo_table(conn: Connection, **_: Any) -> None:
|
||||
""" Add helper table for deleting data on updates.
|
||||
|
||||
The table is only necessary when updates are possible, i.e.
|
||||
the database is not in freeze mode.
|
||||
"""
|
||||
if conn.table_exists('place'):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""CREATE TABLE IF NOT EXISTS place_to_be_deleted (
|
||||
osm_type CHAR(1),
|
||||
osm_id BIGINT,
|
||||
class TEXT,
|
||||
type TEXT,
|
||||
deferred BOOLEAN)""")
|
||||
|
||||
|
||||
@_migration(4, 1, 99, 1)
|
||||
def split_pending_index(conn: Connection, **_: Any) -> None:
|
||||
""" Reorganise indexes for pending updates.
|
||||
"""
|
||||
if conn.table_exists('place'):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_rank_address_sector
|
||||
ON placex USING BTREE (rank_address, geometry_sector)
|
||||
WHERE indexed_status > 0""")
|
||||
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_rank_boundaries_sector
|
||||
ON placex USING BTREE (rank_search, geometry_sector)
|
||||
WHERE class = 'boundary' and type = 'administrative'
|
||||
and indexed_status > 0""")
|
||||
cur.execute("DROP INDEX IF EXISTS idx_placex_pendingsector")
|
||||
|
||||
|
||||
@_migration(4, 2, 99, 0)
|
||||
def enable_forward_dependencies(conn: Connection, **_: Any) -> None:
|
||||
""" Create indexes for updates with forward dependency tracking (long-running).
|
||||
"""
|
||||
if conn.table_exists('planet_osm_ways'):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT * FROM pg_indexes
|
||||
WHERE tablename = 'planet_osm_ways'
|
||||
and indexdef LIKE '%nodes%'""")
|
||||
if cur.rowcount == 0:
|
||||
cur.execute("""CREATE OR REPLACE FUNCTION public.planet_osm_index_bucket(bigint[])
|
||||
RETURNS bigint[]
|
||||
LANGUAGE sql IMMUTABLE
|
||||
AS $function$
|
||||
SELECT ARRAY(SELECT DISTINCT unnest($1) >> 5)
|
||||
$function$""")
|
||||
cur.execute("""CREATE INDEX planet_osm_ways_nodes_bucket_idx
|
||||
ON planet_osm_ways
|
||||
USING gin (planet_osm_index_bucket(nodes))
|
||||
WITH (fastupdate=off)""")
|
||||
cur.execute("""CREATE INDEX planet_osm_rels_parts_idx
|
||||
ON planet_osm_rels USING gin (parts)
|
||||
WITH (fastupdate=off)""")
|
||||
cur.execute("ANALYZE planet_osm_ways")
|
||||
|
||||
|
||||
@_migration(4, 2, 99, 1)
|
||||
def add_improved_geometry_reverse_placenode_index(conn: Connection, **_: Any) -> None:
|
||||
""" Create improved index for reverse lookup of place nodes.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_geometry_reverse_lookupPlaceNode
|
||||
ON placex
|
||||
USING gist (ST_Buffer(geometry, reverse_place_diameter(rank_search)))
|
||||
WHERE rank_address between 4 and 25 AND type != 'postcode'
|
||||
AND name is not null AND linked_place_id is null AND osm_type = 'N'
|
||||
""")
|
||||
|
||||
@_migration(4, 4, 99, 0)
|
||||
def create_postcode_area_lookup_index(conn: Connection, **_: Any) -> None:
|
||||
""" Create index needed for looking up postcode areas from postocde points.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_postcode_areas
|
||||
ON placex USING BTREE (country_code, postcode)
|
||||
WHERE osm_type = 'R' AND class = 'boundary' AND type = 'postal_code'
|
||||
""")
|
||||
|
||||
|
||||
@_migration(4, 4, 99, 1)
|
||||
def create_postcode_parent_index(conn: Connection, **_: Any) -> None:
|
||||
""" Create index needed for updating postcodes when a parent changes.
|
||||
"""
|
||||
if conn.table_exists('planet_osm_ways'):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""CREATE INDEX IF NOT EXISTS
|
||||
idx_location_postcode_parent_place_id
|
||||
ON location_postcode USING BTREE (parent_place_id)""")
|
||||
234
src/nominatim_db/tools/postcodes.py
Normal file
234
src/nominatim_db/tools/postcodes.py
Normal file
@@ -0,0 +1,234 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for importing, updating and otherwise maintaining the table
|
||||
of artificial postcode centroids.
|
||||
"""
|
||||
from typing import Optional, Tuple, Dict, List, TextIO
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
import csv
|
||||
import gzip
|
||||
import logging
|
||||
from math import isfinite
|
||||
|
||||
from psycopg2 import sql as pysql
|
||||
|
||||
from nominatim_core.db.connection import connect, Connection
|
||||
from nominatim_core.utils.centroid import PointsCentroid
|
||||
from ..data.postcode_format import PostcodeFormatter, CountryPostcodeMatcher
|
||||
from ..tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _to_float(numstr: str, max_value: float) -> float:
|
||||
""" Convert the number in string into a float. The number is expected
|
||||
to be in the range of [-max_value, max_value]. Otherwise rises a
|
||||
ValueError.
|
||||
"""
|
||||
num = float(numstr)
|
||||
if not isfinite(num) or num <= -max_value or num >= max_value:
|
||||
raise ValueError()
|
||||
|
||||
return num
|
||||
|
||||
class _PostcodeCollector:
|
||||
""" Collector for postcodes of a single country.
|
||||
"""
|
||||
|
||||
def __init__(self, country: str, matcher: Optional[CountryPostcodeMatcher]):
|
||||
self.country = country
|
||||
self.matcher = matcher
|
||||
self.collected: Dict[str, PointsCentroid] = defaultdict(PointsCentroid)
|
||||
self.normalization_cache: Optional[Tuple[str, Optional[str]]] = None
|
||||
|
||||
|
||||
def add(self, postcode: str, x: float, y: float) -> None:
|
||||
""" Add the given postcode to the collection cache. If the postcode
|
||||
already existed, it is overwritten with the new centroid.
|
||||
"""
|
||||
if self.matcher is not None:
|
||||
normalized: Optional[str]
|
||||
if self.normalization_cache and self.normalization_cache[0] == postcode:
|
||||
normalized = self.normalization_cache[1]
|
||||
else:
|
||||
match = self.matcher.match(postcode)
|
||||
normalized = self.matcher.normalize(match) if match else None
|
||||
self.normalization_cache = (postcode, normalized)
|
||||
|
||||
if normalized:
|
||||
self.collected[normalized] += (x, y)
|
||||
|
||||
|
||||
def commit(self, conn: Connection, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
|
||||
""" Update postcodes for the country from the postcodes selected so far
|
||||
as well as any externally supplied postcodes.
|
||||
"""
|
||||
self._update_from_external(analyzer, project_dir)
|
||||
to_add, to_delete, to_update = self._compute_changes(conn)
|
||||
|
||||
LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
|
||||
self.country, len(to_add), len(to_delete), len(to_update))
|
||||
|
||||
with conn.cursor() as cur:
|
||||
if to_add:
|
||||
cur.execute_values(
|
||||
"""INSERT INTO location_postcode
|
||||
(place_id, indexed_status, country_code,
|
||||
postcode, geometry) VALUES %s""",
|
||||
to_add,
|
||||
template=pysql.SQL("""(nextval('seq_place'), 1, {},
|
||||
%s, 'SRID=4326;POINT(%s %s)')
|
||||
""").format(pysql.Literal(self.country)))
|
||||
if to_delete:
|
||||
cur.execute("""DELETE FROM location_postcode
|
||||
WHERE country_code = %s and postcode = any(%s)
|
||||
""", (self.country, to_delete))
|
||||
if to_update:
|
||||
cur.execute_values(
|
||||
pysql.SQL("""UPDATE location_postcode
|
||||
SET indexed_status = 2,
|
||||
geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
|
||||
FROM (VALUES %s) AS v (pc, x, y)
|
||||
WHERE country_code = {} and postcode = pc
|
||||
""").format(pysql.Literal(self.country)), to_update)
|
||||
|
||||
|
||||
def _compute_changes(self, conn: Connection) \
|
||||
-> Tuple[List[Tuple[str, float, float]], List[str], List[Tuple[str, float, float]]]:
|
||||
""" Compute which postcodes from the collected postcodes have to be
|
||||
added or modified and which from the location_postcode table
|
||||
have to be deleted.
|
||||
"""
|
||||
to_update = []
|
||||
to_delete = []
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
|
||||
FROM location_postcode
|
||||
WHERE country_code = %s""",
|
||||
(self.country, ))
|
||||
for postcode, x, y in cur:
|
||||
pcobj = self.collected.pop(postcode, None)
|
||||
if pcobj:
|
||||
newx, newy = pcobj.centroid()
|
||||
if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
|
||||
to_update.append((postcode, newx, newy))
|
||||
else:
|
||||
to_delete.append(postcode)
|
||||
|
||||
to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
|
||||
self.collected = defaultdict(PointsCentroid)
|
||||
|
||||
return to_add, to_delete, to_update
|
||||
|
||||
|
||||
def _update_from_external(self, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
|
||||
""" Look for an external postcode file for the active country in
|
||||
the project directory and add missing postcodes when found.
|
||||
"""
|
||||
csvfile = self._open_external(project_dir)
|
||||
if csvfile is None:
|
||||
return
|
||||
|
||||
try:
|
||||
reader = csv.DictReader(csvfile)
|
||||
for row in reader:
|
||||
if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
|
||||
LOG.warning("Bad format for external postcode file for country '%s'."
|
||||
" Ignored.", self.country)
|
||||
return
|
||||
postcode = analyzer.normalize_postcode(row['postcode'])
|
||||
if postcode not in self.collected:
|
||||
try:
|
||||
# Do float conversation separately, it might throw
|
||||
centroid = (_to_float(row['lon'], 180),
|
||||
_to_float(row['lat'], 90))
|
||||
self.collected[postcode] += centroid
|
||||
except ValueError:
|
||||
LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
|
||||
row['lat'], row['lon'], self.country)
|
||||
|
||||
finally:
|
||||
csvfile.close()
|
||||
|
||||
|
||||
def _open_external(self, project_dir: Path) -> Optional[TextIO]:
|
||||
fname = project_dir / f'{self.country}_postcodes.csv'
|
||||
|
||||
if fname.is_file():
|
||||
LOG.info("Using external postcode file '%s'.", fname)
|
||||
return open(fname, 'r', encoding='utf-8')
|
||||
|
||||
fname = project_dir / f'{self.country}_postcodes.csv.gz'
|
||||
|
||||
if fname.is_file():
|
||||
LOG.info("Using external postcode file '%s'.", fname)
|
||||
return gzip.open(fname, 'rt')
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def update_postcodes(dsn: str, project_dir: Path, tokenizer: AbstractTokenizer) -> None:
|
||||
""" Update the table of artificial postcodes.
|
||||
|
||||
Computes artificial postcode centroids from the placex table,
|
||||
potentially enhances it with external data and then updates the
|
||||
postcodes in the table 'location_postcode'.
|
||||
"""
|
||||
matcher = PostcodeFormatter()
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
with connect(dsn) as conn:
|
||||
# First get the list of countries that currently have postcodes.
|
||||
# (Doing this before starting to insert, so it is fast on import.)
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT DISTINCT country_code FROM location_postcode")
|
||||
todo_countries = set((row[0] for row in cur))
|
||||
|
||||
# Recompute the list of valid postcodes from placex.
|
||||
with conn.cursor(name="placex_postcodes") as cur:
|
||||
cur.execute("""
|
||||
SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
|
||||
FROM (SELECT
|
||||
COALESCE(plx.country_code,
|
||||
get_country_code(ST_Centroid(pl.geometry))) as cc,
|
||||
pl.address->'postcode' as pc,
|
||||
COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
|
||||
FROM place AS pl LEFT OUTER JOIN placex AS plx
|
||||
ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
|
||||
WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
|
||||
WHERE pc IS NOT null AND cc IS NOT null
|
||||
ORDER BY cc, pc""")
|
||||
|
||||
collector = None
|
||||
|
||||
for country, postcode, x, y in cur:
|
||||
if collector is None or country != collector.country:
|
||||
if collector is not None:
|
||||
collector.commit(conn, analyzer, project_dir)
|
||||
collector = _PostcodeCollector(country, matcher.get_matcher(country))
|
||||
todo_countries.discard(country)
|
||||
collector.add(postcode, x, y)
|
||||
|
||||
if collector is not None:
|
||||
collector.commit(conn, analyzer, project_dir)
|
||||
|
||||
# Now handle any countries that are only in the postcode table.
|
||||
for country in todo_countries:
|
||||
fmt = matcher.get_matcher(country)
|
||||
_PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir)
|
||||
|
||||
conn.commit()
|
||||
|
||||
analyzer.update_postcodes_from_db()
|
||||
|
||||
def can_compute(dsn: str) -> bool:
|
||||
"""
|
||||
Check that the place table exists so that
|
||||
postcodes can be computed.
|
||||
"""
|
||||
with connect(dsn) as conn:
|
||||
return conn.table_exists('place')
|
||||
346
src/nominatim_db/tools/refresh.py
Normal file
346
src/nominatim_db/tools/refresh.py
Normal file
@@ -0,0 +1,346 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for bringing auxiliary data in the database up-to-date.
|
||||
"""
|
||||
from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
|
||||
import csv
|
||||
import gzip
|
||||
import logging
|
||||
from textwrap import dedent
|
||||
from pathlib import Path
|
||||
|
||||
from psycopg2 import sql as pysql
|
||||
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.connection import Connection, connect
|
||||
from nominatim_core.db.utils import execute_file, CopyBuffer
|
||||
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
|
||||
from ..version import NOMINATIM_VERSION
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
OSM_TYPE = {'N': 'node', 'W': 'way', 'R': 'relation'}
|
||||
|
||||
def _add_address_level_rows_from_entry(rows: MutableSequence[Tuple[Any, ...]],
|
||||
entry: Mapping[str, Any]) -> None:
|
||||
""" Converts a single entry from the JSON format for address rank
|
||||
descriptions into a flat format suitable for inserting into a
|
||||
PostgreSQL table and adds these lines to `rows`.
|
||||
"""
|
||||
countries = entry.get('countries') or (None, )
|
||||
for key, values in entry['tags'].items():
|
||||
for value, ranks in values.items():
|
||||
if isinstance(ranks, list):
|
||||
rank_search, rank_address = ranks
|
||||
else:
|
||||
rank_search = rank_address = ranks
|
||||
if not value:
|
||||
value = None
|
||||
for country in countries:
|
||||
rows.append((country, key, value, rank_search, rank_address))
|
||||
|
||||
|
||||
def load_address_levels(conn: Connection, table: str, levels: Sequence[Mapping[str, Any]]) -> None:
|
||||
""" Replace the `address_levels` table with the contents of `levels'.
|
||||
|
||||
A new table is created any previously existing table is dropped.
|
||||
The table has the following columns:
|
||||
country, class, type, rank_search, rank_address
|
||||
"""
|
||||
rows: List[Tuple[Any, ...]] = []
|
||||
for entry in levels:
|
||||
_add_address_level_rows_from_entry(rows, entry)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table(table)
|
||||
|
||||
cur.execute(pysql.SQL("""CREATE TABLE {} (
|
||||
country_code varchar(2),
|
||||
class TEXT,
|
||||
type TEXT,
|
||||
rank_search SMALLINT,
|
||||
rank_address SMALLINT)
|
||||
""").format(pysql.Identifier(table)))
|
||||
|
||||
cur.execute_values(pysql.SQL("INSERT INTO {} VALUES %s")
|
||||
.format(pysql.Identifier(table)), rows)
|
||||
|
||||
cur.execute(pysql.SQL('CREATE UNIQUE INDEX ON {} (country_code, class, type)')
|
||||
.format(pysql.Identifier(table)))
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
def load_address_levels_from_config(conn: Connection, config: Configuration) -> None:
|
||||
""" Replace the `address_levels` table with the content as
|
||||
defined in the given configuration. Uses the parameter
|
||||
NOMINATIM_ADDRESS_LEVEL_CONFIG to determine the location of the
|
||||
configuration file.
|
||||
"""
|
||||
cfg = config.load_sub_configuration('', config='ADDRESS_LEVEL_CONFIG')
|
||||
load_address_levels(conn, 'address_levels', cfg)
|
||||
|
||||
|
||||
def create_functions(conn: Connection, config: Configuration,
|
||||
enable_diff_updates: bool = True,
|
||||
enable_debug: bool = False) -> None:
|
||||
""" (Re)create the PL/pgSQL functions.
|
||||
"""
|
||||
sql = SQLPreprocessor(conn, config)
|
||||
|
||||
sql.run_sql_file(conn, 'functions.sql',
|
||||
disable_diff_updates=not enable_diff_updates,
|
||||
debug=enable_debug)
|
||||
|
||||
|
||||
|
||||
WEBSITE_SCRIPTS = (
|
||||
'deletable.php',
|
||||
'details.php',
|
||||
'lookup.php',
|
||||
'polygons.php',
|
||||
'reverse.php',
|
||||
'search.php',
|
||||
'status.php'
|
||||
)
|
||||
|
||||
# constants needed by PHP scripts: PHP name, config name, type
|
||||
PHP_CONST_DEFS = (
|
||||
('Database_DSN', 'DATABASE_DSN', str),
|
||||
('Default_Language', 'DEFAULT_LANGUAGE', str),
|
||||
('Log_DB', 'LOG_DB', bool),
|
||||
('Log_File', 'LOG_FILE', Path),
|
||||
('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
|
||||
('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
|
||||
('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
|
||||
('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
|
||||
('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
|
||||
('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
|
||||
('MapIcon_URL', 'MAPICON_URL', str),
|
||||
('Search_WithinCountries', 'SEARCH_WITHIN_COUNTRIES', bool),
|
||||
)
|
||||
|
||||
|
||||
def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
|
||||
""" Replaces the wikipedia importance tables with new data.
|
||||
The import is run in a single transaction so that the new data
|
||||
is replace seamlessly.
|
||||
|
||||
Returns 0 if all was well and 1 if the importance file could not
|
||||
be found. Throws an exception if there was an error reading the file.
|
||||
"""
|
||||
if import_importance_csv(dsn, data_path / 'wikimedia-importance.csv.gz') == 0 \
|
||||
or import_importance_sql(dsn, data_path / 'wikimedia-importance.sql.gz',
|
||||
ignore_errors) == 0:
|
||||
return 0
|
||||
|
||||
return 1
|
||||
|
||||
|
||||
def import_importance_csv(dsn: str, data_file: Path) -> int:
|
||||
""" Replace wikipedia importance table with data from a
|
||||
single CSV file.
|
||||
|
||||
The file must be a gzipped CSV and have the following columns:
|
||||
language, title, importance, wikidata_id
|
||||
|
||||
Other columns may be present but will be ignored.
|
||||
"""
|
||||
if not data_file.exists():
|
||||
return 1
|
||||
|
||||
# Only import the first occurance of a wikidata ID.
|
||||
# This keeps indexes and table small.
|
||||
wd_done = set()
|
||||
|
||||
with connect(dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table('wikipedia_article')
|
||||
cur.drop_table('wikipedia_redirect')
|
||||
cur.drop_table('wikimedia_importance')
|
||||
cur.execute("""CREATE TABLE wikimedia_importance (
|
||||
language TEXT NOT NULL,
|
||||
title TEXT NOT NULL,
|
||||
importance double precision NOT NULL,
|
||||
wikidata TEXT
|
||||
) """)
|
||||
|
||||
with gzip.open(str(data_file), 'rt') as fd, CopyBuffer() as buf:
|
||||
for row in csv.DictReader(fd, delimiter='\t', quotechar='|'):
|
||||
wd_id = int(row['wikidata_id'][1:])
|
||||
buf.add(row['language'], row['title'], row['importance'],
|
||||
None if wd_id in wd_done else row['wikidata_id'])
|
||||
wd_done.add(wd_id)
|
||||
|
||||
if buf.size() > 10000000:
|
||||
with conn.cursor() as cur:
|
||||
buf.copy_out(cur, 'wikimedia_importance',
|
||||
columns=['language', 'title', 'importance',
|
||||
'wikidata'])
|
||||
|
||||
with conn.cursor() as cur:
|
||||
buf.copy_out(cur, 'wikimedia_importance',
|
||||
columns=['language', 'title', 'importance', 'wikidata'])
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_title
|
||||
ON wikimedia_importance (title)""")
|
||||
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_wikidata
|
||||
ON wikimedia_importance (wikidata)
|
||||
WHERE wikidata is not null""")
|
||||
|
||||
conn.commit()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def import_importance_sql(dsn: str, data_file: Path, ignore_errors: bool) -> int:
|
||||
""" Replace wikipedia importance table with data from an SQL file.
|
||||
"""
|
||||
if not data_file.exists():
|
||||
return 1
|
||||
|
||||
pre_code = """BEGIN;
|
||||
DROP TABLE IF EXISTS "wikipedia_article";
|
||||
DROP TABLE IF EXISTS "wikipedia_redirect";
|
||||
DROP TABLE IF EXISTS "wikipedia_importance";
|
||||
"""
|
||||
post_code = "COMMIT"
|
||||
execute_file(dsn, data_file, ignore_errors=ignore_errors,
|
||||
pre_code=pre_code, post_code=post_code)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
|
||||
""" Replaces the secondary importance raster data table with new data.
|
||||
|
||||
Returns 0 if all was well and 1 if the raster SQL file could not
|
||||
be found. Throws an exception if there was an error reading the file.
|
||||
"""
|
||||
datafile = data_path / 'secondary_importance.sql.gz'
|
||||
if not datafile.exists():
|
||||
return 1
|
||||
|
||||
with connect(dsn) as conn:
|
||||
postgis_version = conn.postgis_version_tuple()
|
||||
if postgis_version[0] < 3:
|
||||
LOG.error('PostGIS version is too old for using OSM raster data.')
|
||||
return 2
|
||||
|
||||
execute_file(dsn, datafile, ignore_errors=ignore_errors)
|
||||
|
||||
return 0
|
||||
|
||||
def recompute_importance(conn: Connection) -> None:
|
||||
""" Recompute wikipedia links and importance for all entries in placex.
|
||||
This is a long-running operations that must not be executed in
|
||||
parallel with updates.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL')
|
||||
cur.execute("""
|
||||
UPDATE placex SET (wikipedia, importance) =
|
||||
(SELECT wikipedia, importance
|
||||
FROM compute_importance(extratags, country_code, rank_search, centroid))
|
||||
""")
|
||||
cur.execute("""
|
||||
UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
|
||||
FROM placex d
|
||||
WHERE s.place_id = d.linked_place_id and d.wikipedia is not null
|
||||
and (s.wikipedia is null or s.importance < d.importance);
|
||||
""")
|
||||
|
||||
cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL')
|
||||
conn.commit()
|
||||
|
||||
|
||||
def _quote_php_variable(var_type: Type[Any], config: Configuration,
|
||||
conf_name: str) -> str:
|
||||
if var_type == bool:
|
||||
return 'true' if config.get_bool(conf_name) else 'false'
|
||||
|
||||
if var_type == int:
|
||||
return cast(str, getattr(config, conf_name))
|
||||
|
||||
if not getattr(config, conf_name):
|
||||
return 'false'
|
||||
|
||||
if var_type == Path:
|
||||
value = str(config.get_path(conf_name) or '')
|
||||
else:
|
||||
value = getattr(config, conf_name)
|
||||
|
||||
quoted = value.replace("'", "\\'")
|
||||
return f"'{quoted}'"
|
||||
|
||||
|
||||
def setup_website(basedir: Path, config: Configuration, conn: Connection) -> None:
|
||||
""" Create the website script stubs.
|
||||
"""
|
||||
if config.lib_dir.php is None:
|
||||
LOG.info("Python frontend does not require website setup. Skipping.")
|
||||
return
|
||||
|
||||
if not basedir.exists():
|
||||
LOG.info('Creating website directory.')
|
||||
basedir.mkdir()
|
||||
|
||||
assert config.project_dir is not None
|
||||
basedata = dedent(f"""\
|
||||
<?php
|
||||
|
||||
@define('CONST_Debug', $_GET['debug'] ?? false);
|
||||
@define('CONST_LibDir', '{config.lib_dir.php}');
|
||||
@define('CONST_TokenizerDir', '{config.project_dir / 'tokenizer'}');
|
||||
@define('CONST_NominatimVersion', '{NOMINATIM_VERSION!s}');
|
||||
|
||||
""")
|
||||
|
||||
for php_name, conf_name, var_type in PHP_CONST_DEFS:
|
||||
varout = _quote_php_variable(var_type, config, conf_name)
|
||||
|
||||
basedata += f"@define('CONST_{php_name}', {varout});\n"
|
||||
|
||||
template = "\nrequire_once(CONST_LibDir.'/website/{}');\n"
|
||||
|
||||
search_name_table_exists = bool(conn and conn.table_exists('search_name'))
|
||||
|
||||
for script in WEBSITE_SCRIPTS:
|
||||
if not search_name_table_exists and script == 'search.php':
|
||||
out = template.format('reverse-only-search.php')
|
||||
else:
|
||||
out = template.format(script)
|
||||
|
||||
(basedir / script).write_text(basedata + out, 'utf-8')
|
||||
|
||||
|
||||
def invalidate_osm_object(osm_type: str, osm_id: int, conn: Connection,
|
||||
recursive: bool = True) -> None:
|
||||
""" Mark the given OSM object for reindexing. When 'recursive' is set
|
||||
to True (the default), then all dependent objects are marked for
|
||||
reindexing as well.
|
||||
|
||||
'osm_type' must be on of 'N' (node), 'W' (way) or 'R' (relation).
|
||||
If the given object does not exist, then nothing happens.
|
||||
"""
|
||||
assert osm_type in ('N', 'R', 'W')
|
||||
|
||||
LOG.warning("Invalidating OSM %s %s%s.",
|
||||
OSM_TYPE[osm_type], osm_id,
|
||||
' and its dependent places' if recursive else '')
|
||||
|
||||
with conn.cursor() as cur:
|
||||
if recursive:
|
||||
sql = """SELECT place_force_update(place_id)
|
||||
FROM placex WHERE osm_type = %s and osm_id = %s"""
|
||||
else:
|
||||
sql = """UPDATE placex SET indexed_status = 2
|
||||
WHERE osm_type = %s and osm_id = %s"""
|
||||
|
||||
cur.execute(sql, (osm_type, osm_id))
|
||||
206
src/nominatim_db/tools/replication.py
Normal file
206
src/nominatim_db/tools/replication.py
Normal file
@@ -0,0 +1,206 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for updating a database from a replication source.
|
||||
"""
|
||||
from typing import ContextManager, MutableMapping, Any, Generator, cast, Iterator
|
||||
from contextlib import contextmanager
|
||||
import datetime as dt
|
||||
from enum import Enum
|
||||
import logging
|
||||
import time
|
||||
import types
|
||||
import urllib.request as urlrequest
|
||||
|
||||
import requests
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.db import status
|
||||
from nominatim_core.db.connection import Connection, connect
|
||||
from .exec_utils import run_osm2pgsql
|
||||
|
||||
try:
|
||||
from osmium.replication.server import ReplicationServer
|
||||
from osmium import WriteHandler
|
||||
from osmium import version as pyo_version
|
||||
except ImportError as exc:
|
||||
logging.getLogger().critical("pyosmium not installed. Replication functions not available.\n"
|
||||
"To install pyosmium via pip: pip3 install osmium")
|
||||
raise UsageError("replication tools not available") from exc
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def init_replication(conn: Connection, base_url: str,
|
||||
socket_timeout: int = 60) -> None:
|
||||
""" Set up replication for the server at the given base URL.
|
||||
"""
|
||||
LOG.info("Using replication source: %s", base_url)
|
||||
date = status.compute_database_date(conn)
|
||||
|
||||
# margin of error to make sure we get all data
|
||||
date -= dt.timedelta(hours=3)
|
||||
|
||||
with _make_replication_server(base_url, socket_timeout) as repl:
|
||||
seq = repl.timestamp_to_sequence(date)
|
||||
|
||||
if seq is None:
|
||||
LOG.fatal("Cannot reach the configured replication service '%s'.\n"
|
||||
"Does the URL point to a directory containing OSM update data?",
|
||||
base_url)
|
||||
raise UsageError("Failed to reach replication service")
|
||||
|
||||
status.set_status(conn, date=date, seq=seq)
|
||||
|
||||
LOG.warning("Updates initialised at sequence %s (%s)", seq, date)
|
||||
|
||||
|
||||
def check_for_updates(conn: Connection, base_url: str,
|
||||
socket_timeout: int = 60) -> int:
|
||||
""" Check if new data is available from the replication service at the
|
||||
given base URL.
|
||||
"""
|
||||
_, seq, _ = status.get_status(conn)
|
||||
|
||||
if seq is None:
|
||||
LOG.error("Replication not set up. "
|
||||
"Please run 'nominatim replication --init' first.")
|
||||
return 254
|
||||
|
||||
with _make_replication_server(base_url, socket_timeout) as repl:
|
||||
state = repl.get_state_info()
|
||||
|
||||
if state is None:
|
||||
LOG.error("Cannot get state for URL %s.", base_url)
|
||||
return 253
|
||||
|
||||
if state.sequence <= seq:
|
||||
LOG.warning("Database is up to date.")
|
||||
return 2
|
||||
|
||||
LOG.warning("New data available (%i => %i).", seq, state.sequence)
|
||||
return 0
|
||||
|
||||
class UpdateState(Enum):
|
||||
""" Possible states after an update has run.
|
||||
"""
|
||||
|
||||
UP_TO_DATE = 0
|
||||
MORE_PENDING = 2
|
||||
NO_CHANGES = 3
|
||||
|
||||
|
||||
def update(dsn: str, options: MutableMapping[str, Any],
|
||||
socket_timeout: int = 60) -> UpdateState:
|
||||
""" Update database from the next batch of data. Returns the state of
|
||||
updates according to `UpdateState`.
|
||||
"""
|
||||
with connect(dsn) as conn:
|
||||
startdate, startseq, indexed = status.get_status(conn)
|
||||
conn.commit()
|
||||
|
||||
if startseq is None:
|
||||
LOG.error("Replication not set up. "
|
||||
"Please run 'nominatim replication --init' first.")
|
||||
raise UsageError("Replication not set up.")
|
||||
|
||||
assert startdate is not None
|
||||
|
||||
if not indexed and options['indexed_only']:
|
||||
LOG.info("Skipping update. There is data that needs indexing.")
|
||||
return UpdateState.MORE_PENDING
|
||||
|
||||
last_since_update = dt.datetime.now(dt.timezone.utc) - startdate
|
||||
update_interval = dt.timedelta(seconds=options['update_interval'])
|
||||
if last_since_update < update_interval:
|
||||
duration = (update_interval - last_since_update).seconds
|
||||
LOG.warning("Sleeping for %s sec before next update.", duration)
|
||||
time.sleep(duration)
|
||||
|
||||
if options['import_file'].exists():
|
||||
options['import_file'].unlink()
|
||||
|
||||
# Read updates into file.
|
||||
with _make_replication_server(options['base_url'], socket_timeout) as repl:
|
||||
outhandler = WriteHandler(str(options['import_file']))
|
||||
endseq = repl.apply_diffs(outhandler, startseq + 1,
|
||||
max_size=options['max_diff_size'] * 1024)
|
||||
outhandler.close()
|
||||
|
||||
if endseq is None:
|
||||
return UpdateState.NO_CHANGES
|
||||
|
||||
with connect(dsn) as conn:
|
||||
run_osm2pgsql_updates(conn, options)
|
||||
|
||||
# Write the current status to the file
|
||||
endstate = repl.get_state_info(endseq)
|
||||
status.set_status(conn, endstate.timestamp if endstate else None,
|
||||
seq=endseq, indexed=False)
|
||||
conn.commit()
|
||||
|
||||
return UpdateState.UP_TO_DATE
|
||||
|
||||
|
||||
def run_osm2pgsql_updates(conn: Connection, options: MutableMapping[str, Any]) -> None:
|
||||
""" Run osm2pgsql in append mode.
|
||||
"""
|
||||
# Remove any stale deletion marks.
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('TRUNCATE place_to_be_deleted')
|
||||
conn.commit()
|
||||
|
||||
# Consume updates with osm2pgsql.
|
||||
options['append'] = True
|
||||
options['disable_jit'] = conn.server_version_tuple() >= (11, 0)
|
||||
run_osm2pgsql(options)
|
||||
|
||||
# Handle deletions
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('SELECT flush_deleted_places()')
|
||||
conn.commit()
|
||||
|
||||
|
||||
def _make_replication_server(url: str, timeout: int) -> ContextManager[ReplicationServer]:
|
||||
""" Returns a ReplicationServer in form of a context manager.
|
||||
|
||||
Creates a light wrapper around older versions of pyosmium that did
|
||||
not support the context manager interface.
|
||||
"""
|
||||
if hasattr(ReplicationServer, '__enter__'):
|
||||
# Patches the open_url function for pyosmium >= 3.2
|
||||
# where the socket timeout is no longer respected.
|
||||
def patched_open_url(self: ReplicationServer, url: urlrequest.Request) -> Any:
|
||||
""" Download a resource from the given URL and return a byte sequence
|
||||
of the content.
|
||||
"""
|
||||
headers = {"User-Agent" : f"Nominatim (pyosmium/{pyo_version.pyosmium_release})"}
|
||||
|
||||
if self.session is not None:
|
||||
return self.session.get(url.get_full_url(),
|
||||
headers=headers, timeout=timeout or None,
|
||||
stream=True)
|
||||
|
||||
@contextmanager
|
||||
def _get_url_with_session() -> Iterator[requests.Response]:
|
||||
with requests.Session() as session:
|
||||
request = session.get(url.get_full_url(),
|
||||
headers=headers, timeout=timeout or None,
|
||||
stream=True)
|
||||
yield request
|
||||
|
||||
return _get_url_with_session()
|
||||
|
||||
repl = ReplicationServer(url)
|
||||
setattr(repl, 'open_url', types.MethodType(patched_open_url, repl))
|
||||
|
||||
return cast(ContextManager[ReplicationServer], repl)
|
||||
|
||||
@contextmanager
|
||||
def get_cm() -> Generator[ReplicationServer, None, None]:
|
||||
yield ReplicationServer(url)
|
||||
|
||||
return get_cm()
|
||||
0
src/nominatim_db/tools/special_phrases/__init__.py
Normal file
0
src/nominatim_db/tools/special_phrases/__init__.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Contains the class which handles statistics for the
|
||||
import of special phrases.
|
||||
"""
|
||||
import logging
|
||||
LOG = logging.getLogger()
|
||||
|
||||
class SpecialPhrasesImporterStatistics():
|
||||
"""
|
||||
Class handling statistics of the import
|
||||
process of special phrases.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
self._intialize_values()
|
||||
|
||||
def _intialize_values(self) -> None:
|
||||
"""
|
||||
Set all counts for the global
|
||||
import to 0.
|
||||
"""
|
||||
self.tables_created = 0
|
||||
self.tables_deleted = 0
|
||||
self.tables_ignored = 0
|
||||
self.invalids = 0
|
||||
|
||||
def notify_one_phrase_invalid(self) -> None:
|
||||
"""
|
||||
Add +1 to the count of invalid entries
|
||||
fetched from the wiki.
|
||||
"""
|
||||
self.invalids += 1
|
||||
|
||||
def notify_one_table_created(self) -> None:
|
||||
"""
|
||||
Add +1 to the count of created tables.
|
||||
"""
|
||||
self.tables_created += 1
|
||||
|
||||
def notify_one_table_deleted(self) -> None:
|
||||
"""
|
||||
Add +1 to the count of deleted tables.
|
||||
"""
|
||||
self.tables_deleted += 1
|
||||
|
||||
def notify_one_table_ignored(self) -> None:
|
||||
"""
|
||||
Add +1 to the count of ignored tables.
|
||||
"""
|
||||
self.tables_ignored += 1
|
||||
|
||||
def notify_import_done(self) -> None:
|
||||
"""
|
||||
Print stats for the whole import process
|
||||
and reset all values.
|
||||
"""
|
||||
LOG.info('====================================================================')
|
||||
LOG.info('Final statistics of the import:')
|
||||
LOG.info('- %s phrases were invalid.', self.invalids)
|
||||
if self.invalids > 0:
|
||||
LOG.info(' Those invalid phrases have been skipped.')
|
||||
LOG.info('- %s tables were ignored as they already exist on the database',
|
||||
self.tables_ignored)
|
||||
LOG.info('- %s tables were created', self.tables_created)
|
||||
LOG.info('- %s tables were deleted from the database', self.tables_deleted)
|
||||
if self.tables_deleted > 0:
|
||||
LOG.info(' They were deleted as they are not valid anymore.')
|
||||
|
||||
if self.invalids > 0:
|
||||
LOG.warning('%s phrases were invalid and have been skipped during the whole process.',
|
||||
self.invalids)
|
||||
|
||||
self._intialize_values()
|
||||
46
src/nominatim_db/tools/special_phrases/sp_csv_loader.py
Normal file
46
src/nominatim_db/tools/special_phrases/sp_csv_loader.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Module containing the SPCsvLoader class.
|
||||
|
||||
The class allows to load phrases from a csv file.
|
||||
"""
|
||||
from typing import Iterable
|
||||
import csv
|
||||
import os
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from .special_phrase import SpecialPhrase
|
||||
|
||||
class SPCsvLoader:
|
||||
"""
|
||||
Handles loading of special phrases from external csv file.
|
||||
"""
|
||||
def __init__(self, csv_path: str) -> None:
|
||||
self.csv_path = csv_path
|
||||
|
||||
|
||||
def generate_phrases(self) -> Iterable[SpecialPhrase]:
|
||||
""" Open and parse the given csv file.
|
||||
Create the corresponding SpecialPhrases.
|
||||
"""
|
||||
self._check_csv_validity()
|
||||
|
||||
with open(self.csv_path, encoding='utf-8') as fd:
|
||||
reader = csv.DictReader(fd, delimiter=',')
|
||||
for row in reader:
|
||||
yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
|
||||
|
||||
|
||||
def _check_csv_validity(self) -> None:
|
||||
"""
|
||||
Check that the csv file has the right extension.
|
||||
"""
|
||||
_, extension = os.path.splitext(self.csv_path)
|
||||
|
||||
if extension != '.csv':
|
||||
raise UsageError(f'The file {self.csv_path} is not a csv file.')
|
||||
274
src/nominatim_db/tools/special_phrases/sp_importer.py
Normal file
274
src/nominatim_db/tools/special_phrases/sp_importer.py
Normal file
@@ -0,0 +1,274 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Module containing the class handling the import
|
||||
of the special phrases.
|
||||
|
||||
Phrases are analyzed and imported into the database.
|
||||
|
||||
The phrases already present in the database which are not
|
||||
valids anymore are removed.
|
||||
"""
|
||||
from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set
|
||||
import logging
|
||||
import re
|
||||
|
||||
from psycopg2.sql import Identifier, SQL
|
||||
|
||||
from nominatim_core.typing import Protocol
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.connection import Connection
|
||||
from .importer_statistics import SpecialPhrasesImporterStatistics
|
||||
from .special_phrase import SpecialPhrase
|
||||
from ...tokenizer.base import AbstractTokenizer
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _classtype_table(phrase_class: str, phrase_type: str) -> str:
|
||||
""" Return the name of the table for the given class and type.
|
||||
"""
|
||||
return f'place_classtype_{phrase_class}_{phrase_type}'
|
||||
|
||||
|
||||
class SpecialPhraseLoader(Protocol):
|
||||
""" Protocol for classes implementing a loader for special phrases.
|
||||
"""
|
||||
|
||||
def generate_phrases(self) -> Iterable[SpecialPhrase]:
|
||||
""" Generates all special phrase terms this loader can produce.
|
||||
"""
|
||||
|
||||
|
||||
class SPImporter():
|
||||
# pylint: disable-msg=too-many-instance-attributes
|
||||
"""
|
||||
Class handling the process of special phrases importation into the database.
|
||||
|
||||
Take a sp loader which load the phrases from an external source.
|
||||
"""
|
||||
def __init__(self, config: Configuration, conn: Connection,
|
||||
sp_loader: SpecialPhraseLoader) -> None:
|
||||
self.config = config
|
||||
self.db_connection = conn
|
||||
self.sp_loader = sp_loader
|
||||
self.statistics_handler = SpecialPhrasesImporterStatistics()
|
||||
self.black_list, self.white_list = self._load_white_and_black_lists()
|
||||
self.sanity_check_pattern = re.compile(r'^\w+$')
|
||||
# This set will contain all existing phrases to be added.
|
||||
# It contains tuples with the following format: (label, class, type, operator)
|
||||
self.word_phrases: Set[Tuple[str, str, str, str]] = set()
|
||||
# This set will contain all existing place_classtype tables which doesn't match any
|
||||
# special phrases class/type on the wiki.
|
||||
self.table_phrases_to_delete: Set[str] = set()
|
||||
|
||||
def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None:
|
||||
"""
|
||||
Iterate through all SpecialPhrases extracted from the
|
||||
loader and import them into the database.
|
||||
|
||||
If should_replace is set to True only the loaded phrases
|
||||
will be kept into the database. All other phrases already
|
||||
in the database will be removed.
|
||||
"""
|
||||
LOG.warning('Special phrases importation starting')
|
||||
self._fetch_existing_place_classtype_tables()
|
||||
|
||||
# Store pairs of class/type for further processing
|
||||
class_type_pairs = set()
|
||||
|
||||
for phrase in self.sp_loader.generate_phrases():
|
||||
result = self._process_phrase(phrase)
|
||||
if result:
|
||||
class_type_pairs.add(result)
|
||||
|
||||
self._create_classtype_table_and_indexes(class_type_pairs)
|
||||
if should_replace:
|
||||
self._remove_non_existent_tables_from_db()
|
||||
self.db_connection.commit()
|
||||
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
analyzer.update_special_phrases(self.word_phrases, should_replace)
|
||||
|
||||
LOG.warning('Import done.')
|
||||
self.statistics_handler.notify_import_done()
|
||||
|
||||
|
||||
def _fetch_existing_place_classtype_tables(self) -> None:
|
||||
"""
|
||||
Fetch existing place_classtype tables.
|
||||
Fill the table_phrases_to_delete set of the class.
|
||||
"""
|
||||
query = """
|
||||
SELECT table_name
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema='public'
|
||||
AND table_name like 'place_classtype_%';
|
||||
"""
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute(SQL(query))
|
||||
for row in db_cursor:
|
||||
self.table_phrases_to_delete.add(row[0])
|
||||
|
||||
def _load_white_and_black_lists(self) \
|
||||
-> Tuple[Mapping[str, Sequence[str]], Mapping[str, Sequence[str]]]:
|
||||
"""
|
||||
Load white and black lists from phrases-settings.json.
|
||||
"""
|
||||
settings = self.config.load_sub_configuration('phrase-settings.json')
|
||||
|
||||
return settings['blackList'], settings['whiteList']
|
||||
|
||||
def _check_sanity(self, phrase: SpecialPhrase) -> bool:
|
||||
"""
|
||||
Check sanity of given inputs in case somebody added garbage in the wiki.
|
||||
If a bad class/type is detected the system will exit with an error.
|
||||
"""
|
||||
class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
|
||||
type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
|
||||
|
||||
if not class_matchs or not type_matchs:
|
||||
LOG.warning("Bad class/type: %s=%s. It will not be imported",
|
||||
phrase.p_class, phrase.p_type)
|
||||
return False
|
||||
return True
|
||||
|
||||
def _process_phrase(self, phrase: SpecialPhrase) -> Optional[Tuple[str, str]]:
|
||||
"""
|
||||
Processes the given phrase by checking black and white list
|
||||
and sanity.
|
||||
Return the class/type pair corresponding to the phrase.
|
||||
"""
|
||||
|
||||
# blacklisting: disallow certain class/type combinations
|
||||
if phrase.p_class in self.black_list.keys() \
|
||||
and phrase.p_type in self.black_list[phrase.p_class]:
|
||||
return None
|
||||
|
||||
# whitelisting: if class is in whitelist, allow only tags in the list
|
||||
if phrase.p_class in self.white_list.keys() \
|
||||
and phrase.p_type not in self.white_list[phrase.p_class]:
|
||||
return None
|
||||
|
||||
# sanity check, in case somebody added garbage in the wiki
|
||||
if not self._check_sanity(phrase):
|
||||
self.statistics_handler.notify_one_phrase_invalid()
|
||||
return None
|
||||
|
||||
self.word_phrases.add((phrase.p_label, phrase.p_class,
|
||||
phrase.p_type, phrase.p_operator))
|
||||
|
||||
return (phrase.p_class, phrase.p_type)
|
||||
|
||||
|
||||
def _create_classtype_table_and_indexes(self,
|
||||
class_type_pairs: Iterable[Tuple[str, str]]) -> None:
|
||||
"""
|
||||
Create table place_classtype for each given pair.
|
||||
Also create indexes on place_id and centroid.
|
||||
"""
|
||||
LOG.warning('Create tables and indexes...')
|
||||
|
||||
sql_tablespace = self.config.TABLESPACE_AUX_DATA
|
||||
if sql_tablespace:
|
||||
sql_tablespace = ' TABLESPACE ' + sql_tablespace
|
||||
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
|
||||
|
||||
for pair in class_type_pairs:
|
||||
phrase_class = pair[0]
|
||||
phrase_type = pair[1]
|
||||
|
||||
table_name = _classtype_table(phrase_class, phrase_type)
|
||||
|
||||
if table_name in self.table_phrases_to_delete:
|
||||
self.statistics_handler.notify_one_table_ignored()
|
||||
# Remove this table from the ones to delete as it match a
|
||||
# class/type still existing on the special phrases of the wiki.
|
||||
self.table_phrases_to_delete.remove(table_name)
|
||||
# So don't need to create the table and indexes.
|
||||
continue
|
||||
|
||||
# Table creation
|
||||
self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
|
||||
|
||||
# Indexes creation
|
||||
self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
|
||||
|
||||
# Grant access on read to the web user.
|
||||
self._grant_access_to_webuser(phrase_class, phrase_type)
|
||||
|
||||
self.statistics_handler.notify_one_table_created()
|
||||
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute("DROP INDEX idx_placex_classtype")
|
||||
|
||||
|
||||
def _create_place_classtype_table(self, sql_tablespace: str,
|
||||
phrase_class: str, phrase_type: str) -> None:
|
||||
"""
|
||||
Create table place_classtype of the given phrase_class/phrase_type
|
||||
if doesn't exit.
|
||||
"""
|
||||
table_name = _classtype_table(phrase_class, phrase_type)
|
||||
with self.db_connection.cursor() as cur:
|
||||
cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS
|
||||
SELECT place_id AS place_id,
|
||||
st_centroid(geometry) AS centroid
|
||||
FROM placex
|
||||
WHERE class = %s AND type = %s
|
||||
""").format(Identifier(table_name), SQL(sql_tablespace)),
|
||||
(phrase_class, phrase_type))
|
||||
|
||||
|
||||
def _create_place_classtype_indexes(self, sql_tablespace: str,
|
||||
phrase_class: str, phrase_type: str) -> None:
|
||||
"""
|
||||
Create indexes on centroid and place_id for the place_classtype table.
|
||||
"""
|
||||
index_prefix = f'idx_place_classtype_{phrase_class}_{phrase_type}_'
|
||||
base_table = _classtype_table(phrase_class, phrase_type)
|
||||
# Index on centroid
|
||||
if not self.db_connection.index_exists(index_prefix + 'centroid'):
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}")
|
||||
.format(Identifier(index_prefix + 'centroid'),
|
||||
Identifier(base_table),
|
||||
SQL(sql_tablespace)))
|
||||
|
||||
# Index on place_id
|
||||
if not self.db_connection.index_exists(index_prefix + 'place_id'):
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute(SQL("CREATE INDEX {} ON {} USING btree(place_id) {}")
|
||||
.format(Identifier(index_prefix + 'place_id'),
|
||||
Identifier(base_table),
|
||||
SQL(sql_tablespace)))
|
||||
|
||||
|
||||
def _grant_access_to_webuser(self, phrase_class: str, phrase_type: str) -> None:
|
||||
"""
|
||||
Grant access on read to the table place_classtype for the webuser.
|
||||
"""
|
||||
table_name = _classtype_table(phrase_class, phrase_type)
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
|
||||
.format(Identifier(table_name),
|
||||
Identifier(self.config.DATABASE_WEBUSER)))
|
||||
|
||||
def _remove_non_existent_tables_from_db(self) -> None:
|
||||
"""
|
||||
Remove special phrases which doesn't exist on the wiki anymore.
|
||||
Delete the place_classtype tables.
|
||||
"""
|
||||
LOG.warning('Cleaning database...')
|
||||
|
||||
# Delete place_classtype tables corresponding to class/type which
|
||||
# are not on the wiki anymore.
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
for table in self.table_phrases_to_delete:
|
||||
self.statistics_handler.notify_one_table_deleted()
|
||||
db_cursor.drop_table(table)
|
||||
68
src/nominatim_db/tools/special_phrases/sp_wiki_loader.py
Normal file
68
src/nominatim_db/tools/special_phrases/sp_wiki_loader.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Module containing the SPWikiLoader class.
|
||||
"""
|
||||
from typing import Iterable
|
||||
import re
|
||||
import logging
|
||||
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.utils.url_utils import get_url
|
||||
from .special_phrase import SpecialPhrase
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _get_wiki_content(lang: str) -> str:
|
||||
"""
|
||||
Request and return the wiki page's content
|
||||
corresponding to special phrases for a given lang.
|
||||
Requested URL Example :
|
||||
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
|
||||
"""
|
||||
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
|
||||
+ lang.upper()
|
||||
return get_url(url)
|
||||
|
||||
|
||||
class SPWikiLoader:
|
||||
"""
|
||||
Handles loading of special phrases from the wiki.
|
||||
"""
|
||||
def __init__(self, config: Configuration) -> None:
|
||||
self.config = config
|
||||
# Compile the regex here to increase performances.
|
||||
self.occurence_pattern = re.compile(
|
||||
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
|
||||
)
|
||||
# Hack around a bug where building=yes was imported with quotes into the wiki
|
||||
self.type_fix_pattern = re.compile(r'\"|"')
|
||||
|
||||
self.languages = self.config.get_str_list('LANGUAGES') or \
|
||||
['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
|
||||
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
|
||||
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
|
||||
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi',
|
||||
'lv', 'tr']
|
||||
|
||||
|
||||
def generate_phrases(self) -> Iterable[SpecialPhrase]:
|
||||
""" Download the wiki pages for the configured languages
|
||||
and extract the phrases from the page.
|
||||
"""
|
||||
for lang in self.languages:
|
||||
LOG.warning('Importing phrases for lang: %s...', lang)
|
||||
loaded_xml = _get_wiki_content(lang)
|
||||
|
||||
# One match will be of format [label, class, type, operator, plural]
|
||||
matches = self.occurence_pattern.findall(loaded_xml)
|
||||
|
||||
for match in matches:
|
||||
yield SpecialPhrase(match[0],
|
||||
match[1],
|
||||
self.type_fix_pattern.sub('', match[2]),
|
||||
match[3])
|
||||
37
src/nominatim_db/tools/special_phrases/special_phrase.py
Normal file
37
src/nominatim_db/tools/special_phrases/special_phrase.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Module containing the class SpecialPhrase.
|
||||
|
||||
This class is a model used to transfer a special phrase through
|
||||
the process of load and importation.
|
||||
"""
|
||||
from typing import Any
|
||||
|
||||
class SpecialPhrase:
|
||||
"""
|
||||
Model representing a special phrase.
|
||||
"""
|
||||
def __init__(self, p_label: str, p_class: str, p_type: str, p_operator: str) -> None:
|
||||
self.p_label = p_label.strip()
|
||||
self.p_class = p_class.strip()
|
||||
self.p_type = p_type.strip()
|
||||
# Needed if some operator in the wiki are not written in english
|
||||
p_operator = p_operator.strip().lower()
|
||||
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
|
||||
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
if not isinstance(other, SpecialPhrase):
|
||||
return False
|
||||
|
||||
return self.p_label == other.p_label \
|
||||
and self.p_class == other.p_class \
|
||||
and self.p_type == other.p_type \
|
||||
and self.p_operator == other.p_operator
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((self.p_label, self.p_class, self.p_type, self.p_operator))
|
||||
149
src/nominatim_db/tools/tiger_data.py
Normal file
149
src/nominatim_db/tools/tiger_data.py
Normal file
@@ -0,0 +1,149 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for importing tiger data and handling tarbar and directory files
|
||||
"""
|
||||
from typing import Any, TextIO, List, Union, cast
|
||||
import csv
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import tarfile
|
||||
|
||||
from psycopg2.extras import Json
|
||||
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.connection import connect
|
||||
from nominatim_core.db.async_connection import WorkerPool
|
||||
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
|
||||
from nominatim_core.errors import UsageError
|
||||
from ..data.place_info import PlaceInfo
|
||||
from ..tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||
from . import freeze
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
class TigerInput:
|
||||
""" Context manager that goes through Tiger input files which may
|
||||
either be in a directory or gzipped together in a tar file.
|
||||
"""
|
||||
|
||||
def __init__(self, data_dir: str) -> None:
|
||||
self.tar_handle = None
|
||||
self.files: List[Union[str, tarfile.TarInfo]] = []
|
||||
|
||||
if data_dir.endswith('.tar.gz'):
|
||||
try:
|
||||
self.tar_handle = tarfile.open(data_dir) # pylint: disable=consider-using-with
|
||||
except tarfile.ReadError as err:
|
||||
LOG.fatal("Cannot open '%s'. Is this a tar file?", data_dir)
|
||||
raise UsageError("Cannot open Tiger data file.") from err
|
||||
|
||||
self.files = [i for i in self.tar_handle.getmembers() if i.name.endswith('.csv')]
|
||||
LOG.warning("Found %d CSV files in tarfile with path %s", len(self.files), data_dir)
|
||||
else:
|
||||
files = os.listdir(data_dir)
|
||||
self.files = [os.path.join(data_dir, i) for i in files if i.endswith('.csv')]
|
||||
LOG.warning("Found %d CSV files in path %s", len(self.files), data_dir)
|
||||
|
||||
if not self.files:
|
||||
LOG.warning("Tiger data import selected but no files found at %s", data_dir)
|
||||
|
||||
|
||||
def __enter__(self) -> 'TigerInput':
|
||||
return self
|
||||
|
||||
|
||||
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
||||
if self.tar_handle:
|
||||
self.tar_handle.close()
|
||||
self.tar_handle = None
|
||||
|
||||
|
||||
def next_file(self) -> TextIO:
|
||||
""" Return a file handle to the next file to be processed.
|
||||
Raises an IndexError if there is no file left.
|
||||
"""
|
||||
fname = self.files.pop(0)
|
||||
|
||||
if self.tar_handle is not None:
|
||||
extracted = self.tar_handle.extractfile(fname)
|
||||
assert extracted is not None
|
||||
return io.TextIOWrapper(extracted)
|
||||
|
||||
return open(cast(str, fname), encoding='utf-8')
|
||||
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.files)
|
||||
|
||||
|
||||
def handle_threaded_sql_statements(pool: WorkerPool, fd: TextIO,
|
||||
analyzer: AbstractAnalyzer) -> None:
|
||||
""" Handles sql statement with multiplexing
|
||||
"""
|
||||
lines = 0
|
||||
# Using pool of database connections to execute sql statements
|
||||
|
||||
sql = "SELECT tiger_line_import(%s, %s, %s, %s, %s, %s)"
|
||||
|
||||
for row in csv.DictReader(fd, delimiter=';'):
|
||||
try:
|
||||
address = dict(street=row['street'], postcode=row['postcode'])
|
||||
args = ('SRID=4326;' + row['geometry'],
|
||||
int(row['from']), int(row['to']), row['interpolation'],
|
||||
Json(analyzer.process_place(PlaceInfo({'address': address}))),
|
||||
analyzer.normalize_postcode(row['postcode']))
|
||||
except ValueError:
|
||||
continue
|
||||
pool.next_free_worker().perform(sql, args=args)
|
||||
|
||||
lines += 1
|
||||
if lines == 1000:
|
||||
print('.', end='', flush=True)
|
||||
lines = 0
|
||||
|
||||
|
||||
def add_tiger_data(data_dir: str, config: Configuration, threads: int,
|
||||
tokenizer: AbstractTokenizer) -> int:
|
||||
""" Import tiger data from directory or tar file `data dir`.
|
||||
"""
|
||||
dsn = config.get_libpq_dsn()
|
||||
|
||||
with connect(dsn) as conn:
|
||||
is_frozen = freeze.is_frozen(conn)
|
||||
conn.close()
|
||||
|
||||
if is_frozen:
|
||||
raise UsageError("Tiger cannot be imported when database frozen (Github issue #3048)")
|
||||
|
||||
with TigerInput(data_dir) as tar:
|
||||
if not tar:
|
||||
return 1
|
||||
|
||||
with connect(dsn) as conn:
|
||||
sql = SQLPreprocessor(conn, config)
|
||||
sql.run_sql_file(conn, 'tiger_import_start.sql')
|
||||
|
||||
# Reading files and then for each file line handling
|
||||
# sql_query in <threads - 1> chunks.
|
||||
place_threads = max(1, threads - 1)
|
||||
|
||||
with WorkerPool(dsn, place_threads, ignore_sql_errors=True) as pool:
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
while tar:
|
||||
with tar.next_file() as fd:
|
||||
handle_threaded_sql_statements(pool, fd, analyzer)
|
||||
|
||||
print('\n')
|
||||
|
||||
LOG.warning("Creating indexes on Tiger data")
|
||||
with connect(dsn) as conn:
|
||||
sql = SQLPreprocessor(conn, config)
|
||||
sql.run_sql_file(conn, 'tiger_import_finish.sql')
|
||||
|
||||
return 0
|
||||
Reference in New Issue
Block a user