port replication initialisation to Python

This commit is contained in:
Sarah Hoffmann
2021-01-26 22:45:24 +01:00
parent 5b46fcad8e
commit d78f0ba804
13 changed files with 402 additions and 75 deletions

View File

@@ -116,64 +116,13 @@ $sBaseURL = getSetting('REPLICATION_URL');
if ($aResult['init-updates']) { if ($aResult['init-updates']) {
// sanity check that the replication URL is correct $oCmd = (clone($oNominatimCmd))->addParams('replication', '--init');
$sBaseState = file_get_contents($sBaseURL.'/state.txt');
if ($sBaseState === false) { if ($aResult['no-update-functions']) {
echo "\nCannot find state.txt file at the configured replication URL.\n"; $oCmd->addParams('--no-update-functions');
echo "Does the URL point to a directory containing OSM update data?\n\n";
fail('replication URL not reachable.');
}
// sanity check for pyosmium-get-changes
if (!$sPyosmiumBin) {
echo "\nNOMINATIM_PYOSMIUM_BINARY not configured.\n";
echo "You need to install pyosmium and set up the path to pyosmium-get-changes\n";
echo "in your local .env file.\n\n";
fail('NOMINATIM_PYOSMIUM_BINARY not configured');
} }
$aOutput = 0; $oCmd->run();
$oCMD = new \Nominatim\Shell($sPyosmiumBin, '--help');
exec($oCMD->escapedCmd(), $aOutput, $iRet);
if ($iRet != 0) {
echo "Cannot execute pyosmium-get-changes.\n";
echo "Make sure you have pyosmium installed correctly\n";
echo "and have set up NOMINATIM_PYOSMIUM_BINARY to point to pyosmium-get-changes.\n";
fail('pyosmium-get-changes not found or not usable');
}
if (!$aResult['no-update-functions']) {
(clone($oNominatimCmd))->addParams('refresh', '--functions')->run();
}
$sDatabaseDate = getDatabaseDate($oDB);
if (!$sDatabaseDate) {
fail('Cannot determine date of database.');
}
$sWindBack = strftime('%Y-%m-%dT%H:%M:%SZ', strtotime($sDatabaseDate) - (3*60*60));
// get the appropriate state id
$aOutput = 0;
$oCMD = (new \Nominatim\Shell($sPyosmiumBin))
->addParams('--start-date', $sWindBack)
->addParams('--server', $sBaseURL);
exec($oCMD->escapedCmd(), $aOutput, $iRet);
if ($iRet != 0 || $aOutput[0] == 'None') {
fail('Error running pyosmium tools');
}
$oDB->exec('TRUNCATE import_status');
$sSQL = "INSERT INTO import_status (lastimportdate, sequence_id, indexed) VALUES('";
$sSQL .= $sDatabaseDate."',".$aOutput[0].', true)';
try {
$oDB->exec($sSQL);
} catch (\Nominatim\DatabaseError $e) {
fail('Could not enter sequence into database.');
}
echo "Done. Database updates will start at sequence $aOutput[0] ($sWindBack)\n";
} }
if ($aResult['check-for-updates']) { if ($aResult['check-for-updates']) {

View File

@@ -234,12 +234,29 @@ class UpdateReplication:
@staticmethod @staticmethod
def run(args): def run(args):
try:
import osmium # pylint: disable=W0611
except ModuleNotFoundError:
LOG.fatal("pyosmium not installed. Replication functions not available.\n"
"To install pyosmium via pip: pip3 install osmium")
return 1
from .tools import replication, refresh
conn = connect(args.config.get_libpq_dsn())
params = ['update.php'] params = ['update.php']
if args.init: if args.init:
params.append('--init-updates') LOG.warning("Initialising replication updates")
if not args.update_functions: replication.init_replication(conn, args.config.REPLICATION_URL)
params.append('--no-update-functions') if args.update_functions:
elif args.check_for_updates: LOG.warning("Create functions")
refresh.create_functions(conn, args.config, args.data_dir,
True, False)
conn.close()
return 0
if args.check_for_updates:
params.append('--check-for-updates') params.append('--check-for-updates')
else: else:
if args.once: if args.once:

View File

@@ -0,0 +1,58 @@
"""
Specialised connection and cursor functions.
"""
import logging
import psycopg2
import psycopg2.extensions
import psycopg2.extras
class _Cursor(psycopg2.extras.DictCursor):
""" A cursor returning dict-like objects and providing specialised
execution functions.
"""
def execute(self, query, args=None): # pylint: disable=W0221
""" Query execution that logs the SQL query when debugging is enabled.
"""
logger = logging.getLogger()
logger.debug(self.mogrify(query, args).decode('utf-8'))
super().execute(query, args)
def scalar(self, sql, args=None):
""" Execute query that returns a single value. The value is returned.
If the query yields more than one row, a ValueError is raised.
"""
self.execute(sql, args)
if self.rowcount != 1:
raise ValueError("Query did not return a single row.")
return self.fetchone()[0]
class _Connection(psycopg2.extensions.connection):
""" A connection that provides the specialised cursor by default and
adds convenience functions for administrating the database.
"""
def cursor(self, cursor_factory=_Cursor, **kwargs):
""" Return a new cursor. By default the specialised cursor is returned.
"""
return super().cursor(cursor_factory=cursor_factory, **kwargs)
def table_exists(self, table):
""" Check that a table with the given name exists in the database.
"""
with self.cursor() as cur:
num = cur.scalar("""SELECT count(*) FROM pg_tables
WHERE tablename = %s""", (table, ))
return num == 1
def connect(dsn):
""" Open a connection to the database using the specialised connection
factory.
"""
return psycopg2.connect(dsn, connection_factory=_Connection)

50
nominatim/db/status.py Normal file
View File

@@ -0,0 +1,50 @@
"""
Access and helper functions for the status table.
"""
import datetime as dt
import logging
import re
from ..tools.exec_utils import get_url
LOG = logging.getLogger()
def compute_database_date(conn):
""" Determine the date of the database from the newest object in the
data base.
"""
# First, find the node with the highest ID in the database
with conn.cursor() as cur:
osmid = cur.scalar("SELECT max(osm_id) FROM place WHERE osm_type='N'")
if osmid is None:
LOG.fatal("No data found in the database.")
raise RuntimeError("No data found in the database.")
LOG.info("Using node id %d for timestamp lookup", osmid)
# Get the node from the API to find the timestamp when it was created.
node_url = 'https://www.openstreetmap.org/api/0.6/node/{}/1'.format(osmid)
data = get_url(node_url)
match = re.search(r'timestamp="((\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}))Z"', data)
if match is None:
LOG.fatal("The node data downloaded from the API does not contain valid data.\n"
"URL used: %s", node_url)
raise RuntimeError("Bad API data.")
LOG.debug("Found timestamp %s", match[1])
return dt.datetime.fromisoformat(match[1]).replace(tzinfo=dt.timezone.utc)
def set_status(conn, date, seq=None, indexed=True):
""" Replace the current status with the given status.
"""
assert date.tzinfo == dt.timezone.utc
with conn.cursor() as cur:
cur.execute("TRUNCATE TABLE import_status")
cur.execute("""INSERT INTO import_status (lastimportdate, sequence_id, indexed)
VALUES (%s, %s, %s)""", (date, seq, indexed))
conn.commit()

View File

@@ -3,8 +3,13 @@ Helper functions for executing external programs.
""" """
import logging import logging
import subprocess import subprocess
import urllib.request as urlrequest
from urllib.parse import urlencode from urllib.parse import urlencode
from ..version import NOMINATIM_VERSION
LOG = logging.getLogger()
def run_legacy_script(script, *args, nominatim_env=None, throw_on_fail=False): def run_legacy_script(script, *args, nominatim_env=None, throw_on_fail=False):
""" Run a Nominatim PHP script with the given arguments. """ Run a Nominatim PHP script with the given arguments.
@@ -80,3 +85,16 @@ def run_api_script(endpoint, project_dir, extra_env=None, phpcgi_bin=None,
print(result[content_start + 4:].replace('\\n', '\n')) print(result[content_start + 4:].replace('\\n', '\n'))
return 0 return 0
def get_url(url):
""" Get the contents from the given URL and return it as a UTF-8 string.
"""
headers = {"User-Agent" : "Nominatim/" + NOMINATIM_VERSION}
try:
with urlrequest.urlopen(urlrequest.Request(url, headers=headers)) as response:
return response.read().decode('utf-8')
except:
LOG.fatal('Failed to load URL: %s', url)
raise

View File

@@ -0,0 +1,34 @@
"""
Functions for updating a database from a replication source.
"""
import datetime
import logging
from osmium.replication.server import ReplicationServer
from ..db import status
LOG = logging.getLogger()
def init_replication(conn, base_url):
""" Set up replication for the server at the given base URL.
"""
LOG.info("Using replication source: %s", base_url)
date = status.compute_database_date(conn)
# margin of error to make sure we get all data
date -= datetime.timedelta(hours=3)
repl = ReplicationServer(base_url)
seq = repl.timestamp_to_sequence(date)
if seq is None:
LOG.fatal("Cannot reach the configured replication service '%s'.\n"
"Does the URL point to a directory containing OSM update data?",
base_url)
raise RuntimeError("Failed to reach replication service")
status.set_status(conn, date=date, seq=seq)
LOG.warning("Updates intialised at sequence %s (%s)", seq, date)

5
nominatim/version.py Normal file
View File

@@ -0,0 +1,5 @@
"""
Version information for Nominatim.
"""
NOMINATIM_VERSION = "3.6.0"

View File

@@ -57,6 +57,9 @@ NOMINATIM_HTTP_PROXY_HOST=proxy.mydomain.com
NOMINATIM_HTTP_PROXY_PORT=3128 NOMINATIM_HTTP_PROXY_PORT=3128
NOMINATIM_HTTP_PROXY_LOGIN= NOMINATIM_HTTP_PROXY_LOGIN=
NOMINATIM_HTTP_PROXY_PASSWORD= NOMINATIM_HTTP_PROXY_PASSWORD=
# Also set these standard environment variables.
# HTTP_PROXY="http://user:pass@10.10.1.10:1080"
# HTTPS_PROXY="http://user:pass@10.10.1.10:1080"
# Location of the osm2pgsql binary. # Location of the osm2pgsql binary.
# When empty, osm2pgsql is expected to reside in the osm2pgsql directory in # When empty, osm2pgsql is expected to reside in the osm2pgsql directory in

View File

@@ -1,6 +1,6 @@
drop table if exists import_status; drop table if exists import_status;
CREATE TABLE import_status ( CREATE TABLE import_status (
lastimportdate timestamp NOT NULL, lastimportdate timestamp with time zone NOT NULL,
sequence_id integer, sequence_id integer,
indexed boolean indexed boolean
); );

View File

@@ -1,3 +1,4 @@
import itertools
import sys import sys
from pathlib import Path from pathlib import Path
@@ -11,6 +12,7 @@ SRC_DIR = Path(__file__) / '..' / '..' / '..'
sys.path.insert(0, str(SRC_DIR.resolve())) sys.path.insert(0, str(SRC_DIR.resolve()))
from nominatim.config import Configuration from nominatim.config import Configuration
from nominatim.db import connection
class _TestingCursor(psycopg2.extras.DictCursor): class _TestingCursor(psycopg2.extras.DictCursor):
""" Extension to the DictCursor class that provides execution """ Extension to the DictCursor class that provides execution
@@ -40,27 +42,42 @@ def temp_db(monkeypatch):
exported into NOMINATIM_DATABASE_DSN. exported into NOMINATIM_DATABASE_DSN.
""" """
name = 'test_nominatim_python_unittest' name = 'test_nominatim_python_unittest'
with psycopg2.connect(database='postgres') as conn: conn = psycopg2.connect(database='postgres')
conn.set_isolation_level(0)
with conn.cursor() as cur: conn.set_isolation_level(0)
cur.execute('DROP DATABASE IF EXISTS {}'.format(name)) with conn.cursor() as cur:
cur.execute('CREATE DATABASE {}'.format(name)) cur.execute('DROP DATABASE IF EXISTS {}'.format(name))
cur.execute('CREATE DATABASE {}'.format(name))
conn.close()
monkeypatch.setenv('NOMINATIM_DATABASE_DSN' , 'dbname=' + name) monkeypatch.setenv('NOMINATIM_DATABASE_DSN' , 'dbname=' + name)
yield name yield name
with psycopg2.connect(database='postgres') as conn: conn = psycopg2.connect(database='postgres')
conn.set_isolation_level(0)
with conn.cursor() as cur:
cur.execute('DROP DATABASE IF EXISTS {}'.format(name))
conn.set_isolation_level(0)
with conn.cursor() as cur:
cur.execute('DROP DATABASE IF EXISTS {}'.format(name))
conn.close()
@pytest.fixture
def temp_db_with_extensions(temp_db):
conn = psycopg2.connect(database=temp_db)
with conn.cursor() as cur:
cur.execute('CREATE EXTENSION hstore; CREATE EXTENSION postgis;')
conn.commit()
conn.close()
return temp_db
@pytest.fixture @pytest.fixture
def temp_db_conn(temp_db): def temp_db_conn(temp_db):
""" Connection to the test database. """ Connection to the test database.
""" """
conn = psycopg2.connect(database=temp_db) conn = connection.connect('dbname=' + temp_db)
yield conn yield conn
conn.close() conn.close()
@@ -80,3 +97,50 @@ def temp_db_cursor(temp_db):
@pytest.fixture @pytest.fixture
def def_config(): def def_config():
return Configuration(None, SRC_DIR.resolve() / 'settings') return Configuration(None, SRC_DIR.resolve() / 'settings')
@pytest.fixture
def status_table(temp_db_conn):
""" Create an empty version of the status table.
"""
with temp_db_conn.cursor() as cur:
cur.execute("""CREATE TABLE import_status (
lastimportdate timestamp with time zone NOT NULL,
sequence_id integer,
indexed boolean
)""")
temp_db_conn.commit()
@pytest.fixture
def place_table(temp_db_with_extensions, temp_db_conn):
""" Create an empty version of the place table.
"""
with temp_db_conn.cursor() as cur:
cur.execute("""CREATE TABLE place (
osm_id int8 NOT NULL,
osm_type char(1) NOT NULL,
class text NOT NULL,
type text NOT NULL,
name hstore,
admin_level smallint,
address hstore,
extratags hstore,
geometry Geometry(Geometry,4326) NOT NULL)""")
temp_db_conn.commit()
@pytest.fixture
def place_row(place_table, temp_db_cursor):
""" A factory for rows in the place table. The table is created as a
prerequisite to the fixture.
"""
idseq = itertools.count(1001)
def _insert(osm_type='N', osm_id=None, cls='amenity', typ='cafe', names=None,
admin_level=None, address=None, extratags=None, geom=None):
temp_db_cursor.execute("INSERT INTO place VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
(osm_id or next(idseq), osm_type, cls, typ, names,
admin_level, address, extratags,
geom or 'SRID=4326;POINT(0 0 )'))
return _insert

View File

@@ -7,6 +7,7 @@ import pytest
import nominatim.cli import nominatim.cli
import nominatim.indexer.indexer import nominatim.indexer.indexer
import nominatim.tools.refresh import nominatim.tools.refresh
import nominatim.tools.replication
def call_nominatim(*args): def call_nominatim(*args):
return nominatim.cli.nominatim(module_dir='build/module', return nominatim.cli.nominatim(module_dir='build/module',
@@ -56,7 +57,6 @@ def test_cli_help(capsys):
(('import', '--continue', 'load-data'), 'setup'), (('import', '--continue', 'load-data'), 'setup'),
(('freeze',), 'setup'), (('freeze',), 'setup'),
(('special-phrases',), 'specialphrases'), (('special-phrases',), 'specialphrases'),
(('replication',), 'update'),
(('add-data', '--tiger-data', 'tiger'), 'setup'), (('add-data', '--tiger-data', 'tiger'), 'setup'),
(('add-data', '--file', 'foo.osm'), 'update'), (('add-data', '--file', 'foo.osm'), 'update'),
(('check-database',), 'check_import_finished'), (('check-database',), 'check_import_finished'),
@@ -102,7 +102,7 @@ def test_index_command(monkeypatch, temp_db_cursor, params, do_bnds, do_ranks):
('importance', ('update.php', '--recompute-importance')), ('importance', ('update.php', '--recompute-importance')),
('website', ('setup.php', '--setup-website')), ('website', ('setup.php', '--setup-website')),
]) ])
def test_refresh_legacy_command(mock_run_legacy, command, params): def test_refresh_legacy_command(mock_run_legacy, temp_db, command, params):
assert 0 == call_nominatim('refresh', '--' + command) assert 0 == call_nominatim('refresh', '--' + command)
assert mock_run_legacy.called == 1 assert mock_run_legacy.called == 1
@@ -115,7 +115,7 @@ def test_refresh_legacy_command(mock_run_legacy, command, params):
('address-levels', 'load_address_levels_from_file'), ('address-levels', 'load_address_levels_from_file'),
('functions', 'create_functions'), ('functions', 'create_functions'),
]) ])
def test_refresh_command(monkeypatch, command, func): def test_refresh_command(monkeypatch, temp_db, command, func):
func_mock = MockParamCapture() func_mock = MockParamCapture()
monkeypatch.setattr(nominatim.tools.refresh, func, func_mock) monkeypatch.setattr(nominatim.tools.refresh, func, func_mock)
@@ -123,12 +123,26 @@ def test_refresh_command(monkeypatch, command, func):
assert func_mock.called == 1 assert func_mock.called == 1
def test_refresh_importance_computed_after_wiki_import(mock_run_legacy):
def test_refresh_importance_computed_after_wiki_import(mock_run_legacy, temp_db):
assert 0 == call_nominatim('refresh', '--importance', '--wiki-data') assert 0 == call_nominatim('refresh', '--importance', '--wiki-data')
assert mock_run_legacy.called == 2 assert mock_run_legacy.called == 2
assert mock_run_legacy.last_args == ('update.php', '--recompute-importance') assert mock_run_legacy.last_args == ('update.php', '--recompute-importance')
@pytest.mark.parametrize("params,func", [
(('--init', '--no-update-functions'), 'init_replication')
])
def test_replication_command(monkeypatch, temp_db, params, func):
func_mock = MockParamCapture()
monkeypatch.setattr(nominatim.tools.replication, func, func_mock)
assert 0 == call_nominatim('replication', *params)
assert func_mock.called == 1
@pytest.mark.parametrize("params", [ @pytest.mark.parametrize("params", [
('search', '--query', 'new'), ('search', '--query', 'new'),
('reverse', '--lat', '0', '--lon', '0'), ('reverse', '--lat', '0', '--lon', '0'),

View File

@@ -0,0 +1,74 @@
"""
Tests for status table manipulation.
"""
import datetime as dt
import pytest
import nominatim.db.status
def test_compute_database_date_place_empty(status_table, place_table, temp_db_conn):
with pytest.raises(RuntimeError):
nominatim.db.status.compute_database_date(temp_db_conn)
OSM_NODE_DATA = """\
<osm version="0.6" generator="OpenStreetMap server" copyright="OpenStreetMap and contributors" attribution="http://www.openstreetmap.org/copyright" license="http://opendatacommons.org/licenses/odbl/1-0/">
<node id="45673" visible="true" version="1" changeset="2047" timestamp="2006-01-27T22:09:10Z" user="Foo" uid="111" lat="48.7586670" lon="8.1343060">
</node>
</osm>
"""
def test_compute_database_date_valid(monkeypatch, status_table, place_row, temp_db_conn):
place_row(osm_type='N', osm_id=45673)
requested_url = []
def mock_url(url):
requested_url.append(url)
return OSM_NODE_DATA
monkeypatch.setattr(nominatim.db.status, "get_url", mock_url)
date = nominatim.db.status.compute_database_date(temp_db_conn)
assert requested_url == ['https://www.openstreetmap.org/api/0.6/node/45673/1']
assert date == dt.datetime.fromisoformat('2006-01-27T22:09:10').replace(tzinfo=dt.timezone.utc)
def test_compute_database_broken_api(monkeypatch, status_table, place_row, temp_db_conn):
place_row(osm_type='N', osm_id=45673)
requested_url = []
def mock_url(url):
requested_url.append(url)
return '<osm version="0.6" generator="OpenStre'
monkeypatch.setattr(nominatim.db.status, "get_url", mock_url)
with pytest.raises(RuntimeError):
date = nominatim.db.status.compute_database_date(temp_db_conn)
def test_set_status_empty_table(status_table, temp_db_conn, temp_db_cursor):
date = dt.datetime.fromordinal(1000000).replace(tzinfo=dt.timezone.utc)
nominatim.db.status.set_status(temp_db_conn, date=date)
temp_db_cursor.execute("SELECT * FROM import_status")
assert temp_db_cursor.rowcount == 1
assert temp_db_cursor.fetchone() == [date, None, True]
def test_set_status_filled_table(status_table, temp_db_conn, temp_db_cursor):
date = dt.datetime.fromordinal(1000000).replace(tzinfo=dt.timezone.utc)
nominatim.db.status.set_status(temp_db_conn, date=date)
assert 1 == temp_db_cursor.scalar("SELECT count(*) FROM import_status")
date = dt.datetime.fromordinal(1000100).replace(tzinfo=dt.timezone.utc)
nominatim.db.status.set_status(temp_db_conn, date=date, seq=456, indexed=False)
temp_db_cursor.execute("SELECT * FROM import_status")
assert temp_db_cursor.rowcount == 1
assert temp_db_cursor.fetchone() == [date, 456, False]

View File

@@ -0,0 +1,41 @@
"""
Tests for replication functionality.
"""
import datetime as dt
import pytest
import nominatim.tools.replication
OSM_NODE_DATA = """\
<osm version="0.6" generator="OpenStreetMap server" copyright="OpenStreetMap and contributors" attribution="http://www.openstreetmap.org/copyright" license="http://opendatacommons.org/licenses/odbl/1-0/">
<node id="100" visible="true" version="1" changeset="2047" timestamp="2006-01-27T22:09:10Z" user="Foo" uid="111" lat="48.7586670" lon="8.1343060">
</node>
</osm>
"""
def test_init_replication_bad_base_url(monkeypatch, status_table, place_row, temp_db_conn, temp_db_cursor):
place_row(osm_type='N', osm_id=100)
monkeypatch.setattr(nominatim.db.status, "get_url", lambda u : OSM_NODE_DATA)
with pytest.raises(RuntimeError, match="Failed to reach replication service"):
nominatim.tools.replication.init_replication(temp_db_conn, 'https://test.io')
def test_init_replication_success(monkeypatch, status_table, place_row, temp_db_conn, temp_db_cursor):
place_row(osm_type='N', osm_id=100)
monkeypatch.setattr(nominatim.db.status, "get_url", lambda u : OSM_NODE_DATA)
monkeypatch.setattr(nominatim.tools.replication.ReplicationServer,
"timestamp_to_sequence",
lambda self, date: 234)
nominatim.tools.replication.init_replication(temp_db_conn, 'https://test.io')
temp_db_cursor.execute("SELECT * FROM import_status")
expected_date = dt.datetime.fromisoformat('2006-01-27T19:09:10').replace(tzinfo=dt.timezone.utc)
assert temp_db_cursor.rowcount == 1
assert temp_db_cursor.fetchone() == [expected_date, 234, True]