port replication initialisation to Python

This commit is contained in:
Sarah Hoffmann
2021-01-26 22:45:24 +01:00
parent 5b46fcad8e
commit d78f0ba804
13 changed files with 402 additions and 75 deletions

View File

@@ -116,64 +116,13 @@ $sBaseURL = getSetting('REPLICATION_URL');
if ($aResult['init-updates']) {
// sanity check that the replication URL is correct
$sBaseState = file_get_contents($sBaseURL.'/state.txt');
if ($sBaseState === false) {
echo "\nCannot find state.txt file at the configured replication URL.\n";
echo "Does the URL point to a directory containing OSM update data?\n\n";
fail('replication URL not reachable.');
}
// sanity check for pyosmium-get-changes
if (!$sPyosmiumBin) {
echo "\nNOMINATIM_PYOSMIUM_BINARY not configured.\n";
echo "You need to install pyosmium and set up the path to pyosmium-get-changes\n";
echo "in your local .env file.\n\n";
fail('NOMINATIM_PYOSMIUM_BINARY not configured');
$oCmd = (clone($oNominatimCmd))->addParams('replication', '--init');
if ($aResult['no-update-functions']) {
$oCmd->addParams('--no-update-functions');
}
$aOutput = 0;
$oCMD = new \Nominatim\Shell($sPyosmiumBin, '--help');
exec($oCMD->escapedCmd(), $aOutput, $iRet);
if ($iRet != 0) {
echo "Cannot execute pyosmium-get-changes.\n";
echo "Make sure you have pyosmium installed correctly\n";
echo "and have set up NOMINATIM_PYOSMIUM_BINARY to point to pyosmium-get-changes.\n";
fail('pyosmium-get-changes not found or not usable');
}
if (!$aResult['no-update-functions']) {
(clone($oNominatimCmd))->addParams('refresh', '--functions')->run();
}
$sDatabaseDate = getDatabaseDate($oDB);
if (!$sDatabaseDate) {
fail('Cannot determine date of database.');
}
$sWindBack = strftime('%Y-%m-%dT%H:%M:%SZ', strtotime($sDatabaseDate) - (3*60*60));
// get the appropriate state id
$aOutput = 0;
$oCMD = (new \Nominatim\Shell($sPyosmiumBin))
->addParams('--start-date', $sWindBack)
->addParams('--server', $sBaseURL);
exec($oCMD->escapedCmd(), $aOutput, $iRet);
if ($iRet != 0 || $aOutput[0] == 'None') {
fail('Error running pyosmium tools');
}
$oDB->exec('TRUNCATE import_status');
$sSQL = "INSERT INTO import_status (lastimportdate, sequence_id, indexed) VALUES('";
$sSQL .= $sDatabaseDate."',".$aOutput[0].', true)';
try {
$oDB->exec($sSQL);
} catch (\Nominatim\DatabaseError $e) {
fail('Could not enter sequence into database.');
}
echo "Done. Database updates will start at sequence $aOutput[0] ($sWindBack)\n";
$oCmd->run();
}
if ($aResult['check-for-updates']) {

View File

@@ -234,12 +234,29 @@ class UpdateReplication:
@staticmethod
def run(args):
try:
import osmium # pylint: disable=W0611
except ModuleNotFoundError:
LOG.fatal("pyosmium not installed. Replication functions not available.\n"
"To install pyosmium via pip: pip3 install osmium")
return 1
from .tools import replication, refresh
conn = connect(args.config.get_libpq_dsn())
params = ['update.php']
if args.init:
params.append('--init-updates')
if not args.update_functions:
params.append('--no-update-functions')
elif args.check_for_updates:
LOG.warning("Initialising replication updates")
replication.init_replication(conn, args.config.REPLICATION_URL)
if args.update_functions:
LOG.warning("Create functions")
refresh.create_functions(conn, args.config, args.data_dir,
True, False)
conn.close()
return 0
if args.check_for_updates:
params.append('--check-for-updates')
else:
if args.once:

View File

@@ -0,0 +1,58 @@
"""
Specialised connection and cursor functions.
"""
import logging
import psycopg2
import psycopg2.extensions
import psycopg2.extras
class _Cursor(psycopg2.extras.DictCursor):
""" A cursor returning dict-like objects and providing specialised
execution functions.
"""
def execute(self, query, args=None): # pylint: disable=W0221
""" Query execution that logs the SQL query when debugging is enabled.
"""
logger = logging.getLogger()
logger.debug(self.mogrify(query, args).decode('utf-8'))
super().execute(query, args)
def scalar(self, sql, args=None):
""" Execute query that returns a single value. The value is returned.
If the query yields more than one row, a ValueError is raised.
"""
self.execute(sql, args)
if self.rowcount != 1:
raise ValueError("Query did not return a single row.")
return self.fetchone()[0]
class _Connection(psycopg2.extensions.connection):
""" A connection that provides the specialised cursor by default and
adds convenience functions for administrating the database.
"""
def cursor(self, cursor_factory=_Cursor, **kwargs):
""" Return a new cursor. By default the specialised cursor is returned.
"""
return super().cursor(cursor_factory=cursor_factory, **kwargs)
def table_exists(self, table):
""" Check that a table with the given name exists in the database.
"""
with self.cursor() as cur:
num = cur.scalar("""SELECT count(*) FROM pg_tables
WHERE tablename = %s""", (table, ))
return num == 1
def connect(dsn):
""" Open a connection to the database using the specialised connection
factory.
"""
return psycopg2.connect(dsn, connection_factory=_Connection)

50
nominatim/db/status.py Normal file
View File

@@ -0,0 +1,50 @@
"""
Access and helper functions for the status table.
"""
import datetime as dt
import logging
import re
from ..tools.exec_utils import get_url
LOG = logging.getLogger()
def compute_database_date(conn):
""" Determine the date of the database from the newest object in the
data base.
"""
# First, find the node with the highest ID in the database
with conn.cursor() as cur:
osmid = cur.scalar("SELECT max(osm_id) FROM place WHERE osm_type='N'")
if osmid is None:
LOG.fatal("No data found in the database.")
raise RuntimeError("No data found in the database.")
LOG.info("Using node id %d for timestamp lookup", osmid)
# Get the node from the API to find the timestamp when it was created.
node_url = 'https://www.openstreetmap.org/api/0.6/node/{}/1'.format(osmid)
data = get_url(node_url)
match = re.search(r'timestamp="((\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}))Z"', data)
if match is None:
LOG.fatal("The node data downloaded from the API does not contain valid data.\n"
"URL used: %s", node_url)
raise RuntimeError("Bad API data.")
LOG.debug("Found timestamp %s", match[1])
return dt.datetime.fromisoformat(match[1]).replace(tzinfo=dt.timezone.utc)
def set_status(conn, date, seq=None, indexed=True):
""" Replace the current status with the given status.
"""
assert date.tzinfo == dt.timezone.utc
with conn.cursor() as cur:
cur.execute("TRUNCATE TABLE import_status")
cur.execute("""INSERT INTO import_status (lastimportdate, sequence_id, indexed)
VALUES (%s, %s, %s)""", (date, seq, indexed))
conn.commit()

View File

@@ -3,8 +3,13 @@ Helper functions for executing external programs.
"""
import logging
import subprocess
import urllib.request as urlrequest
from urllib.parse import urlencode
from ..version import NOMINATIM_VERSION
LOG = logging.getLogger()
def run_legacy_script(script, *args, nominatim_env=None, throw_on_fail=False):
""" Run a Nominatim PHP script with the given arguments.
@@ -80,3 +85,16 @@ def run_api_script(endpoint, project_dir, extra_env=None, phpcgi_bin=None,
print(result[content_start + 4:].replace('\\n', '\n'))
return 0
def get_url(url):
""" Get the contents from the given URL and return it as a UTF-8 string.
"""
headers = {"User-Agent" : "Nominatim/" + NOMINATIM_VERSION}
try:
with urlrequest.urlopen(urlrequest.Request(url, headers=headers)) as response:
return response.read().decode('utf-8')
except:
LOG.fatal('Failed to load URL: %s', url)
raise

View File

@@ -0,0 +1,34 @@
"""
Functions for updating a database from a replication source.
"""
import datetime
import logging
from osmium.replication.server import ReplicationServer
from ..db import status
LOG = logging.getLogger()
def init_replication(conn, base_url):
""" Set up replication for the server at the given base URL.
"""
LOG.info("Using replication source: %s", base_url)
date = status.compute_database_date(conn)
# margin of error to make sure we get all data
date -= datetime.timedelta(hours=3)
repl = ReplicationServer(base_url)
seq = repl.timestamp_to_sequence(date)
if seq is None:
LOG.fatal("Cannot reach the configured replication service '%s'.\n"
"Does the URL point to a directory containing OSM update data?",
base_url)
raise RuntimeError("Failed to reach replication service")
status.set_status(conn, date=date, seq=seq)
LOG.warning("Updates intialised at sequence %s (%s)", seq, date)

5
nominatim/version.py Normal file
View File

@@ -0,0 +1,5 @@
"""
Version information for Nominatim.
"""
NOMINATIM_VERSION = "3.6.0"

View File

@@ -57,6 +57,9 @@ NOMINATIM_HTTP_PROXY_HOST=proxy.mydomain.com
NOMINATIM_HTTP_PROXY_PORT=3128
NOMINATIM_HTTP_PROXY_LOGIN=
NOMINATIM_HTTP_PROXY_PASSWORD=
# Also set these standard environment variables.
# HTTP_PROXY="http://user:pass@10.10.1.10:1080"
# HTTPS_PROXY="http://user:pass@10.10.1.10:1080"
# Location of the osm2pgsql binary.
# When empty, osm2pgsql is expected to reside in the osm2pgsql directory in

View File

@@ -1,6 +1,6 @@
drop table if exists import_status;
CREATE TABLE import_status (
lastimportdate timestamp NOT NULL,
lastimportdate timestamp with time zone NOT NULL,
sequence_id integer,
indexed boolean
);

View File

@@ -1,3 +1,4 @@
import itertools
import sys
from pathlib import Path
@@ -11,6 +12,7 @@ SRC_DIR = Path(__file__) / '..' / '..' / '..'
sys.path.insert(0, str(SRC_DIR.resolve()))
from nominatim.config import Configuration
from nominatim.db import connection
class _TestingCursor(psycopg2.extras.DictCursor):
""" Extension to the DictCursor class that provides execution
@@ -40,27 +42,42 @@ def temp_db(monkeypatch):
exported into NOMINATIM_DATABASE_DSN.
"""
name = 'test_nominatim_python_unittest'
with psycopg2.connect(database='postgres') as conn:
conn.set_isolation_level(0)
with conn.cursor() as cur:
cur.execute('DROP DATABASE IF EXISTS {}'.format(name))
cur.execute('CREATE DATABASE {}'.format(name))
conn = psycopg2.connect(database='postgres')
conn.set_isolation_level(0)
with conn.cursor() as cur:
cur.execute('DROP DATABASE IF EXISTS {}'.format(name))
cur.execute('CREATE DATABASE {}'.format(name))
conn.close()
monkeypatch.setenv('NOMINATIM_DATABASE_DSN' , 'dbname=' + name)
yield name
with psycopg2.connect(database='postgres') as conn:
conn.set_isolation_level(0)
with conn.cursor() as cur:
cur.execute('DROP DATABASE IF EXISTS {}'.format(name))
conn = psycopg2.connect(database='postgres')
conn.set_isolation_level(0)
with conn.cursor() as cur:
cur.execute('DROP DATABASE IF EXISTS {}'.format(name))
conn.close()
@pytest.fixture
def temp_db_with_extensions(temp_db):
conn = psycopg2.connect(database=temp_db)
with conn.cursor() as cur:
cur.execute('CREATE EXTENSION hstore; CREATE EXTENSION postgis;')
conn.commit()
conn.close()
return temp_db
@pytest.fixture
def temp_db_conn(temp_db):
""" Connection to the test database.
"""
conn = psycopg2.connect(database=temp_db)
conn = connection.connect('dbname=' + temp_db)
yield conn
conn.close()
@@ -80,3 +97,50 @@ def temp_db_cursor(temp_db):
@pytest.fixture
def def_config():
return Configuration(None, SRC_DIR.resolve() / 'settings')
@pytest.fixture
def status_table(temp_db_conn):
""" Create an empty version of the status table.
"""
with temp_db_conn.cursor() as cur:
cur.execute("""CREATE TABLE import_status (
lastimportdate timestamp with time zone NOT NULL,
sequence_id integer,
indexed boolean
)""")
temp_db_conn.commit()
@pytest.fixture
def place_table(temp_db_with_extensions, temp_db_conn):
""" Create an empty version of the place table.
"""
with temp_db_conn.cursor() as cur:
cur.execute("""CREATE TABLE place (
osm_id int8 NOT NULL,
osm_type char(1) NOT NULL,
class text NOT NULL,
type text NOT NULL,
name hstore,
admin_level smallint,
address hstore,
extratags hstore,
geometry Geometry(Geometry,4326) NOT NULL)""")
temp_db_conn.commit()
@pytest.fixture
def place_row(place_table, temp_db_cursor):
""" A factory for rows in the place table. The table is created as a
prerequisite to the fixture.
"""
idseq = itertools.count(1001)
def _insert(osm_type='N', osm_id=None, cls='amenity', typ='cafe', names=None,
admin_level=None, address=None, extratags=None, geom=None):
temp_db_cursor.execute("INSERT INTO place VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
(osm_id or next(idseq), osm_type, cls, typ, names,
admin_level, address, extratags,
geom or 'SRID=4326;POINT(0 0 )'))
return _insert

View File

@@ -7,6 +7,7 @@ import pytest
import nominatim.cli
import nominatim.indexer.indexer
import nominatim.tools.refresh
import nominatim.tools.replication
def call_nominatim(*args):
return nominatim.cli.nominatim(module_dir='build/module',
@@ -56,7 +57,6 @@ def test_cli_help(capsys):
(('import', '--continue', 'load-data'), 'setup'),
(('freeze',), 'setup'),
(('special-phrases',), 'specialphrases'),
(('replication',), 'update'),
(('add-data', '--tiger-data', 'tiger'), 'setup'),
(('add-data', '--file', 'foo.osm'), 'update'),
(('check-database',), 'check_import_finished'),
@@ -102,7 +102,7 @@ def test_index_command(monkeypatch, temp_db_cursor, params, do_bnds, do_ranks):
('importance', ('update.php', '--recompute-importance')),
('website', ('setup.php', '--setup-website')),
])
def test_refresh_legacy_command(mock_run_legacy, command, params):
def test_refresh_legacy_command(mock_run_legacy, temp_db, command, params):
assert 0 == call_nominatim('refresh', '--' + command)
assert mock_run_legacy.called == 1
@@ -115,7 +115,7 @@ def test_refresh_legacy_command(mock_run_legacy, command, params):
('address-levels', 'load_address_levels_from_file'),
('functions', 'create_functions'),
])
def test_refresh_command(monkeypatch, command, func):
def test_refresh_command(monkeypatch, temp_db, command, func):
func_mock = MockParamCapture()
monkeypatch.setattr(nominatim.tools.refresh, func, func_mock)
@@ -123,12 +123,26 @@ def test_refresh_command(monkeypatch, command, func):
assert func_mock.called == 1
def test_refresh_importance_computed_after_wiki_import(mock_run_legacy):
def test_refresh_importance_computed_after_wiki_import(mock_run_legacy, temp_db):
assert 0 == call_nominatim('refresh', '--importance', '--wiki-data')
assert mock_run_legacy.called == 2
assert mock_run_legacy.last_args == ('update.php', '--recompute-importance')
@pytest.mark.parametrize("params,func", [
(('--init', '--no-update-functions'), 'init_replication')
])
def test_replication_command(monkeypatch, temp_db, params, func):
func_mock = MockParamCapture()
monkeypatch.setattr(nominatim.tools.replication, func, func_mock)
assert 0 == call_nominatim('replication', *params)
assert func_mock.called == 1
@pytest.mark.parametrize("params", [
('search', '--query', 'new'),
('reverse', '--lat', '0', '--lon', '0'),

View File

@@ -0,0 +1,74 @@
"""
Tests for status table manipulation.
"""
import datetime as dt
import pytest
import nominatim.db.status
def test_compute_database_date_place_empty(status_table, place_table, temp_db_conn):
with pytest.raises(RuntimeError):
nominatim.db.status.compute_database_date(temp_db_conn)
OSM_NODE_DATA = """\
<osm version="0.6" generator="OpenStreetMap server" copyright="OpenStreetMap and contributors" attribution="http://www.openstreetmap.org/copyright" license="http://opendatacommons.org/licenses/odbl/1-0/">
<node id="45673" visible="true" version="1" changeset="2047" timestamp="2006-01-27T22:09:10Z" user="Foo" uid="111" lat="48.7586670" lon="8.1343060">
</node>
</osm>
"""
def test_compute_database_date_valid(monkeypatch, status_table, place_row, temp_db_conn):
place_row(osm_type='N', osm_id=45673)
requested_url = []
def mock_url(url):
requested_url.append(url)
return OSM_NODE_DATA
monkeypatch.setattr(nominatim.db.status, "get_url", mock_url)
date = nominatim.db.status.compute_database_date(temp_db_conn)
assert requested_url == ['https://www.openstreetmap.org/api/0.6/node/45673/1']
assert date == dt.datetime.fromisoformat('2006-01-27T22:09:10').replace(tzinfo=dt.timezone.utc)
def test_compute_database_broken_api(monkeypatch, status_table, place_row, temp_db_conn):
place_row(osm_type='N', osm_id=45673)
requested_url = []
def mock_url(url):
requested_url.append(url)
return '<osm version="0.6" generator="OpenStre'
monkeypatch.setattr(nominatim.db.status, "get_url", mock_url)
with pytest.raises(RuntimeError):
date = nominatim.db.status.compute_database_date(temp_db_conn)
def test_set_status_empty_table(status_table, temp_db_conn, temp_db_cursor):
date = dt.datetime.fromordinal(1000000).replace(tzinfo=dt.timezone.utc)
nominatim.db.status.set_status(temp_db_conn, date=date)
temp_db_cursor.execute("SELECT * FROM import_status")
assert temp_db_cursor.rowcount == 1
assert temp_db_cursor.fetchone() == [date, None, True]
def test_set_status_filled_table(status_table, temp_db_conn, temp_db_cursor):
date = dt.datetime.fromordinal(1000000).replace(tzinfo=dt.timezone.utc)
nominatim.db.status.set_status(temp_db_conn, date=date)
assert 1 == temp_db_cursor.scalar("SELECT count(*) FROM import_status")
date = dt.datetime.fromordinal(1000100).replace(tzinfo=dt.timezone.utc)
nominatim.db.status.set_status(temp_db_conn, date=date, seq=456, indexed=False)
temp_db_cursor.execute("SELECT * FROM import_status")
assert temp_db_cursor.rowcount == 1
assert temp_db_cursor.fetchone() == [date, 456, False]

View File

@@ -0,0 +1,41 @@
"""
Tests for replication functionality.
"""
import datetime as dt
import pytest
import nominatim.tools.replication
OSM_NODE_DATA = """\
<osm version="0.6" generator="OpenStreetMap server" copyright="OpenStreetMap and contributors" attribution="http://www.openstreetmap.org/copyright" license="http://opendatacommons.org/licenses/odbl/1-0/">
<node id="100" visible="true" version="1" changeset="2047" timestamp="2006-01-27T22:09:10Z" user="Foo" uid="111" lat="48.7586670" lon="8.1343060">
</node>
</osm>
"""
def test_init_replication_bad_base_url(monkeypatch, status_table, place_row, temp_db_conn, temp_db_cursor):
place_row(osm_type='N', osm_id=100)
monkeypatch.setattr(nominatim.db.status, "get_url", lambda u : OSM_NODE_DATA)
with pytest.raises(RuntimeError, match="Failed to reach replication service"):
nominatim.tools.replication.init_replication(temp_db_conn, 'https://test.io')
def test_init_replication_success(monkeypatch, status_table, place_row, temp_db_conn, temp_db_cursor):
place_row(osm_type='N', osm_id=100)
monkeypatch.setattr(nominatim.db.status, "get_url", lambda u : OSM_NODE_DATA)
monkeypatch.setattr(nominatim.tools.replication.ReplicationServer,
"timestamp_to_sequence",
lambda self, date: 234)
nominatim.tools.replication.init_replication(temp_db_conn, 'https://test.io')
temp_db_cursor.execute("SELECT * FROM import_status")
expected_date = dt.datetime.fromisoformat('2006-01-27T19:09:10').replace(tzinfo=dt.timezone.utc)
assert temp_db_cursor.rowcount == 1
assert temp_db_cursor.fetchone() == [expected_date, 234, True]