From e5206133628c0ab1cacd6c5a04a2a9a973bfc86c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 23 Feb 2021 10:11:21 +0100 Subject: [PATCH 01/18] convert connect() into a context manager --- nominatim/clicmd/admin.py | 5 +-- nominatim/clicmd/freeze.py | 5 +-- nominatim/clicmd/index.py | 5 +-- nominatim/clicmd/refresh.py | 22 +++++------ nominatim/clicmd/replication.py | 39 ++++++++----------- nominatim/db/connection.py | 10 ++++- nominatim/tools/check_database.py | 2 +- test/python/conftest.py | 5 +-- test/python/test_db_connection.py | 5 +-- test/python/test_tools_admin.py | 5 +-- test/python/test_tools_check_database.py | 4 ++ .../test_tools_refresh_create_functions.py | 5 +-- 12 files changed, 53 insertions(+), 59 deletions(-) diff --git a/nominatim/clicmd/admin.py b/nominatim/clicmd/admin.py index e5863575..fd9382eb 100644 --- a/nominatim/clicmd/admin.py +++ b/nominatim/clicmd/admin.py @@ -54,9 +54,8 @@ class AdminFuncs: if args.analyse_indexing: LOG.warning('Analysing performance of indexing function') from ..tools import admin - conn = connect(args.config.get_libpq_dsn()) - admin.analyse_indexing(conn, osm_id=args.osm_id, place_id=args.place_id) - conn.close() + with connect(args.config.get_libpq_dsn()) as conn: + admin.analyse_indexing(conn, osm_id=args.osm_id, place_id=args.place_id) return 0 diff --git a/nominatim/clicmd/freeze.py b/nominatim/clicmd/freeze.py index 8bca04b9..1b311e97 100644 --- a/nominatim/clicmd/freeze.py +++ b/nominatim/clicmd/freeze.py @@ -29,9 +29,8 @@ class SetupFreeze: def run(args): from ..tools import freeze - conn = connect(args.config.get_libpq_dsn()) - freeze.drop_update_tables(conn) + with connect(args.config.get_libpq_dsn()) as conn: + freeze.drop_update_tables(conn) freeze.drop_flatnode_file(args.config.FLATNODE_FILE) - conn.close() return 0 diff --git a/nominatim/clicmd/index.py b/nominatim/clicmd/index.py index ca3f9dee..96a69396 100644 --- a/nominatim/clicmd/index.py +++ b/nominatim/clicmd/index.py @@ -51,8 +51,7 @@ class UpdateIndex: if not args.no_boundaries and not args.boundaries_only \ and args.minrank == 0 and args.maxrank == 30: - conn = connect(args.config.get_libpq_dsn()) - status.set_indexed(conn, True) - conn.close() + with connect(args.config.get_libpq_dsn()) as conn: + status.set_indexed(conn, True) return 0 diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index ffbe628b..f68e185a 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -50,29 +50,25 @@ class UpdateRefresh: if args.postcodes: LOG.warning("Update postcodes centroid") - conn = connect(args.config.get_libpq_dsn()) - refresh.update_postcodes(conn, args.sqllib_dir) - conn.close() + with connect(args.config.get_libpq_dsn()) as conn: + refresh.update_postcodes(conn, args.sqllib_dir) if args.word_counts: LOG.warning('Recompute frequency of full-word search terms') - conn = connect(args.config.get_libpq_dsn()) - refresh.recompute_word_counts(conn, args.sqllib_dir) - conn.close() + with connect(args.config.get_libpq_dsn()) as conn: + refresh.recompute_word_counts(conn, args.sqllib_dir) if args.address_levels: cfg = Path(args.config.ADDRESS_LEVEL_CONFIG) LOG.warning('Updating address levels from %s', cfg) - conn = connect(args.config.get_libpq_dsn()) - refresh.load_address_levels_from_file(conn, cfg) - conn.close() + with connect(args.config.get_libpq_dsn()) as conn: + refresh.load_address_levels_from_file(conn, cfg) if args.functions: LOG.warning('Create functions') - conn = connect(args.config.get_libpq_dsn()) - refresh.create_functions(conn, args.config, args.sqllib_dir, - args.diffs, args.enable_debug_statements) - conn.close() + with connect(args.config.get_libpq_dsn()) as conn: + refresh.create_functions(conn, args.config, args.sqllib_dir, + args.diffs, args.enable_debug_statements) if args.wiki_data: run_legacy_script('setup.php', '--import-wikipedia-articles', diff --git a/nominatim/clicmd/replication.py b/nominatim/clicmd/replication.py index 2a19e6cd..e766be2b 100644 --- a/nominatim/clicmd/replication.py +++ b/nominatim/clicmd/replication.py @@ -62,13 +62,12 @@ class UpdateReplication: from ..tools import replication, refresh LOG.warning("Initialising replication updates") - conn = connect(args.config.get_libpq_dsn()) - replication.init_replication(conn, base_url=args.config.REPLICATION_URL) - if args.update_functions: - LOG.warning("Create functions") - refresh.create_functions(conn, args.config, args.sqllib_dir, - True, False) - conn.close() + with connect(args.config.get_libpq_dsn()) as conn: + replication.init_replication(conn, base_url=args.config.REPLICATION_URL) + if args.update_functions: + LOG.warning("Create functions") + refresh.create_functions(conn, args.config, args.sqllib_dir, + True, False) return 0 @@ -76,10 +75,8 @@ class UpdateReplication: def _check_for_updates(args): from ..tools import replication - conn = connect(args.config.get_libpq_dsn()) - ret = replication.check_for_updates(conn, base_url=args.config.REPLICATION_URL) - conn.close() - return ret + with connect(args.config.get_libpq_dsn()) as conn: + return replication.check_for_updates(conn, base_url=args.config.REPLICATION_URL) @staticmethod def _report_update(batchdate, start_import, start_index): @@ -122,13 +119,12 @@ class UpdateReplication: recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL') while True: - conn = connect(args.config.get_libpq_dsn()) - start = dt.datetime.now(dt.timezone.utc) - state = replication.update(conn, params) - if state is not replication.UpdateState.NO_CHANGES: - status.log_status(conn, start, 'import') - batchdate, _, _ = status.get_status(conn) - conn.close() + with connect(args.config.get_libpq_dsn()) as conn: + start = dt.datetime.now(dt.timezone.utc) + state = replication.update(conn, params) + if state is not replication.UpdateState.NO_CHANGES: + status.log_status(conn, start, 'import') + batchdate, _, _ = status.get_status(conn) if state is not replication.UpdateState.NO_CHANGES and args.do_index: index_start = dt.datetime.now(dt.timezone.utc) @@ -137,10 +133,9 @@ class UpdateReplication: indexer.index_boundaries(0, 30) indexer.index_by_rank(0, 30) - conn = connect(args.config.get_libpq_dsn()) - status.set_indexed(conn, True) - status.log_status(conn, index_start, 'index') - conn.close() + with connect(args.config.get_libpq_dsn()) as conn: + status.set_indexed(conn, True) + status.log_status(conn, index_start, 'index') else: index_start = None diff --git a/nominatim/db/connection.py b/nominatim/db/connection.py index b941f46f..6bd81a2f 100644 --- a/nominatim/db/connection.py +++ b/nominatim/db/connection.py @@ -1,6 +1,7 @@ """ Specialised connection and cursor functions. """ +import contextlib import logging import psycopg2 @@ -84,9 +85,14 @@ class _Connection(psycopg2.extensions.connection): def connect(dsn): """ Open a connection to the database using the specialised connection - factory. + factory. The returned object may be used in conjunction with 'with'. + When used outside a context manager, use the `connection` attribute + to get the connection. """ try: - return psycopg2.connect(dsn, connection_factory=_Connection) + conn = psycopg2.connect(dsn, connection_factory=_Connection) + ctxmgr = contextlib.closing(conn) + ctxmgr.connection = conn + return ctxmgr except psycopg2.OperationalError as err: raise UsageError("Cannot connect to database: {}".format(err)) from err diff --git a/nominatim/tools/check_database.py b/nominatim/tools/check_database.py index 7b8da200..d8ab08cc 100644 --- a/nominatim/tools/check_database.py +++ b/nominatim/tools/check_database.py @@ -60,7 +60,7 @@ def check_database(config): """ Run a number of checks on the database and return the status. """ try: - conn = connect(config.get_libpq_dsn()) + conn = connect(config.get_libpq_dsn()).connection except UsageError as err: conn = _BadConnection(str(err)) diff --git a/test/python/conftest.py b/test/python/conftest.py index 72a56dcf..0e0e808c 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -85,9 +85,8 @@ def temp_db_with_extensions(temp_db): def temp_db_conn(temp_db): """ Connection to the test database. """ - conn = connection.connect('dbname=' + temp_db) - yield conn - conn.close() + with connection.connect('dbname=' + temp_db) as conn: + yield conn @pytest.fixture diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py index 11ad691a..846ef864 100644 --- a/test/python/test_db_connection.py +++ b/test/python/test_db_connection.py @@ -7,9 +7,8 @@ from nominatim.db.connection import connect @pytest.fixture def db(temp_db): - conn = connect('dbname=' + temp_db) - yield conn - conn.close() + with connect('dbname=' + temp_db) as conn: + yield conn def test_connection_table_exists(db, temp_db_cursor): diff --git a/test/python/test_tools_admin.py b/test/python/test_tools_admin.py index a40a17db..36c7d6ff 100644 --- a/test/python/test_tools_admin.py +++ b/test/python/test_tools_admin.py @@ -9,9 +9,8 @@ from nominatim.tools import admin @pytest.fixture def db(temp_db, placex_table): - conn = connect('dbname=' + temp_db) - yield conn - conn.close() + with connect('dbname=' + temp_db) as conn: + yield conn def test_analyse_indexing_no_objects(db): with pytest.raises(UsageError): diff --git a/test/python/test_tools_check_database.py b/test/python/test_tools_check_database.py index 0b5c23a6..3787c3be 100644 --- a/test/python/test_tools_check_database.py +++ b/test/python/test_tools_check_database.py @@ -10,6 +10,10 @@ def test_check_database_unknown_db(def_config, monkeypatch): assert 1 == chkdb.check_database(def_config) +def test_check_database_fatal_test(def_config, temp_db): + assert 1 == chkdb.check_database(def_config) + + def test_check_conection_good(temp_db_conn, def_config): assert chkdb.check_connection(temp_db_conn, def_config) == chkdb.CheckState.OK diff --git a/test/python/test_tools_refresh_create_functions.py b/test/python/test_tools_refresh_create_functions.py index d219d748..ac2f2211 100644 --- a/test/python/test_tools_refresh_create_functions.py +++ b/test/python/test_tools_refresh_create_functions.py @@ -11,9 +11,8 @@ SQL_DIR = (Path(__file__) / '..' / '..' / '..' / 'lib-sql').resolve() @pytest.fixture def db(temp_db): - conn = connect('dbname=' + temp_db) - yield conn - conn.close() + with connect('dbname=' + temp_db) as conn: + yield conn @pytest.fixture def db_with_tables(db): From af7226393a45a0ea5b87967c3231392b0e12da64 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 23 Feb 2021 14:11:11 +0100 Subject: [PATCH 02/18] add function to set up libpq environment Instead of parsing the DSN for each external libpq program we are going to execute, provide a function that feeds them all necessary parameters through the environment. osm2pgsql is the first user. --- nominatim/db/connection.py | 55 +++++++++++++++++++++++++++++-- nominatim/tools/exec_utils.py | 14 ++------ test/python/test_db_connection.py | 23 ++++++++++++- 3 files changed, 77 insertions(+), 15 deletions(-) diff --git a/nominatim/db/connection.py b/nominatim/db/connection.py index 6bd81a2f..68e988f6 100644 --- a/nominatim/db/connection.py +++ b/nominatim/db/connection.py @@ -3,6 +3,7 @@ Specialised connection and cursor functions. """ import contextlib import logging +import os import psycopg2 import psycopg2.extensions @@ -10,6 +11,8 @@ import psycopg2.extras from ..errors import UsageError +LOG = logging.getLogger() + class _Cursor(psycopg2.extras.DictCursor): """ A cursor returning dict-like objects and providing specialised execution functions. @@ -18,8 +21,7 @@ class _Cursor(psycopg2.extras.DictCursor): def execute(self, query, args=None): # pylint: disable=W0221 """ Query execution that logs the SQL query when debugging is enabled. """ - logger = logging.getLogger() - logger.debug(self.mogrify(query, args).decode('utf-8')) + LOG.debug(self.mogrify(query, args).decode('utf-8')) super().execute(query, args) @@ -96,3 +98,52 @@ def connect(dsn): return ctxmgr except psycopg2.OperationalError as err: raise UsageError("Cannot connect to database: {}".format(err)) from err + + +# Translation from PG connection string parameters to PG environment variables. +# Derived from https://www.postgresql.org/docs/current/libpq-envars.html. +_PG_CONNECTION_STRINGS = { + 'host': 'PGHOST', + 'hostaddr': 'PGHOSTADDR', + 'port': 'PGPORT', + 'dbname': 'PGDATABASE', + 'user': 'PGUSER', + 'password': 'PGPASSWORD', + 'passfile': 'PGPASSFILE', + 'channel_binding': 'PGCHANNELBINDING', + 'service': 'PGSERVICE', + 'options': 'PGOPTIONS', + 'application_name': 'PGAPPNAME', + 'sslmode': 'PGSSLMODE', + 'requiressl': 'PGREQUIRESSL', + 'sslcompression': 'PGSSLCOMPRESSION', + 'sslcert': 'PGSSLCERT', + 'sslkey': 'PGSSLKEY', + 'sslrootcert': 'PGSSLROOTCERT', + 'sslcrl': 'PGSSLCRL', + 'requirepeer': 'PGREQUIREPEER', + 'ssl_min_protocol_version': 'PGSSLMINPROTOCOLVERSION', + 'ssl_min_protocol_version': 'PGSSLMAXPROTOCOLVERSION', + 'gssencmode': 'PGGSSENCMODE', + 'krbsrvname': 'PGKRBSRVNAME', + 'gsslib': 'PGGSSLIB', + 'connect_timeout': 'PGCONNECT_TIMEOUT', + 'target_session_attrs': 'PGTARGETSESSIONATTRS', +} + + +def get_pg_env(dsn, base_env=None): + """ Return a copy of `base_env` with the environment variables for + PostgresSQL set up from the given database connection string. + If `base_env` is None, then the OS environment is used as a base + environment. + """ + env = base_env if base_env is not None else os.environ + + for param, value in psycopg2.extensions.parse_dsn(dsn).items(): + if param in _PG_CONNECTION_STRINGS: + env[_PG_CONNECTION_STRINGS[param]] = value + else: + LOG.error("Unknown connection parameter '%s' ignored.", param) + + return env diff --git a/nominatim/tools/exec_utils.py b/nominatim/tools/exec_utils.py index f373f347..004a821c 100644 --- a/nominatim/tools/exec_utils.py +++ b/nominatim/tools/exec_utils.py @@ -10,6 +10,7 @@ from urllib.parse import urlencode from psycopg2.extensions import parse_dsn from ..version import NOMINATIM_VERSION +from ..db.connection import get_pg_env LOG = logging.getLogger() @@ -100,7 +101,7 @@ def run_php_server(server_address, base_dir): def run_osm2pgsql(options): """ Run osm2pgsql with the given options. """ - env = os.environ + env = get_pg_env(options['dsn']) cmd = [options['osm2pgsql'], '--hstore', '--latlon', '--slim', '--with-forward-dependencies', 'false', @@ -116,17 +117,6 @@ def run_osm2pgsql(options): if options['flatnode_file']: cmd.extend(('--flat-nodes', options['flatnode_file'])) - dsn = parse_dsn(options['dsn']) - if 'password' in dsn: - env['PGPASSWORD'] = dsn['password'] - if 'dbname' in dsn: - cmd.extend(('-d', dsn['dbname'])) - if 'user' in dsn: - cmd.extend(('--username', dsn['user'])) - for param in ('host', 'port'): - if param in dsn: - cmd.extend(('--' + param, dsn[param])) - if options.get('disable_jit', False): env['PGOPTIONS'] = '-c jit=off -c max_parallel_workers_per_gather=0' diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py index 846ef864..fd5da754 100644 --- a/test/python/test_db_connection.py +++ b/test/python/test_db_connection.py @@ -3,7 +3,7 @@ Tests for specialised conenction and cursor classes. """ import pytest -from nominatim.db.connection import connect +from nominatim.db.connection import connect, get_pg_env @pytest.fixture def db(temp_db): @@ -48,3 +48,24 @@ def test_cursor_scalar_many_rows(db): with db.cursor() as cur: with pytest.raises(RuntimeError): cur.scalar('SELECT * FROM pg_tables') + + +def test_get_pg_env_add_variable(monkeypatch): + monkeypatch.delenv('PGPASSWORD', raising=False) + env = get_pg_env('user=fooF') + + assert env['PGUSER'] == 'fooF' + assert 'PGPASSWORD' not in env + + +def test_get_pg_env_overwrite_variable(monkeypatch): + monkeypatch.setenv('PGUSER', 'some default') + env = get_pg_env('user=overwriter') + + assert env['PGUSER'] == 'overwriter' + + +def test_get_pg_env_ignore_unknown(): + env = get_pg_env('tty=stuff', base_env={}) + + assert env == {} From b93ec2522e25102e8a36ca78423a53b4579c68c2 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 23 Feb 2021 19:05:51 +0100 Subject: [PATCH 03/18] use psql for executing sql files This allows to run larger files without needing to keep them in memory. --- nominatim/db/utils.py | 37 +++++++++++++++++++++++++++++------- nominatim/tools/refresh.py | 8 ++++---- test/python/test_db_utils.py | 33 +++++++++++++++++++++----------- 3 files changed, 56 insertions(+), 22 deletions(-) diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index abd72519..1a104e51 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -1,12 +1,35 @@ """ Helper functions for handling DB accesses. """ +import subprocess +import logging -def execute_file(conn, fname): - """ Read an SQL file and run its contents against the given connection. +from .connection import get_pg_env +from ..errors import UsageError + +LOG = logging.getLogger() + +def execute_file(dsn, fname, ignore_errors=False): + """ Read an SQL file and run its contents against the given database + using psql. """ - with fname.open('r') as fdesc: - sql = fdesc.read() - with conn.cursor() as cur: - cur.execute(sql) - conn.commit() + cmd = ['psql'] + if not ignore_errors: + cmd.extend(('-v', 'ON_ERROR_STOP=1')) + proc = subprocess.Popen(cmd, env=get_pg_env(dsn), stdin=subprocess.PIPE) + + if not LOG.isEnabledFor(logging.INFO): + proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8')) + + with fname.open('rb') as fdesc: + chunk = fdesc.read(2048) + while chunk and proc.poll() is None: + proc.stdin.write(chunk) + chunk = fdesc.read(2048) + + proc.stdin.close() + + ret = proc.wait() + print(ret, chunk) + if ret != 0 or chunk: + raise UsageError("Failed to execute SQL file.") diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index f09c0ced..33efe8f8 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -12,17 +12,17 @@ from ..db.utils import execute_file LOG = logging.getLogger() -def update_postcodes(conn, sql_dir): +def update_postcodes(dsn, sql_dir): """ Recalculate postcode centroids and add, remove and update entries in the location_postcode table. `conn` is an opne connection to the database. """ - execute_file(conn, sql_dir / 'update-postcodes.sql') + execute_file(dsn, sql_dir / 'update-postcodes.sql') -def recompute_word_counts(conn, sql_dir): +def recompute_word_counts(dsn, sql_dir): """ Compute the frequency of full-word search terms. """ - execute_file(conn, sql_dir / 'words_from_search_name.sql') + execute_file(dsn, sql_dir / 'words_from_search_name.sql') def _add_address_level_rows_from_entry(rows, entry): diff --git a/test/python/test_db_utils.py b/test/python/test_db_utils.py index e756f2c4..b2586ed0 100644 --- a/test/python/test_db_utils.py +++ b/test/python/test_db_utils.py @@ -5,26 +5,37 @@ import psycopg2 import pytest import nominatim.db.utils as db_utils +from nominatim.errors import UsageError -def test_execute_file_success(temp_db_conn, tmp_path): +@pytest.fixture +def dsn(temp_db): + return 'dbname=' + temp_db + +def test_execute_file_success(dsn, temp_db_cursor, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);') - db_utils.execute_file(temp_db_conn, tmpfile) + db_utils.execute_file(dsn, tmpfile) - with temp_db_conn.cursor() as cur: - cur.execute('SELECT * FROM test') + temp_db_cursor.execute('SELECT * FROM test') - assert cur.rowcount == 1 - assert cur.fetchone()[0] == 56 + assert temp_db_cursor.rowcount == 1 + assert temp_db_cursor.fetchone()[0] == 56 -def test_execute_file_bad_file(temp_db_conn, tmp_path): +def test_execute_file_bad_file(dsn, tmp_path): with pytest.raises(FileNotFoundError): - db_utils.execute_file(temp_db_conn, tmp_path / 'test2.sql') + db_utils.execute_file(dsn, tmp_path / 'test2.sql') -def test_execute_file_bad_sql(temp_db_conn, tmp_path): +def test_execute_file_bad_sql(dsn, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE STABLE test (id INT)') - with pytest.raises(psycopg2.ProgrammingError): - db_utils.execute_file(temp_db_conn, tmpfile) + with pytest.raises(UsageError): + db_utils.execute_file(dsn, tmpfile) + + +def test_execute_file_bad_sql_ignore_errors(dsn, tmp_path): + tmpfile = tmp_path / 'test.sql' + tmpfile.write_text('CREATE STABLE test (id INT)') + + db_utils.execute_file(dsn, tmpfile, ignore_errors=True) From f6e894a53af83a69f553555cb4a6248d57a58391 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 23 Feb 2021 22:50:23 +0100 Subject: [PATCH 04/18] port database setup function to python Hide the former PHP functions in a transition command until they are removed. --- lib-php/Shell.php | 7 +- lib-php/admin/setup.php | 13 ++- lib-php/setup/SetupClass.php | 90 ---------------- nominatim/cli.py | 2 + nominatim/clicmd/__init__.py | 1 + nominatim/clicmd/refresh.py | 6 +- nominatim/clicmd/transition.py | 53 ++++++++++ nominatim/db/connection.py | 20 +++- nominatim/db/utils.py | 26 +++-- nominatim/tools/database_import.py | 121 ++++++++++++++++++++++ nominatim/tools/exec_utils.py | 3 - nominatim/version.py | 3 + test/Makefile | 3 + test/python/conftest.py | 3 + test/python/test_cli.py | 14 ++- test/python/test_db_connection.py | 11 ++ test/python/test_db_utils.py | 1 + test/python/test_tools_database_import.py | 96 +++++++++++++++++ 18 files changed, 357 insertions(+), 116 deletions(-) create mode 100644 nominatim/clicmd/transition.py create mode 100644 nominatim/tools/database_import.py create mode 100644 test/python/test_tools_database_import.py diff --git a/lib-php/Shell.php b/lib-php/Shell.php index 72f90735..52a7d8fb 100644 --- a/lib-php/Shell.php +++ b/lib-php/Shell.php @@ -48,7 +48,7 @@ class Shell return join(' ', $aEscaped); } - public function run() + public function run($bExitOnFail = False) { $sCmd = $this->escapedCmd(); // $aEnv does not need escaping, proc_open seems to handle it fine @@ -67,6 +67,11 @@ class Shell fclose($aPipes[0]); // no stdin $iStat = proc_close($hProc); + + if ($iStat != 0 && $bExitOnFail) { + exit($iStat); + } + return $iStat; } diff --git a/lib-php/admin/setup.php b/lib-php/admin/setup.php index 241b873c..902c24e0 100644 --- a/lib-php/admin/setup.php +++ b/lib-php/admin/setup.php @@ -56,6 +56,15 @@ setupHTTPProxy(); $bDidSomething = false; +$oNominatimCmd = new \Nominatim\Shell(getSetting('NOMINATIM_TOOL')); +if (isset($aCMDResult['quiet']) && $aCMDResult['quiet']) { + $oNominatimCmd->addParams('--quiet'); +} +if ($aCMDResult['verbose']) { + $oNominatimCmd->addParams('--verbose'); +} + + //******************************************************* // Making some sanity check: // Check if osm-file is set and points to a valid file @@ -72,12 +81,12 @@ $oSetup = new SetupFunctions($aCMDResult); // go through complete process if 'all' is selected or start selected functions if ($aCMDResult['create-db'] || $aCMDResult['all']) { $bDidSomething = true; - $oSetup->createDB(); + (clone($oNominatimCmd))->addParams('transition', '--create-db')->run(true); } if ($aCMDResult['setup-db'] || $aCMDResult['all']) { $bDidSomething = true; - $oSetup->setupDB(); + (clone($oNominatimCmd))->addParams('transition', '--setup-db')->run(true); } if ($aCMDResult['import-data'] || $aCMDResult['all']) { diff --git a/lib-php/setup/SetupClass.php b/lib-php/setup/SetupClass.php index a423e12c..1e7dccb4 100755 --- a/lib-php/setup/SetupClass.php +++ b/lib-php/setup/SetupClass.php @@ -84,96 +84,6 @@ class SetupFunctions } } - public function createDB() - { - info('Create DB'); - $oDB = new \Nominatim\DB; - - if ($oDB->checkConnection()) { - fail('database already exists ('.getSetting('DATABASE_DSN').')'); - } - - $oCmd = (new \Nominatim\Shell('createdb')) - ->addParams('-E', 'UTF-8') - ->addParams('-p', $this->aDSNInfo['port']); - - if (isset($this->aDSNInfo['username'])) { - $oCmd->addParams('-U', $this->aDSNInfo['username']); - } - if (isset($this->aDSNInfo['password'])) { - $oCmd->addEnvPair('PGPASSWORD', $this->aDSNInfo['password']); - } - if (isset($this->aDSNInfo['hostspec'])) { - $oCmd->addParams('-h', $this->aDSNInfo['hostspec']); - } - $oCmd->addParams($this->aDSNInfo['database']); - - $result = $oCmd->run(); - if ($result != 0) fail('Error executing external command: '.$oCmd->escapedCmd()); - } - - public function setupDB() - { - info('Setup DB'); - - $fPostgresVersion = $this->db()->getPostgresVersion(); - echo 'Postgres version found: '.$fPostgresVersion."\n"; - - if ($fPostgresVersion < 9.03) { - fail('Minimum supported version of Postgresql is 9.3.'); - } - - $this->pgsqlRunScript('CREATE EXTENSION IF NOT EXISTS hstore'); - $this->pgsqlRunScript('CREATE EXTENSION IF NOT EXISTS postgis'); - - $fPostgisVersion = $this->db()->getPostgisVersion(); - echo 'Postgis version found: '.$fPostgisVersion."\n"; - - if ($fPostgisVersion < 2.2) { - echo "Minimum required Postgis version 2.2\n"; - exit(1); - } - - $sPgUser = getSetting('DATABASE_WEBUSER'); - $i = $this->db()->getOne("select count(*) from pg_user where usename = '$sPgUser'"); - if ($i == 0) { - echo "\nERROR: Web user '".$sPgUser."' does not exist. Create it with:\n"; - echo "\n createuser ".$sPgUser."\n\n"; - exit(1); - } - - if (!getSetting('DATABASE_MODULE_PATH')) { - // If no custom module path is set then copy the module into the - // project directory, but only if it is not the same file already - // (aka we are running from the build dir). - $sDest = CONST_InstallDir.'/module'; - if ($sDest != CONST_Default_ModulePath) { - if (!file_exists($sDest)) { - mkdir($sDest); - } - if (!copy(CONST_Default_ModulePath.'/nominatim.so', $sDest.'/nominatim.so')) { - echo "Failed to copy database module to $sDest."; - exit(1); - } - chmod($sDest.'/nominatim.so', 0755); - info("Database module installed at $sDest."); - } else { - info('Running from build directory. Leaving database module as is.'); - } - } else { - info('Using database module from DATABASE_MODULE_PATH ('.getSetting('DATABASE_MODULE_PATH').').'); - } - // Try accessing the C module, so we know early if something is wrong - $this->checkModulePresence(); // raises exception on failure - - $this->pgsqlRunScriptFile(CONST_DataDir.'/country_name.sql'); - $this->pgsqlRunScriptFile(CONST_DataDir.'/country_osm_grid.sql.gz'); - - if ($this->bNoPartitions) { - $this->pgsqlRunScript('update country_name set partition = 0'); - } - } - public function importData($sOSMFile) { info('Import data'); diff --git a/nominatim/cli.py b/nominatim/cli.py index 83ecf67b..8668c51c 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -354,4 +354,6 @@ def nominatim(**kwargs): else: parser.parser.epilog = 'php-cgi not found. Query commands not available.' + parser.add_subcommand('transition', clicmd.AdminTransition) + return parser.run(**kwargs) diff --git a/nominatim/clicmd/__init__.py b/nominatim/clicmd/__init__.py index ae970c82..78570b1b 100644 --- a/nominatim/clicmd/__init__.py +++ b/nominatim/clicmd/__init__.py @@ -8,3 +8,4 @@ from .index import UpdateIndex from .refresh import UpdateRefresh from .admin import AdminFuncs from .freeze import SetupFreeze +from .transition import AdminTransition diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index f68e185a..5dca41de 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -50,13 +50,11 @@ class UpdateRefresh: if args.postcodes: LOG.warning("Update postcodes centroid") - with connect(args.config.get_libpq_dsn()) as conn: - refresh.update_postcodes(conn, args.sqllib_dir) + refresh.update_postcodes(args.config.get_libpq_dsn(), args.sqllib_dir) if args.word_counts: LOG.warning('Recompute frequency of full-word search terms') - with connect(args.config.get_libpq_dsn()) as conn: - refresh.recompute_word_counts(conn, args.sqllib_dir) + refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir) if args.address_levels: cfg = Path(args.config.ADDRESS_LEVEL_CONFIG) diff --git a/nominatim/clicmd/transition.py b/nominatim/clicmd/transition.py new file mode 100644 index 00000000..2f351be2 --- /dev/null +++ b/nominatim/clicmd/transition.py @@ -0,0 +1,53 @@ +""" +Implementation of the 'transition' subcommand. + +This subcommand provides standins for functions that were available +through the PHP scripts but are now no longer directly accessible. +This module will be removed as soon as the transition phase is over. +""" +import logging + +from ..db.connection import connect + +# Do not repeat documentation of subcommand classes. +# pylint: disable=C0111 +# Using non-top-level imports to avoid eventually unused imports. +# pylint: disable=E0012,C0415 + +LOG = logging.getLogger() + +class AdminTransition: + """\ + Internal functions for code transition. Do not use. + """ + + @staticmethod + def add_args(parser): + group = parser.add_argument_group('Sub-functions') + group.add_argument('--create-db', action='store_true', + help='Create nominatim db') + group.add_argument('--setup-db', action='store_true', + help='Build a blank nominatim db') + group = parser.add_argument_group('Options') + group.add_argument('--no-partitions', action='store_true', + help='Do not partition search indices') + + @staticmethod + def run(args): + from ..tools import database_import + + if args.create_db: + LOG.warning('Create DB') + database_import.create_db(args.config.get_libpq_dsn()) + + if args.setup_db: + LOG.warning('Setup DB') + mpath = database_import.install_module(args.module_dir, args.project_dir, + args.config.DATABASE_MODULE_PATH) + + with connect(args.config.get_libpq_dsn()) as conn: + database_import.setup_extensions(conn) + database_import.check_module_dir_path(conn, mpath) + + database_import.import_base_data(args.config.get_libpq_dsn(), + args.data_dir, args.no_partitions) diff --git a/nominatim/db/connection.py b/nominatim/db/connection.py index 68e988f6..12f5f4e1 100644 --- a/nominatim/db/connection.py +++ b/nominatim/db/connection.py @@ -81,9 +81,21 @@ class _Connection(psycopg2.extensions.connection): """ version = self.server_version if version < 100000: - return (version / 10000, (version % 10000) / 100) + return (int(version / 10000), (version % 10000) / 100) + + return (int(version / 10000), version % 10000) + + + def postgis_version_tuple(self): + """ Return the postgis version installed in the database as a + tuple of (major, minor). Assumes that the PostGIS extension + has been installed already. + """ + with self.cursor() as cur: + version = cur.scalar('SELECT postgis_lib_version()') + + return tuple((int(x) for x in version.split('.')[:2])) - return (version / 10000, version % 10000) def connect(dsn): """ Open a connection to the database using the specialised connection @@ -123,7 +135,7 @@ _PG_CONNECTION_STRINGS = { 'sslcrl': 'PGSSLCRL', 'requirepeer': 'PGREQUIREPEER', 'ssl_min_protocol_version': 'PGSSLMINPROTOCOLVERSION', - 'ssl_min_protocol_version': 'PGSSLMAXPROTOCOLVERSION', + 'ssl_max_protocol_version': 'PGSSLMAXPROTOCOLVERSION', 'gssencmode': 'PGGSSENCMODE', 'krbsrvname': 'PGKRBSRVNAME', 'gsslib': 'PGGSSLIB', @@ -138,7 +150,7 @@ def get_pg_env(dsn, base_env=None): If `base_env` is None, then the OS environment is used as a base environment. """ - env = base_env if base_env is not None else os.environ + env = dict(base_env if base_env is not None else os.environ) for param, value in psycopg2.extensions.parse_dsn(dsn).items(): if param in _PG_CONNECTION_STRINGS: diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 1a104e51..575f3010 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -3,12 +3,24 @@ Helper functions for handling DB accesses. """ import subprocess import logging +import gzip from .connection import get_pg_env from ..errors import UsageError LOG = logging.getLogger() +def _pipe_to_proc(proc, fdesc): + chunk = fdesc.read(2048) + while chunk and proc.poll() is None: + try: + proc.stdin.write(chunk) + except BrokenPipeError as exc: + raise UsageError("Failed to execute SQL file.") from exc + chunk = fdesc.read(2048) + + return len(chunk) + def execute_file(dsn, fname, ignore_errors=False): """ Read an SQL file and run its contents against the given database using psql. @@ -21,15 +33,15 @@ def execute_file(dsn, fname, ignore_errors=False): if not LOG.isEnabledFor(logging.INFO): proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8')) - with fname.open('rb') as fdesc: - chunk = fdesc.read(2048) - while chunk and proc.poll() is None: - proc.stdin.write(chunk) - chunk = fdesc.read(2048) + if fname.suffix == '.gz': + with gzip.open(str(fname), 'rb') as fdesc: + remain = _pipe_to_proc(proc, fdesc) + else: + with fname.open('rb') as fdesc: + remain = _pipe_to_proc(proc, fdesc) proc.stdin.close() ret = proc.wait() - print(ret, chunk) - if ret != 0 or chunk: + if ret != 0 or remain > 0: raise UsageError("Failed to execute SQL file.") diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py new file mode 100644 index 00000000..89e01f66 --- /dev/null +++ b/nominatim/tools/database_import.py @@ -0,0 +1,121 @@ +""" +Functions for setting up and importing a new Nominatim database. +""" +import logging +import subprocess +import shutil + +from ..db.connection import connect, get_pg_env +from ..db import utils as db_utils +from ..errors import UsageError +from ..version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION + +LOG = logging.getLogger() + +def create_db(dsn, rouser=None): + """ Create a new database for the given DSN. Fails when the database + already exists or the PostgreSQL version is too old. + Uses `createdb` to create the database. + + If 'rouser' is given, then the function also checks that the user + with that given name exists. + + Requires superuser rights by the caller. + """ + proc = subprocess.run(['createdb'], env=get_pg_env(dsn), check=False) + + if proc.returncode != 0: + raise UsageError('Creating new database failed.') + + with connect(dsn) as conn: + postgres_version = conn.server_version_tuple() # pylint: disable=E1101 + if postgres_version < POSTGRESQL_REQUIRED_VERSION: + LOG.fatal('Minimum supported version of Postgresql is %d.%d. ' + 'Found version %d.%d.', + POSTGRESQL_REQUIRED_VERSION[0], POSTGRESQL_REQUIRED_VERSION[1], + postgres_version[0], postgres_version[1]) + raise UsageError('PostgreSQL server is too old.') + + if rouser is not None: + with conn.cursor() as cur: # pylint: disable=E1101 + cnt = cur.scalar('SELECT count(*) FROM pg_user where usename = %s', + (rouser, )) + if cnt == 0: + LOG.fatal("Web user '%s' does not exists. Create it with:\n" + "\n createuser %s", rouser, rouser) + raise UsageError('Missing read-only user.') + + + +def setup_extensions(conn): + """ Set up all extensions needed for Nominatim. Also checks that the + versions of the extensions are sufficient. + """ + with conn.cursor() as cur: + cur.execute('CREATE EXTENSION IF NOT EXISTS hstore') + cur.execute('CREATE EXTENSION IF NOT EXISTS postgis') + conn.commit() + + postgis_version = conn.postgis_version_tuple() + if postgis_version < POSTGIS_REQUIRED_VERSION: + LOG.fatal('Minimum supported version of PostGIS is %d.%d. ' + 'Found version %d.%d.', + POSTGIS_REQUIRED_VERSION[0], POSTGIS_REQUIRED_VERSION[1], + postgis_version[0], postgis_version[1]) + raise UsageError('PostGIS version is too old.') + + +def install_module(src_dir, project_dir, module_dir): + """ Copy the normalization module from src_dir into the project + directory under the '/module' directory. If 'module_dir' is set, then + use the module from there instead and check that it is accessible + for Postgresql. + + The function detects when the installation is run from the + build directory. It doesn't touch the module in that case. + """ + if not module_dir: + module_dir = project_dir / 'module' + + if not module_dir.exists() or not src_dir.samefile(module_dir): + + if not module_dir.exists(): + module_dir.mkdir() + + destfile = module_dir / 'nominatim.so' + shutil.copy(str(src_dir / 'nominatim.so'), str(destfile)) + destfile.chmod(0o755) + + LOG.info('Database module installed at %s', str(destfile)) + else: + LOG.info('Running from build directory. Leaving database module as is.') + else: + LOG.info("Using custom path for database module at '%s'", module_dir) + + return module_dir + + +def check_module_dir_path(conn, path): + """ Check that the normalisation module can be found and executed + from the given path. + """ + with conn.cursor() as cur: + cur.execute("""CREATE FUNCTION nominatim_test_import_func(text) + RETURNS text AS '{}/nominatim.so', 'transliteration' + LANGUAGE c IMMUTABLE STRICT; + DROP FUNCTION nominatim_test_import_func(text) + """.format(path)) + + +def import_base_data(dsn, sql_dir, ignore_partitions=False): + """ Create and populate the tables with basic static data that provides + the background for geocoding. + """ + db_utils.execute_file(dsn, sql_dir / 'country_name.sql') + db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz') + + if ignore_partitions: + with connect(dsn) as conn: + with conn.cursor() as cur: # pylint: disable=E1101 + cur.execute('UPDATE country_name SET partition = 0') + conn.commit() # pylint: disable=E1101 diff --git a/nominatim/tools/exec_utils.py b/nominatim/tools/exec_utils.py index 004a821c..b2ccc4a2 100644 --- a/nominatim/tools/exec_utils.py +++ b/nominatim/tools/exec_utils.py @@ -2,13 +2,10 @@ Helper functions for executing external programs. """ import logging -import os import subprocess import urllib.request as urlrequest from urllib.parse import urlencode -from psycopg2.extensions import parse_dsn - from ..version import NOMINATIM_VERSION from ..db.connection import get_pg_env diff --git a/nominatim/version.py b/nominatim/version.py index a2ddc9fa..8d1c6849 100644 --- a/nominatim/version.py +++ b/nominatim/version.py @@ -3,3 +3,6 @@ Version information for Nominatim. """ NOMINATIM_VERSION = "3.6.0" + +POSTGRESQL_REQUIRED_VERSION = (9, 3) +POSTGIS_REQUIRED_VERSION = (2, 2) diff --git a/test/Makefile b/test/Makefile index b65e465d..613b974d 100644 --- a/test/Makefile +++ b/test/Makefile @@ -10,5 +10,8 @@ bdd-no-test-db: php: cd php && phpunit ./ +python: + pytest python + .PHONY: bdd php no-test-db diff --git a/test/python/conftest.py b/test/python/conftest.py index 0e0e808c..6d4d3ac9 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -105,6 +105,9 @@ def temp_db_cursor(temp_db): def def_config(): return Configuration(None, SRC_DIR.resolve() / 'settings') +@pytest.fixture +def src_dir(): + return SRC_DIR.resolve() @pytest.fixture def status_table(temp_db_conn): diff --git a/test/python/test_cli.py b/test/python/test_cli.py index aa6a5c7f..03325b8d 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -6,9 +6,11 @@ correct functionionality. They use a lot of monkeypatching to avoid executing the actual functions. """ import datetime as dt +import time +from pathlib import Path + import psycopg2 import pytest -import time import nominatim.cli import nominatim.clicmd.api @@ -23,14 +25,16 @@ import nominatim.tools.replication from nominatim.errors import UsageError from nominatim.db import status +SRC_DIR = (Path(__file__) / '..' / '..' / '..').resolve() + def call_nominatim(*args): return nominatim.cli.nominatim(module_dir='build/module', osm2pgsql_path='build/osm2pgsql/osm2pgsql', - phplib_dir='lib-php', - data_dir='.', + phplib_dir=str(SRC_DIR / 'lib-php'), + data_dir=str(SRC_DIR / 'data'), phpcgi_path='/usr/bin/php-cgi', - sqllib_dir='lib-sql', - config_dir='settings', + sqllib_dir=str(SRC_DIR / 'lib-sql'), + config_dir=str(SRC_DIR / 'settings'), cli_args=args) class MockParamCapture: diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py index fd5da754..dcbfb8bf 100644 --- a/test/python/test_db_connection.py +++ b/test/python/test_db_connection.py @@ -37,6 +37,17 @@ def test_connection_server_version_tuple(db): assert len(ver) == 2 assert ver[0] > 8 + +def test_connection_postgis_version_tuple(db, temp_db_cursor): + temp_db_cursor.execute('CREATE EXTENSION postgis') + + ver = db.postgis_version_tuple() + + assert isinstance(ver, tuple) + assert len(ver) == 2 + assert ver[0] >= 2 + + def test_cursor_scalar(db, temp_db_cursor): temp_db_cursor.execute('CREATE TABLE dummy (id INT)') diff --git a/test/python/test_db_utils.py b/test/python/test_db_utils.py index b2586ed0..1c3b834a 100644 --- a/test/python/test_db_utils.py +++ b/test/python/test_db_utils.py @@ -26,6 +26,7 @@ def test_execute_file_bad_file(dsn, tmp_path): with pytest.raises(FileNotFoundError): db_utils.execute_file(dsn, tmp_path / 'test2.sql') + def test_execute_file_bad_sql(dsn, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE STABLE test (id INT)') diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py new file mode 100644 index 00000000..a4ab16f9 --- /dev/null +++ b/test/python/test_tools_database_import.py @@ -0,0 +1,96 @@ +""" +Tests for functions to import a new database. +""" +import pytest +import psycopg2 +import sys + +from nominatim.tools import database_import +from nominatim.errors import UsageError + +@pytest.fixture +def nonexistant_db(): + dbname = 'test_nominatim_python_unittest' + + conn = psycopg2.connect(database='postgres') + + conn.set_isolation_level(0) + with conn.cursor() as cur: + cur.execute('DROP DATABASE IF EXISTS {}'.format(dbname)) + + yield dbname + + with conn.cursor() as cur: + cur.execute('DROP DATABASE IF EXISTS {}'.format(dbname)) + + +def test_create_db_success(nonexistant_db): + database_import.create_db('dbname=' + nonexistant_db, rouser='www-data') + + conn = psycopg2.connect(database=nonexistant_db) + conn.close() + + +def test_create_db_already_exists(temp_db): + with pytest.raises(UsageError): + database_import.create_db('dbname=' + temp_db) + + +def test_create_db_unsupported_version(nonexistant_db, monkeypatch): + monkeypatch.setattr(database_import, 'POSTGRESQL_REQUIRED_VERSION', (100, 4)) + + with pytest.raises(UsageError, match='PostgreSQL server is too old.'): + database_import.create_db('dbname=' + nonexistant_db) + + +def test_create_db_missing_ro_user(nonexistant_db): + with pytest.raises(UsageError, match='Missing read-only user.'): + database_import.create_db('dbname=' + nonexistant_db, rouser='sdfwkjkjgdugu2;jgsafkljas;') + + +def test_setup_extensions(temp_db_conn, temp_db_cursor): + database_import.setup_extensions(temp_db_conn) + + temp_db_cursor.execute('CREATE TABLE t (h HSTORE, geom GEOMETRY(Geometry, 4326))') + + +def test_setup_extensions_old_postgis(temp_db_conn, monkeypatch): + monkeypatch.setattr(database_import, 'POSTGIS_REQUIRED_VERSION', (50, 50)) + + with pytest.raises(UsageError, match='PostGIS version is too old.'): + database_import.setup_extensions(temp_db_conn) + + +def test_install_module(tmp_path): + src_dir = tmp_path / 'source' + src_dir.mkdir() + (src_dir / 'nominatim.so').write_text('TEST nomiantim.so') + + project_dir = tmp_path / 'project' + project_dir.mkdir() + + database_import.install_module(src_dir, project_dir, '') + + outfile = project_dir / 'module' / 'nominatim.so' + + assert outfile.exists() + assert outfile.read_text() == 'TEST nomiantim.so' + assert outfile.stat().st_mode == 33261 + + +def test_import_base_data(src_dir, temp_db, temp_db_cursor): + temp_db_cursor.execute('CREATE EXTENSION hstore') + temp_db_cursor.execute('CREATE EXTENSION postgis') + database_import.import_base_data('dbname=' + temp_db, src_dir / 'data') + + assert temp_db_cursor.scalar('SELECT count(*) FROM country_name') > 0 + + +def test_import_base_data_ignore_partitions(src_dir, temp_db, temp_db_cursor): + temp_db_cursor.execute('CREATE EXTENSION hstore') + temp_db_cursor.execute('CREATE EXTENSION postgis') + database_import.import_base_data('dbname=' + temp_db, src_dir / 'data', + ignore_partitions=True) + + assert temp_db_cursor.scalar('SELECT count(*) FROM country_name') > 0 + assert temp_db_cursor.scalar('SELECT count(*) FROM country_name WHERE partition != 0') == 0 From 7222235579bc0d287c86fea6d7e22959132c7377 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 24 Feb 2021 10:38:19 +0100 Subject: [PATCH 05/18] introduce custom object for cmdline arguments Allows to define special functions over the arguments. Also splits CLI tests in two files as they have become too many. --- nominatim/cli.py | 4 +- nominatim/clicmd/args.py | 22 +++++ nominatim/clicmd/replication.py | 13 +-- test/python/mocks.py | 18 ++++ test/python/test_cli.py | 94 +------------------- test/python/test_cli_replication.py | 127 ++++++++++++++++++++++++++++ 6 files changed, 173 insertions(+), 105 deletions(-) create mode 100644 nominatim/clicmd/args.py create mode 100644 test/python/mocks.py create mode 100644 test/python/test_cli_replication.py diff --git a/nominatim/cli.py b/nominatim/cli.py index 8668c51c..e1824cc6 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -12,6 +12,7 @@ from .config import Configuration from .tools.exec_utils import run_legacy_script, run_php_server from .errors import UsageError from . import clicmd +from .clicmd.args import NominatimArgs LOG = logging.getLogger() @@ -62,7 +63,8 @@ class CommandlineParser: """ Parse the command line arguments of the program and execute the appropriate subcommand. """ - args = self.parser.parse_args(args=kwargs.get('cli_args')) + args = NominatimArgs() + self.parser.parse_args(args=kwargs.get('cli_args'), namespace=args) if args.subcommand is None: self.parser.print_help() diff --git a/nominatim/clicmd/args.py b/nominatim/clicmd/args.py new file mode 100644 index 00000000..f9b94ef2 --- /dev/null +++ b/nominatim/clicmd/args.py @@ -0,0 +1,22 @@ +""" +Provides custom functions over command-line arguments. +""" + + +class NominatimArgs: + """ Customized namespace class for the nominatim command line tool + to receive the command-line arguments. + """ + + def osm2pgsql_options(self, default_cache, default_threads): + """ Return the standard osm2pgsql options that can be derived + from the command line arguments. The resulting dict can be + further customized and then used in `run_osm2pgsql()`. + """ + return dict(osm2pgsql=self.config.OSM2PGSQL_BINARY or self.osm2pgsql_path, + osm2pgsql_cache=self.osm2pgsql_cache or default_cache, + osm2pgsql_style=self.config.get_import_style_file(), + threads=self.threads or default_threads, + dsn=self.config.get_libpq_dsn(), + flatnode_file=self.config.FLATNODE_FILE) + diff --git a/nominatim/clicmd/replication.py b/nominatim/clicmd/replication.py index e766be2b..fc18945e 100644 --- a/nominatim/clicmd/replication.py +++ b/nominatim/clicmd/replication.py @@ -17,17 +17,6 @@ LOG = logging.getLogger() # Using non-top-level imports to make pyosmium optional for replication only. # pylint: disable=E0012,C0415 -def _osm2pgsql_options_from_args(args, default_cache, default_threads): - """ Set up the standard osm2pgsql from the command line arguments. - """ - return dict(osm2pgsql=args.osm2pgsql_path, - osm2pgsql_cache=args.osm2pgsql_cache or default_cache, - osm2pgsql_style=args.config.get_import_style_file(), - threads=args.threads or default_threads, - dsn=args.config.get_libpq_dsn(), - flatnode_file=args.config.FLATNODE_FILE) - - class UpdateReplication: """\ Update the database using an online replication service. @@ -96,7 +85,7 @@ class UpdateReplication: from ..tools import replication from ..indexer.indexer import Indexer - params = _osm2pgsql_options_from_args(args, 2000, 1) + params = args.osm2pgsql_options(default_cache=2000, default_threads=1) params.update(base_url=args.config.REPLICATION_URL, update_interval=args.config.get_int('REPLICATION_UPDATE_INTERVAL'), import_file=args.project_dir / 'osmosischange.osc', diff --git a/test/python/mocks.py b/test/python/mocks.py new file mode 100644 index 00000000..415e18b3 --- /dev/null +++ b/test/python/mocks.py @@ -0,0 +1,18 @@ +""" +Custom mocks for testing. +""" + + +class MockParamCapture: + """ Mock that records the parameters with which a function was called + as well as the number of calls. + """ + def __init__(self, retval=0): + self.called = 0 + self.return_value = retval + + def __call__(self, *args, **kwargs): + self.called += 1 + self.last_args = args + self.last_kwargs = kwargs + return self.return_value diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 03325b8d..2b4240b4 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -5,11 +5,8 @@ These tests just check that the various command line parameters route to the correct functionionality. They use a lot of monkeypatching to avoid executing the actual functions. """ -import datetime as dt -import time from pathlib import Path -import psycopg2 import pytest import nominatim.cli @@ -21,9 +18,8 @@ import nominatim.tools.admin import nominatim.tools.check_database import nominatim.tools.freeze import nominatim.tools.refresh -import nominatim.tools.replication -from nominatim.errors import UsageError -from nominatim.db import status + +from mocks import MockParamCapture SRC_DIR = (Path(__file__) / '..' / '..' / '..').resolve() @@ -37,19 +33,6 @@ def call_nominatim(*args): config_dir=str(SRC_DIR / 'settings'), cli_args=args) -class MockParamCapture: - """ Mock that records the parameters with which a function was called - as well as the number of calls. - """ - def __init__(self, retval=0): - self.called = 0 - self.return_value = retval - - def __call__(self, *args, **kwargs): - self.called += 1 - self.last_args = args - self.last_kwargs = kwargs - return self.return_value @pytest.fixture def mock_run_legacy(monkeypatch): @@ -186,79 +169,6 @@ def test_refresh_importance_computed_after_wiki_import(mock_func_factory, temp_d assert mock_run_legacy.last_args == ('update.php', '--recompute-importance') -@pytest.mark.parametrize("params,func", [ - (('--init', '--no-update-functions'), 'init_replication'), - (('--check-for-updates',), 'check_for_updates') - ]) -def test_replication_command(mock_func_factory, temp_db, params, func): - func_mock = mock_func_factory(nominatim.tools.replication, func) - - assert 0 == call_nominatim('replication', *params) - assert func_mock.called == 1 - - -def test_replication_update_bad_interval(monkeypatch, temp_db): - monkeypatch.setenv('NOMINATIM_REPLICATION_UPDATE_INTERVAL', 'xx') - - assert call_nominatim('replication') == 1 - - -def test_replication_update_bad_interval_for_geofabrik(monkeypatch, temp_db): - monkeypatch.setenv('NOMINATIM_REPLICATION_URL', - 'https://download.geofabrik.de/europe/ireland-and-northern-ireland-updates') - - assert call_nominatim('replication') == 1 - - -@pytest.mark.parametrize("state", [nominatim.tools.replication.UpdateState.UP_TO_DATE, - nominatim.tools.replication.UpdateState.NO_CHANGES]) -def test_replication_update_once_no_index(mock_func_factory, temp_db, temp_db_conn, - status_table, state): - status.set_status(temp_db_conn, date=dt.datetime.now(dt.timezone.utc), seq=1) - func_mock = mock_func_factory(nominatim.tools.replication, 'update') - - assert 0 == call_nominatim('replication', '--once', '--no-index') - - -def test_replication_update_continuous(monkeypatch, temp_db_conn, status_table): - status.set_status(temp_db_conn, date=dt.datetime.now(dt.timezone.utc), seq=1) - states = [nominatim.tools.replication.UpdateState.UP_TO_DATE, - nominatim.tools.replication.UpdateState.UP_TO_DATE] - monkeypatch.setattr(nominatim.tools.replication, 'update', - lambda *args, **kwargs: states.pop()) - - index_mock = MockParamCapture() - monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', index_mock) - monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', index_mock) - - with pytest.raises(IndexError): - call_nominatim('replication') - - assert index_mock.called == 4 - - -def test_replication_update_continuous_no_change(monkeypatch, temp_db_conn, status_table): - status.set_status(temp_db_conn, date=dt.datetime.now(dt.timezone.utc), seq=1) - states = [nominatim.tools.replication.UpdateState.NO_CHANGES, - nominatim.tools.replication.UpdateState.UP_TO_DATE] - monkeypatch.setattr(nominatim.tools.replication, 'update', - lambda *args, **kwargs: states.pop()) - - index_mock = MockParamCapture() - monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', index_mock) - monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', index_mock) - - sleep_mock = MockParamCapture() - monkeypatch.setattr(time, 'sleep', sleep_mock) - - with pytest.raises(IndexError): - call_nominatim('replication') - - assert index_mock.called == 2 - assert sleep_mock.called == 1 - assert sleep_mock.last_args[0] == 60 - - def test_serve_command(mock_func_factory): func = mock_func_factory(nominatim.cli, 'run_php_server') diff --git a/test/python/test_cli_replication.py b/test/python/test_cli_replication.py new file mode 100644 index 00000000..a62ad1a4 --- /dev/null +++ b/test/python/test_cli_replication.py @@ -0,0 +1,127 @@ +""" +Tests for replication command of command-line interface wrapper. +""" +import datetime as dt +import time +from pathlib import Path + +import pytest + +import nominatim.cli +import nominatim.indexer.indexer +import nominatim.tools.replication +from nominatim.db import status + +from mocks import MockParamCapture + +SRC_DIR = (Path(__file__) / '..' / '..' / '..').resolve() + +def call_nominatim(*args): + return nominatim.cli.nominatim(module_dir='build/module', + osm2pgsql_path='build/osm2pgsql/osm2pgsql', + phplib_dir=str(SRC_DIR / 'lib-php'), + data_dir=str(SRC_DIR / 'data'), + phpcgi_path='/usr/bin/php-cgi', + sqllib_dir=str(SRC_DIR / 'lib-sql'), + config_dir=str(SRC_DIR / 'settings'), + cli_args=['replication'] + list(args)) + +@pytest.fixture +def index_mock(monkeypatch): + mock = MockParamCapture() + monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', mock) + monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', mock) + + return mock + + +@pytest.fixture +def mock_func_factory(monkeypatch): + def get_mock(module, func): + mock = MockParamCapture() + monkeypatch.setattr(module, func, mock) + return mock + + return get_mock + + +@pytest.fixture +def init_status(temp_db_conn, status_table): + status.set_status(temp_db_conn, date=dt.datetime.now(dt.timezone.utc), seq=1) + return 1 + + +@pytest.fixture +def update_mock(mock_func_factory, init_status): + return mock_func_factory(nominatim.tools.replication, 'update') + +@pytest.mark.parametrize("params,func", [ + (('--init', '--no-update-functions'), 'init_replication'), + (('--check-for-updates',), 'check_for_updates') + ]) +def test_replication_command(mock_func_factory, temp_db, params, func): + func_mock = mock_func_factory(nominatim.tools.replication, func) + + assert 0 == call_nominatim(*params) + assert func_mock.called == 1 + + +def test_replication_update_bad_interval(monkeypatch, temp_db): + monkeypatch.setenv('NOMINATIM_REPLICATION_UPDATE_INTERVAL', 'xx') + + assert call_nominatim() == 1 + + +def test_replication_update_bad_interval_for_geofabrik(monkeypatch, temp_db): + monkeypatch.setenv('NOMINATIM_REPLICATION_URL', + 'https://download.geofabrik.de/europe/ireland-and-northern-ireland-updates') + + assert call_nominatim() == 1 + + +def test_replication_update_once_no_index(update_mock): + assert 0 == call_nominatim('--once', '--no-index') + + assert str(update_mock.last_args[1]['osm2pgsql']) == 'build/osm2pgsql/osm2pgsql' + + +def test_replication_update_custom_osm2pgsql(monkeypatch, update_mock): + monkeypatch.setenv('NOMINATIM_OSM2PGSQL_BINARY', '/secret/osm2pgsql') + assert 0 == call_nominatim('--once', '--no-index') + + assert str(update_mock.last_args[1]['osm2pgsql']) == '/secret/osm2pgsql' + + +def test_replication_update_custom_threads(update_mock): + assert 0 == call_nominatim('--once', '--no-index', '--threads', '4') + + assert update_mock.last_args[1]['threads'] == 4 + + +def test_replication_update_continuous(monkeypatch, init_status, index_mock): + states = [nominatim.tools.replication.UpdateState.UP_TO_DATE, + nominatim.tools.replication.UpdateState.UP_TO_DATE] + monkeypatch.setattr(nominatim.tools.replication, 'update', + lambda *args, **kwargs: states.pop()) + + with pytest.raises(IndexError): + call_nominatim() + + assert index_mock.called == 4 + + +def test_replication_update_continuous_no_change(monkeypatch, init_status, index_mock): + states = [nominatim.tools.replication.UpdateState.NO_CHANGES, + nominatim.tools.replication.UpdateState.UP_TO_DATE] + monkeypatch.setattr(nominatim.tools.replication, 'update', + lambda *args, **kwargs: states.pop()) + + sleep_mock = MockParamCapture() + monkeypatch.setattr(time, 'sleep', sleep_mock) + + with pytest.raises(IndexError): + call_nominatim() + + assert index_mock.called == 2 + assert sleep_mock.called == 1 + assert sleep_mock.last_args[0] == 60 From 32683f73c787464e16f2a146d4c08c4041087dd5 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 24 Feb 2021 17:21:45 +0100 Subject: [PATCH 06/18] move import-data option to native python This adds a new dependecy to the Python psutil package. --- .pylintrc | 11 +++++ CMakeLists.txt | 2 +- docs/admin/Installation.md | 3 +- lib-php/Shell.php | 2 +- lib-php/admin/setup.php | 17 ++++++- lib-php/setup/SetupClass.php | 60 ----------------------- nominatim/clicmd/args.py | 11 +++-- nominatim/clicmd/index.py | 12 +---- nominatim/clicmd/transition.py | 18 +++++++ nominatim/db/connection.py | 11 +++++ nominatim/tools/database_import.py | 47 ++++++++++++++++-- nominatim/tools/exec_utils.py | 9 ++++ test/python/conftest.py | 11 ++++- test/python/test_cli.py | 2 + test/python/test_db_connection.py | 17 +++++++ test/python/test_tools_database_import.py | 40 +++++++++++++++ test/python/test_tools_exec_utils.py | 17 +++++-- vagrant/Install-on-Centos-7.sh | 2 +- vagrant/Install-on-Centos-8.sh | 2 +- vagrant/Install-on-Ubuntu-18.sh | 2 +- vagrant/Install-on-Ubuntu-20.sh | 2 +- 21 files changed, 205 insertions(+), 93 deletions(-) create mode 100644 .pylintrc diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..da6dbe03 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,11 @@ +[MASTER] + +extension-pkg-whitelist=osmium + +[MESSAGES CONTROL] + +[TYPECHECK] + +# closing added here because it sometimes triggers a false positive with +# 'with' statements. +ignored-classes=NominatimArgs,closing diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f8e4611..2b4c2976 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -177,7 +177,7 @@ if (BUILD_TESTS) if (PYLINT) message(STATUS "Using pylint binary ${PYLINT}") add_test(NAME pylint - COMMAND ${PYLINT} --extension-pkg-whitelist=osmium nominatim + COMMAND ${PYLINT} nominatim WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) else() message(WARNING "pylint not found. Python linting tests disabled.") diff --git a/docs/admin/Installation.md b/docs/admin/Installation.md index 05e57f93..4360c9a7 100644 --- a/docs/admin/Installation.md +++ b/docs/admin/Installation.md @@ -41,10 +41,11 @@ For running Nominatim: * [Python 3](https://www.python.org/) (3.5+) * [Psycopg2](https://www.psycopg.org) (2.7+) * [Python Dotenv](https://github.com/theskumar/python-dotenv) + * [psutil] (https://github.com/giampaolo/psutil) * [PHP](https://php.net) (7.0 or later) * PHP-pgsql * PHP-intl (bundled with PHP) - ( PHP-cgi (for running queries from the command line) + * PHP-cgi (for running queries from the command line) For running continuous updates: diff --git a/lib-php/Shell.php b/lib-php/Shell.php index 52a7d8fb..b43db135 100644 --- a/lib-php/Shell.php +++ b/lib-php/Shell.php @@ -48,7 +48,7 @@ class Shell return join(' ', $aEscaped); } - public function run($bExitOnFail = False) + public function run($bExitOnFail = false) { $sCmd = $this->escapedCmd(); // $aEnv does not need escaping, proc_open seems to handle it fine diff --git a/lib-php/admin/setup.php b/lib-php/admin/setup.php index 902c24e0..d3831322 100644 --- a/lib-php/admin/setup.php +++ b/lib-php/admin/setup.php @@ -86,12 +86,25 @@ if ($aCMDResult['create-db'] || $aCMDResult['all']) { if ($aCMDResult['setup-db'] || $aCMDResult['all']) { $bDidSomething = true; - (clone($oNominatimCmd))->addParams('transition', '--setup-db')->run(true); + $oCmd = (clone($oNominatimCmd))->addParams('transition', '--setup-db'); + + if ($aCMDResult['no-partitions'] ?? false) { + $oCmd->addParams('--no-partitions'); + } + + $oCmd->run(true); } if ($aCMDResult['import-data'] || $aCMDResult['all']) { $bDidSomething = true; - $oSetup->importData($aCMDResult['osm-file']); + $oCmd = (clone($oNominatimCmd)) + ->addParams('transition', '--import-data') + ->addParams('--osm-file', $aCMDResult['osm-file']); + if ($aCMDResult['drop'] ?? false) { + $oCmd->addParams('--drop'); + } + + $oCmd->run(true); } if ($aCMDResult['create-functions'] || $aCMDResult['all']) { diff --git a/lib-php/setup/SetupClass.php b/lib-php/setup/SetupClass.php index 1e7dccb4..e11ecd13 100755 --- a/lib-php/setup/SetupClass.php +++ b/lib-php/setup/SetupClass.php @@ -84,66 +84,6 @@ class SetupFunctions } } - public function importData($sOSMFile) - { - info('Import data'); - - if (!file_exists(getOsm2pgsqlBinary())) { - echo "Check NOMINATIM_OSM2PGSQL_BINARY in your local .env file.\n"; - echo "Normally you should not need to set this manually.\n"; - fail("osm2pgsql not found in '".getOsm2pgsqlBinary()."'"); - } - - $oCmd = new \Nominatim\Shell(getOsm2pgsqlBinary()); - $oCmd->addParams('--style', getImportStyle()); - - if (getSetting('FLATNODE_FILE')) { - $oCmd->addParams('--flat-nodes', getSetting('FLATNODE_FILE')); - } - if (getSetting('TABLESPACE_OSM_DATA')) { - $oCmd->addParams('--tablespace-slim-data', getSetting('TABLESPACE_OSM_DATA')); - } - if (getSetting('TABLESPACE_OSM_INDEX')) { - $oCmd->addParams('--tablespace-slim-index', getSetting('TABLESPACE_OSM_INDEX')); - } - if (getSetting('TABLESPACE_PLACE_DATA')) { - $oCmd->addParams('--tablespace-main-data', getSetting('TABLESPACE_PLACE_DATA')); - } - if (getSetting('TABLESPACE_PLACE_INDEX')) { - $oCmd->addParams('--tablespace-main-index', getSetting('TABLESPACE_PLACE_INDEX')); - } - $oCmd->addParams('--latlong', '--slim', '--create'); - $oCmd->addParams('--output', 'gazetteer'); - $oCmd->addParams('--hstore'); - $oCmd->addParams('--number-processes', 1); - $oCmd->addParams('--with-forward-dependencies', 'false'); - $oCmd->addParams('--log-progress', 'true'); - $oCmd->addParams('--cache', $this->iCacheMemory); - $oCmd->addParams('--port', $this->aDSNInfo['port']); - - if (isset($this->aDSNInfo['username'])) { - $oCmd->addParams('--username', $this->aDSNInfo['username']); - } - if (isset($this->aDSNInfo['password'])) { - $oCmd->addEnvPair('PGPASSWORD', $this->aDSNInfo['password']); - } - if (isset($this->aDSNInfo['hostspec'])) { - $oCmd->addParams('--host', $this->aDSNInfo['hostspec']); - } - $oCmd->addParams('--database', $this->aDSNInfo['database']); - $oCmd->addParams($sOSMFile); - $oCmd->run(); - - if (!$this->sIgnoreErrors && !$this->db()->getRow('select * from place limit 1')) { - fail('No Data'); - } - - if ($this->bDrop) { - $this->dropTable('planet_osm_nodes'); - $this->removeFlatnodeFile(); - } - } - public function createFunctions() { info('Create Functions'); diff --git a/nominatim/clicmd/args.py b/nominatim/clicmd/args.py index f9b94ef2..47007579 100644 --- a/nominatim/clicmd/args.py +++ b/nominatim/clicmd/args.py @@ -3,7 +3,7 @@ Provides custom functions over command-line arguments. """ -class NominatimArgs: +class NominatimArgs: # pylint: disable=too-few-public-methods """ Customized namespace class for the nominatim command line tool to receive the command-line arguments. """ @@ -18,5 +18,10 @@ class NominatimArgs: osm2pgsql_style=self.config.get_import_style_file(), threads=self.threads or default_threads, dsn=self.config.get_libpq_dsn(), - flatnode_file=self.config.FLATNODE_FILE) - + flatnode_file=self.config.FLATNODE_FILE, + tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA, + slim_index=self.config.TABLESPACE_OSM_INDEX, + main_data=self.config.TABLESPACE_PLACE_DATA, + main_index=self.config.TABLESPACE_PLACE_INDEX + ) + ) diff --git a/nominatim/clicmd/index.py b/nominatim/clicmd/index.py index 96a69396..0225c5ed 100644 --- a/nominatim/clicmd/index.py +++ b/nominatim/clicmd/index.py @@ -1,7 +1,7 @@ """ Implementation of the 'index' subcommand. """ -import os +import psutil from ..db import status from ..db.connection import connect @@ -11,14 +11,6 @@ from ..db.connection import connect # Using non-top-level imports to avoid eventually unused imports. # pylint: disable=E0012,C0415 -def _num_system_cpus(): - try: - cpus = len(os.sched_getaffinity(0)) - except NotImplementedError: - cpus = None - - return cpus or os.cpu_count() - class UpdateIndex: """\ @@ -42,7 +34,7 @@ class UpdateIndex: from ..indexer.indexer import Indexer indexer = Indexer(args.config.get_libpq_dsn(), - args.threads or _num_system_cpus() or 1) + args.threads or psutil.cpu_count() or 1) if not args.no_boundaries: indexer.index_boundaries(args.minrank, args.maxrank) diff --git a/nominatim/clicmd/transition.py b/nominatim/clicmd/transition.py index 2f351be2..eb4e2d2f 100644 --- a/nominatim/clicmd/transition.py +++ b/nominatim/clicmd/transition.py @@ -6,8 +6,10 @@ through the PHP scripts but are now no longer directly accessible. This module will be removed as soon as the transition phase is over. """ import logging +from pathlib import Path from ..db.connection import connect +from ..errors import UsageError # Do not repeat documentation of subcommand classes. # pylint: disable=C0111 @@ -28,9 +30,17 @@ class AdminTransition: help='Create nominatim db') group.add_argument('--setup-db', action='store_true', help='Build a blank nominatim db') + group.add_argument('--import-data', action='store_true', + help='Import a osm file') group = parser.add_argument_group('Options') group.add_argument('--no-partitions', action='store_true', help='Do not partition search indices') + group.add_argument('--osm-file', metavar='FILE', + help='File to import') + group.add_argument('--drop', action='store_true', + help='Drop tables needed for updates, making the database readonly') + group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int, + help='Size of cache to be used by osm2pgsql (in MB)') @staticmethod def run(args): @@ -51,3 +61,11 @@ class AdminTransition: database_import.import_base_data(args.config.get_libpq_dsn(), args.data_dir, args.no_partitions) + + if args.import_data: + LOG.warning('Import data') + if not args.osm_file: + raise UsageError('Missing required --osm-file argument') + database_import.import_osm_data(Path(args.osm_file), + args.osm2pgsql_options(0, 1), + drop=args.drop) diff --git a/nominatim/db/connection.py b/nominatim/db/connection.py index 12f5f4e1..5aa05ced 100644 --- a/nominatim/db/connection.py +++ b/nominatim/db/connection.py @@ -75,6 +75,17 @@ class _Connection(psycopg2.extensions.connection): return True + def drop_table(self, name, if_exists=True): + """ Drop the table with the given name. + Set `if_exists` to False if a non-existant table should raise + an exception instead of just being ignored. + """ + with self.cursor() as cur: + cur.execute("""DROP TABLE {} "{}" + """.format('IF EXISTS' if if_exists else '', name)) + self.commit() + + def server_version_tuple(self): """ Return the server version as a tuple of (major, minor). Converts correctly for pre-10 and post-10 PostgreSQL versions. diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index 89e01f66..00ec95c0 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -2,11 +2,16 @@ Functions for setting up and importing a new Nominatim database. """ import logging +import os import subprocess import shutil +from pathlib import Path + +import psutil from ..db.connection import connect, get_pg_env from ..db import utils as db_utils +from .exec_utils import run_osm2pgsql from ..errors import UsageError from ..version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION @@ -28,7 +33,7 @@ def create_db(dsn, rouser=None): raise UsageError('Creating new database failed.') with connect(dsn) as conn: - postgres_version = conn.server_version_tuple() # pylint: disable=E1101 + postgres_version = conn.server_version_tuple() if postgres_version < POSTGRESQL_REQUIRED_VERSION: LOG.fatal('Minimum supported version of Postgresql is %d.%d. ' 'Found version %d.%d.', @@ -37,7 +42,7 @@ def create_db(dsn, rouser=None): raise UsageError('PostgreSQL server is too old.') if rouser is not None: - with conn.cursor() as cur: # pylint: disable=E1101 + with conn.cursor() as cur: cnt = cur.scalar('SELECT count(*) FROM pg_user where usename = %s', (rouser, )) if cnt == 0: @@ -109,13 +114,45 @@ def check_module_dir_path(conn, path): def import_base_data(dsn, sql_dir, ignore_partitions=False): """ Create and populate the tables with basic static data that provides - the background for geocoding. + the background for geocoding. Data is assumed to not yet exist. """ db_utils.execute_file(dsn, sql_dir / 'country_name.sql') db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz') if ignore_partitions: with connect(dsn) as conn: - with conn.cursor() as cur: # pylint: disable=E1101 + with conn.cursor() as cur: cur.execute('UPDATE country_name SET partition = 0') - conn.commit() # pylint: disable=E1101 + conn.commit() + + +def import_osm_data(osm_file, options, drop=False): + """ Import the given OSM file. 'options' contains the list of + default settings for osm2pgsql. + """ + options['import_file'] = osm_file + options['append'] = False + options['threads'] = 1 + + if not options['flatnode_file'] and options['osm2pgsql_cache'] == 0: + # Make some educated guesses about cache size based on the size + # of the import file and the available memory. + mem = psutil.virtual_memory() + fsize = os.stat(str(osm_file)).st_size + options['osm2pgsql_cache'] = int(min((mem.available + mem.cached) * 0.75, + fsize * 2) / 1024 / 1024) + 1 + + run_osm2pgsql(options) + + with connect(options['dsn']) as conn: + with conn.cursor() as cur: + cur.execute('SELECT * FROM place LIMIT 1') + if cur.rowcount == 0: + raise UsageError('No data imported by osm2pgsql.') + + if drop: + conn.drop_table('planet_osm_nodes') + + if drop: + if options['flatnode_file']: + Path(options['flatnode_file']).unlink() diff --git a/nominatim/tools/exec_utils.py b/nominatim/tools/exec_utils.py index b2ccc4a2..b476c123 100644 --- a/nominatim/tools/exec_utils.py +++ b/nominatim/tools/exec_utils.py @@ -110,10 +110,19 @@ def run_osm2pgsql(options): ] if options['append']: cmd.append('--append') + else: + cmd.append('--create') if options['flatnode_file']: cmd.extend(('--flat-nodes', options['flatnode_file'])) + for key, param in (('slim_data', '--tablespace-slim-data'), + ('slim_index', '--tablespace-slim-index'), + ('main_data', '--tablespace-main-data'), + ('main_index', '--tablespace-main-index')): + if options['tablespaces'][key]: + cmd.extend((param, options['tablespaces'][key])) + if options.get('disable_jit', False): env['PGOPTIONS'] = '-c jit=off -c max_parallel_workers_per_gather=0' diff --git a/test/python/conftest.py b/test/python/conftest.py index 6d4d3ac9..f0569ab8 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -198,4 +198,13 @@ def placex_table(temp_db_with_extensions, temp_db_conn): temp_db_conn.commit() - +@pytest.fixture +def osm2pgsql_options(temp_db): + return dict(osm2pgsql='echo', + osm2pgsql_cache=10, + osm2pgsql_style='style.file', + threads=1, + dsn='dbname=' + temp_db, + flatnode_file='', + tablespaces=dict(slim_data='', slim_index='', + main_data='', main_index='')) diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 2b4240b4..9170406d 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -40,6 +40,7 @@ def mock_run_legacy(monkeypatch): monkeypatch.setattr(nominatim.cli, 'run_legacy_script', mock) return mock + @pytest.fixture def mock_func_factory(monkeypatch): def get_mock(module, func): @@ -49,6 +50,7 @@ def mock_func_factory(monkeypatch): return get_mock + def test_cli_help(capsys): """ Running nominatim tool without arguments prints help. """ diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py index dcbfb8bf..635465f5 100644 --- a/test/python/test_db_connection.py +++ b/test/python/test_db_connection.py @@ -2,6 +2,7 @@ Tests for specialised conenction and cursor classes. """ import pytest +import psycopg2 from nominatim.db.connection import connect, get_pg_env @@ -30,6 +31,22 @@ def test_connection_index_exists(db, temp_db_cursor): assert db.index_exists('some_index', table='bar') == False +def test_drop_table_existing(db, temp_db_cursor): + temp_db_cursor.execute('CREATE TABLE dummy (id INT)') + + assert db.table_exists('dummy') + db.drop_table('dummy') + assert not db.table_exists('dummy') + + +def test_drop_table_non_existsing(db): + db.drop_table('dfkjgjriogjigjgjrdghehtre') + + +def test_drop_table_non_existing_force(db): + with pytest.raises(psycopg2.ProgrammingError, match='.*does not exist.*'): + db.drop_table('dfkjgjriogjigjgjrdghehtre', if_exists=False) + def test_connection_server_version_tuple(db): ver = db.server_version_tuple() diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py index a4ab16f9..597fdfc1 100644 --- a/test/python/test_tools_database_import.py +++ b/test/python/test_tools_database_import.py @@ -4,6 +4,7 @@ Tests for functions to import a new database. import pytest import psycopg2 import sys +from pathlib import Path from nominatim.tools import database_import from nominatim.errors import UsageError @@ -94,3 +95,42 @@ def test_import_base_data_ignore_partitions(src_dir, temp_db, temp_db_cursor): assert temp_db_cursor.scalar('SELECT count(*) FROM country_name') > 0 assert temp_db_cursor.scalar('SELECT count(*) FROM country_name WHERE partition != 0') == 0 + + +def test_import_osm_data_simple(temp_db_cursor,osm2pgsql_options): + temp_db_cursor.execute('CREATE TABLE place (id INT)') + temp_db_cursor.execute('INSERT INTO place values (1)') + + database_import.import_osm_data('file.pdf', osm2pgsql_options) + + +def test_import_osm_data_simple_no_data(temp_db_cursor,osm2pgsql_options): + temp_db_cursor.execute('CREATE TABLE place (id INT)') + + with pytest.raises(UsageError, match='No data.*'): + database_import.import_osm_data('file.pdf', osm2pgsql_options) + + +def test_import_osm_data_drop(temp_db_conn, temp_db_cursor, tmp_path, osm2pgsql_options): + temp_db_cursor.execute('CREATE TABLE place (id INT)') + temp_db_cursor.execute('CREATE TABLE planet_osm_nodes (id INT)') + temp_db_cursor.execute('INSERT INTO place values (1)') + + flatfile = tmp_path / 'flatfile' + flatfile.write_text('touch') + + osm2pgsql_options['flatnode_file'] = str(flatfile.resolve()) + + database_import.import_osm_data('file.pdf', osm2pgsql_options, drop=True) + + assert not flatfile.exists() + assert not temp_db_conn.table_exists('planet_osm_nodes') + + +def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options): + temp_db_cursor.execute('CREATE TABLE place (id INT)') + temp_db_cursor.execute('INSERT INTO place values (1)') + + osm2pgsql_options['osm2pgsql_cache'] = 0 + + database_import.import_osm_data(Path(__file__), osm2pgsql_options) diff --git a/test/python/test_tools_exec_utils.py b/test/python/test_tools_exec_utils.py index 283f486a..8f60ac74 100644 --- a/test/python/test_tools_exec_utils.py +++ b/test/python/test_tools_exec_utils.py @@ -105,8 +105,15 @@ def test_run_api_with_extra_env(tmp_project_dir): ### run_osm2pgsql -def test_run_osm2pgsql(): - exec_utils.run_osm2pgsql(dict(osm2pgsql='echo', append=False, flatnode_file=None, - dsn='dbname=foobar', threads=1, osm2pgsql_cache=500, - osm2pgsql_style='./my.style', - import_file='foo.bar')) +def test_run_osm2pgsql(osm2pgsql_options): + osm2pgsql_options['append'] = False + osm2pgsql_options['import_file'] = 'foo.bar' + osm2pgsql_options['tablespaces']['osm_data'] = 'extra' + exec_utils.run_osm2pgsql(osm2pgsql_options) + + +def test_run_osm2pgsql_disable_jit(osm2pgsql_options): + osm2pgsql_options['append'] = True + osm2pgsql_options['import_file'] = 'foo.bar' + osm2pgsql_options['disable_jit'] = True + exec_utils.run_osm2pgsql(osm2pgsql_options) diff --git a/vagrant/Install-on-Centos-7.sh b/vagrant/Install-on-Centos-7.sh index eb16f873..e0ab7470 100755 --- a/vagrant/Install-on-Centos-7.sh +++ b/vagrant/Install-on-Centos-7.sh @@ -42,7 +42,7 @@ python3-pip python3-setuptools python3-devel \ expat-devel zlib-devel - pip3 install --user psycopg2 python-dotenv + pip3 install --user psycopg2 python-dotenv psutil # diff --git a/vagrant/Install-on-Centos-8.sh b/vagrant/Install-on-Centos-8.sh index 1cf93a1f..0018a452 100755 --- a/vagrant/Install-on-Centos-8.sh +++ b/vagrant/Install-on-Centos-8.sh @@ -35,7 +35,7 @@ python3-pip python3-setuptools python3-devel \ expat-devel zlib-devel - pip3 install --user psycopg2 python-dotenv + pip3 install --user psycopg2 python-dotenv psutil # diff --git a/vagrant/Install-on-Ubuntu-18.sh b/vagrant/Install-on-Ubuntu-18.sh index 527ded09..d8231908 100755 --- a/vagrant/Install-on-Ubuntu-18.sh +++ b/vagrant/Install-on-Ubuntu-18.sh @@ -30,7 +30,7 @@ export DEBIAN_FRONTEND=noninteractive #DOCS: postgresql-server-dev-10 postgresql-10-postgis-2.4 \ postgresql-contrib-10 postgresql-10-postgis-scripts \ php php-pgsql php-intl python3-pip \ - python3-psycopg2 git + python3-psycopg2 python3-psutil git # The python-dotenv package that comes with Ubuntu 18.04 is too old, so # install the latest version from pip: diff --git a/vagrant/Install-on-Ubuntu-20.sh b/vagrant/Install-on-Ubuntu-20.sh index bf8120c4..6f03bc72 100755 --- a/vagrant/Install-on-Ubuntu-20.sh +++ b/vagrant/Install-on-Ubuntu-20.sh @@ -33,7 +33,7 @@ export DEBIAN_FRONTEND=noninteractive #DOCS: postgresql-server-dev-12 postgresql-12-postgis-3 \ postgresql-contrib-12 postgresql-12-postgis-3-scripts \ php php-pgsql php-intl python3-dotenv \ - python3-psycopg2 git + python3-psycopg2 python3-psutil git # # System Configuration From c7fd0a7af4ce261e61feb0e63d0a6db736280081 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 24 Feb 2021 22:02:13 +0100 Subject: [PATCH 07/18] port wikipedia importance functions to python --- lib-php/admin/setup.php | 6 +-- lib-php/admin/update.php | 15 +----- lib-php/setup/SetupClass.php | 52 +------------------ nominatim/clicmd/refresh.py | 17 ++++-- nominatim/db/utils.py | 13 ++++- nominatim/tools/refresh.py | 47 +++++++++++++++++ test/python/conftest.py | 14 +++++ test/python/test_cli.py | 26 ++++------ test/python/test_db_connection.py | 14 ++--- test/python/test_db_utils.py | 28 ++++++++-- .../test_tools_refresh_address_levels.py | 3 +- 11 files changed, 132 insertions(+), 103 deletions(-) diff --git a/lib-php/admin/setup.php b/lib-php/admin/setup.php index d3831322..91b72c15 100644 --- a/lib-php/admin/setup.php +++ b/lib-php/admin/setup.php @@ -131,7 +131,7 @@ if ($aCMDResult['create-partition-functions'] || $aCMDResult['all']) { if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all']) { $bDidSomething = true; - $oSetup->importWikipediaArticles(); + (clone($oNominatimCmd))->addParams('refresh', '--wiki-data')->run(); } if ($aCMDResult['load-data'] || $aCMDResult['all']) { @@ -157,7 +157,7 @@ if ($aCMDResult['index'] || $aCMDResult['all']) { if ($aCMDResult['drop']) { $bDidSomething = true; - $oSetup->drop($aCMDResult); + (clone($oNominatimCmd))->addParams('freeze')->run(true); } if ($aCMDResult['create-search-indices'] || $aCMDResult['all']) { @@ -172,7 +172,7 @@ if ($aCMDResult['create-country-names'] || $aCMDResult['all']) { if ($aCMDResult['setup-website'] || $aCMDResult['all']) { $bDidSomething = true; - $oSetup->setupWebsite(); + (clone($oNominatimCmd))->addParams('refresh', '--website')->run(true); } // ****************************************************** diff --git a/lib-php/admin/update.php b/lib-php/admin/update.php index fba5300b..4f639c8d 100644 --- a/lib-php/admin/update.php +++ b/lib-php/admin/update.php @@ -211,20 +211,7 @@ if ($aResult['update-address-levels']) { } if ($aResult['recompute-importance']) { - echo "Updating importance values for database.\n"; - $oDB = new Nominatim\DB(); - $oDB->connect(); - - $sSQL = 'ALTER TABLE placex DISABLE TRIGGER ALL;'; - $sSQL .= 'UPDATE placex SET (wikipedia, importance) ='; - $sSQL .= ' (SELECT wikipedia, importance'; - $sSQL .= ' FROM compute_importance(extratags, country_code, osm_type, osm_id));'; - $sSQL .= 'UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance'; - $sSQL .= ' FROM placex d'; - $sSQL .= ' WHERE s.place_id = d.linked_place_id and d.wikipedia is not null'; - $sSQL .= ' and (s.wikipedia is null or s.importance < d.importance);'; - $sSQL .= 'ALTER TABLE placex ENABLE TRIGGER ALL;'; - $oDB->exec($sSQL); + (clone($oNominatimCmd))->addParams('refresh', '--importance')->run(true); } if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) { diff --git a/lib-php/setup/SetupClass.php b/lib-php/setup/SetupClass.php index e11ecd13..66093322 100755 --- a/lib-php/setup/SetupClass.php +++ b/lib-php/setup/SetupClass.php @@ -6,7 +6,6 @@ require_once(CONST_LibDir.'/Shell.php'); class SetupFunctions { - protected $iCacheMemory; protected $iInstances; protected $aDSNInfo; protected $bQuiet; @@ -31,16 +30,6 @@ class SetupFunctions warn('resetting threads to '.$this->iInstances); } - if (isset($aCMDResult['osm2pgsql-cache'])) { - $this->iCacheMemory = $aCMDResult['osm2pgsql-cache']; - } elseif (getSetting('FLATNODE_FILE')) { - // When flatnode files are enabled then disable cache per default. - $this->iCacheMemory = 0; - } else { - // Otherwise: Assume we can steal all the cache memory in the box. - $this->iCacheMemory = getCacheMemoryMB(); - } - // parse database string $this->aDSNInfo = \Nominatim\DB::parseDSN(getSetting('DATABASE_DSN')); if (!isset($this->aDSNInfo['port'])) { @@ -82,6 +71,7 @@ class SetupFunctions if ($this->bVerbose) { $this->oNominatimCmd->addParams('--verbose'); } + $this->oNominatimCmd->addParams('--threads', $this->iInstances); } public function createFunctions() @@ -136,20 +126,6 @@ class SetupFunctions $this->createSqlFunctions(); // also create partition functions } - public function importWikipediaArticles() - { - $sWikiArticlePath = getSetting('WIKIPEDIA_DATA_PATH', CONST_InstallDir); - $sWikiArticlesFile = $sWikiArticlePath.'/wikimedia-importance.sql.gz'; - if (file_exists($sWikiArticlesFile)) { - info('Importing wikipedia articles and redirects'); - $this->dropTable('wikipedia_article'); - $this->dropTable('wikipedia_redirect'); - $this->pgsqlRunScriptFile($sWikiArticlesFile); - } else { - warn('wikipedia importance dump file not found - places will have default importance'); - } - } - public function loadData($bDisableTokenPrecalc) { info('Drop old Data'); @@ -505,21 +481,6 @@ class SetupFunctions $this->pgsqlRunScript($sSQL); } - public function drop() - { - (clone($this->oNominatimCmd))->addParams('freeze')->run(); - } - - /** - * Setup the directory for the API scripts. - * - * @return null - */ - public function setupWebsite() - { - (clone($this->oNominatimCmd))->addParams('refresh', '--website')->run(); - } - /** * Return the connection to the database. * @@ -538,15 +499,6 @@ class SetupFunctions return $this->oDB; } - private function removeFlatnodeFile() - { - $sFName = getSetting('FLATNODE_FILE'); - if ($sFName && file_exists($sFName)) { - if ($this->bVerbose) echo 'Deleting '.$sFName."\n"; - unlink($sFName); - } - } - private function pgsqlRunScript($sScript, $bfatal = true) { runSQLScript( @@ -570,7 +522,7 @@ class SetupFunctions $oCmd->addParams('--enable-debug-statements'); } - $oCmd->run(); + $oCmd->run(!$this->sIgnoreErrors); } private function pgsqlRunPartitionScript($sTemplate) diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index 5dca41de..9dca4e42 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -5,7 +5,6 @@ import logging from pathlib import Path from ..db.connection import connect -from ..tools.exec_utils import run_legacy_script # Do not repeat documentation of subcommand classes. # pylint: disable=C0111 @@ -69,12 +68,20 @@ class UpdateRefresh: args.diffs, args.enable_debug_statements) if args.wiki_data: - run_legacy_script('setup.php', '--import-wikipedia-articles', - nominatim_env=args, throw_on_fail=True) + data_path = Path(args.config.WIKIPEDIA_DATA_PATH + or args.project_dir) + LOG.warning('Import wikipdia article importance from %s', data_path) + if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(), + data_path) > 0: + LOG.fatal('FATAL: Wikipedia importance dump file not found') + return 1 + # Attention: importance MUST come after wiki data import. if args.importance: - run_legacy_script('update.php', '--recompute-importance', - nominatim_env=args, throw_on_fail=True) + LOG.warning('Update importance values for database') + with connect(args.config.get_libpq_dsn()) as conn: + refresh.recompute_importance(conn) + if args.website: webdir = args.project_dir / 'website' LOG.warning('Setting up website directory at %s', webdir) diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 575f3010..0490bbc8 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -21,9 +21,12 @@ def _pipe_to_proc(proc, fdesc): return len(chunk) -def execute_file(dsn, fname, ignore_errors=False): +def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None): """ Read an SQL file and run its contents against the given database - using psql. + using psql. Use `pre_code` and `post_code` to run extra commands + before or after executing the file. The commands are run within the + same session, so they may be used to wrap the file execution in a + transaction. """ cmd = ['psql'] if not ignore_errors: @@ -33,6 +36,9 @@ def execute_file(dsn, fname, ignore_errors=False): if not LOG.isEnabledFor(logging.INFO): proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8')) + if pre_code: + proc.stdin.write((pre_code + ';').encode('utf-8')) + if fname.suffix == '.gz': with gzip.open(str(fname), 'rb') as fdesc: remain = _pipe_to_proc(proc, fdesc) @@ -40,6 +46,9 @@ def execute_file(dsn, fname, ignore_errors=False): with fname.open('rb') as fdesc: remain = _pipe_to_proc(proc, fdesc) + if remain == 0 and post_code: + proc.stdin.write((';' + post_code).encode('utf-8')) + proc.stdin.close() ret = proc.wait() diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index 33efe8f8..b59c37bf 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -200,6 +200,53 @@ PHP_CONST_DEFS = ( ) +def import_wikipedia_articles(dsn, data_path, ignore_errors=False): + """ Replaces the wikipedia importance tables with new data. + The import is run in a single transaction so that the new data + is replace seemlessly. + + Returns 0 if all was well and 1 if the importance file could not + be found. Throws an exception if there was an error reading the file. + """ + datafile = data_path / 'wikimedia-importance.sql.gz' + + if not datafile.exists(): + return 1 + + pre_code = """BEGIN; + DROP TABLE IF EXISTS "wikipedia_article"; + DROP TABLE IF EXISTS "wikipedia_redirect" + """ + post_code = "COMMIT" + execute_file(dsn, datafile, ignore_errors=ignore_errors, + pre_code=pre_code, post_code=post_code) + + return 0 + + +def recompute_importance(conn): + """ Recompute wikipedia links and importance for all entries in placex. + This is a long-running operations that must not be executed in + parallel with updates. + """ + with conn.cursor() as cur: + cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL') + cur.execute(""" + UPDATE placex SET (wikipedia, importance) = + (SELECT wikipedia, importance + FROM compute_importance(extratags, country_code, osm_type, osm_id)) + """) + cur.execute(""" + UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance + FROM placex d + WHERE s.place_id = d.linked_place_id and d.wikipedia is not null + and (s.wikipedia is null or s.importance < d.importance); + """) + + cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL') + conn.commit() + + def setup_website(basedir, phplib_dir, config): """ Create the website script stubs. """ diff --git a/test/python/conftest.py b/test/python/conftest.py index f0569ab8..40b611c0 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -71,6 +71,12 @@ def temp_db(monkeypatch): conn.close() + +@pytest.fixture +def dsn(temp_db): + return 'dbname=' + temp_db + + @pytest.fixture def temp_db_with_extensions(temp_db): conn = psycopg2.connect(database=temp_db) @@ -101,6 +107,14 @@ def temp_db_cursor(temp_db): conn.close() +@pytest.fixture +def table_factory(temp_db_cursor): + def mk_table(name, definition='id INT'): + temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition)) + + return mk_table + + @pytest.fixture def def_config(): return Configuration(None, SRC_DIR.resolve() / 'settings') diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 9170406d..e2a44e37 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -135,24 +135,13 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra assert rank_mock.called == do_ranks -@pytest.mark.parametrize("command,params", [ - ('wiki-data', ('setup.php', '--import-wikipedia-articles')), - ('importance', ('update.php', '--recompute-importance')), - ]) -def test_refresh_legacy_command(mock_func_factory, temp_db, command, params): - mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script') - - assert 0 == call_nominatim('refresh', '--' + command) - - assert mock_run_legacy.called == 1 - assert len(mock_run_legacy.last_args) >= len(params) - assert mock_run_legacy.last_args[:len(params)] == params - @pytest.mark.parametrize("command,func", [ ('postcodes', 'update_postcodes'), ('word-counts', 'recompute_word_counts'), ('address-levels', 'load_address_levels_from_file'), ('functions', 'create_functions'), + ('wiki-data', 'import_wikipedia_articles'), + ('importance', 'recompute_importance'), ('website', 'setup_website'), ]) def test_refresh_command(mock_func_factory, temp_db, command, func): @@ -162,13 +151,16 @@ def test_refresh_command(mock_func_factory, temp_db, command, func): assert func_mock.called == 1 -def test_refresh_importance_computed_after_wiki_import(mock_func_factory, temp_db): - mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script') +def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db): + calls = [] + monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles', + lambda *args, **kwargs: calls.append('import') or 0) + monkeypatch.setattr(nominatim.tools.refresh, 'recompute_importance', + lambda *args, **kwargs: calls.append('update')) assert 0 == call_nominatim('refresh', '--importance', '--wiki-data') - assert mock_run_legacy.called == 2 - assert mock_run_legacy.last_args == ('update.php', '--recompute-importance') + assert calls == ['import', 'update'] def test_serve_command(mock_func_factory): diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py index 635465f5..ce81c4f3 100644 --- a/test/python/test_db_connection.py +++ b/test/python/test_db_connection.py @@ -12,10 +12,10 @@ def db(temp_db): yield conn -def test_connection_table_exists(db, temp_db_cursor): +def test_connection_table_exists(db, table_factory): assert db.table_exists('foobar') == False - temp_db_cursor.execute('CREATE TABLE foobar (id INT)') + table_factory('foobar') assert db.table_exists('foobar') == True @@ -31,10 +31,10 @@ def test_connection_index_exists(db, temp_db_cursor): assert db.index_exists('some_index', table='bar') == False -def test_drop_table_existing(db, temp_db_cursor): - temp_db_cursor.execute('CREATE TABLE dummy (id INT)') - +def test_drop_table_existing(db, table_factory): + table_factory('dummy') assert db.table_exists('dummy') + db.drop_table('dummy') assert not db.table_exists('dummy') @@ -65,8 +65,8 @@ def test_connection_postgis_version_tuple(db, temp_db_cursor): assert ver[0] >= 2 -def test_cursor_scalar(db, temp_db_cursor): - temp_db_cursor.execute('CREATE TABLE dummy (id INT)') +def test_cursor_scalar(db, table_factory): + table_factory('dummy') with db.cursor() as cur: assert cur.scalar('SELECT count(*) FROM dummy') == 0 diff --git a/test/python/test_db_utils.py b/test/python/test_db_utils.py index 1c3b834a..b8a49ccf 100644 --- a/test/python/test_db_utils.py +++ b/test/python/test_db_utils.py @@ -7,10 +7,6 @@ import pytest import nominatim.db.utils as db_utils from nominatim.errors import UsageError -@pytest.fixture -def dsn(temp_db): - return 'dbname=' + temp_db - def test_execute_file_success(dsn, temp_db_cursor, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);') @@ -40,3 +36,27 @@ def test_execute_file_bad_sql_ignore_errors(dsn, tmp_path): tmpfile.write_text('CREATE STABLE test (id INT)') db_utils.execute_file(dsn, tmpfile, ignore_errors=True) + + +def test_execute_file_with_pre_code(dsn, tmp_path, temp_db_cursor): + tmpfile = tmp_path / 'test.sql' + tmpfile.write_text('INSERT INTO test VALUES(4)') + + db_utils.execute_file(dsn, tmpfile, pre_code='CREATE TABLE test (id INT)') + + temp_db_cursor.execute('SELECT * FROM test') + + assert temp_db_cursor.rowcount == 1 + assert temp_db_cursor.fetchone()[0] == 4 + + +def test_execute_file_with_post_code(dsn, tmp_path, temp_db_cursor): + tmpfile = tmp_path / 'test.sql' + tmpfile.write_text('CREATE TABLE test (id INT)') + + db_utils.execute_file(dsn, tmpfile, post_code='INSERT INTO test VALUES(23)') + + temp_db_cursor.execute('SELECT * FROM test') + + assert temp_db_cursor.rowcount == 1 + assert temp_db_cursor.fetchone()[0] == 23 diff --git a/test/python/test_tools_refresh_address_levels.py b/test/python/test_tools_refresh_address_levels.py index 87e34c61..2bd91720 100644 --- a/test/python/test_tools_refresh_address_levels.py +++ b/test/python/test_tools_refresh_address_levels.py @@ -2,9 +2,10 @@ Tests for function for importing address ranks. """ import json -import pytest from pathlib import Path +import pytest + from nominatim.tools.refresh import load_address_levels, load_address_levels_from_file def test_load_ranks_def_config(temp_db_conn, temp_db_cursor, def_config): From db5e78c8792a62c5e2c4e6b18f12b0d55151b1ed Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 25 Feb 2021 10:22:52 +0100 Subject: [PATCH 08/18] remove unused partitionfunction function --- lib-php/admin/setup.php | 2 +- lib-php/setup/SetupClass.php | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/lib-php/admin/setup.php b/lib-php/admin/setup.php index 91b72c15..6fca7c3c 100644 --- a/lib-php/admin/setup.php +++ b/lib-php/admin/setup.php @@ -126,7 +126,7 @@ if ($aCMDResult['create-partition-tables'] || $aCMDResult['all']) { if ($aCMDResult['create-partition-functions'] || $aCMDResult['all']) { $bDidSomething = true; - $oSetup->createPartitionFunctions(); + $oSetup->createFunctions(); // also create partition functions } if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all']) { diff --git a/lib-php/setup/SetupClass.php b/lib-php/setup/SetupClass.php index 66093322..34c97319 100755 --- a/lib-php/setup/SetupClass.php +++ b/lib-php/setup/SetupClass.php @@ -120,12 +120,6 @@ class SetupFunctions $this->pgsqlRunPartitionScript($sTemplate); } - public function createPartitionFunctions() - { - info('Create Partition Functions'); - $this->createSqlFunctions(); // also create partition functions - } - public function loadData($bDisableTokenPrecalc) { info('Drop old Data'); From 3c186f80304b5b66795e9ef3cb9edb8f343c9416 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 25 Feb 2021 11:25:01 +0100 Subject: [PATCH 09/18] add a function for the intial indexing run Also moves postcodes to fully parallel indexing. --- lib-php/admin/setup.php | 28 +++++++++--- lib-php/setup/SetupClass.php | 44 ------------------ nominatim/cli.py | 2 + nominatim/clicmd/transition.py | 10 ++++ nominatim/indexer/indexer.py | 76 +++++++++++++++++++++++++++++-- test/python/test_indexing.py | 58 +++++++++++++++++++++-- test/python/test_tools_refresh.py | 26 +++++++++++ 7 files changed, 184 insertions(+), 60 deletions(-) create mode 100644 test/python/test_tools_refresh.py diff --git a/lib-php/admin/setup.php b/lib-php/admin/setup.php index 6fca7c3c..cb7eeee1 100644 --- a/lib-php/admin/setup.php +++ b/lib-php/admin/setup.php @@ -64,6 +64,16 @@ if ($aCMDResult['verbose']) { $oNominatimCmd->addParams('--verbose'); } +// by default, use all but one processor, but never more than 15. +var_dump($aCMDResult); +$iInstances = max(1, $aCMDResult['threads'] ?? (min(16, getProcessorCount()) - 1)); + +function run($oCmd) { + global $iInstances; + $oCmd->addParams('--threads', $iInstances); + $oCmd->run(true); +} + //******************************************************* // Making some sanity check: @@ -81,7 +91,7 @@ $oSetup = new SetupFunctions($aCMDResult); // go through complete process if 'all' is selected or start selected functions if ($aCMDResult['create-db'] || $aCMDResult['all']) { $bDidSomething = true; - (clone($oNominatimCmd))->addParams('transition', '--create-db')->run(true); + run((clone($oNominatimCmd))->addParams('transition', '--create-db')); } if ($aCMDResult['setup-db'] || $aCMDResult['all']) { @@ -92,7 +102,7 @@ if ($aCMDResult['setup-db'] || $aCMDResult['all']) { $oCmd->addParams('--no-partitions'); } - $oCmd->run(true); + run($oCmd); } if ($aCMDResult['import-data'] || $aCMDResult['all']) { @@ -104,7 +114,7 @@ if ($aCMDResult['import-data'] || $aCMDResult['all']) { $oCmd->addParams('--drop'); } - $oCmd->run(true); + run($oCmd); } if ($aCMDResult['create-functions'] || $aCMDResult['all']) { @@ -131,6 +141,7 @@ if ($aCMDResult['create-partition-functions'] || $aCMDResult['all']) { if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all']) { $bDidSomething = true; + // ignore errors! (clone($oNominatimCmd))->addParams('refresh', '--wiki-data')->run(); } @@ -152,12 +163,17 @@ if ($aCMDResult['calculate-postcodes'] || $aCMDResult['all']) { if ($aCMDResult['index'] || $aCMDResult['all']) { $bDidSomething = true; - $oSetup->index($aCMDResult['index-noanalyse']); + $oCmd = (clone($oNominatimCmd))->addParams('transition', '--index'); + if ($aCMDResult['index-noanalyse'] ?? false) { + $oCmd->addParams('--no-analyse'); + } + + run($oCmd); } if ($aCMDResult['drop']) { $bDidSomething = true; - (clone($oNominatimCmd))->addParams('freeze')->run(true); + run((clone($oNominatimCmd))->addParams('freeze')); } if ($aCMDResult['create-search-indices'] || $aCMDResult['all']) { @@ -172,7 +188,7 @@ if ($aCMDResult['create-country-names'] || $aCMDResult['all']) { if ($aCMDResult['setup-website'] || $aCMDResult['all']) { $bDidSomething = true; - (clone($oNominatimCmd))->addParams('refresh', '--website')->run(true); + run((clone($oNominatimCmd))->addParams('refresh', '--website')); } // ****************************************************** diff --git a/lib-php/setup/SetupClass.php b/lib-php/setup/SetupClass.php index 34c97319..e8c145ba 100755 --- a/lib-php/setup/SetupClass.php +++ b/lib-php/setup/SetupClass.php @@ -71,7 +71,6 @@ class SetupFunctions if ($this->bVerbose) { $this->oNominatimCmd->addParams('--verbose'); } - $this->oNominatimCmd->addParams('--threads', $this->iInstances); } public function createFunctions() @@ -380,49 +379,6 @@ class SetupFunctions $this->db()->exec($sSQL); } - public function index($bIndexNoanalyse) - { - $this->checkModulePresence(); // raises exception on failure - - $oBaseCmd = (clone $this->oNominatimCmd)->addParams('index'); - - info('Index ranks 0 - 4'); - $oCmd = (clone $oBaseCmd)->addParams('--maxrank', 4); - - $iStatus = $oCmd->run(); - if ($iStatus != 0) { - fail('error status ' . $iStatus . ' running nominatim!'); - } - if (!$bIndexNoanalyse) $this->pgsqlRunScript('ANALYSE'); - - info('Index administrative boundaries'); - $oCmd = (clone $oBaseCmd)->addParams('--boundaries-only'); - $iStatus = $oCmd->run(); - if ($iStatus != 0) { - fail('error status ' . $iStatus . ' running nominatim!'); - } - - info('Index ranks 5 - 25'); - $oCmd = (clone $oBaseCmd)->addParams('--no-boundaries', '--minrank', 5, '--maxrank', 25); - $iStatus = $oCmd->run(); - if ($iStatus != 0) { - fail('error status ' . $iStatus . ' running nominatim!'); - } - - if (!$bIndexNoanalyse) $this->pgsqlRunScript('ANALYSE'); - - info('Index ranks 26 - 30'); - $oCmd = (clone $oBaseCmd)->addParams('--no-boundaries', '--minrank', 26); - $iStatus = $oCmd->run(); - if ($iStatus != 0) { - fail('error status ' . $iStatus . ' running nominatim!'); - } - - info('Index postcodes'); - $sSQL = 'UPDATE location_postcode SET indexed_status = 0'; - $this->db()->exec($sSQL); - } - public function createSearchIndices() { info('Create Search indices'); diff --git a/nominatim/cli.py b/nominatim/cli.py index e1824cc6..eb652d64 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -171,6 +171,8 @@ class SetupAll: params.append('--ignore-errors') if args.index_noanalyse: params.append('--index-noanalyse') + if args.threads: + params.extend(('--threads', args.threads)) return run_legacy_script(*params, nominatim_env=args) diff --git a/nominatim/clicmd/transition.py b/nominatim/clicmd/transition.py index eb4e2d2f..4a5b44f5 100644 --- a/nominatim/clicmd/transition.py +++ b/nominatim/clicmd/transition.py @@ -32,6 +32,8 @@ class AdminTransition: help='Build a blank nominatim db') group.add_argument('--import-data', action='store_true', help='Import a osm file') + group.add_argument('--index', action='store_true', + help='Index the data') group = parser.add_argument_group('Options') group.add_argument('--no-partitions', action='store_true', help='Do not partition search indices') @@ -41,6 +43,8 @@ class AdminTransition: help='Drop tables needed for updates, making the database readonly') group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int, help='Size of cache to be used by osm2pgsql (in MB)') + group.add_argument('--no-analyse', action='store_true', + help='Do not perform analyse operations during index') @staticmethod def run(args): @@ -69,3 +73,9 @@ class AdminTransition: database_import.import_osm_data(Path(args.osm_file), args.osm2pgsql_options(0, 1), drop=args.drop) + + if args.index: + LOG.warning('Indexing') + from ..indexer.indexer import Indexer + indexer = Indexer(args.config.get_libpq_dsn(), args.threads or 1) + indexer.index_full() diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index 6e0ed60f..61971497 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -61,8 +61,8 @@ class InterpolationRunner: @staticmethod def sql_index_place(ids): return """UPDATE location_property_osmline - SET indexed_status = 0 WHERE place_id IN ({})"""\ - .format(','.join((str(i) for i in ids))) + SET indexed_status = 0 WHERE place_id IN ({}) + """.format(','.join((str(i) for i in ids))) class BoundaryRunner: """ Returns SQL commands for indexing the administrative boundaries @@ -79,19 +79,46 @@ class BoundaryRunner: return """SELECT count(*) FROM placex WHERE indexed_status > 0 AND rank_search = {} - AND class = 'boundary' and type = 'administrative'""".format(self.rank) + AND class = 'boundary' and type = 'administrative' + """.format(self.rank) def sql_get_objects(self): return """SELECT place_id FROM placex WHERE indexed_status > 0 and rank_search = {} and class = 'boundary' and type = 'administrative' - ORDER BY partition, admin_level""".format(self.rank) + ORDER BY partition, admin_level + """.format(self.rank) @staticmethod def sql_index_place(ids): return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\ .format(','.join((str(i) for i in ids))) + +class PostcodeRunner: + """ Provides the SQL commands for indexing the location_postcode table. + """ + + @staticmethod + def name(): + return "postcodes (location_postcode)" + + @staticmethod + def sql_count_objects(): + return 'SELECT count(*) FROM location_postcode WHERE indexed_status > 0' + + @staticmethod + def sql_get_objects(): + return """SELECT place_id FROM location_postcode + WHERE indexed_status > 0 + ORDER BY country_code, postcode""" + + @staticmethod + def sql_index_place(ids): + return """UPDATE location_postcode SET indexed_status = 0 + WHERE place_id IN ({}) + """.format(','.join((str(i) for i in ids))) + class Indexer: """ Main indexing routine. """ @@ -100,7 +127,36 @@ class Indexer: self.conn = psycopg2.connect(dsn) self.threads = [DBConnection(dsn) for _ in range(num_threads)] + + def index_full(self, analyse=True): + """ Index the complete database. This will first index boudnaries + followed by all other objects. When `analyse` is True, then the + database will be analysed at the appropriate places to + ensure that database statistics are updated. + """ + self.index_by_rank(0, 4) + self._analyse_db_if(analyse) + + self.index_boundaries(0, 30) + self._analyse_db_if(analyse) + + self.index_by_rank(5, 25) + self._analyse_db_if(analyse) + + self.index_by_rank(26, 30) + self._analyse_db_if(analyse) + + self.index_postcodes() + self._analyse_db_if(analyse) + + def _analyse_db_if(self, condition): + if condition: + with self.conn.cursor() as cur: + cur.execute('ANALYSE') + def index_boundaries(self, minrank, maxrank): + """ Index only administrative boundaries within the given rank range. + """ LOG.warning("Starting indexing boundaries using %s threads", len(self.threads)) @@ -108,7 +164,11 @@ class Indexer: self.index(BoundaryRunner(rank)) def index_by_rank(self, minrank, maxrank): - """ Run classic indexing by rank. + """ Index all entries of placex in the given rank range (inclusive) + in order of their address rank. + + When rank 30 is requested then also interpolations and + places with address rank 0 will be indexed. """ maxrank = min(maxrank, 30) LOG.warning("Starting indexing rank (%i to %i) using %i threads", @@ -124,6 +184,12 @@ class Indexer: else: self.index(RankRunner(maxrank)) + + def index_postcodes(self): + """Index the entries ofthe location_postcode table. + """ + self.index(PostcodeRunner(), 20) + def update_status_table(self): """ Update the status in the status table to 'indexed'. """ diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py index 6b52a65e..ee9c6c7e 100644 --- a/test/python/test_indexing.py +++ b/test/python/test_indexing.py @@ -12,6 +12,7 @@ class IndexerTestDB: def __init__(self, conn): self.placex_id = itertools.count(100000) self.osmline_id = itertools.count(500000) + self.postcode_id = itertools.count(700000) self.conn = conn self.conn.set_isolation_level(0) @@ -31,6 +32,12 @@ class IndexerTestDB: indexed_status SMALLINT, indexed_date TIMESTAMP, geometry_sector INTEGER)""") + cur.execute("""CREATE TABLE location_postcode ( + place_id BIGINT, + indexed_status SMALLINT, + indexed_date TIMESTAMP, + country_code varchar(2), + postcode TEXT)""") cur.execute("""CREATE OR REPLACE FUNCTION date_update() RETURNS TRIGGER AS $$ BEGIN @@ -39,10 +46,10 @@ class IndexerTestDB: END IF; RETURN NEW; END; $$ LANGUAGE plpgsql;""") - cur.execute("""CREATE TRIGGER placex_update BEFORE UPDATE ON placex - FOR EACH ROW EXECUTE PROCEDURE date_update()""") - cur.execute("""CREATE TRIGGER osmline_update BEFORE UPDATE ON location_property_osmline - FOR EACH ROW EXECUTE PROCEDURE date_update()""") + for table in ('placex', 'location_property_osmline', 'location_postcode'): + cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0} + FOR EACH ROW EXECUTE PROCEDURE date_update() + """.format(table)) def scalar(self, query): with self.conn.cursor() as cur: @@ -74,6 +81,15 @@ class IndexerTestDB: (next_id, sector)) return next_id + def add_postcode(self, country, postcode): + next_id = next(self.postcode_id) + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO location_postcode + (place_id, indexed_status, country_code, postcode) + VALUES (%s, 1, %s, %s)""", + (next_id, country, postcode)) + return next_id + def placex_unindexed(self): return self.scalar('SELECT count(*) from placex where indexed_status > 0') @@ -87,7 +103,7 @@ def test_db(temp_db_conn): @pytest.mark.parametrize("threads", [1, 15]) -def test_index_full(test_db, threads): +def test_index_all_by_rank(test_db, threads): for rank in range(31): test_db.add_place(rank_address=rank, rank_search=rank) test_db.add_osmline() @@ -184,3 +200,35 @@ def test_index_boundaries(test_db, threads): assert 0 == test_db.scalar(""" SELECT count(*) FROM placex WHERE indexed_status = 0 AND class != 'boundary'""") + + +@pytest.mark.parametrize("threads", [1, 15]) +def test_index_postcodes(test_db, threads): + for postcode in range(1000): + test_db.add_postcode('de', postcode) + for postcode in range(32000, 33000): + test_db.add_postcode('us', postcode) + + idx = Indexer('dbname=test_nominatim_python_unittest', threads) + idx.index_postcodes() + + assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode + WHERE indexed_status != 0""") + + +def test_index_full(test_db): + for rank in range(4, 10): + test_db.add_admin(rank_address=rank, rank_search=rank) + for rank in range(31): + test_db.add_place(rank_address=rank, rank_search=rank) + test_db.add_osmline() + for postcode in range(1000): + test_db.add_postcode('de', postcode) + + idx = Indexer('dbname=test_nominatim_python_unittest', 4) + idx.index_full() + + assert 0 == test_db.placex_unindexed() + assert 0 == test_db.osmline_unindexed() + assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode + WHERE indexed_status != 0""") diff --git a/test/python/test_tools_refresh.py b/test/python/test_tools_refresh.py new file mode 100644 index 00000000..d6c46ad7 --- /dev/null +++ b/test/python/test_tools_refresh.py @@ -0,0 +1,26 @@ +""" +Test for various refresh functions. +""" +from pathlib import Path + +import pytest + +from nominatim.tools import refresh + +TEST_DIR = (Path(__file__) / '..' / '..').resolve() + +def test_refresh_import_wikipedia_not_existing(dsn): + assert 1 == refresh.import_wikipedia_articles(dsn, Path('.')) + + +@pytest.mark.parametrize("replace", (True, False)) +def test_refresh_import_wikipedia(dsn, table_factory, temp_db_cursor, replace): + if replace: + table_factory('wikipedia_article') + table_factory('wikipedia_redirect') + + # use the small wikipedia file for the API testdb + assert 0 == refresh.import_wikipedia_articles(dsn, TEST_DIR / 'testdb') + + assert temp_db_cursor.scalar('SELECT count(*) FROM wikipedia_article') > 0 + assert temp_db_cursor.scalar('SELECT count(*) FROM wikipedia_redirect') > 0 From 57db5819efb1eac7497aca2dbdb2493a67bd33f9 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 25 Feb 2021 21:32:40 +0100 Subject: [PATCH 10/18] prot load-data function to python --- lib-php/admin/setup.php | 3 +- lib-php/setup/SetupClass.php | 127 ----------------------------- nominatim/clicmd/transition.py | 17 ++++ nominatim/tools/database_import.py | 83 +++++++++++++++++++ 4 files changed, 101 insertions(+), 129 deletions(-) diff --git a/lib-php/admin/setup.php b/lib-php/admin/setup.php index cb7eeee1..6493460d 100644 --- a/lib-php/admin/setup.php +++ b/lib-php/admin/setup.php @@ -65,7 +65,6 @@ if ($aCMDResult['verbose']) { } // by default, use all but one processor, but never more than 15. -var_dump($aCMDResult); $iInstances = max(1, $aCMDResult['threads'] ?? (min(16, getProcessorCount()) - 1)); function run($oCmd) { @@ -147,7 +146,7 @@ if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all']) { if ($aCMDResult['load-data'] || $aCMDResult['all']) { $bDidSomething = true; - $oSetup->loadData($aCMDResult['disable-token-precalc']); + run((clone($oNominatimCmd))->addParams('transition', '--load-data')); } if ($aCMDResult['import-tiger-data']) { diff --git a/lib-php/setup/SetupClass.php b/lib-php/setup/SetupClass.php index e8c145ba..b0081fd8 100755 --- a/lib-php/setup/SetupClass.php +++ b/lib-php/setup/SetupClass.php @@ -119,133 +119,6 @@ class SetupFunctions $this->pgsqlRunPartitionScript($sTemplate); } - public function loadData($bDisableTokenPrecalc) - { - info('Drop old Data'); - - $oDB = $this->db(); - - $oDB->exec('TRUNCATE word'); - echo '.'; - $oDB->exec('TRUNCATE placex'); - echo '.'; - $oDB->exec('TRUNCATE location_property_osmline'); - echo '.'; - $oDB->exec('TRUNCATE place_addressline'); - echo '.'; - $oDB->exec('TRUNCATE location_area'); - echo '.'; - if (!$this->dbReverseOnly()) { - $oDB->exec('TRUNCATE search_name'); - echo '.'; - } - $oDB->exec('TRUNCATE search_name_blank'); - echo '.'; - $oDB->exec('DROP SEQUENCE seq_place'); - echo '.'; - $oDB->exec('CREATE SEQUENCE seq_place start 100000'); - echo '.'; - - $sSQL = 'select distinct partition from country_name'; - $aPartitions = $oDB->getCol($sSQL); - - if (!$this->bNoPartitions) $aPartitions[] = 0; - foreach ($aPartitions as $sPartition) { - $oDB->exec('TRUNCATE location_road_'.$sPartition); - echo '.'; - } - - // used by getorcreate_word_id to ignore frequent partial words - $sSQL = 'CREATE OR REPLACE FUNCTION get_maxwordfreq() RETURNS integer AS '; - $sSQL .= '$$ SELECT '.getSetting('MAX_WORD_FREQUENCY').' as maxwordfreq; $$ LANGUAGE SQL IMMUTABLE'; - $oDB->exec($sSQL); - echo ".\n"; - - // pre-create the word list - if (!$bDisableTokenPrecalc) { - info('Loading word list'); - $this->pgsqlRunScriptFile(CONST_DataDir.'/words.sql'); - } - - info('Load Data'); - $sColumns = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'; - - $aDBInstances = array(); - $iLoadThreads = max(1, $this->iInstances - 1); - for ($i = 0; $i < $iLoadThreads; $i++) { - // https://secure.php.net/manual/en/function.pg-connect.php - $DSN = getSetting('DATABASE_DSN'); - $DSN = preg_replace('/^pgsql:/', '', $DSN); - $DSN = preg_replace('/;/', ' ', $DSN); - $aDBInstances[$i] = pg_connect($DSN, PGSQL_CONNECT_FORCE_NEW); - pg_ping($aDBInstances[$i]); - } - - for ($i = 0; $i < $iLoadThreads; $i++) { - $sSQL = "INSERT INTO placex ($sColumns) SELECT $sColumns FROM place WHERE osm_id % $iLoadThreads = $i"; - $sSQL .= " and not (class='place' and type='houses' and osm_type='W'"; - $sSQL .= " and ST_GeometryType(geometry) = 'ST_LineString')"; - $sSQL .= ' and ST_IsValid(geometry)'; - if ($this->bVerbose) echo "$sSQL\n"; - if (!pg_send_query($aDBInstances[$i], $sSQL)) { - fail(pg_last_error($aDBInstances[$i])); - } - } - - // last thread for interpolation lines - // https://secure.php.net/manual/en/function.pg-connect.php - $DSN = getSetting('DATABASE_DSN'); - $DSN = preg_replace('/^pgsql:/', '', $DSN); - $DSN = preg_replace('/;/', ' ', $DSN); - $aDBInstances[$iLoadThreads] = pg_connect($DSN, PGSQL_CONNECT_FORCE_NEW); - pg_ping($aDBInstances[$iLoadThreads]); - $sSQL = 'insert into location_property_osmline'; - $sSQL .= ' (osm_id, address, linegeo)'; - $sSQL .= ' SELECT osm_id, address, geometry from place where '; - $sSQL .= "class='place' and type='houses' and osm_type='W' and ST_GeometryType(geometry) = 'ST_LineString'"; - if ($this->bVerbose) echo "$sSQL\n"; - if (!pg_send_query($aDBInstances[$iLoadThreads], $sSQL)) { - fail(pg_last_error($aDBInstances[$iLoadThreads])); - } - - $bFailed = false; - for ($i = 0; $i <= $iLoadThreads; $i++) { - while (($hPGresult = pg_get_result($aDBInstances[$i])) !== false) { - $resultStatus = pg_result_status($hPGresult); - // PGSQL_EMPTY_QUERY, PGSQL_COMMAND_OK, PGSQL_TUPLES_OK, - // PGSQL_COPY_OUT, PGSQL_COPY_IN, PGSQL_BAD_RESPONSE, - // PGSQL_NONFATAL_ERROR and PGSQL_FATAL_ERROR - // echo 'Query result ' . $i . ' is: ' . $resultStatus . "\n"; - if ($resultStatus != PGSQL_COMMAND_OK && $resultStatus != PGSQL_TUPLES_OK) { - $resultError = pg_result_error($hPGresult); - echo '-- error text ' . $i . ': ' . $resultError . "\n"; - $bFailed = true; - } - } - } - if ($bFailed) { - fail('SQL errors loading placex and/or location_property_osmline tables'); - } - - for ($i = 0; $i < $this->iInstances; $i++) { - pg_close($aDBInstances[$i]); - } - - echo "\n"; - info('Reanalysing database'); - $this->pgsqlRunScript('ANALYSE'); - - $sDatabaseDate = getDatabaseDate($oDB); - $oDB->exec('TRUNCATE import_status'); - if (!$sDatabaseDate) { - warn('could not determine database date.'); - } else { - $sSQL = "INSERT INTO import_status (lastimportdate) VALUES('".$sDatabaseDate."')"; - $oDB->exec($sSQL); - echo "Latest data imported from $sDatabaseDate.\n"; - } - } - public function importTigerData($sTigerPath) { info('Import Tiger data'); diff --git a/nominatim/clicmd/transition.py b/nominatim/clicmd/transition.py index 4a5b44f5..de4e16ca 100644 --- a/nominatim/clicmd/transition.py +++ b/nominatim/clicmd/transition.py @@ -9,6 +9,7 @@ import logging from pathlib import Path from ..db.connection import connect +from ..db import status from ..errors import UsageError # Do not repeat documentation of subcommand classes. @@ -32,6 +33,8 @@ class AdminTransition: help='Build a blank nominatim db') group.add_argument('--import-data', action='store_true', help='Import a osm file') + group.add_argument('--load-data', action='store_true', + help='Copy data to live tables from import table') group.add_argument('--index', action='store_true', help='Index the data') group = parser.add_argument_group('Options') @@ -74,6 +77,20 @@ class AdminTransition: args.osm2pgsql_options(0, 1), drop=args.drop) + if args.load_data: + LOG.warning('Load data') + with connect(args.config.get_libpq_dsn()) as conn: + database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY) + database_import.load_data(args.config.get_libpq_dsn(), + args.data_dir, + args.threads or 1) + + with connect(args.config.get_libpq_dsn()) as conn: + try: + status.set_status(conn, status.compute_database_date(conn)) + except Exception as exc: # pylint: disable=bare-except + LOG.error('Cannot determine date of database: %s', exc) + if args.index: LOG.warning('Indexing') from ..indexer.indexer import Indexer diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index 00ec95c0..a6df2755 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -3,6 +3,7 @@ Functions for setting up and importing a new Nominatim database. """ import logging import os +import selectors import subprocess import shutil from pathlib import Path @@ -11,6 +12,7 @@ import psutil from ..db.connection import connect, get_pg_env from ..db import utils as db_utils +from ..db.async_connection import DBConnection from .exec_utils import run_osm2pgsql from ..errors import UsageError from ..version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION @@ -156,3 +158,84 @@ def import_osm_data(osm_file, options, drop=False): if drop: if options['flatnode_file']: Path(options['flatnode_file']).unlink() + + +def truncate_data_tables(conn, max_word_frequency=None): + """ Truncate all data tables to prepare for a fresh load. + """ + with conn.cursor() as cur: + cur.execute('TRUNCATE word') + cur.execute('TRUNCATE placex') + cur.execute('TRUNCATE place_addressline') + cur.execute('TRUNCATE location_area') + cur.execute('TRUNCATE location_area_country') + cur.execute('TRUNCATE location_property') + cur.execute('TRUNCATE location_property_tiger') + cur.execute('TRUNCATE location_property_osmline') + cur.execute('TRUNCATE location_postcode') + cur.execute('TRUNCATE search_name') + cur.execute('DROP SEQUENCE seq_place') + cur.execute('CREATE SEQUENCE seq_place start 100000') + + cur.execute("""SELECT tablename FROM pg_tables + WHERE tablename LIKE 'location_road_%'""") + + for table in [r[0] for r in list(cur)]: + cur.execute('TRUNCATE ' + table) + + if max_word_frequency is not None: + # Used by getorcreate_word_id to ignore frequent partial words. + cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq() + RETURNS integer AS $$ + SELECT {} as maxwordfreq; + $$ LANGUAGE SQL IMMUTABLE + """.format(max_word_frequency)) + conn.commit() + +_COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry' + +def load_data(dsn, data_dir, threads): + """ Copy data into the word and placex table. + """ + # Pre-calculate the most important terms in the word list. + db_utils.execute_file(dsn, data_dir / 'words.sql') + + sel = selectors.DefaultSelector() + # Then copy data from place to placex in chunks. + place_threads = max(1, threads - 1) + for imod in range(place_threads): + conn = DBConnection(dsn) + conn.connect() + conn.perform("""INSERT INTO placex ({0}) + SELECT {0} FROM place + WHERE osm_id % {1} = {2} + AND NOT (class='place' and type='houses') + AND ST_IsValid(geometry) + """.format(_COPY_COLUMNS, place_threads, imod)) + sel.register(conn, selectors.EVENT_READ, conn) + + # Address interpolations go into another table. + conn = DBConnection(dsn) + conn.connect() + conn.perform("""INSERT INTO location_property_osmline (osm_id, address, linegeo) + SELECT osm_id, address, geometry FROM place + WHERE class='place' and type='houses' and osm_type='W' + and ST_GeometryType(geometry) = 'ST_LineString' + """) + sel.register(conn, selectors.EVENT_READ, conn) + + # Now wait for all of them to finish. + todo = place_threads + 1 + while todo > 0: + for key, _ in sel.select(1): + conn = key.data + sel.unregister(conn) + conn.wait() + conn.close() + todo -= 1 + print('.', end='', flush=True) + print('\n') + + with connect(dsn) as conn: + with conn.cursor() as cur: + cur.execute('ANALYSE') From 3ee8d9fa75011a5e259c3b58fce671b4ff0c35fc Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 26 Feb 2021 12:10:54 +0100 Subject: [PATCH 11/18] properly close connections of indexer after use --- nominatim/indexer/indexer.py | 105 +++++++++++++++++++++++++---------- 1 file changed, 75 insertions(+), 30 deletions(-) diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index 61971497..d997e522 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -124,8 +124,25 @@ class Indexer: """ def __init__(self, dsn, num_threads): - self.conn = psycopg2.connect(dsn) - self.threads = [DBConnection(dsn) for _ in range(num_threads)] + self.dsn = dsn + self.num_threads = num_threads + self.conn = None + self.threads = [] + + + def _setup_connections(self): + self.conn = psycopg2.connect(self.dsn) + self.threads = [DBConnection(self.dsn) for _ in range(self.num_threads)] + + + def _close_connections(self): + if self.conn: + self.conn.close() + self.conn = None + + for thread in self.threads: + thread.close() + threads = [] def index_full(self, analyse=True): @@ -134,34 +151,44 @@ class Indexer: database will be analysed at the appropriate places to ensure that database statistics are updated. """ - self.index_by_rank(0, 4) - self._analyse_db_if(analyse) + conn = psycopg2.connect(self.dsn) - self.index_boundaries(0, 30) - self._analyse_db_if(analyse) + try: + self.index_by_rank(0, 4) + self._analyse_db_if(conn, analyse) - self.index_by_rank(5, 25) - self._analyse_db_if(analyse) + self.index_boundaries(0, 30) + self._analyse_db_if(conn, analyse) - self.index_by_rank(26, 30) - self._analyse_db_if(analyse) + self.index_by_rank(5, 25) + self._analyse_db_if(conn, analyse) - self.index_postcodes() - self._analyse_db_if(analyse) + self.index_by_rank(26, 30) + self._analyse_db_if(conn, analyse) - def _analyse_db_if(self, condition): + self.index_postcodes() + self._analyse_db_if(conn, analyse) + finally: + conn.close() + + def _analyse_db_if(self, conn, condition): if condition: - with self.conn.cursor() as cur: + with conn.cursor() as cur: cur.execute('ANALYSE') def index_boundaries(self, minrank, maxrank): """ Index only administrative boundaries within the given rank range. """ LOG.warning("Starting indexing boundaries using %s threads", - len(self.threads)) + self.num_threads) - for rank in range(max(minrank, 4), min(maxrank, 26)): - self.index(BoundaryRunner(rank)) + self._setup_connections() + + try: + for rank in range(max(minrank, 4), min(maxrank, 26)): + self.index(BoundaryRunner(rank)) + finally: + self._close_connections() def index_by_rank(self, minrank, maxrank): """ Index all entries of placex in the given rank range (inclusive) @@ -172,30 +199,48 @@ class Indexer: """ maxrank = min(maxrank, 30) LOG.warning("Starting indexing rank (%i to %i) using %i threads", - minrank, maxrank, len(self.threads)) + minrank, maxrank, self.num_threads) - for rank in range(max(1, minrank), maxrank): - self.index(RankRunner(rank)) + self._setup_connections() - if maxrank == 30: - self.index(RankRunner(0)) - self.index(InterpolationRunner(), 20) - self.index(RankRunner(30), 20) - else: - self.index(RankRunner(maxrank)) + try: + for rank in range(max(1, minrank), maxrank): + self.index(RankRunner(rank)) + + if maxrank == 30: + self.index(RankRunner(0)) + self.index(InterpolationRunner(), 20) + self.index(RankRunner(30), 20) + else: + self.index(RankRunner(maxrank)) + finally: + self._close_connections() def index_postcodes(self): """Index the entries ofthe location_postcode table. """ - self.index(PostcodeRunner(), 20) + LOG.warning("Starting indexing postcodes using %s threads", self.num_threads) + + self._setup_connections() + + try: + self.index(PostcodeRunner(), 20) + finally: + self._close_connections() def update_status_table(self): """ Update the status in the status table to 'indexed'. """ - with self.conn.cursor() as cur: - cur.execute('UPDATE import_status SET indexed = true') - self.conn.commit() + conn = psycopg2.connect(self.dsn) + + try: + with conn.cursor() as cur: + cur.execute('UPDATE import_status SET indexed = true') + + conn.commit() + finally: + conn.close() def index(self, obj, batch=1): """ Index a single rank or table. `obj` describes the SQL to use From 15b590679063601512f7544d90b9fcef54cc08fd Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 26 Feb 2021 15:02:39 +0100 Subject: [PATCH 12/18] move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. --- nominatim/cli.py | 68 +---------- nominatim/clicmd/__init__.py | 1 + nominatim/clicmd/setup.py | 140 ++++++++++++++++++++++ nominatim/clicmd/transition.py | 8 +- nominatim/indexer/indexer.py | 23 ++-- nominatim/tools/database_import.py | 47 +++++--- test/python/conftest.py | 54 ++++++++- test/python/test_cli.py | 33 ++++- test/python/test_tools_check_database.py | 4 + test/python/test_tools_database_import.py | 66 ++++++++++ 10 files changed, 342 insertions(+), 102 deletions(-) create mode 100644 nominatim/clicmd/setup.py diff --git a/nominatim/cli.py b/nominatim/cli.py index eb652d64..35c6c1f0 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -111,72 +111,6 @@ class CommandlineParser: # pylint: disable=E0012,C0415 -class SetupAll: - """\ - Create a new Nominatim database from an OSM file. - """ - - @staticmethod - def add_args(parser): - group_name = parser.add_argument_group('Required arguments') - group = group_name.add_mutually_exclusive_group(required=True) - group.add_argument('--osm-file', - help='OSM file to be imported.') - group.add_argument('--continue', dest='continue_at', - choices=['load-data', 'indexing', 'db-postprocess'], - help='Continue an import that was interrupted') - group = parser.add_argument_group('Optional arguments') - group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int, - help='Size of cache to be used by osm2pgsql (in MB)') - group.add_argument('--reverse-only', action='store_true', - help='Do not create tables and indexes for searching') - group.add_argument('--enable-debug-statements', action='store_true', - help='Include debug warning statements in SQL code') - group.add_argument('--no-partitions', action='store_true', - help="""Do not partition search indices - (speeds up import of single country extracts)""") - group.add_argument('--no-updates', action='store_true', - help="""Do not keep tables that are only needed for - updating the database later""") - group = parser.add_argument_group('Expert options') - group.add_argument('--ignore-errors', action='store_true', - help='Continue import even when errors in SQL are present') - group.add_argument('--index-noanalyse', action='store_true', - help='Do not perform analyse operations during index') - - - @staticmethod - def run(args): - params = ['setup.php'] - if args.osm_file: - params.extend(('--all', '--osm-file', args.osm_file)) - else: - if args.continue_at == 'load-data': - params.append('--load-data') - if args.continue_at in ('load-data', 'indexing'): - params.append('--index') - params.extend(('--create-search-indices', '--create-country-names', - '--setup-website')) - if args.osm2pgsql_cache: - params.extend(('--osm2pgsql-cache', args.osm2pgsql_cache)) - if args.reverse_only: - params.append('--reverse-only') - if args.enable_debug_statements: - params.append('--enable-debug-statements') - if args.no_partitions: - params.append('--no-partitions') - if args.no_updates: - params.append('--drop') - if args.ignore_errors: - params.append('--ignore-errors') - if args.index_noanalyse: - params.append('--index-noanalyse') - if args.threads: - params.extend(('--threads', args.threads)) - - return run_legacy_script(*params, nominatim_env=args) - - class SetupSpecialPhrases: """\ Maintain special phrases. @@ -334,7 +268,7 @@ def nominatim(**kwargs): """ parser = CommandlineParser('nominatim', nominatim.__doc__) - parser.add_subcommand('import', SetupAll) + parser.add_subcommand('import', clicmd.SetupAll) parser.add_subcommand('freeze', clicmd.SetupFreeze) parser.add_subcommand('replication', clicmd.UpdateReplication) diff --git a/nominatim/clicmd/__init__.py b/nominatim/clicmd/__init__.py index 78570b1b..9101e0c0 100644 --- a/nominatim/clicmd/__init__.py +++ b/nominatim/clicmd/__init__.py @@ -2,6 +2,7 @@ Subcommand definitions for the command-line tool. """ +from .setup import SetupAll from .replication import UpdateReplication from .api import APISearch, APIReverse, APILookup, APIDetails, APIStatus from .index import UpdateIndex diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py new file mode 100644 index 00000000..8f717cbb --- /dev/null +++ b/nominatim/clicmd/setup.py @@ -0,0 +1,140 @@ +""" +Implementation of the 'import' subcommand. +""" +import logging +from pathlib import Path + +import psutil + +from ..tools.exec_utils import run_legacy_script +from ..db.connection import connect +from ..db import status +from ..errors import UsageError + +# Do not repeat documentation of subcommand classes. +# pylint: disable=C0111 +# Using non-top-level imports to avoid eventually unused imports. +# pylint: disable=E0012,C0415 + +LOG = logging.getLogger() + +class SetupAll: + """\ + Create a new Nominatim database from an OSM file. + """ + + @staticmethod + def add_args(parser): + group_name = parser.add_argument_group('Required arguments') + group = group_name.add_mutually_exclusive_group(required=True) + group.add_argument('--osm-file', metavar='FILE', + help='OSM file to be imported.') + group.add_argument('--continue', dest='continue_at', + choices=['load-data', 'indexing', 'db-postprocess'], + help='Continue an import that was interrupted') + group = parser.add_argument_group('Optional arguments') + group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int, + help='Size of cache to be used by osm2pgsql (in MB)') + group.add_argument('--reverse-only', action='store_true', + help='Do not create tables and indexes for searching') + group.add_argument('--no-partitions', action='store_true', + help="""Do not partition search indices + (speeds up import of single country extracts)""") + group.add_argument('--no-updates', action='store_true', + help="""Do not keep tables that are only needed for + updating the database later""") + group = parser.add_argument_group('Expert options') + group.add_argument('--ignore-errors', action='store_true', + help='Continue import even when errors in SQL are present') + group.add_argument('--index-noanalyse', action='store_true', + help='Do not perform analyse operations during index') + + + @staticmethod + def run(args): # pylint: disable=too-many-statements + from ..tools import database_import + from ..tools import refresh + from ..indexer.indexer import Indexer + + if args.osm_file and not Path(args.osm_file).is_file(): + LOG.fatal("OSM file '%s' does not exist.", args.osm_file) + raise UsageError('Cannot access file.') + + if args.continue_at is None: + database_import.setup_database_skeleton(args.config.get_libpq_dsn(), + args.data_dir, + args.no_partitions, + rouser=args.config.DATABASE_WEBUSER) + + LOG.warning('Installing database module') + with connect(args.config.get_libpq_dsn()) as conn: + database_import.install_module(args.module_dir, args.project_dir, + args.config.DATABASE_MODULE_PATH, + conn=conn) + + LOG.warning('Importing OSM data file') + database_import.import_osm_data(Path(args.osm_file), + args.osm2pgsql_options(0, 1), + drop=args.no_updates) + + LOG.warning('Create functions (1st pass)') + with connect(args.config.get_libpq_dsn()) as conn: + refresh.create_functions(conn, args.config, args.sqllib_dir, + False, False) + + LOG.warning('Create tables') + params = ['setup.php', '--create-tables', '--create-partition-tables'] + if args.reverse_only: + params.append('--reverse-only') + run_legacy_script(*params, nominatim_env=args) + + LOG.warning('Create functions (2nd pass)') + with connect(args.config.get_libpq_dsn()) as conn: + refresh.create_functions(conn, args.config, args.sqllib_dir, + False, False) + + LOG.warning('Importing wikipedia importance data') + data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir) + if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(), + data_path) > 0: + LOG.error('Wikipedia importance dump file not found. ' + 'Will be using default importances.') + + LOG.warning('Initialise tables') + with connect(args.config.get_libpq_dsn()) as conn: + database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY) + + if args.continue_at is None or args.continue_at == 'load-data': + LOG.warning('Load data into placex table') + database_import.load_data(args.config.get_libpq_dsn(), + args.data_dir, + args.threads or psutil.cpu_count() or 1) + + LOG.warning('Calculate postcodes') + run_legacy_script('setup.php', '--calculate-postcodes', nominatim_env=args) + + if args.continue_at is None or args.continue_at in ('load-data', 'indexing'): + LOG.warning('Indexing places') + indexer = Indexer(args.config.get_libpq_dsn(), + args.threads or psutil.cpu_count() or 1) + indexer.index_full(analyse=not args.index_noanalyse) + + LOG.warning('Post-process tables') + params = ['setup.php', '--create-search-indices', '--create-country-names'] + if args.no_updates: + params.append('--drop') + run_legacy_script(*params, nominatim_env=args) + + webdir = args.project_dir / 'website' + LOG.warning('Setup website at %s', webdir) + refresh.setup_website(webdir, args.phplib_dir, args.config) + + with connect(args.config.get_libpq_dsn()) as conn: + try: + dbdate = status.compute_database_date(conn) + status.set_status(conn, dbdate) + LOG.info('Database is at %s.', dbdate) + except Exception as exc: # pylint: disable=broad-except + LOG.error('Cannot determine date of database: %s', exc) + + return 0 diff --git a/nominatim/clicmd/transition.py b/nominatim/clicmd/transition.py index de4e16ca..1d78062e 100644 --- a/nominatim/clicmd/transition.py +++ b/nominatim/clicmd/transition.py @@ -59,12 +59,12 @@ class AdminTransition: if args.setup_db: LOG.warning('Setup DB') - mpath = database_import.install_module(args.module_dir, args.project_dir, - args.config.DATABASE_MODULE_PATH) with connect(args.config.get_libpq_dsn()) as conn: database_import.setup_extensions(conn) - database_import.check_module_dir_path(conn, mpath) + database_import.install_module(args.module_dir, args.project_dir, + args.config.DATABASE_MODULE_PATH, + conn=conn) database_import.import_base_data(args.config.get_libpq_dsn(), args.data_dir, args.no_partitions) @@ -88,7 +88,7 @@ class AdminTransition: with connect(args.config.get_libpq_dsn()) as conn: try: status.set_status(conn, status.compute_database_date(conn)) - except Exception as exc: # pylint: disable=bare-except + except Exception as exc: # pylint: disable=broad-except LOG.error('Cannot determine date of database: %s', exc) if args.index: diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index d997e522..93723844 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -119,6 +119,13 @@ class PostcodeRunner: WHERE place_id IN ({}) """.format(','.join((str(i) for i in ids))) + +def _analyse_db_if(conn, condition): + if condition: + with conn.cursor() as cur: + cur.execute('ANALYSE') + + class Indexer: """ Main indexing routine. """ @@ -142,7 +149,7 @@ class Indexer: for thread in self.threads: thread.close() - threads = [] + self.threads = [] def index_full(self, analyse=True): @@ -155,26 +162,22 @@ class Indexer: try: self.index_by_rank(0, 4) - self._analyse_db_if(conn, analyse) + _analyse_db_if(conn, analyse) self.index_boundaries(0, 30) - self._analyse_db_if(conn, analyse) + _analyse_db_if(conn, analyse) self.index_by_rank(5, 25) - self._analyse_db_if(conn, analyse) + _analyse_db_if(conn, analyse) self.index_by_rank(26, 30) - self._analyse_db_if(conn, analyse) + _analyse_db_if(conn, analyse) self.index_postcodes() - self._analyse_db_if(conn, analyse) + _analyse_db_if(conn, analyse) finally: conn.close() - def _analyse_db_if(self, conn, condition): - if condition: - with conn.cursor() as cur: - cur.execute('ANALYSE') def index_boundaries(self, minrank, maxrank): """ Index only administrative boundaries within the given rank range. diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index a6df2755..6e65e73a 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -9,6 +9,7 @@ import shutil from pathlib import Path import psutil +import psycopg2 from ..db.connection import connect, get_pg_env from ..db import utils as db_utils @@ -19,6 +20,21 @@ from ..version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION LOG = logging.getLogger() +def setup_database_skeleton(dsn, data_dir, no_partitions, rouser=None): + """ Create a new database for Nominatim and populate it with the + essential extensions and data. + """ + LOG.warning('Creating database') + create_db(dsn, rouser) + + LOG.warning('Setting up database') + with connect(dsn) as conn: + setup_extensions(conn) + + LOG.warning('Loading basic data') + import_base_data(dsn, data_dir, no_partitions) + + def create_db(dsn, rouser=None): """ Create a new database for the given DSN. Fails when the database already exists or the PostgreSQL version is too old. @@ -72,7 +88,7 @@ def setup_extensions(conn): raise UsageError('PostGIS version is too old.') -def install_module(src_dir, project_dir, module_dir): +def install_module(src_dir, project_dir, module_dir, conn=None): """ Copy the normalization module from src_dir into the project directory under the '/module' directory. If 'module_dir' is set, then use the module from there instead and check that it is accessible @@ -80,6 +96,9 @@ def install_module(src_dir, project_dir, module_dir): The function detects when the installation is run from the build directory. It doesn't touch the module in that case. + + If 'conn' is given, then the function also tests if the module + can be access via the given database. """ if not module_dir: module_dir = project_dir / 'module' @@ -99,19 +118,17 @@ def install_module(src_dir, project_dir, module_dir): else: LOG.info("Using custom path for database module at '%s'", module_dir) - return module_dir - - -def check_module_dir_path(conn, path): - """ Check that the normalisation module can be found and executed - from the given path. - """ - with conn.cursor() as cur: - cur.execute("""CREATE FUNCTION nominatim_test_import_func(text) - RETURNS text AS '{}/nominatim.so', 'transliteration' - LANGUAGE c IMMUTABLE STRICT; - DROP FUNCTION nominatim_test_import_func(text) - """.format(path)) + if conn is not None: + with conn.cursor() as cur: + try: + cur.execute("""CREATE FUNCTION nominatim_test_import_func(text) + RETURNS text AS '{}/nominatim.so', 'transliteration' + LANGUAGE c IMMUTABLE STRICT; + DROP FUNCTION nominatim_test_import_func(text) + """.format(module_dir)) + except psycopg2.DatabaseError as err: + LOG.fatal("Error accessing database module: %s", err) + raise UsageError("Database module cannot be accessed.") from err def import_base_data(dsn, sql_dir, ignore_partitions=False): @@ -174,7 +191,7 @@ def truncate_data_tables(conn, max_word_frequency=None): cur.execute('TRUNCATE location_property_osmline') cur.execute('TRUNCATE location_postcode') cur.execute('TRUNCATE search_name') - cur.execute('DROP SEQUENCE seq_place') + cur.execute('DROP SEQUENCE IF EXISTS seq_place') cur.execute('CREATE SEQUENCE seq_place start 100000') cur.execute("""SELECT tablename FROM pg_tables diff --git a/test/python/conftest.py b/test/python/conftest.py index 40b611c0..d16dceff 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -43,6 +43,11 @@ class _TestingCursor(psycopg2.extras.DictCursor): WHERE tablename = %s""", (table, )) return num == 1 + def table_rows(self, table): + """ Return the number of rows in the given table. + """ + return self.scalar('SELECT count(*) FROM ' + table) + @pytest.fixture def temp_db(monkeypatch): @@ -109,8 +114,12 @@ def temp_db_cursor(temp_db): @pytest.fixture def table_factory(temp_db_cursor): - def mk_table(name, definition='id INT'): + def mk_table(name, definition='id INT', content=None): temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition)) + if content is not None: + if not isinstance(content, str): + content = '),('.join([str(x) for x in content]) + temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content)) return mk_table @@ -174,7 +183,7 @@ def place_row(place_table, temp_db_cursor): temp_db_cursor.execute("INSERT INTO place VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)", (osm_id or next(idseq), osm_type, cls, typ, names, admin_level, address, extratags, - geom or 'SRID=4326;POINT(0 0 )')) + geom or 'SRID=4326;POINT(0 0)')) return _insert @@ -184,7 +193,7 @@ def placex_table(temp_db_with_extensions, temp_db_conn): """ with temp_db_conn.cursor() as cur: cur.execute("""CREATE TABLE placex ( - place_id BIGINT NOT NULL, + place_id BIGINT, parent_place_id BIGINT, linked_place_id BIGINT, importance FLOAT, @@ -207,8 +216,43 @@ def placex_table(temp_db_with_extensions, temp_db_conn): country_code varchar(2), housenumber TEXT, postcode TEXT, - centroid GEOMETRY(Geometry, 4326)) - """) + centroid GEOMETRY(Geometry, 4326))""") + temp_db_conn.commit() + + +@pytest.fixture +def osmline_table(temp_db_with_extensions, temp_db_conn): + with temp_db_conn.cursor() as cur: + cur.execute("""CREATE TABLE location_property_osmline ( + place_id BIGINT, + osm_id BIGINT, + parent_place_id BIGINT, + geometry_sector INTEGER, + indexed_date TIMESTAMP, + startnumber INTEGER, + endnumber INTEGER, + partition SMALLINT, + indexed_status SMALLINT, + linegeo GEOMETRY, + interpolationtype TEXT, + address HSTORE, + postcode TEXT, + country_code VARCHAR(2))""") + temp_db_conn.commit() + + +@pytest.fixture +def word_table(temp_db, temp_db_conn): + with temp_db_conn.cursor() as cur: + cur.execute("""CREATE TABLE word ( + word_id INTEGER, + word_token text, + word text, + class text, + type text, + country_code varchar(2), + search_name_count INTEGER, + operator TEXT)""") temp_db_conn.commit() diff --git a/test/python/test_cli.py b/test/python/test_cli.py index e2a44e37..70c10605 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -13,9 +13,11 @@ import nominatim.cli import nominatim.clicmd.api import nominatim.clicmd.refresh import nominatim.clicmd.admin +import nominatim.clicmd.setup import nominatim.indexer.indexer import nominatim.tools.admin import nominatim.tools.check_database +import nominatim.tools.database_import import nominatim.tools.freeze import nominatim.tools.refresh @@ -61,7 +63,6 @@ def test_cli_help(capsys): @pytest.mark.parametrize("command,script", [ - (('import', '--continue', 'load-data'), 'setup'), (('special-phrases',), 'specialphrases'), (('add-data', '--tiger-data', 'tiger'), 'setup'), (('add-data', '--file', 'foo.osm'), 'update'), @@ -74,6 +75,36 @@ def test_legacy_commands_simple(mock_run_legacy, command, script): assert mock_run_legacy.last_args[0] == script + '.php' +def test_import_missing_file(temp_db): + assert 1 == call_nominatim('import', '--osm-file', 'sfsafegweweggdgw.reh.erh') + + +def test_import_bad_file(temp_db): + assert 1 == call_nominatim('import', '--osm-file', '.') + + +def test_import_full(temp_db, mock_func_factory): + mocks = [ + mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'), + mock_func_factory(nominatim.tools.database_import, 'install_module'), + mock_func_factory(nominatim.tools.database_import, 'import_osm_data'), + mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'), + mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'), + mock_func_factory(nominatim.tools.database_import, 'load_data'), + mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), + mock_func_factory(nominatim.tools.refresh, 'setup_website'), + ] + + cf_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions') + mock_func_factory(nominatim.clicmd.setup, 'run_legacy_script') + + assert 0 == call_nominatim('import', '--osm-file', __file__) + + assert cf_mock.called > 1 + + for mock in mocks: + assert mock.called == 1 + def test_freeze_command(mock_func_factory, temp_db): mock_drop = mock_func_factory(nominatim.tools.freeze, 'drop_update_tables') mock_flatnode = mock_func_factory(nominatim.tools.freeze, 'drop_flatnode_file') diff --git a/test/python/test_tools_check_database.py b/test/python/test_tools_check_database.py index 3787c3be..68b376a7 100644 --- a/test/python/test_tools_check_database.py +++ b/test/python/test_tools_check_database.py @@ -63,6 +63,10 @@ def test_check_database_indexes_bad(temp_db_conn, def_config): assert chkdb.check_database_indexes(temp_db_conn, def_config) == chkdb.CheckState.FAIL +def test_check_database_indexes_valid(temp_db_conn, def_config): + assert chkdb.check_database_index_valid(temp_db_conn, def_config) == chkdb.CheckState.OK + + def test_check_tiger_table_disabled(temp_db_conn, def_config, monkeypatch): monkeypatch.setenv('NOMINATIM_USE_US_TIGER_DATA' , 'no') assert chkdb.check_tiger_table(temp_db_conn, def_config) == chkdb.CheckState.NOT_APPLICABLE diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py index 597fdfc1..f9760fc0 100644 --- a/test/python/test_tools_database_import.py +++ b/test/python/test_tools_database_import.py @@ -24,6 +24,24 @@ def nonexistant_db(): with conn.cursor() as cur: cur.execute('DROP DATABASE IF EXISTS {}'.format(dbname)) +@pytest.mark.parametrize("no_partitions", (True, False)) +def test_setup_skeleton(src_dir, nonexistant_db, no_partitions): + database_import.setup_database_skeleton('dbname=' + nonexistant_db, + src_dir / 'data', no_partitions) + + conn = psycopg2.connect(database=nonexistant_db) + + try: + with conn.cursor() as cur: + cur.execute("SELECT distinct partition FROM country_name") + partitions = set([r[0] for r in list(cur)]) + if no_partitions: + assert partitions == set([0]) + else: + assert len(partitions) > 10 + finally: + conn.close() + def test_create_db_success(nonexistant_db): database_import.create_db('dbname=' + nonexistant_db, rouser='www-data') @@ -79,6 +97,22 @@ def test_install_module(tmp_path): assert outfile.stat().st_mode == 33261 +def test_install_module_custom(tmp_path): + (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so') + + database_import.install_module(tmp_path, tmp_path, str(tmp_path.resolve())) + + assert not (tmp_path / 'module').exists() + + +def test_install_module_fail_access(temp_db_conn, tmp_path): + (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so') + + with pytest.raises(UsageError, match='.*module cannot be accessed.*'): + database_import.install_module(tmp_path, tmp_path, '', + conn=temp_db_conn) + + def test_import_base_data(src_dir, temp_db, temp_db_cursor): temp_db_cursor.execute('CREATE EXTENSION hstore') temp_db_cursor.execute('CREATE EXTENSION postgis') @@ -134,3 +168,35 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options): osm2pgsql_options['osm2pgsql_cache'] = 0 database_import.import_osm_data(Path(__file__), osm2pgsql_options) + + +def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory): + tables = ('word', 'placex', 'place_addressline', 'location_area', + 'location_area_country', 'location_property', + 'location_property_tiger', 'location_property_osmline', + 'location_postcode', 'search_name', 'location_road_23') + for table in tables: + table_factory(table, content=(1, 2, 3)) + + database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23) + + for table in tables: + assert temp_db_cursor.table_rows(table) == 0 + + +@pytest.mark.parametrize("threads", (1, 5)) +def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_table, + temp_db_cursor, threads): + for func in ('make_keywords', 'getorcreate_housenumber_id', 'make_standard_name'): + temp_db_cursor.execute("""CREATE FUNCTION {} (src TEXT) + RETURNS TEXT AS $$ SELECT 'a' $$ LANGUAGE SQL + """.format(func)) + for oid in range(100, 130): + place_row(osm_id=oid) + place_row(osm_type='W', osm_id=342, cls='place', typ='houses', + geom='SRID=4326;LINESTRING(0 0, 10 10)') + + database_import.load_data(dsn, src_dir / 'data', threads) + + assert temp_db_cursor.table_rows('placex') == 30 + assert temp_db_cursor.table_rows('location_property_osmline') == 1 From dd03aeb966c2a101759d447a0fa00340812c89da Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 26 Feb 2021 16:14:29 +0100 Subject: [PATCH 13/18] bdd: use python library where possible Replace calls to PHP scripts with direct calls into the nominatim Python library where possible. This speed up tests quite a bit. --- lib-php/admin/setup.php | 4 +++ nominatim/cli.py | 12 +++++---- nominatim/clicmd/setup.py | 3 ++- nominatim/clicmd/transition.py | 5 +++- nominatim/tools/database_import.py | 11 ++++---- test/bdd/steps/nominatim_environment.py | 36 +++++++++++++++++-------- test/bdd/steps/steps_db_ops.py | 16 ++++++----- test/bdd/steps/steps_osm_data.py | 5 ++-- 8 files changed, 60 insertions(+), 32 deletions(-) diff --git a/lib-php/admin/setup.php b/lib-php/admin/setup.php index 6493460d..3aa6b828 100644 --- a/lib-php/admin/setup.php +++ b/lib-php/admin/setup.php @@ -69,7 +69,11 @@ $iInstances = max(1, $aCMDResult['threads'] ?? (min(16, getProcessorCount()) - 1 function run($oCmd) { global $iInstances; + global $aCMDResult; $oCmd->addParams('--threads', $iInstances); + if ($aCMDResult['ignore-errors'] ?? false) { + $oCmd->addParams('--ignore-errors'); + } $oCmd->run(true); } diff --git a/nominatim/cli.py b/nominatim/cli.py index 35c6c1f0..7459711f 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -75,12 +75,14 @@ class CommandlineParser: setattr(args, arg, Path(kwargs[arg])) args.project_dir = Path(args.project_dir).resolve() - logging.basicConfig(stream=sys.stderr, - format='%(asctime)s: %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=max(4 - args.verbose, 1) * 10) + if 'cli_args' not in kwargs: + logging.basicConfig(stream=sys.stderr, + format='%(asctime)s: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=max(4 - args.verbose, 1) * 10) - args.config = Configuration(args.project_dir, args.config_dir) + args.config = Configuration(args.project_dir, args.config_dir, + environ=kwargs.get('environ', os.environ)) log = logging.getLogger() log.warning('Using project directory: %s', str(args.project_dir)) diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 8f717cbb..f8229b39 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -75,7 +75,8 @@ class SetupAll: LOG.warning('Importing OSM data file') database_import.import_osm_data(Path(args.osm_file), args.osm2pgsql_options(0, 1), - drop=args.no_updates) + drop=args.no_updates, + ignore_errors=args.ignore_errors) LOG.warning('Create functions (1st pass)') with connect(args.config.get_libpq_dsn()) as conn: diff --git a/nominatim/clicmd/transition.py b/nominatim/clicmd/transition.py index 1d78062e..210eec70 100644 --- a/nominatim/clicmd/transition.py +++ b/nominatim/clicmd/transition.py @@ -48,6 +48,8 @@ class AdminTransition: help='Size of cache to be used by osm2pgsql (in MB)') group.add_argument('--no-analyse', action='store_true', help='Do not perform analyse operations during index') + group.add_argument('--ignore-errors', action='store_true', + help="Ignore certain erros on import.") @staticmethod def run(args): @@ -75,7 +77,8 @@ class AdminTransition: raise UsageError('Missing required --osm-file argument') database_import.import_osm_data(Path(args.osm_file), args.osm2pgsql_options(0, 1), - drop=args.drop) + drop=args.drop, + ignore_errors=args.ignore_errors) if args.load_data: LOG.warning('Load data') diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index 6e65e73a..7d71386f 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -145,7 +145,7 @@ def import_base_data(dsn, sql_dir, ignore_partitions=False): conn.commit() -def import_osm_data(osm_file, options, drop=False): +def import_osm_data(osm_file, options, drop=False, ignore_errors=False): """ Import the given OSM file. 'options' contains the list of default settings for osm2pgsql. """ @@ -164,10 +164,11 @@ def import_osm_data(osm_file, options, drop=False): run_osm2pgsql(options) with connect(options['dsn']) as conn: - with conn.cursor() as cur: - cur.execute('SELECT * FROM place LIMIT 1') - if cur.rowcount == 0: - raise UsageError('No data imported by osm2pgsql.') + if not ignore_errors: + with conn.cursor() as cur: + cur.execute('SELECT * FROM place LIMIT 1') + if cur.rowcount == 0: + raise UsageError('No data imported by osm2pgsql.') if drop: conn.drop_table('planet_osm_nodes') diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py index 811faf5c..168334b1 100644 --- a/test/bdd/steps/nominatim_environment.py +++ b/test/bdd/steps/nominatim_environment.py @@ -7,6 +7,7 @@ import psycopg2.extras sys.path.insert(1, str((Path(__file__) / '..' / '..' / '..' / '..').resolve())) +from nominatim import cli from nominatim.config import Configuration from nominatim.tools import refresh from steps.utils import run_script @@ -88,18 +89,18 @@ class NominatimEnvironment: self.test_env['NOMINATIM_FLATNODE_FILE'] = '' self.test_env['NOMINATIM_IMPORT_STYLE'] = 'full' self.test_env['NOMINATIM_USE_US_TIGER_DATA'] = 'yes' - self.test_env['NOMINATIM_DATADIR'] = self.src_dir / 'data' - self.test_env['NOMINATIM_SQLDIR'] = self.src_dir / 'lib-sql' - self.test_env['NOMINATIM_CONFIGDIR'] = self.src_dir / 'settings' - self.test_env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = self.build_dir / 'module' - self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = self.build_dir / 'osm2pgsql' / 'osm2pgsql' - self.test_env['NOMINATIM_NOMINATIM_TOOL'] = self.build_dir / 'nominatim' + self.test_env['NOMINATIM_DATADIR'] = str((self.src_dir / 'data').resolve()) + self.test_env['NOMINATIM_SQLDIR'] = str((self.src_dir / 'lib-sql').resolve()) + self.test_env['NOMINATIM_CONFIGDIR'] = str((self.src_dir / 'settings').resolve()) + self.test_env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str((self.build_dir / 'module').resolve()) + self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = str((self.build_dir / 'osm2pgsql' / 'osm2pgsql').resolve()) + self.test_env['NOMINATIM_NOMINATIM_TOOL'] = str((self.build_dir / 'nominatim').resolve()) if self.server_module_path: self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.server_module_path else: # avoid module being copied into the temporary environment - self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.build_dir / 'module' + self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = str((self.build_dir / 'module').resolve()) if self.website_dir is not None: self.website_dir.cleanup() @@ -182,9 +183,9 @@ class NominatimEnvironment: self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve()) try: - self.run_setup_script('all', osm_file=self.api_test_file) + self.run_nominatim('import', '--osm-file', str(self.api_test_file)) self.run_setup_script('import-tiger-data') - self.run_setup_script('drop') + self.run_nominatim('freeze') phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve()) run_script(['psql', '-d', self.api_test_db, '-f', phrase_file]) @@ -249,12 +250,25 @@ class NominatimEnvironment: """ with db.cursor() as cur: while True: - self.run_update_script('index') + self.run_nominatim('index') cur.execute("SELECT 'a' FROM placex WHERE indexed_status != 0 LIMIT 1") if cur.rowcount == 0: return + def run_nominatim(self, *cmdline): + """ Run the nominatim command-line tool via the library. + """ + cli.nominatim(module_dir='', + osm2pgsql_path=str(self.build_dir / 'osm2pgsql' / 'osm2pgsql'), + phplib_dir=str(self.src_dir / 'lib-php'), + sqllib_dir=str(self.src_dir / 'lib-sql'), + data_dir=str(self.src_dir / 'data'), + config_dir=str(self.src_dir / 'settings'), + cli_args=cmdline, + phpcgi_path='', + environ=self.test_env) + def run_setup_script(self, *args, **kwargs): """ Run the Nominatim setup script with the given arguments. """ @@ -285,7 +299,7 @@ class NominatimEnvironment: """ Copy data from place to the placex and location_property_osmline tables invoking the appropriate triggers. """ - self.run_setup_script('create-functions', 'create-partition-functions') + self.run_nominatim('refresh', '--functions', '--no-diff-updates') with db.cursor() as cur: cur.execute("""INSERT INTO placex (osm_type, osm_id, class, type, diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index c549f3eb..9d443b43 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -5,6 +5,7 @@ import psycopg2.extras from place_inserter import PlaceColumn from table_compare import NominatimID, DBRow +from nominatim.indexer.indexer import Indexer def check_database_integrity(context): """ Check some generic constraints on the tables. @@ -85,7 +86,12 @@ def import_and_index_data_from_place_table(context): """ Import data previously set up in the place table. """ context.nominatim.copy_from_place(context.db) - context.nominatim.run_setup_script('calculate-postcodes', 'index', 'index-noanalyse') + context.nominatim.run_setup_script('calculate-postcodes') + + # Call directly as the refresh function does not include postcodes. + indexer = Indexer(context.nominatim.test_env['NOMINATIM_DATABASE_DSN'][6:], 1) + indexer.index_full(analyse=False) + check_database_integrity(context) @when("updating places") @@ -93,8 +99,7 @@ def update_place_table(context): """ Update the place table with the given data. Also runs all triggers related to updates and reindexes the new data. """ - context.nominatim.run_setup_script( - 'create-functions', 'create-partition-functions', 'enable-diff-updates') + context.nominatim.run_nominatim('refresh', '--functions') with context.db.cursor() as cur: for row in context.table: PlaceColumn(context).add_row(row, False).db_insert(cur) @@ -106,7 +111,7 @@ def update_place_table(context): def update_postcodes(context): """ Rerun the calculation of postcodes. """ - context.nominatim.run_update_script('calculate-postcodes') + context.nominatim.run_nominatim('refresh', '--postcodes') @when("marking for delete (?P.*)") def delete_places(context, oids): @@ -114,8 +119,7 @@ def delete_places(context, oids): separated by commas. Also runs all triggers related to updates and reindexes the new data. """ - context.nominatim.run_setup_script( - 'create-functions', 'create-partition-functions', 'enable-diff-updates') + context.nominatim.run_nominatim('refresh', '--functions') with context.db.cursor() as cur: for oid in oids.split(','): NominatimID(oid).query_osm_id(cur, 'DELETE FROM place WHERE {}') diff --git a/test/bdd/steps/steps_osm_data.py b/test/bdd/steps/steps_osm_data.py index 3858198b..844fb274 100644 --- a/test/bdd/steps/steps_osm_data.py +++ b/test/bdd/steps/steps_osm_data.py @@ -75,9 +75,8 @@ def update_from_osm_file(context): The data is expected as attached text in OPL format. """ context.nominatim.copy_from_place(context.db) - context.nominatim.run_setup_script('index', 'index-noanalyse') - context.nominatim.run_setup_script('create-functions', 'create-partition-functions', - 'enable-diff-updates') + context.nominatim.run_nominatim('index') + context.nominatim.run_nominatim('refresh', '--functions') # create an OSM file and import it fname = write_opl_file(context.text, context.osm) From c7f40e3cee2853ade0e16a90ab7a3b29b02265f9 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 26 Feb 2021 16:33:30 +0100 Subject: [PATCH 14/18] fix verbose flag for PHP wrapper scripts The flag must come after the command. --- lib-php/admin/setup.php | 15 ++++++++------- lib-php/admin/update.php | 33 +++++++++++++++++++-------------- nominatim/db/utils.py | 2 ++ 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/lib-php/admin/setup.php b/lib-php/admin/setup.php index 3aa6b828..f8e360bb 100644 --- a/lib-php/admin/setup.php +++ b/lib-php/admin/setup.php @@ -57,23 +57,24 @@ setupHTTPProxy(); $bDidSomething = false; $oNominatimCmd = new \Nominatim\Shell(getSetting('NOMINATIM_TOOL')); -if (isset($aCMDResult['quiet']) && $aCMDResult['quiet']) { - $oNominatimCmd->addParams('--quiet'); -} -if ($aCMDResult['verbose']) { - $oNominatimCmd->addParams('--verbose'); -} // by default, use all but one processor, but never more than 15. $iInstances = max(1, $aCMDResult['threads'] ?? (min(16, getProcessorCount()) - 1)); -function run($oCmd) { +function run($oCmd) +{ global $iInstances; global $aCMDResult; $oCmd->addParams('--threads', $iInstances); if ($aCMDResult['ignore-errors'] ?? false) { $oCmd->addParams('--ignore-errors'); } + if ($aCMDResult['quiet'] ?? false) { + $oCmd->addParams('--quiet'); + } + if ($aCMDResult['verbose'] ?? false) { + $oCmd->addParams('--verbose'); + } $oCmd->run(true); } diff --git a/lib-php/admin/update.php b/lib-php/admin/update.php index 4f639c8d..81e0b832 100644 --- a/lib-php/admin/update.php +++ b/lib-php/admin/update.php @@ -104,11 +104,17 @@ if ($fPostgresVersion >= 11.0) { } $oNominatimCmd = new \Nominatim\Shell(getSetting('NOMINATIM_TOOL')); -if ($aResult['quiet']) { - $oNominatimCmd->addParams('--quiet'); -} -if ($aResult['verbose']) { - $oNominatimCmd->addParams('--verbose'); + +function run($oCmd) +{ + global $aCMDResult; + if ($aCMDResult['quiet'] ?? false) { + $oCmd->addParams('--quiet'); + } + if ($aCMDResult['verbose'] ?? false) { + $oCmd->addParams('--verbose'); + } + $oCmd->run(true); } @@ -119,7 +125,7 @@ if ($aResult['init-updates']) { $oCmd->addParams('--no-update-functions'); } - $oCmd->run(); + run($oCmd); } if ($aResult['check-for-updates']) { @@ -147,7 +153,7 @@ if (isset($aResult['import-diff']) || isset($aResult['import-file'])) { } if ($aResult['calculate-postcodes']) { - (clone($oNominatimCmd))->addParams('refresh', '--postcodes')->run(); + run((clone($oNominatimCmd))->addParams('refresh', '--postcodes')); } $sTemporaryFile = CONST_InstallDir.'/osmosischange.osc'; @@ -196,22 +202,21 @@ if ($bHaveDiff) { } if ($aResult['recompute-word-counts']) { - (clone($oNominatimCmd))->addParams('refresh', '--word-counts')->run(); + run((clone($oNominatimCmd))->addParams('refresh', '--word-counts')); } if ($aResult['index']) { - (clone $oNominatimCmd) + run((clone $oNominatimCmd) ->addParams('index', '--minrank', $aResult['index-rank']) - ->addParams('--threads', $aResult['index-instances']) - ->run(); + ->addParams('--threads', $aResult['index-instances'])); } if ($aResult['update-address-levels']) { - (clone($oNominatimCmd))->addParams('refresh', '--address-levels')->run(); + run((clone($oNominatimCmd))->addParams('refresh', '--address-levels')); } if ($aResult['recompute-importance']) { - (clone($oNominatimCmd))->addParams('refresh', '--importance')->run(true); + run((clone($oNominatimCmd))->addParams('refresh', '--importance')); } if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) { @@ -227,5 +232,5 @@ if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) { $oCmd->addParams('--no-index'); } - exit($oCmd->run()); + run($oCmd); } diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 0490bbc8..6d2eb297 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -31,6 +31,8 @@ def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None) cmd = ['psql'] if not ignore_errors: cmd.extend(('-v', 'ON_ERROR_STOP=1')) + if not LOG.isEnabledFor(logging.INFO): + cmd.append('--quiet') proc = subprocess.Popen(cmd, env=get_pg_env(dsn), stdin=subprocess.PIPE) if not LOG.isEnabledFor(logging.INFO): From 9feb84e426265c5543a6adccb5e400881c3f3cad Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 26 Feb 2021 16:50:09 +0100 Subject: [PATCH 15/18] actions: add psutil dependency --- .github/actions/build-nominatim/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/build-nominatim/action.yml b/.github/actions/build-nominatim/action.yml index d62ecf86..f22be81a 100644 --- a/.github/actions/build-nominatim/action.yml +++ b/.github/actions/build-nominatim/action.yml @@ -6,7 +6,7 @@ runs: steps: - name: Install prerequisits run: | - sudo apt-get install -y -qq libboost-system-dev libboost-filesystem-dev libexpat1-dev zlib1g-dev libbz2-dev libpq-dev libproj-dev python3-psycopg2 python3-pyosmium python3-dotenv + sudo apt-get install -y -qq libboost-system-dev libboost-filesystem-dev libexpat1-dev zlib1g-dev libbz2-dev libpq-dev libproj-dev python3-psycopg2 python3-pyosmium python3-dotenv python3-psutil shell: bash - name: Download dependencies From d14a3df10fe0a2834fd38770179bf0318f5eecfa Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 26 Feb 2021 18:20:53 +0100 Subject: [PATCH 16/18] do not truncate search_name in reverse-only mode --- nominatim/tools/database_import.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index 7d71386f..ab8ccdd2 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -191,7 +191,8 @@ def truncate_data_tables(conn, max_word_frequency=None): cur.execute('TRUNCATE location_property_tiger') cur.execute('TRUNCATE location_property_osmline') cur.execute('TRUNCATE location_postcode') - cur.execute('TRUNCATE search_name') + if conn.table_exists('search_name'): + cur.execute('TRUNCATE search_name') cur.execute('DROP SEQUENCE IF EXISTS seq_place') cur.execute('CREATE SEQUENCE seq_place start 100000') From afabbeb546b94bfcdc1b5fbba37c04b5e07b54ed Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 26 Feb 2021 18:23:24 +0100 Subject: [PATCH 17/18] older versions of Postgresql need explicit return type --- test/python/test_tools_database_import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py index f9760fc0..45324834 100644 --- a/test/python/test_tools_database_import.py +++ b/test/python/test_tools_database_import.py @@ -189,7 +189,7 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta temp_db_cursor, threads): for func in ('make_keywords', 'getorcreate_housenumber_id', 'make_standard_name'): temp_db_cursor.execute("""CREATE FUNCTION {} (src TEXT) - RETURNS TEXT AS $$ SELECT 'a' $$ LANGUAGE SQL + RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL """.format(func)) for oid in range(100, 130): place_row(osm_id=oid) From b46adbad224eabd915fa5363b719afd0c8571476 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 27 Feb 2021 10:24:40 +0100 Subject: [PATCH 18/18] make sure psql always finishes If an execption is raised by other means, we still have to close the stdin pipe to psql to make sure that it exits and releases its connection to the database. --- nominatim/db/utils.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 6d2eb297..0a2e2c06 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -35,24 +35,25 @@ def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None) cmd.append('--quiet') proc = subprocess.Popen(cmd, env=get_pg_env(dsn), stdin=subprocess.PIPE) - if not LOG.isEnabledFor(logging.INFO): - proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8')) + try: + if not LOG.isEnabledFor(logging.INFO): + proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8')) - if pre_code: - proc.stdin.write((pre_code + ';').encode('utf-8')) + if pre_code: + proc.stdin.write((pre_code + ';').encode('utf-8')) - if fname.suffix == '.gz': - with gzip.open(str(fname), 'rb') as fdesc: - remain = _pipe_to_proc(proc, fdesc) - else: - with fname.open('rb') as fdesc: - remain = _pipe_to_proc(proc, fdesc) + if fname.suffix == '.gz': + with gzip.open(str(fname), 'rb') as fdesc: + remain = _pipe_to_proc(proc, fdesc) + else: + with fname.open('rb') as fdesc: + remain = _pipe_to_proc(proc, fdesc) - if remain == 0 and post_code: - proc.stdin.write((';' + post_code).encode('utf-8')) + if remain == 0 and post_code: + proc.stdin.write((';' + post_code).encode('utf-8')) + finally: + proc.stdin.close() + ret = proc.wait() - proc.stdin.close() - - ret = proc.wait() if ret != 0 or remain > 0: raise UsageError("Failed to execute SQL file.")