From e6d9485c4ae566896f4e9e474d7d3588bc7b4b25 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 21 Jan 2021 10:19:38 +0100 Subject: [PATCH 01/13] cli: import python modules for commands on demand Given that only one command will be executed in the end, it is not necessary to import what amounts to the whole library. This becomes in particular important for update functions that have a dependency on pyosmium. The dependency can remain optional for people not using updates. --- nominatim/cli.py | 4 ++-- test/python/test_cli.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/nominatim/cli.py b/nominatim/cli.py index 6c110ce7..c558eb84 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -11,8 +11,6 @@ from pathlib import Path from .config import Configuration from .tools.exec_utils import run_legacy_script, run_api_script -from .indexer.indexer import Indexer - def _num_system_cpus(): try: cpus = len(os.sched_getaffinity(0)) @@ -320,6 +318,8 @@ class UpdateIndex: @staticmethod def run(args): + from .indexer.indexer import Indexer + indexer = Indexer(args.config.get_libpq_dsn(), args.threads or _num_system_cpus() or 1) diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 92b42372..9ac62973 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -5,6 +5,7 @@ import psycopg2 import pytest import nominatim.cli +import nominatim.indexer.indexer def call_nominatim(*args): return nominatim.cli.nominatim(module_dir='build/module', @@ -87,9 +88,9 @@ def test_index_command(monkeypatch, temp_db, params, do_bnds, do_ranks): with conn.cursor() as cur: cur.execute("CREATE TABLE import_status (indexed bool)") bnd_mock = MockParamCapture() - monkeypatch.setattr(nominatim.cli.Indexer, 'index_boundaries', bnd_mock) + monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', bnd_mock) rank_mock = MockParamCapture() - monkeypatch.setattr(nominatim.cli.Indexer, 'index_by_rank', rank_mock) + monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', rank_mock) assert 0 == call_nominatim('index', *params) From e6c2842b66c607400a0a95b7b8e8de8cd5b12d51 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 22 Jan 2021 23:25:37 +0100 Subject: [PATCH 02/13] move update code for postcode and word count to Python Adds also tests for the new function to execute a SQL script. --- lib/admin/update.php | 23 +++++--------- nominatim/cli.py | 59 ++++++++++++++++++++---------------- nominatim/db/utils.py | 11 +++++++ nominatim/tools/refresh.py | 16 ++++++++++ test/python/test_cli.py | 16 ++++++++-- test/python/test_db_utils.py | 33 ++++++++++++++++++++ 6 files changed, 114 insertions(+), 44 deletions(-) create mode 100644 nominatim/db/utils.py create mode 100644 nominatim/tools/refresh.py create mode 100644 test/python/test_db_utils.py diff --git a/lib/admin/update.php b/lib/admin/update.php index fe9658b5..48609c3e 100644 --- a/lib/admin/update.php +++ b/lib/admin/update.php @@ -104,14 +104,12 @@ if ($fPostgresVersion >= 11.0) { ); } - -$oIndexCmd = (new \Nominatim\Shell(getSetting('NOMINATIM_TOOL'))) - ->addParams('index'); +$oNominatimCmd = new \Nominatim\Shell(getSetting('NOMINATIM_TOOL')); if ($aResult['quiet']) { - $oIndexCmd->addParams('--quiet'); + $oNominatimCmd->addParams('--quiet'); } if ($aResult['verbose']) { - $oIndexCmd->addParams('--verbose'); + $oNominatimCmd->addParams('--verbose'); } $sPyosmiumBin = getSetting('PYOSMIUM_BINARY'); @@ -220,9 +218,7 @@ if (isset($aResult['import-diff']) || isset($aResult['import-file'])) { } if ($aResult['calculate-postcodes']) { - info('Update postcodes centroids'); - $sTemplate = file_get_contents(CONST_DataDir.'/sql/update-postcodes.sql'); - runSQLScript($sTemplate, true, true); + (clone($oNominatimCmd))->addParams('refresh', '--postcodes')->run(); } $sTemporaryFile = CONST_InstallDir.'/osmosischange.osc'; @@ -271,15 +267,11 @@ if ($bHaveDiff) { } if ($aResult['recompute-word-counts']) { - info('Recompute frequency of full-word search terms'); - $sTemplate = file_get_contents(CONST_DataDir.'/sql/words_from_search_name.sql'); - runSQLScript($sTemplate, true, true); + (clone($oNominatimCmd))->addParams('refresh', '--word-counts')->run(); } if ($aResult['index']) { - $oCmd = (clone $oIndexCmd) - ->addParams('--minrank', $aResult['index-rank']); - $oCmd->run(); + (clone $oNominatimCmd)->addParams('index', '--minrank', $aResult['index-rank'])->run(); } if ($aResult['update-address-levels']) { @@ -421,7 +413,8 @@ if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) { if (!$aResult['no-index']) { $fCMDStartTime = time(); - $oThisIndexCmd = clone($oIndexCmd); + $oThisIndexCmd = clone($oNominatimCmd); + $oThisIndexCmd->addParams('index'); echo $oThisIndexCmd->escapedCmd()."\n"; $iErrorLevel = $oThisIndexCmd->run(); if ($iErrorLevel) { diff --git a/nominatim/cli.py b/nominatim/cli.py index c558eb84..4388902d 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -8,9 +8,13 @@ import argparse import logging from pathlib import Path +import psycopg2 + from .config import Configuration from .tools.exec_utils import run_legacy_script, run_api_script +LOG = logging.getLogger() + def _num_system_cpus(): try: cpus = len(os.sched_getaffinity(0)) @@ -366,32 +370,35 @@ class UpdateRefresh: @staticmethod def run(args): - if args.postcodes: - run_legacy_script('update.php', '--calculate-postcodes', - nominatim_env=args, throw_on_fail=True) - if args.word_counts: - run_legacy_script('update.php', '--recompute-word-counts', - nominatim_env=args, throw_on_fail=True) - if args.address_levels: - run_legacy_script('update.php', '--update-address-levels', - nominatim_env=args, throw_on_fail=True) - if args.functions: - params = ['setup.php', '--create-functions', '--create-partition-functions'] - if args.diffs: - params.append('--enable-diff-updates') - if args.enable_debug_statements: - params.append('--enable-debug-statements') - run_legacy_script(*params, nominatim_env=args, throw_on_fail=True) - if args.wiki_data: - run_legacy_script('setup.php', '--import-wikipedia-articles', - nominatim_env=args, throw_on_fail=True) - # Attention: importance MUST come after wiki data import. - if args.importance: - run_legacy_script('update.php', '--recompute-importance', - nominatim_env=args, throw_on_fail=True) - if args.website: - run_legacy_script('setup.php', '--setup-website', - nominatim_env=args, throw_on_fail=True) + import nominatim.tools.refresh + + with psycopg2.connect(args.config.get_libpq_dsn()) as conn: + if args.postcodes: + LOG.warning("Update postcodes centroid") + nominatim.tools.refresh.update_postcodes(conn, args.data_dir) + if args.word_counts: + LOG.warning('Recompute frequency of full-word search terms') + nominatim.tools.refresh.recompute_word_counts(conn, args.data_dir) + if args.address_levels: + run_legacy_script('update.php', '--update-address-levels', + nominatim_env=args, throw_on_fail=True) + if args.functions: + params = ['setup.php', '--create-functions', '--create-partition-functions'] + if args.diffs: + params.append('--enable-diff-updates') + if args.enable_debug_statements: + params.append('--enable-debug-statements') + run_legacy_script(*params, nominatim_env=args, throw_on_fail=True) + if args.wiki_data: + run_legacy_script('setup.php', '--import-wikipedia-articles', + nominatim_env=args, throw_on_fail=True) + # Attention: importance MUST come after wiki data import. + if args.importance: + run_legacy_script('update.php', '--recompute-importance', + nominatim_env=args, throw_on_fail=True) + if args.website: + run_legacy_script('setup.php', '--setup-website', + nominatim_env=args, throw_on_fail=True) return 0 diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py new file mode 100644 index 00000000..1a39746e --- /dev/null +++ b/nominatim/db/utils.py @@ -0,0 +1,11 @@ +""" +Helper functions for handling DB accesses. +""" + +def execute_file(conn, fname): + """ Read an SQL file and run its contents against the given connection. + """ + with fname.open('r') as fdesc: + sql = fdesc.read() + with conn.cursor() as cur: + cur.execute(sql) diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py new file mode 100644 index 00000000..859b5646 --- /dev/null +++ b/nominatim/tools/refresh.py @@ -0,0 +1,16 @@ +""" +Functions for bringing auxiliary data in the database up-to-date. +""" +from ..db.utils import execute_file + +def update_postcodes(conn, datadir): + """ Recalculate postcode centroids and add, remove and update entries in the + location_postcode table. `conn` is an opne connection to the database. + """ + execute_file(conn, datadir / 'sql' / 'update-postcodes.sql') + + +def recompute_word_counts(conn, datadir): + """ Compute the frequency of full-word search terms. + """ + execute_file(conn, datadir / 'sql' / 'words_from_search_name.sql') diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 9ac62973..33c65ade 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -6,6 +6,7 @@ import pytest import nominatim.cli import nominatim.indexer.indexer +import nominatim.tools.refresh def call_nominatim(*args): return nominatim.cli.nominatim(module_dir='build/module', @@ -99,21 +100,30 @@ def test_index_command(monkeypatch, temp_db, params, do_bnds, do_ranks): @pytest.mark.parametrize("command,params", [ - ('postcodes', ('update.php', '--calculate-postcodes')), - ('word-counts', ('update.php', '--recompute-word-counts')), ('address-levels', ('update.php', '--update-address-levels')), ('functions', ('setup.php',)), ('wiki-data', ('setup.php', '--import-wikipedia-articles')), ('importance', ('update.php', '--recompute-importance')), ('website', ('setup.php', '--setup-website')), ]) -def test_refresh_command(mock_run_legacy, command, params): +def test_refresh_legacy_command(mock_run_legacy, command, params): assert 0 == call_nominatim('refresh', '--' + command) assert mock_run_legacy.called == 1 assert len(mock_run_legacy.last_args) >= len(params) assert mock_run_legacy.last_args[:len(params)] == params +@pytest.mark.parametrize("command,func", [ + ('postcodes', 'update_postcodes'), + ('word-counts', 'recompute_word_counts'), + ]) +def test_refresh_command(monkeypatch, command, func): + func_mock = MockParamCapture() + monkeypatch.setattr(nominatim.tools.refresh, func, func_mock) + + assert 0 == call_nominatim('refresh', '--' + command) + + assert func_mock.called == 1 def test_refresh_importance_computed_after_wiki_import(mock_run_legacy): assert 0 == call_nominatim('refresh', '--importance', '--wiki-data') diff --git a/test/python/test_db_utils.py b/test/python/test_db_utils.py new file mode 100644 index 00000000..3210721e --- /dev/null +++ b/test/python/test_db_utils.py @@ -0,0 +1,33 @@ +""" +Tests for DB utility functions in db.utils +""" +import psycopg2 +import pytest + +import nominatim.db.utils as db_utils + +def test_execute_file_success(temp_db, tmp_path): + tmpfile = tmp_path / 'test.sql' + tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);') + + with psycopg2.connect('dbname=' + temp_db) as conn: + db_utils.execute_file(conn, tmpfile) + + with conn.cursor() as cur: + cur.execute('SELECT * FROM test') + + assert cur.rowcount == 1 + assert cur.fetchone()[0] == 56 + +def test_execute_file_bad_file(temp_db, tmp_path): + with psycopg2.connect('dbname=' + temp_db) as conn: + with pytest.raises(FileNotFoundError): + db_utils.execute_file(conn, tmp_path / 'test2.sql') + +def test_execute_file_bad_sql(temp_db, tmp_path): + tmpfile = tmp_path / 'test.sql' + tmpfile.write_text('CREATE STABLE test (id INT)') + + with psycopg2.connect('dbname=' + temp_db) as conn: + with pytest.raises(psycopg2.ProgrammingError): + db_utils.execute_file(conn, tmpfile) From 94fa7162be678dc74c1a5516e22916122ba66bbb Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 23 Jan 2021 17:25:14 +0100 Subject: [PATCH 03/13] port address level computation to Python Also adds simple tests for correct table creation. --- lib/admin/update.php | 6 +- lib/setup/AddressLevelParser.php | 98 ------------------- lib/setup/SetupClass.php | 24 +++-- nominatim/cli.py | 59 ++++++----- nominatim/config.py | 7 ++ nominatim/db/utils.py | 1 + nominatim/tools/refresh.py | 55 +++++++++++ test/python/conftest.py | 58 ++++++++++- test/python/test_cli.py | 8 +- test/python/test_db_utils.py | 27 +++-- test/python/test_indexing.py | 6 +- test/python/test_tools_exec_utils.py | 5 +- .../test_tools_refresh_address_levels.py | 85 ++++++++++++++++ 13 files changed, 268 insertions(+), 171 deletions(-) delete mode 100644 lib/setup/AddressLevelParser.php create mode 100644 test/python/test_tools_refresh_address_levels.py diff --git a/lib/admin/update.php b/lib/admin/update.php index 48609c3e..972a6fe5 100644 --- a/lib/admin/update.php +++ b/lib/admin/update.php @@ -4,7 +4,6 @@ require_once(CONST_LibDir.'/init-cmd.php'); require_once(CONST_LibDir.'/setup_functions.php'); require_once(CONST_LibDir.'/setup/SetupClass.php'); -require_once(CONST_LibDir.'/setup/AddressLevelParser.php'); ini_set('memory_limit', '800M'); @@ -275,10 +274,7 @@ if ($aResult['index']) { } if ($aResult['update-address-levels']) { - $sAddressLevelConfig = getSettingConfig('ADDRESS_LEVEL_CONFIG', 'address-levels.json'); - echo 'Updating address levels from '.$sAddressLevelConfig.".\n"; - $oAlParser = new \Nominatim\Setup\AddressLevelParser($sAddressLevelConfig); - $oAlParser->createTable($oDB, 'address_levels'); + (clone($oNominatimCmd))->addParams('refresh', '--address-levels')->run(); } if ($aResult['recompute-importance']) { diff --git a/lib/setup/AddressLevelParser.php b/lib/setup/AddressLevelParser.php deleted file mode 100644 index a399c955..00000000 --- a/lib/setup/AddressLevelParser.php +++ /dev/null @@ -1,98 +0,0 @@ -aLevels = json_decode($sJson, true); - if (!$this->aLevels) { - switch (json_last_error()) { - case JSON_ERROR_NONE: - break; - case JSON_ERROR_DEPTH: - fail('JSON error - Maximum stack depth exceeded'); - break; - case JSON_ERROR_STATE_MISMATCH: - fail('JSON error - Underflow or the modes mismatch'); - break; - case JSON_ERROR_CTRL_CHAR: - fail('JSON error - Unexpected control character found'); - break; - case JSON_ERROR_SYNTAX: - fail('JSON error - Syntax error, malformed JSON'); - break; - case JSON_ERROR_UTF8: - fail('JSON error - Malformed UTF-8 characters, possibly incorrectly encoded'); - break; - default: - fail('JSON error - Unknown error'); - break; - } - } - } - - /** - * Dump the description into a database table. - * - * @param object $oDB Database conneciton to use. - * @param string $sTable Name of table to create. - * - * @return null - * - * A new table is created. Any previously existing table is dropped. - * The table has the following columns: - * country, class, type, rank_search, rank_address. - */ - public function createTable($oDB, $sTable) - { - $oDB->exec('DROP TABLE IF EXISTS '.$sTable); - $sSql = 'CREATE TABLE '.$sTable; - $sSql .= '(country_code varchar(2), class TEXT, type TEXT,'; - $sSql .= ' rank_search SMALLINT, rank_address SMALLINT)'; - $oDB->exec($sSql); - - $sSql = 'CREATE UNIQUE INDEX ON '.$sTable.' (country_code, class, type)'; - $oDB->exec($sSql); - - $sSql = 'INSERT INTO '.$sTable.' VALUES '; - foreach ($this->aLevels as $aLevel) { - $aCountries = array(); - if (isset($aLevel['countries'])) { - foreach ($aLevel['countries'] as $sCountry) { - $aCountries[$sCountry] = $oDB->getDBQuoted($sCountry); - } - } else { - $aCountries['NULL'] = 'NULL'; - } - foreach ($aLevel['tags'] as $sKey => $aValues) { - foreach ($aValues as $sValue => $mRanks) { - $aFields = array( - $oDB->getDBQuoted($sKey), - $sValue ? $oDB->getDBQuoted($sValue) : 'NULL' - ); - if (is_array($mRanks)) { - $aFields[] = (string) $mRanks[0]; - $aFields[] = (string) $mRanks[1]; - } else { - $aFields[] = (string) $mRanks; - $aFields[] = (string) $mRanks; - } - $sLine = ','.join(',', $aFields).'),'; - - foreach ($aCountries as $sCountries) { - $sSql .= '('.$sCountries.$sLine; - } - } - } - } - $oDB->exec(rtrim($sSql, ',')); - } -} diff --git a/lib/setup/SetupClass.php b/lib/setup/SetupClass.php index d17fdca7..635949b9 100755 --- a/lib/setup/SetupClass.php +++ b/lib/setup/SetupClass.php @@ -2,7 +2,6 @@ namespace Nominatim\Setup; -require_once(CONST_LibDir.'/setup/AddressLevelParser.php'); require_once(CONST_LibDir.'/Shell.php'); class SetupFunctions @@ -19,6 +18,7 @@ class SetupFunctions protected $bNoPartitions; protected $bDrop; protected $oDB = null; + protected $oNominatimCmd; public function __construct(array $aCMDResult) { @@ -81,6 +81,14 @@ class SetupFunctions } $this->bDrop = isset($aCMDResult['drop']) && $aCMDResult['drop']; + + $this->oNominatimCmd = new \Nominatim\Shell(getSetting('NOMINATIM_TOOL')); + if ($this->bQuiet) { + $this->oNominatimCmd->addParams('--quiet'); + } + if ($this->bVerbose) { + $this->oNominatimCmd->addParams('--verbose'); + } } public function createDB() @@ -256,8 +264,7 @@ class SetupFunctions $this->dropTable('search_name'); } - $oAlParser = new AddressLevelParser(getSettingConfig('ADDRESS_LEVEL_CONFIG', 'address-levels.json')); - $oAlParser->createTable($this->db(), 'address_levels'); + (clone($this->oNominatimCmd))->addParams('refresh', '--address-levels')->run(); } public function createTableTriggers() @@ -549,19 +556,10 @@ class SetupFunctions { $this->checkModulePresence(); // raises exception on failure - $oBaseCmd = (new \Nominatim\Shell(getSetting('NOMINATIM_TOOL'))) - ->addParams('index'); - - if ($this->bQuiet) { - $oBaseCmd->addParams('-q'); - } - if ($this->bVerbose) { - $oBaseCmd->addParams('-v'); - } + $oBaseCmd = (clone $this->oNominatimCmd)->addParams('index'); info('Index ranks 0 - 4'); $oCmd = (clone $oBaseCmd)->addParams('--maxrank', 4); - echo $oCmd->escapedCmd(); $iStatus = $oCmd->run(); if ($iStatus != 0) { diff --git a/nominatim/cli.py b/nominatim/cli.py index 4388902d..00eb905e 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -372,33 +372,38 @@ class UpdateRefresh: def run(args): import nominatim.tools.refresh - with psycopg2.connect(args.config.get_libpq_dsn()) as conn: - if args.postcodes: - LOG.warning("Update postcodes centroid") - nominatim.tools.refresh.update_postcodes(conn, args.data_dir) - if args.word_counts: - LOG.warning('Recompute frequency of full-word search terms') - nominatim.tools.refresh.recompute_word_counts(conn, args.data_dir) - if args.address_levels: - run_legacy_script('update.php', '--update-address-levels', - nominatim_env=args, throw_on_fail=True) - if args.functions: - params = ['setup.php', '--create-functions', '--create-partition-functions'] - if args.diffs: - params.append('--enable-diff-updates') - if args.enable_debug_statements: - params.append('--enable-debug-statements') - run_legacy_script(*params, nominatim_env=args, throw_on_fail=True) - if args.wiki_data: - run_legacy_script('setup.php', '--import-wikipedia-articles', - nominatim_env=args, throw_on_fail=True) - # Attention: importance MUST come after wiki data import. - if args.importance: - run_legacy_script('update.php', '--recompute-importance', - nominatim_env=args, throw_on_fail=True) - if args.website: - run_legacy_script('setup.php', '--setup-website', - nominatim_env=args, throw_on_fail=True) + conn = psycopg2.connect(args.config.get_libpq_dsn()) + + if args.postcodes: + LOG.warning("Update postcodes centroid") + nominatim.tools.refresh.update_postcodes(conn, args.data_dir) + if args.word_counts: + LOG.warning('Recompute frequency of full-word search terms') + nominatim.tools.refresh.recompute_word_counts(conn, args.data_dir) + if args.address_levels: + cfg = Path(args.config.ADDRESS_LEVEL_CONFIG) + LOG.warning('Updating address levels from %s', cfg) + nominatim.tools.refresh.load_address_levels_from_file(conn, cfg) + if args.functions: + params = ['setup.php', '--create-functions', '--create-partition-functions'] + if args.diffs: + params.append('--enable-diff-updates') + if args.enable_debug_statements: + params.append('--enable-debug-statements') + run_legacy_script(*params, nominatim_env=args, throw_on_fail=True) + if args.wiki_data: + run_legacy_script('setup.php', '--import-wikipedia-articles', + nominatim_env=args, throw_on_fail=True) + # Attention: importance MUST come after wiki data import. + if args.importance: + run_legacy_script('update.php', '--recompute-importance', + nominatim_env=args, throw_on_fail=True) + if args.website: + run_legacy_script('setup.php', '--setup-website', + nominatim_env=args, throw_on_fail=True) + + conn.close() + return 0 diff --git a/nominatim/config.py b/nominatim/config.py index 458c828f..d4ba0d7a 100644 --- a/nominatim/config.py +++ b/nominatim/config.py @@ -24,6 +24,13 @@ class Configuration: if project_dir is not None: self._config.update(dotenv_values(str((project_dir / '.env').resolve()))) + # Add defaults for variables that are left empty to set the default. + # They may still be overwritten by environment variables. + if not self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG']: + self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG'] = \ + str(config_dir / 'address-levels.json') + + def __getattr__(self, name): name = 'NOMINATIM_' + name diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 1a39746e..abd72519 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -9,3 +9,4 @@ def execute_file(conn, fname): sql = fdesc.read() with conn.cursor() as cur: cur.execute(sql) + conn.commit() diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index 859b5646..885caca5 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -1,6 +1,10 @@ """ Functions for bringing auxiliary data in the database up-to-date. """ +import json + +from psycopg2.extras import execute_values + from ..db.utils import execute_file def update_postcodes(conn, datadir): @@ -14,3 +18,54 @@ def recompute_word_counts(conn, datadir): """ Compute the frequency of full-word search terms. """ execute_file(conn, datadir / 'sql' / 'words_from_search_name.sql') + + +def _add_address_level_rows_from_entry(rows, entry): + """ Converts a single entry from the JSON format for address rank + descriptions into a flat format suitable for inserting into a + PostgreSQL table and adds these lines to `rows`. + """ + countries = entry.get('countries') or (None, ) + for key, values in entry['tags'].items(): + for value, ranks in values.items(): + if isinstance(ranks, list): + rank_search, rank_address = ranks + else: + rank_search = rank_address = ranks + if not value: + value = None + for country in countries: + rows.append((country, key, value, rank_search, rank_address)) + +def load_address_levels(conn, table, levels): + """ Replace the `address_levels` table with the contents of `levels'. + + A new table is created any previously existing table is dropped. + The table has the following columns: + country, class, type, rank_search, rank_address + """ + rows = [] + for entry in levels: + _add_address_level_rows_from_entry(rows, entry) + + with conn.cursor() as cur: + cur.execute('DROP TABLE IF EXISTS {}'.format(table)) + + cur.execute("""CREATE TABLE {} (country_code varchar(2), + class TEXT, + type TEXT, + rank_search SMALLINT, + rank_address SMALLINT)""".format(table)) + + execute_values(cur, "INSERT INTO {} VALUES %s".format(table), rows) + + cur.execute('CREATE UNIQUE INDEX ON {} (country_code, class, type)'.format(table)) + + conn.commit() + +def load_address_levels_from_file(conn, config_file): + """ Replace the `address_levels` table with the contents of the config + file. + """ + with config_file.open('r') as fdesc: + load_address_levels(conn, 'address_levels', json.load(fdesc)) diff --git a/test/python/conftest.py b/test/python/conftest.py index 1cc9ef9c..d92df5c5 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -2,13 +2,43 @@ import sys from pathlib import Path import psycopg2 +import psycopg2.extras import pytest +SRC_DIR = Path(__file__) / '..' / '..' / '..' + # always test against the source -sys.path.insert(0, str((Path(__file__) / '..' / '..' / '..').resolve())) +sys.path.insert(0, str(SRC_DIR.resolve())) + +from nominatim.config import Configuration + +class _TestingCursor(psycopg2.extras.DictCursor): + """ Extension to the DictCursor class that provides execution + short-cuts that simplify writing assertions. + """ + + def scalar(self, sql, params=None): + """ Execute a query with a single return value and return this value. + Raises an assertion when not exactly one row is returned. + """ + self.execute(sql, params) + assert self.rowcount == 1 + return self.fetchone()[0] + + def row_set(self, sql, params=None): + """ Execute a query and return the result as a set of tuples. + """ + self.execute(sql, params) + if self.rowcount == 1: + return set(tuple(self.fetchone())) + + return set((tuple(row) for row in self)) @pytest.fixture def temp_db(monkeypatch): + """ Create an empty database for the test. The database name is also + exported into NOMINATIM_DATABASE_DSN. + """ name = 'test_nominatim_python_unittest' with psycopg2.connect(database='postgres') as conn: conn.set_isolation_level(0) @@ -24,3 +54,29 @@ def temp_db(monkeypatch): conn.set_isolation_level(0) with conn.cursor() as cur: cur.execute('DROP DATABASE IF EXISTS {}'.format(name)) + + +@pytest.fixture +def temp_db_conn(temp_db): + """ Connection to the test database. + """ + conn = psycopg2.connect(database=temp_db) + yield conn + conn.close() + + +@pytest.fixture +def temp_db_cursor(temp_db): + """ Connection and cursor towards the test database. The connection will + be in auto-commit mode. + """ + conn = psycopg2.connect('dbname=' + temp_db) + conn.set_isolation_level(0) + with conn.cursor(cursor_factory=_TestingCursor) as cur: + yield cur + conn.close() + + +@pytest.fixture +def def_config(): + return Configuration(None, SRC_DIR.resolve() / 'settings') diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 33c65ade..bd192260 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -84,10 +84,8 @@ def test_add_data_command(mock_run_legacy, name, oid): (['--boundaries-only'], 1, 0), (['--no-boundaries'], 0, 1), (['--boundaries-only', '--no-boundaries'], 0, 0)]) -def test_index_command(monkeypatch, temp_db, params, do_bnds, do_ranks): - with psycopg2.connect(database=temp_db) as conn: - with conn.cursor() as cur: - cur.execute("CREATE TABLE import_status (indexed bool)") +def test_index_command(monkeypatch, temp_db_cursor, params, do_bnds, do_ranks): + temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)") bnd_mock = MockParamCapture() monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', bnd_mock) rank_mock = MockParamCapture() @@ -100,7 +98,6 @@ def test_index_command(monkeypatch, temp_db, params, do_bnds, do_ranks): @pytest.mark.parametrize("command,params", [ - ('address-levels', ('update.php', '--update-address-levels')), ('functions', ('setup.php',)), ('wiki-data', ('setup.php', '--import-wikipedia-articles')), ('importance', ('update.php', '--recompute-importance')), @@ -116,6 +113,7 @@ def test_refresh_legacy_command(mock_run_legacy, command, params): @pytest.mark.parametrize("command,func", [ ('postcodes', 'update_postcodes'), ('word-counts', 'recompute_word_counts'), + ('address-levels', 'load_address_levels_from_file'), ]) def test_refresh_command(monkeypatch, command, func): func_mock = MockParamCapture() diff --git a/test/python/test_db_utils.py b/test/python/test_db_utils.py index 3210721e..e756f2c4 100644 --- a/test/python/test_db_utils.py +++ b/test/python/test_db_utils.py @@ -6,28 +6,25 @@ import pytest import nominatim.db.utils as db_utils -def test_execute_file_success(temp_db, tmp_path): +def test_execute_file_success(temp_db_conn, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);') - with psycopg2.connect('dbname=' + temp_db) as conn: - db_utils.execute_file(conn, tmpfile) + db_utils.execute_file(temp_db_conn, tmpfile) - with conn.cursor() as cur: - cur.execute('SELECT * FROM test') + with temp_db_conn.cursor() as cur: + cur.execute('SELECT * FROM test') - assert cur.rowcount == 1 - assert cur.fetchone()[0] == 56 + assert cur.rowcount == 1 + assert cur.fetchone()[0] == 56 -def test_execute_file_bad_file(temp_db, tmp_path): - with psycopg2.connect('dbname=' + temp_db) as conn: - with pytest.raises(FileNotFoundError): - db_utils.execute_file(conn, tmp_path / 'test2.sql') +def test_execute_file_bad_file(temp_db_conn, tmp_path): + with pytest.raises(FileNotFoundError): + db_utils.execute_file(temp_db_conn, tmp_path / 'test2.sql') -def test_execute_file_bad_sql(temp_db, tmp_path): +def test_execute_file_bad_sql(temp_db_conn, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE STABLE test (id INT)') - with psycopg2.connect('dbname=' + temp_db) as conn: - with pytest.raises(psycopg2.ProgrammingError): - db_utils.execute_file(conn, tmpfile) + with pytest.raises(psycopg2.ProgrammingError): + db_utils.execute_file(temp_db_conn, tmpfile) diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py index 2fe21e80..6b52a65e 100644 --- a/test/python/test_indexing.py +++ b/test/python/test_indexing.py @@ -82,10 +82,8 @@ class IndexerTestDB: @pytest.fixture -def test_db(temp_db): - conn = psycopg2.connect(database=temp_db) - yield IndexerTestDB(conn) - conn.close() +def test_db(temp_db_conn): + yield IndexerTestDB(temp_db_conn) @pytest.mark.parametrize("threads", [1, 15]) diff --git a/test/python/test_tools_exec_utils.py b/test/python/test_tools_exec_utils.py index d9f80740..a4eef61f 100644 --- a/test/python/test_tools_exec_utils.py +++ b/test/python/test_tools_exec_utils.py @@ -7,7 +7,6 @@ import tempfile import pytest -from nominatim.config import Configuration import nominatim.tools.exec_utils as exec_utils @pytest.fixture @@ -18,9 +17,9 @@ def tmp_phplib_dir(): yield Path(phpdir) @pytest.fixture -def nominatim_env(tmp_phplib_dir): +def nominatim_env(tmp_phplib_dir, def_config): class _NominatimEnv: - config = Configuration(None, Path(__file__) / '..' / '..' / '..' / 'settings') + config = def_config phplib_dir = tmp_phplib_dir data_dir = Path('data') project_dir = Path('.') diff --git a/test/python/test_tools_refresh_address_levels.py b/test/python/test_tools_refresh_address_levels.py new file mode 100644 index 00000000..87e34c61 --- /dev/null +++ b/test/python/test_tools_refresh_address_levels.py @@ -0,0 +1,85 @@ +""" +Tests for function for importing address ranks. +""" +import json +import pytest +from pathlib import Path + +from nominatim.tools.refresh import load_address_levels, load_address_levels_from_file + +def test_load_ranks_def_config(temp_db_conn, temp_db_cursor, def_config): + load_address_levels_from_file(temp_db_conn, Path(def_config.ADDRESS_LEVEL_CONFIG)) + + assert temp_db_cursor.scalar('SELECT count(*) FROM address_levels') > 0 + +def test_load_ranks_from_file(temp_db_conn, temp_db_cursor, tmp_path): + test_file = tmp_path / 'test_levels.json' + test_file.write_text('[{"tags":{"place":{"sea":2}}}]') + + load_address_levels_from_file(temp_db_conn, test_file) + + assert temp_db_cursor.scalar('SELECT count(*) FROM address_levels') > 0 + + +def test_load_ranks_from_broken_file(temp_db_conn, tmp_path): + test_file = tmp_path / 'test_levels.json' + test_file.write_text('[{"tags":"place":{"sea":2}}}]') + + with pytest.raises(json.decoder.JSONDecodeError): + load_address_levels_from_file(temp_db_conn, test_file) + + +def test_load_ranks_country(temp_db_conn, temp_db_cursor): + load_address_levels(temp_db_conn, 'levels', + [{"tags": {"place": {"village": 14}}}, + {"countries": ['de'], + "tags": {"place": {"village": 15}}}, + {"countries": ['uk', 'us' ], + "tags": {"place": {"village": 16}}} + ]) + + assert temp_db_cursor.row_set('SELECT * FROM levels') == \ + set([(None, 'place', 'village', 14, 14), + ('de', 'place', 'village', 15, 15), + ('uk', 'place', 'village', 16, 16), + ('us', 'place', 'village', 16, 16), + ]) + + +def test_load_ranks_default_value(temp_db_conn, temp_db_cursor): + load_address_levels(temp_db_conn, 'levels', + [{"tags": {"boundary": {"": 28}}}, + {"countries": ['hu'], + "tags": {"boundary": {"": 29}}} + ]) + + assert temp_db_cursor.row_set('SELECT * FROM levels') == \ + set([(None, 'boundary', None, 28, 28), + ('hu', 'boundary', None, 29, 29), + ]) + + +def test_load_ranks_multiple_keys(temp_db_conn, temp_db_cursor): + load_address_levels(temp_db_conn, 'levels', + [{"tags": + {"place": {"city": 14}, + "boundary": {"administrative2" : 4}} + }]) + + assert temp_db_cursor.row_set('SELECT * FROM levels') == \ + set([(None, 'place', 'city', 14, 14), + (None, 'boundary', 'administrative2', 4, 4), + ]) + + +def test_load_ranks_address(temp_db_conn, temp_db_cursor): + load_address_levels(temp_db_conn, 'levels', + [{"tags": + {"place": {"city": 14, + "town" : [14, 13]}} + }]) + + assert temp_db_cursor.row_set('SELECT * FROM levels') == \ + set([(None, 'place', 'city', 14, 14), + (None, 'place', 'town', 14, 13), + ]) From 5b46fcad8ec166d1b8e35cfbc58405b71e27caa3 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 24 Jan 2021 14:35:35 +0100 Subject: [PATCH 04/13] convert functon creation to python The new functions always creates normal and partitioned functions. Also adds specialised connection and cursor classes for adding frequently used helper functions. --- lib/admin/update.php | 7 +- lib/setup/SetupClass.php | 45 ++------- nominatim/cli.py | 26 ++--- nominatim/config.py | 8 ++ nominatim/tools/refresh.py | 98 ++++++++++++++++++ test/python/test_cli.py | 3 +- test/python/test_config.py | 24 +++++ test/python/test_db_connection.py | 32 ++++++ .../test_tools_refresh_create_functions.py | 99 +++++++++++++++++++ 9 files changed, 285 insertions(+), 57 deletions(-) create mode 100644 test/python/test_db_connection.py create mode 100644 test/python/test_tools_refresh_create_functions.py diff --git a/lib/admin/update.php b/lib/admin/update.php index 972a6fe5..a0fbbc46 100644 --- a/lib/admin/update.php +++ b/lib/admin/update.php @@ -143,12 +143,7 @@ if ($aResult['init-updates']) { } if (!$aResult['no-update-functions']) { - // instantiate setupClass to use the function therein - $cSetup = new SetupFunctions(array( - 'enable-diff-updates' => true, - 'verbose' => $aResult['verbose'] - )); - $cSetup->createFunctions(); + (clone($oNominatimCmd))->addParams('refresh', '--functions')->run(); } $sDatabaseDate = getDatabaseDate($oDB); diff --git a/lib/setup/SetupClass.php b/lib/setup/SetupClass.php index 635949b9..a865b8f0 100755 --- a/lib/setup/SetupClass.php +++ b/lib/setup/SetupClass.php @@ -290,9 +290,7 @@ class SetupFunctions public function createPartitionFunctions() { info('Create Partition Functions'); - - $sTemplate = file_get_contents(CONST_DataDir.'/sql/partition-functions.src.sql'); - $this->pgsqlRunPartitionScript($sTemplate); + $this->createSqlFunctions(); // also create partition functions } public function importWikipediaArticles() @@ -788,43 +786,18 @@ class SetupFunctions private function createSqlFunctions() { - $sBasePath = CONST_DataDir.'/sql/functions/'; - $sTemplate = file_get_contents($sBasePath.'utils.sql'); - $sTemplate .= file_get_contents($sBasePath.'normalization.sql'); - $sTemplate .= file_get_contents($sBasePath.'ranking.sql'); - $sTemplate .= file_get_contents($sBasePath.'importance.sql'); - $sTemplate .= file_get_contents($sBasePath.'address_lookup.sql'); - $sTemplate .= file_get_contents($sBasePath.'interpolation.sql'); - if ($this->db()->tableExists('place')) { - $sTemplate .= file_get_contents($sBasePath.'place_triggers.sql'); - } - if ($this->db()->tableExists('placex')) { - $sTemplate .= file_get_contents($sBasePath.'placex_triggers.sql'); - } - if ($this->db()->tableExists('location_postcode')) { - $sTemplate .= file_get_contents($sBasePath.'postcode_triggers.sql'); - } - $sTemplate = str_replace('{modulepath}', $this->sModulePath, $sTemplate); - if ($this->bEnableDiffUpdates) { - $sTemplate = str_replace('RETURN NEW; -- %DIFFUPDATES%', '--', $sTemplate); + $oCmd = (clone($this->oNominatimCmd)) + ->addParams('refresh', '--functions'); + + if (!$this->bEnableDiffUpdates) { + $oCmd->addParams('--no-diff-updates'); } + if ($this->bEnableDebugStatements) { - $sTemplate = str_replace('--DEBUG:', '', $sTemplate); - } - if (getSettingBool('LIMIT_REINDEXING')) { - $sTemplate = str_replace('--LIMIT INDEXING:', '', $sTemplate); - } - if (!getSettingBool('USE_US_TIGER_DATA')) { - $sTemplate = str_replace('-- %NOTIGERDATA% ', '', $sTemplate); - } - if (!getSettingBool('USE_AUX_LOCATION_DATA')) { - $sTemplate = str_replace('-- %NOAUXDATA% ', '', $sTemplate); + $oCmd->addParams('--enable-debug-statements'); } - $sReverseOnly = $this->dbReverseOnly() ? 'true' : 'false'; - $sTemplate = str_replace('%REVERSE-ONLY%', $sReverseOnly, $sTemplate); - - $this->pgsqlRunScript($sTemplate); + $oCmd->run(); } private function pgsqlRunPartitionScript($sTemplate) diff --git a/nominatim/cli.py b/nominatim/cli.py index 00eb905e..4b38040c 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -8,10 +8,9 @@ import argparse import logging from pathlib import Path -import psycopg2 - from .config import Configuration from .tools.exec_utils import run_legacy_script, run_api_script +from .db.connection import connect LOG = logging.getLogger() @@ -370,27 +369,28 @@ class UpdateRefresh: @staticmethod def run(args): - import nominatim.tools.refresh + from .tools import refresh - conn = psycopg2.connect(args.config.get_libpq_dsn()) + conn = connect(args.config.get_libpq_dsn()) if args.postcodes: LOG.warning("Update postcodes centroid") - nominatim.tools.refresh.update_postcodes(conn, args.data_dir) + refresh.update_postcodes(conn, args.data_dir) + if args.word_counts: LOG.warning('Recompute frequency of full-word search terms') - nominatim.tools.refresh.recompute_word_counts(conn, args.data_dir) + refresh.recompute_word_counts(conn, args.data_dir) + if args.address_levels: cfg = Path(args.config.ADDRESS_LEVEL_CONFIG) LOG.warning('Updating address levels from %s', cfg) - nominatim.tools.refresh.load_address_levels_from_file(conn, cfg) + refresh.load_address_levels_from_file(conn, cfg) + if args.functions: - params = ['setup.php', '--create-functions', '--create-partition-functions'] - if args.diffs: - params.append('--enable-diff-updates') - if args.enable_debug_statements: - params.append('--enable-debug-statements') - run_legacy_script(*params, nominatim_env=args, throw_on_fail=True) + LOG.warning('Create functions') + refresh.create_functions(conn, args.config, args.data_dir, + args.diffs, args.enable_debug_statements) + if args.wiki_data: run_legacy_script('setup.php', '--import-wikipedia-articles', nominatim_env=args, throw_on_fail=True) diff --git a/nominatim/config.py b/nominatim/config.py index d4ba0d7a..3f75ce33 100644 --- a/nominatim/config.py +++ b/nominatim/config.py @@ -20,6 +20,7 @@ class Configuration: """ def __init__(self, project_dir, config_dir): + self.project_dir = project_dir self._config = dotenv_values(str((config_dir / 'env.defaults').resolve())) if project_dir is not None: self._config.update(dotenv_values(str((project_dir / '.env').resolve()))) @@ -36,6 +37,13 @@ class Configuration: return os.environ.get(name) or self._config[name] + def get_bool(self, name): + """ Return the given configuration parameters as a boolean. + Values of '1', 'yes' and 'true' are accepted as truthy values, + everything else is interpreted as false. + """ + return self.__getattr__(name).lower() in ('1', 'yes', 'true') + def get_libpq_dsn(self): """ Get configured database DSN converted into the key/value format understood by libpq and psycopg. diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index 885caca5..5fbb07f8 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -2,6 +2,7 @@ Functions for bringing auxiliary data in the database up-to-date. """ import json +import re from psycopg2.extras import execute_values @@ -69,3 +70,100 @@ def load_address_levels_from_file(conn, config_file): """ with config_file.open('r') as fdesc: load_address_levels(conn, 'address_levels', json.load(fdesc)) + +PLPGSQL_BASE_MODULES = ( + 'utils.sql', + 'normalization.sql', + 'ranking.sql', + 'importance.sql', + 'address_lookup.sql', + 'interpolation.sql' +) + +PLPGSQL_TABLE_MODULES = ( + ('place', 'place_triggers.sql'), + ('placex', 'placex_triggers.sql'), + ('location_postcode', 'postcode_triggers.sql') +) + +def _get_standard_function_sql(conn, config, sql_dir, enable_diff_updates, enable_debug): + """ Read all applicable SQLs containing PL/pgSQL functions, replace + placefolders and execute them. + """ + sql_func_dir = sql_dir / 'functions' + sql = '' + + # Get the basic set of functions that is always imported. + for sql_file in PLPGSQL_BASE_MODULES: + with (sql_func_dir / sql_file).open('r') as fdesc: + sql += fdesc.read() + + # Some files require the presence of a certain table + for table, fname in PLPGSQL_TABLE_MODULES: + if conn.table_exists(table): + with (sql_func_dir / fname).open('r') as fdesc: + sql += fdesc.read() + + # Replace placeholders. + sql = sql.replace('{modulepath}', + config.DATABASE_MODULE_PATH or str((config.project_dir / 'module').resolve())) + + if enable_diff_updates: + sql = sql.replace('RETURN NEW; -- %DIFFUPDATES%', '--') + + if enable_debug: + sql = sql.replace('--DEBUG:', '') + + if config.get_bool('LIMIT_REINDEXING'): + sql = sql.replace('--LIMIT INDEXING:', '') + + if not config.get_bool('USE_US_TIGER_DATA'): + sql = sql.replace('-- %NOTIGERDATA% ', '') + + if not config.get_bool('USE_AUX_LOCATION_DATA'): + sql = sql.replace('-- %NOAUXDATA% ', '') + + reverse_only = 'false' if conn.table_exists('search_name') else 'true' + + return sql.replace('%REVERSE-ONLY%', reverse_only) + + +def replace_partition_string(sql, partitions): + """ Replace a partition template with the actual partition code. + """ + for match in re.findall('^-- start(.*?)^-- end', sql, re.M | re.S): + repl = '' + for part in partitions: + repl += match.replace('-partition-', str(part)) + sql = sql.replace(match, repl) + + return sql + +def _get_partition_function_sql(conn, sql_dir): + """ Create functions that work on partition tables. + """ + with conn.cursor() as cur: + cur.execute('SELECT distinct partition FROM country_name') + partitions = set([0]) + for row in cur: + partitions.add(row[0]) + + with (sql_dir / 'partition-functions.src.sql').open('r') as fdesc: + sql = fdesc.read() + + return replace_partition_string(sql, sorted(partitions)) + +def create_functions(conn, config, data_dir, + enable_diff_updates=True, enable_debug=False): + """ (Re)create the PL/pgSQL functions. + """ + sql_dir = data_dir / 'sql' + + sql = _get_standard_function_sql(conn, config, sql_dir, + enable_diff_updates, enable_debug) + sql += _get_partition_function_sql(conn, sql_dir) + + with conn.cursor() as cur: + cur.execute(sql) + + conn.commit() diff --git a/test/python/test_cli.py b/test/python/test_cli.py index bd192260..942a9409 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -98,7 +98,6 @@ def test_index_command(monkeypatch, temp_db_cursor, params, do_bnds, do_ranks): @pytest.mark.parametrize("command,params", [ - ('functions', ('setup.php',)), ('wiki-data', ('setup.php', '--import-wikipedia-articles')), ('importance', ('update.php', '--recompute-importance')), ('website', ('setup.php', '--setup-website')), @@ -114,6 +113,7 @@ def test_refresh_legacy_command(mock_run_legacy, command, params): ('postcodes', 'update_postcodes'), ('word-counts', 'recompute_word_counts'), ('address-levels', 'load_address_levels_from_file'), + ('functions', 'create_functions'), ]) def test_refresh_command(monkeypatch, command, func): func_mock = MockParamCapture() @@ -129,7 +129,6 @@ def test_refresh_importance_computed_after_wiki_import(mock_run_legacy): assert mock_run_legacy.called == 2 assert mock_run_legacy.last_args == ('update.php', '--recompute-importance') - @pytest.mark.parametrize("params", [ ('search', '--query', 'new'), ('reverse', '--lat', '0', '--lon', '0'), diff --git a/test/python/test_config.py b/test/python/test_config.py index bada9d86..064f71ba 100644 --- a/test/python/test_config.py +++ b/test/python/test_config.py @@ -15,6 +15,7 @@ def test_no_project_dir(): assert config.DATABASE_WEBUSER == 'www-data' + def test_prefer_project_setting_over_default(): with tempfile.TemporaryDirectory() as project_dir: with open(project_dir + '/.env', 'w') as envfile: @@ -24,6 +25,7 @@ def test_prefer_project_setting_over_default(): assert config.DATABASE_WEBUSER == 'apache' + def test_prefer_os_environ_over_project_setting(monkeypatch): with tempfile.TemporaryDirectory() as project_dir: with open(project_dir + '/.env', 'w') as envfile: @@ -35,6 +37,7 @@ def test_prefer_os_environ_over_project_setting(monkeypatch): assert config.DATABASE_WEBUSER == 'nobody' + def test_get_os_env_add_defaults(monkeypatch): config = Configuration(None, DEFCFG_DIR) @@ -42,6 +45,7 @@ def test_get_os_env_add_defaults(monkeypatch): assert config.get_os_env()['NOMINATIM_DATABASE_WEBUSER'] == 'www-data' + def test_get_os_env_prefer_os_environ(monkeypatch): config = Configuration(None, DEFCFG_DIR) @@ -49,11 +53,13 @@ def test_get_os_env_prefer_os_environ(monkeypatch): assert config.get_os_env()['NOMINATIM_DATABASE_WEBUSER'] == 'nobody' + def test_get_libpq_dsn_convert_default(): config = Configuration(None, DEFCFG_DIR) assert config.get_libpq_dsn() == 'dbname=nominatim' + def test_get_libpq_dsn_convert_php(monkeypatch): config = Configuration(None, DEFCFG_DIR) @@ -62,6 +68,7 @@ def test_get_libpq_dsn_convert_php(monkeypatch): assert config.get_libpq_dsn() == 'dbname=gis password=foo host=localhost' + def test_get_libpq_dsn_convert_libpq(monkeypatch): config = Configuration(None, DEFCFG_DIR) @@ -69,3 +76,20 @@ def test_get_libpq_dsn_convert_libpq(monkeypatch): 'host=localhost dbname=gis password=foo') assert config.get_libpq_dsn() == 'host=localhost dbname=gis password=foo' + + +@pytest.mark.parametrize("value,result", + [(x, True) for x in ('1', 'true', 'True', 'yes', 'YES')] + + [(x, False) for x in ('0', 'false', 'no', 'NO', 'x')]) +def test_get_bool(monkeypatch, value, result): + config = Configuration(None, DEFCFG_DIR) + + monkeypatch.setenv('NOMINATIM_FOOBAR', value) + + assert config.get_bool('FOOBAR') == result + +def test_get_bool_empty(): + config = Configuration(None, DEFCFG_DIR) + + assert config.DATABASE_MODULE_PATH == '' + assert config.get_bool('DATABASE_MODULE_PATH') == False diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py new file mode 100644 index 00000000..5c484558 --- /dev/null +++ b/test/python/test_db_connection.py @@ -0,0 +1,32 @@ +""" +Tests for specialised conenction and cursor classes. +""" +import pytest + +from nominatim.db.connection import connect + +@pytest.fixture +def db(temp_db): + conn = connect('dbname=' + temp_db) + yield conn + conn.close() + + +def test_connection_table_exists(db, temp_db_cursor): + assert db.table_exists('foobar') == False + + temp_db_cursor.execute('CREATE TABLE foobar (id INT)') + + assert db.table_exists('foobar') == True + + +def test_cursor_scalar(db, temp_db_cursor): + temp_db_cursor.execute('CREATE TABLE dummy (id INT)') + + with db.cursor() as cur: + assert cur.scalar('SELECT count(*) FROM dummy') == 0 + +def test_cursor_scalar_many_rows(db): + with db.cursor() as cur: + with pytest.raises(ValueError): + cur.scalar('SELECT * FROM pg_tables') diff --git a/test/python/test_tools_refresh_create_functions.py b/test/python/test_tools_refresh_create_functions.py new file mode 100644 index 00000000..4807e64f --- /dev/null +++ b/test/python/test_tools_refresh_create_functions.py @@ -0,0 +1,99 @@ +""" +Tests for creating PL/pgSQL functions for Nominatim. +""" +from pathlib import Path +import pytest + +from nominatim.db.connection import connect +from nominatim.tools.refresh import _get_standard_function_sql, _get_partition_function_sql + +SQL_DIR = (Path(__file__) / '..' / '..' / '..' / 'sql').resolve() + +@pytest.fixture +def db(temp_db): + conn = connect('dbname=' + temp_db) + yield conn + conn.close() + +@pytest.fixture +def db_with_tables(db): + with db.cursor() as cur: + for table in ('place', 'placex', 'location_postcode'): + cur.execute('CREATE TABLE {} (place_id BIGINT)'.format(table)) + + return db + + +def test_standard_functions_replace_module_default(db, def_config): + def_config.project_dir = Path('.') + sql = _get_standard_function_sql(db, def_config, SQL_DIR, False, False) + + assert sql + assert sql.find('{modulepath}') < 0 + assert sql.find("'{}'".format(Path('module/nominatim.so').resolve())) >= 0 + + +def test_standard_functions_replace_module_custom(monkeypatch, db, def_config): + monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', 'custom') + sql = _get_standard_function_sql(db, def_config, SQL_DIR, False, False) + + assert sql + assert sql.find('{modulepath}') < 0 + assert sql.find("'custom/nominatim.so'") >= 0 + + +@pytest.mark.parametrize("enabled", (True, False)) +def test_standard_functions_enable_diff(db_with_tables, def_config, enabled): + def_config.project_dir = Path('.') + sql = _get_standard_function_sql(db_with_tables, def_config, SQL_DIR, enabled, False) + + assert sql + assert (sql.find('%DIFFUPDATES%') < 0) == enabled + + +@pytest.mark.parametrize("enabled", (True, False)) +def test_standard_functions_enable_debug(db_with_tables, def_config, enabled): + def_config.project_dir = Path('.') + sql = _get_standard_function_sql(db_with_tables, def_config, SQL_DIR, False, enabled) + + assert sql + assert (sql.find('--DEBUG') < 0) == enabled + + +@pytest.mark.parametrize("enabled", (True, False)) +def test_standard_functions_enable_limit_reindexing(monkeypatch, db_with_tables, def_config, enabled): + def_config.project_dir = Path('.') + monkeypatch.setenv('NOMINATIM_LIMIT_REINDEXING', 'yes' if enabled else 'no') + sql = _get_standard_function_sql(db_with_tables, def_config, SQL_DIR, False, False) + + assert sql + assert (sql.find('--LIMIT INDEXING') < 0) == enabled + + +@pytest.mark.parametrize("enabled", (True, False)) +def test_standard_functions_enable_tiger(monkeypatch, db_with_tables, def_config, enabled): + def_config.project_dir = Path('.') + monkeypatch.setenv('NOMINATIM_USE_US_TIGER_DATA', 'yes' if enabled else 'no') + sql = _get_standard_function_sql(db_with_tables, def_config, SQL_DIR, False, False) + + assert sql + assert (sql.find('%NOTIGERDATA%') >= 0) == enabled + + +@pytest.mark.parametrize("enabled", (True, False)) +def test_standard_functions_enable_aux(monkeypatch, db_with_tables, def_config, enabled): + def_config.project_dir = Path('.') + monkeypatch.setenv('NOMINATIM_USE_AUX_LOCATION_DATA', 'yes' if enabled else 'no') + sql = _get_standard_function_sql(db_with_tables, def_config, SQL_DIR, False, False) + + assert sql + assert (sql.find('%NOAUXDATA%') >= 0) == enabled + + +def test_partition_function(temp_db_cursor, db, def_config): + temp_db_cursor.execute("CREATE TABLE country_name (partition SMALLINT)") + + sql = _get_partition_function_sql(db, SQL_DIR) + + assert sql + assert sql.find('-partition-') < 0 From d78f0ba80470a33a7a76edfe3ace5108684873cd Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 26 Jan 2021 22:45:24 +0100 Subject: [PATCH 05/13] port replication initialisation to Python --- lib/admin/update.php | 61 ++----------------- nominatim/cli.py | 25 ++++++-- nominatim/db/connection.py | 58 ++++++++++++++++++ nominatim/db/status.py | 50 ++++++++++++++++ nominatim/tools/exec_utils.py | 18 ++++++ nominatim/tools/replication.py | 34 +++++++++++ nominatim/version.py | 5 ++ settings/env.defaults | 3 + sql/tables.sql | 2 +- test/python/conftest.py | 84 +++++++++++++++++++++++---- test/python/test_cli.py | 22 +++++-- test/python/test_db_status.py | 74 +++++++++++++++++++++++ test/python/test_tools_replication.py | 41 +++++++++++++ 13 files changed, 402 insertions(+), 75 deletions(-) create mode 100644 nominatim/db/connection.py create mode 100644 nominatim/db/status.py create mode 100644 nominatim/tools/replication.py create mode 100644 nominatim/version.py create mode 100644 test/python/test_db_status.py create mode 100644 test/python/test_tools_replication.py diff --git a/lib/admin/update.php b/lib/admin/update.php index a0fbbc46..04eb7019 100644 --- a/lib/admin/update.php +++ b/lib/admin/update.php @@ -116,64 +116,13 @@ $sBaseURL = getSetting('REPLICATION_URL'); if ($aResult['init-updates']) { - // sanity check that the replication URL is correct - $sBaseState = file_get_contents($sBaseURL.'/state.txt'); - if ($sBaseState === false) { - echo "\nCannot find state.txt file at the configured replication URL.\n"; - echo "Does the URL point to a directory containing OSM update data?\n\n"; - fail('replication URL not reachable.'); - } - // sanity check for pyosmium-get-changes - if (!$sPyosmiumBin) { - echo "\nNOMINATIM_PYOSMIUM_BINARY not configured.\n"; - echo "You need to install pyosmium and set up the path to pyosmium-get-changes\n"; - echo "in your local .env file.\n\n"; - fail('NOMINATIM_PYOSMIUM_BINARY not configured'); + $oCmd = (clone($oNominatimCmd))->addParams('replication', '--init'); + + if ($aResult['no-update-functions']) { + $oCmd->addParams('--no-update-functions'); } - $aOutput = 0; - $oCMD = new \Nominatim\Shell($sPyosmiumBin, '--help'); - exec($oCMD->escapedCmd(), $aOutput, $iRet); - - if ($iRet != 0) { - echo "Cannot execute pyosmium-get-changes.\n"; - echo "Make sure you have pyosmium installed correctly\n"; - echo "and have set up NOMINATIM_PYOSMIUM_BINARY to point to pyosmium-get-changes.\n"; - fail('pyosmium-get-changes not found or not usable'); - } - - if (!$aResult['no-update-functions']) { - (clone($oNominatimCmd))->addParams('refresh', '--functions')->run(); - } - - $sDatabaseDate = getDatabaseDate($oDB); - if (!$sDatabaseDate) { - fail('Cannot determine date of database.'); - } - $sWindBack = strftime('%Y-%m-%dT%H:%M:%SZ', strtotime($sDatabaseDate) - (3*60*60)); - - // get the appropriate state id - $aOutput = 0; - $oCMD = (new \Nominatim\Shell($sPyosmiumBin)) - ->addParams('--start-date', $sWindBack) - ->addParams('--server', $sBaseURL); - - exec($oCMD->escapedCmd(), $aOutput, $iRet); - if ($iRet != 0 || $aOutput[0] == 'None') { - fail('Error running pyosmium tools'); - } - - $oDB->exec('TRUNCATE import_status'); - $sSQL = "INSERT INTO import_status (lastimportdate, sequence_id, indexed) VALUES('"; - $sSQL .= $sDatabaseDate."',".$aOutput[0].', true)'; - - try { - $oDB->exec($sSQL); - } catch (\Nominatim\DatabaseError $e) { - fail('Could not enter sequence into database.'); - } - - echo "Done. Database updates will start at sequence $aOutput[0] ($sWindBack)\n"; + $oCmd->run(); } if ($aResult['check-for-updates']) { diff --git a/nominatim/cli.py b/nominatim/cli.py index 4b38040c..a9ffef33 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -234,12 +234,29 @@ class UpdateReplication: @staticmethod def run(args): + try: + import osmium # pylint: disable=W0611 + except ModuleNotFoundError: + LOG.fatal("pyosmium not installed. Replication functions not available.\n" + "To install pyosmium via pip: pip3 install osmium") + return 1 + + from .tools import replication, refresh + + conn = connect(args.config.get_libpq_dsn()) + params = ['update.php'] if args.init: - params.append('--init-updates') - if not args.update_functions: - params.append('--no-update-functions') - elif args.check_for_updates: + LOG.warning("Initialising replication updates") + replication.init_replication(conn, args.config.REPLICATION_URL) + if args.update_functions: + LOG.warning("Create functions") + refresh.create_functions(conn, args.config, args.data_dir, + True, False) + conn.close() + return 0 + + if args.check_for_updates: params.append('--check-for-updates') else: if args.once: diff --git a/nominatim/db/connection.py b/nominatim/db/connection.py new file mode 100644 index 00000000..8e75d7a2 --- /dev/null +++ b/nominatim/db/connection.py @@ -0,0 +1,58 @@ +""" +Specialised connection and cursor functions. +""" +import logging + +import psycopg2 +import psycopg2.extensions +import psycopg2.extras + +class _Cursor(psycopg2.extras.DictCursor): + """ A cursor returning dict-like objects and providing specialised + execution functions. + """ + + def execute(self, query, args=None): # pylint: disable=W0221 + """ Query execution that logs the SQL query when debugging is enabled. + """ + logger = logging.getLogger() + logger.debug(self.mogrify(query, args).decode('utf-8')) + + super().execute(query, args) + + def scalar(self, sql, args=None): + """ Execute query that returns a single value. The value is returned. + If the query yields more than one row, a ValueError is raised. + """ + self.execute(sql, args) + + if self.rowcount != 1: + raise ValueError("Query did not return a single row.") + + return self.fetchone()[0] + + +class _Connection(psycopg2.extensions.connection): + """ A connection that provides the specialised cursor by default and + adds convenience functions for administrating the database. + """ + + def cursor(self, cursor_factory=_Cursor, **kwargs): + """ Return a new cursor. By default the specialised cursor is returned. + """ + return super().cursor(cursor_factory=cursor_factory, **kwargs) + + def table_exists(self, table): + """ Check that a table with the given name exists in the database. + """ + with self.cursor() as cur: + num = cur.scalar("""SELECT count(*) FROM pg_tables + WHERE tablename = %s""", (table, )) + return num == 1 + + +def connect(dsn): + """ Open a connection to the database using the specialised connection + factory. + """ + return psycopg2.connect(dsn, connection_factory=_Connection) diff --git a/nominatim/db/status.py b/nominatim/db/status.py new file mode 100644 index 00000000..af4b85c3 --- /dev/null +++ b/nominatim/db/status.py @@ -0,0 +1,50 @@ +""" +Access and helper functions for the status table. +""" +import datetime as dt +import logging +import re + +from ..tools.exec_utils import get_url + +LOG = logging.getLogger() + +def compute_database_date(conn): + """ Determine the date of the database from the newest object in the + data base. + """ + # First, find the node with the highest ID in the database + with conn.cursor() as cur: + osmid = cur.scalar("SELECT max(osm_id) FROM place WHERE osm_type='N'") + + if osmid is None: + LOG.fatal("No data found in the database.") + raise RuntimeError("No data found in the database.") + + LOG.info("Using node id %d for timestamp lookup", osmid) + # Get the node from the API to find the timestamp when it was created. + node_url = 'https://www.openstreetmap.org/api/0.6/node/{}/1'.format(osmid) + data = get_url(node_url) + + match = re.search(r'timestamp="((\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}))Z"', data) + + if match is None: + LOG.fatal("The node data downloaded from the API does not contain valid data.\n" + "URL used: %s", node_url) + raise RuntimeError("Bad API data.") + + LOG.debug("Found timestamp %s", match[1]) + + return dt.datetime.fromisoformat(match[1]).replace(tzinfo=dt.timezone.utc) + + +def set_status(conn, date, seq=None, indexed=True): + """ Replace the current status with the given status. + """ + assert date.tzinfo == dt.timezone.utc + with conn.cursor() as cur: + cur.execute("TRUNCATE TABLE import_status") + cur.execute("""INSERT INTO import_status (lastimportdate, sequence_id, indexed) + VALUES (%s, %s, %s)""", (date, seq, indexed)) + + conn.commit() diff --git a/nominatim/tools/exec_utils.py b/nominatim/tools/exec_utils.py index 9e16e293..0d3db204 100644 --- a/nominatim/tools/exec_utils.py +++ b/nominatim/tools/exec_utils.py @@ -3,8 +3,13 @@ Helper functions for executing external programs. """ import logging import subprocess +import urllib.request as urlrequest from urllib.parse import urlencode +from ..version import NOMINATIM_VERSION + +LOG = logging.getLogger() + def run_legacy_script(script, *args, nominatim_env=None, throw_on_fail=False): """ Run a Nominatim PHP script with the given arguments. @@ -80,3 +85,16 @@ def run_api_script(endpoint, project_dir, extra_env=None, phpcgi_bin=None, print(result[content_start + 4:].replace('\\n', '\n')) return 0 + + +def get_url(url): + """ Get the contents from the given URL and return it as a UTF-8 string. + """ + headers = {"User-Agent" : "Nominatim/" + NOMINATIM_VERSION} + + try: + with urlrequest.urlopen(urlrequest.Request(url, headers=headers)) as response: + return response.read().decode('utf-8') + except: + LOG.fatal('Failed to load URL: %s', url) + raise diff --git a/nominatim/tools/replication.py b/nominatim/tools/replication.py new file mode 100644 index 00000000..86405168 --- /dev/null +++ b/nominatim/tools/replication.py @@ -0,0 +1,34 @@ +""" +Functions for updating a database from a replication source. +""" +import datetime +import logging + +from osmium.replication.server import ReplicationServer + +from ..db import status + +LOG = logging.getLogger() + +def init_replication(conn, base_url): + """ Set up replication for the server at the given base URL. + """ + LOG.info("Using replication source: %s", base_url) + date = status.compute_database_date(conn) + + # margin of error to make sure we get all data + date -= datetime.timedelta(hours=3) + + repl = ReplicationServer(base_url) + + seq = repl.timestamp_to_sequence(date) + + if seq is None: + LOG.fatal("Cannot reach the configured replication service '%s'.\n" + "Does the URL point to a directory containing OSM update data?", + base_url) + raise RuntimeError("Failed to reach replication service") + + status.set_status(conn, date=date, seq=seq) + + LOG.warning("Updates intialised at sequence %s (%s)", seq, date) diff --git a/nominatim/version.py b/nominatim/version.py new file mode 100644 index 00000000..a2ddc9fa --- /dev/null +++ b/nominatim/version.py @@ -0,0 +1,5 @@ +""" +Version information for Nominatim. +""" + +NOMINATIM_VERSION = "3.6.0" diff --git a/settings/env.defaults b/settings/env.defaults index fbad3e33..9fefc622 100644 --- a/settings/env.defaults +++ b/settings/env.defaults @@ -57,6 +57,9 @@ NOMINATIM_HTTP_PROXY_HOST=proxy.mydomain.com NOMINATIM_HTTP_PROXY_PORT=3128 NOMINATIM_HTTP_PROXY_LOGIN= NOMINATIM_HTTP_PROXY_PASSWORD= +# Also set these standard environment variables. +# HTTP_PROXY="http://user:pass@10.10.1.10:1080" +# HTTPS_PROXY="http://user:pass@10.10.1.10:1080" # Location of the osm2pgsql binary. # When empty, osm2pgsql is expected to reside in the osm2pgsql directory in diff --git a/sql/tables.sql b/sql/tables.sql index 5686bcd2..8647e304 100644 --- a/sql/tables.sql +++ b/sql/tables.sql @@ -1,6 +1,6 @@ drop table if exists import_status; CREATE TABLE import_status ( - lastimportdate timestamp NOT NULL, + lastimportdate timestamp with time zone NOT NULL, sequence_id integer, indexed boolean ); diff --git a/test/python/conftest.py b/test/python/conftest.py index d92df5c5..2e81e919 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -1,3 +1,4 @@ +import itertools import sys from pathlib import Path @@ -11,6 +12,7 @@ SRC_DIR = Path(__file__) / '..' / '..' / '..' sys.path.insert(0, str(SRC_DIR.resolve())) from nominatim.config import Configuration +from nominatim.db import connection class _TestingCursor(psycopg2.extras.DictCursor): """ Extension to the DictCursor class that provides execution @@ -40,27 +42,42 @@ def temp_db(monkeypatch): exported into NOMINATIM_DATABASE_DSN. """ name = 'test_nominatim_python_unittest' - with psycopg2.connect(database='postgres') as conn: - conn.set_isolation_level(0) - with conn.cursor() as cur: - cur.execute('DROP DATABASE IF EXISTS {}'.format(name)) - cur.execute('CREATE DATABASE {}'.format(name)) + conn = psycopg2.connect(database='postgres') + + conn.set_isolation_level(0) + with conn.cursor() as cur: + cur.execute('DROP DATABASE IF EXISTS {}'.format(name)) + cur.execute('CREATE DATABASE {}'.format(name)) + + conn.close() monkeypatch.setenv('NOMINATIM_DATABASE_DSN' , 'dbname=' + name) yield name - with psycopg2.connect(database='postgres') as conn: - conn.set_isolation_level(0) - with conn.cursor() as cur: - cur.execute('DROP DATABASE IF EXISTS {}'.format(name)) + conn = psycopg2.connect(database='postgres') + conn.set_isolation_level(0) + with conn.cursor() as cur: + cur.execute('DROP DATABASE IF EXISTS {}'.format(name)) + + conn.close() + +@pytest.fixture +def temp_db_with_extensions(temp_db): + conn = psycopg2.connect(database=temp_db) + with conn.cursor() as cur: + cur.execute('CREATE EXTENSION hstore; CREATE EXTENSION postgis;') + conn.commit() + conn.close() + + return temp_db @pytest.fixture def temp_db_conn(temp_db): """ Connection to the test database. """ - conn = psycopg2.connect(database=temp_db) + conn = connection.connect('dbname=' + temp_db) yield conn conn.close() @@ -80,3 +97,50 @@ def temp_db_cursor(temp_db): @pytest.fixture def def_config(): return Configuration(None, SRC_DIR.resolve() / 'settings') + + +@pytest.fixture +def status_table(temp_db_conn): + """ Create an empty version of the status table. + """ + with temp_db_conn.cursor() as cur: + cur.execute("""CREATE TABLE import_status ( + lastimportdate timestamp with time zone NOT NULL, + sequence_id integer, + indexed boolean + )""") + temp_db_conn.commit() + + +@pytest.fixture +def place_table(temp_db_with_extensions, temp_db_conn): + """ Create an empty version of the place table. + """ + with temp_db_conn.cursor() as cur: + cur.execute("""CREATE TABLE place ( + osm_id int8 NOT NULL, + osm_type char(1) NOT NULL, + class text NOT NULL, + type text NOT NULL, + name hstore, + admin_level smallint, + address hstore, + extratags hstore, + geometry Geometry(Geometry,4326) NOT NULL)""") + temp_db_conn.commit() + + +@pytest.fixture +def place_row(place_table, temp_db_cursor): + """ A factory for rows in the place table. The table is created as a + prerequisite to the fixture. + """ + idseq = itertools.count(1001) + def _insert(osm_type='N', osm_id=None, cls='amenity', typ='cafe', names=None, + admin_level=None, address=None, extratags=None, geom=None): + temp_db_cursor.execute("INSERT INTO place VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)", + (osm_id or next(idseq), osm_type, cls, typ, names, + admin_level, address, extratags, + geom or 'SRID=4326;POINT(0 0 )')) + + return _insert diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 942a9409..ed46eba3 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -7,6 +7,7 @@ import pytest import nominatim.cli import nominatim.indexer.indexer import nominatim.tools.refresh +import nominatim.tools.replication def call_nominatim(*args): return nominatim.cli.nominatim(module_dir='build/module', @@ -56,7 +57,6 @@ def test_cli_help(capsys): (('import', '--continue', 'load-data'), 'setup'), (('freeze',), 'setup'), (('special-phrases',), 'specialphrases'), - (('replication',), 'update'), (('add-data', '--tiger-data', 'tiger'), 'setup'), (('add-data', '--file', 'foo.osm'), 'update'), (('check-database',), 'check_import_finished'), @@ -102,7 +102,7 @@ def test_index_command(monkeypatch, temp_db_cursor, params, do_bnds, do_ranks): ('importance', ('update.php', '--recompute-importance')), ('website', ('setup.php', '--setup-website')), ]) -def test_refresh_legacy_command(mock_run_legacy, command, params): +def test_refresh_legacy_command(mock_run_legacy, temp_db, command, params): assert 0 == call_nominatim('refresh', '--' + command) assert mock_run_legacy.called == 1 @@ -115,7 +115,7 @@ def test_refresh_legacy_command(mock_run_legacy, command, params): ('address-levels', 'load_address_levels_from_file'), ('functions', 'create_functions'), ]) -def test_refresh_command(monkeypatch, command, func): +def test_refresh_command(monkeypatch, temp_db, command, func): func_mock = MockParamCapture() monkeypatch.setattr(nominatim.tools.refresh, func, func_mock) @@ -123,12 +123,26 @@ def test_refresh_command(monkeypatch, command, func): assert func_mock.called == 1 -def test_refresh_importance_computed_after_wiki_import(mock_run_legacy): + +def test_refresh_importance_computed_after_wiki_import(mock_run_legacy, temp_db): assert 0 == call_nominatim('refresh', '--importance', '--wiki-data') assert mock_run_legacy.called == 2 assert mock_run_legacy.last_args == ('update.php', '--recompute-importance') + +@pytest.mark.parametrize("params,func", [ + (('--init', '--no-update-functions'), 'init_replication') + ]) +def test_replication_command(monkeypatch, temp_db, params, func): + func_mock = MockParamCapture() + monkeypatch.setattr(nominatim.tools.replication, func, func_mock) + + assert 0 == call_nominatim('replication', *params) + + assert func_mock.called == 1 + + @pytest.mark.parametrize("params", [ ('search', '--query', 'new'), ('reverse', '--lat', '0', '--lon', '0'), diff --git a/test/python/test_db_status.py b/test/python/test_db_status.py new file mode 100644 index 00000000..d73b099e --- /dev/null +++ b/test/python/test_db_status.py @@ -0,0 +1,74 @@ +""" +Tests for status table manipulation. +""" +import datetime as dt + +import pytest + +import nominatim.db.status + +def test_compute_database_date_place_empty(status_table, place_table, temp_db_conn): + with pytest.raises(RuntimeError): + nominatim.db.status.compute_database_date(temp_db_conn) + +OSM_NODE_DATA = """\ + + + + +""" + +def test_compute_database_date_valid(monkeypatch, status_table, place_row, temp_db_conn): + place_row(osm_type='N', osm_id=45673) + + requested_url = [] + def mock_url(url): + requested_url.append(url) + return OSM_NODE_DATA + + monkeypatch.setattr(nominatim.db.status, "get_url", mock_url) + + date = nominatim.db.status.compute_database_date(temp_db_conn) + + assert requested_url == ['https://www.openstreetmap.org/api/0.6/node/45673/1'] + assert date == dt.datetime.fromisoformat('2006-01-27T22:09:10').replace(tzinfo=dt.timezone.utc) + + +def test_compute_database_broken_api(monkeypatch, status_table, place_row, temp_db_conn): + place_row(osm_type='N', osm_id=45673) + + requested_url = [] + def mock_url(url): + requested_url.append(url) + return ' + + + +""" + + +def test_init_replication_bad_base_url(monkeypatch, status_table, place_row, temp_db_conn, temp_db_cursor): + place_row(osm_type='N', osm_id=100) + + monkeypatch.setattr(nominatim.db.status, "get_url", lambda u : OSM_NODE_DATA) + + with pytest.raises(RuntimeError, match="Failed to reach replication service"): + nominatim.tools.replication.init_replication(temp_db_conn, 'https://test.io') + + +def test_init_replication_success(monkeypatch, status_table, place_row, temp_db_conn, temp_db_cursor): + place_row(osm_type='N', osm_id=100) + + monkeypatch.setattr(nominatim.db.status, "get_url", lambda u : OSM_NODE_DATA) + monkeypatch.setattr(nominatim.tools.replication.ReplicationServer, + "timestamp_to_sequence", + lambda self, date: 234) + + nominatim.tools.replication.init_replication(temp_db_conn, 'https://test.io') + + temp_db_cursor.execute("SELECT * FROM import_status") + + expected_date = dt.datetime.fromisoformat('2006-01-27T19:09:10').replace(tzinfo=dt.timezone.utc) + assert temp_db_cursor.rowcount == 1 + assert temp_db_cursor.fetchone() == [expected_date, 234, True] From 8f0885f6cb24f545a2f5021d53d8aec64a72bf9b Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 28 Jan 2021 14:34:17 +0100 Subject: [PATCH 06/13] port check-for-update function to python --- lib/admin/update.php | 13 +--------- nominatim/cli.py | 16 ++++++------ nominatim/db/status.py | 13 ++++++++++ nominatim/tools/replication.py | 25 +++++++++++++++++++ test/python/test_cli.py | 5 ++-- test/python/test_db_status.py | 12 +++++++++ test/python/test_tools_replication.py | 36 +++++++++++++++++++++++++++ utils/check_server_for_updates.py | 24 ------------------ 8 files changed, 98 insertions(+), 46 deletions(-) delete mode 100755 utils/check_server_for_updates.py diff --git a/lib/admin/update.php b/lib/admin/update.php index 04eb7019..a2ff6158 100644 --- a/lib/admin/update.php +++ b/lib/admin/update.php @@ -126,18 +126,7 @@ if ($aResult['init-updates']) { } if ($aResult['check-for-updates']) { - $aLastState = $oDB->getRow('SELECT sequence_id FROM import_status'); - - if (!$aLastState['sequence_id']) { - fail('Updates not set up. Please run ./utils/update.php --init-updates.'); - } - - $oCmd = (new \Nominatim\Shell(CONST_BinDir.'/check_server_for_updates.py')) - ->addParams($sBaseURL) - ->addParams($aLastState['sequence_id']); - $iRet = $oCmd->run(); - - exit($iRet); + exit((clone($oNominatimCmd))->addParams('replication', '--check-for-updates')->run()); } if (isset($aResult['import-diff']) || isset($aResult['import-file'])) { diff --git a/nominatim/cli.py b/nominatim/cli.py index a9ffef33..edfddcbc 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -257,14 +257,16 @@ class UpdateReplication: return 0 if args.check_for_updates: - params.append('--check-for-updates') + ret = replication.check_for_updates(conn, args.config.REPLICATION_URL) + conn.close() + return ret + + if args.once: + params.append('--import-osmosis') else: - if args.once: - params.append('--import-osmosis') - else: - params.append('--import-osmosis-all') - if not args.do_index: - params.append('--no-index') + params.append('--import-osmosis-all') + if not args.do_index: + params.append('--no-index') return run_legacy_script(*params, nominatim_env=args) diff --git a/nominatim/db/status.py b/nominatim/db/status.py index af4b85c3..a0454771 100644 --- a/nominatim/db/status.py +++ b/nominatim/db/status.py @@ -48,3 +48,16 @@ def set_status(conn, date, seq=None, indexed=True): VALUES (%s, %s, %s)""", (date, seq, indexed)) conn.commit() + + +def get_status(conn): + """ Return the current status as a triple of (date, sequence, indexed). + If status has not been set up yet, a triple of None is returned. + """ + with conn.cursor() as cur: + cur.execute("SELECT * FROM import_status LIMIT 1") + if cur.rowcount < 1: + return None, None, None + + row = cur.fetchone() + return row['lastimportdate'], row['sequence_id'], row['indexed'] diff --git a/nominatim/tools/replication.py b/nominatim/tools/replication.py index 86405168..f278556a 100644 --- a/nominatim/tools/replication.py +++ b/nominatim/tools/replication.py @@ -32,3 +32,28 @@ def init_replication(conn, base_url): status.set_status(conn, date=date, seq=seq) LOG.warning("Updates intialised at sequence %s (%s)", seq, date) + + +def check_for_updates(conn, base_url): + """ Check if new data is available from the replication service at the + given base URL. + """ + _, seq, _ = status.get_status(conn) + + if seq is None: + LOG.error("Replication not set up. " + "Please run 'nominatim replication --init' first.") + return 254 + + state = ReplicationServer(base_url).get_state_info() + + if state is None: + LOG.error("Cannot get state for URL %s.", base_url) + return 253 + + if state.sequence <= seq: + LOG.warning("Database is up to date.") + return 1 + + LOG.warning("New data available (%i => %i).", seq, state.sequence) + return 0 diff --git a/test/python/test_cli.py b/test/python/test_cli.py index ed46eba3..71e3ff65 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -120,7 +120,6 @@ def test_refresh_command(monkeypatch, temp_db, command, func): monkeypatch.setattr(nominatim.tools.refresh, func, func_mock) assert 0 == call_nominatim('refresh', '--' + command) - assert func_mock.called == 1 @@ -132,14 +131,14 @@ def test_refresh_importance_computed_after_wiki_import(mock_run_legacy, temp_db) @pytest.mark.parametrize("params,func", [ - (('--init', '--no-update-functions'), 'init_replication') + (('--init', '--no-update-functions'), 'init_replication'), + (('--check-for-updates',), 'check_for_updates') ]) def test_replication_command(monkeypatch, temp_db, params, func): func_mock = MockParamCapture() monkeypatch.setattr(nominatim.tools.replication, func, func_mock) assert 0 == call_nominatim('replication', *params) - assert func_mock.called == 1 diff --git a/test/python/test_db_status.py b/test/python/test_db_status.py index d73b099e..9631170a 100644 --- a/test/python/test_db_status.py +++ b/test/python/test_db_status.py @@ -72,3 +72,15 @@ def test_set_status_filled_table(status_table, temp_db_conn, temp_db_cursor): assert temp_db_cursor.rowcount == 1 assert temp_db_cursor.fetchone() == [date, 456, False] + + +def test_get_status_empty_table(status_table, temp_db_conn): + assert nominatim.db.status.get_status(temp_db_conn) == (None, None, None) + + +def test_get_status_success(status_table, temp_db_conn): + date = dt.datetime.fromordinal(1000000).replace(tzinfo=dt.timezone.utc) + nominatim.db.status.set_status(temp_db_conn, date=date, seq=667, indexed=False) + + assert nominatim.db.status.get_status(temp_db_conn) == \ + (date, 667, False) diff --git a/test/python/test_tools_replication.py b/test/python/test_tools_replication.py index f8a2e92b..e06eda59 100644 --- a/test/python/test_tools_replication.py +++ b/test/python/test_tools_replication.py @@ -4,8 +4,10 @@ Tests for replication functionality. import datetime as dt import pytest +from osmium.replication.server import OsmosisState import nominatim.tools.replication +import nominatim.db.status as status OSM_NODE_DATA = """\ @@ -39,3 +41,37 @@ def test_init_replication_success(monkeypatch, status_table, place_row, temp_db_ expected_date = dt.datetime.fromisoformat('2006-01-27T19:09:10').replace(tzinfo=dt.timezone.utc) assert temp_db_cursor.rowcount == 1 assert temp_db_cursor.fetchone() == [expected_date, 234, True] + + +def test_check_for_updates_empty_status_table(status_table, temp_db_conn): + assert nominatim.tools.replication.check_for_updates(temp_db_conn, 'https://test.io') == 254 + + +def test_check_for_updates_seq_not_set(status_table, temp_db_conn): + status.set_status(temp_db_conn, dt.datetime.now().replace(tzinfo=dt.timezone.utc)) + + assert nominatim.tools.replication.check_for_updates(temp_db_conn, 'https://test.io') == 254 + + +def test_check_for_updates_no_state(monkeypatch, status_table, temp_db_conn): + status.set_status(temp_db_conn, + dt.datetime.now().replace(tzinfo=dt.timezone.utc), + seq=345) + + monkeypatch.setattr(nominatim.tools.replication.ReplicationServer, + "get_state_info", lambda self: None) + + assert nominatim.tools.replication.check_for_updates(temp_db_conn, 'https://test.io') == 253 + + +@pytest.mark.parametrize("server_sequence,result", [(344, 1), (345, 1), (346, 0)]) +def test_check_for_updates_no_new_data(monkeypatch, status_table, temp_db_conn, + server_sequence, result): + date = dt.datetime.now().replace(tzinfo=dt.timezone.utc) + status.set_status(temp_db_conn, date, seq=345) + + monkeypatch.setattr(nominatim.tools.replication.ReplicationServer, + "get_state_info", + lambda self: OsmosisState(server_sequence, date)) + + assert nominatim.tools.replication.check_for_updates(temp_db_conn, 'https://test.io') == result diff --git a/utils/check_server_for_updates.py b/utils/check_server_for_updates.py deleted file mode 100755 index bcc9d0ba..00000000 --- a/utils/check_server_for_updates.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python3 - -import sys -from osmium.replication import server - -if __name__ == '__main__': - if len(sys.argv) != 3: - print("Usage: python check_server_for_updates.py ") - sys.exit(254) - - seqid = int(sys.argv[2]) - - state = server.ReplicationServer(sys.argv[1]).get_state_info() - - if state is None: - print("ERROR: Cannot get state from URL %s." % (sys.argv[1], )) - sys.exit(253) - - if state.sequence <= seqid: - print("Database up to date.") - sys.exit(1) - - print("New data available (%i => %i)." % (seqid, state.sequence)) - sys.exit(0) From 4cb6dc01f382e9fb748efbe4517442af2274f210 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 30 Jan 2021 15:50:34 +0100 Subject: [PATCH 07/13] port replication update function to python --- lib/admin/update.php | 157 +++----------------------- nominatim/cli.py | 145 ++++++++++++++++++------ nominatim/config.py | 41 ++++++- nominatim/db/status.py | 20 +++- nominatim/tools/exec_utils.py | 38 +++++++ nominatim/tools/replication.py | 65 ++++++++++- test/python/conftest.py | 11 +- test/python/test_cli.py | 72 +++++++++++- test/python/test_config.py | 60 ++++++++++ test/python/test_db_status.py | 27 +++++ test/python/test_tools_exec_utils.py | 9 ++ test/python/test_tools_replication.py | 72 +++++++++++- utils/osm_file_date.py | 34 ------ 13 files changed, 527 insertions(+), 224 deletions(-) delete mode 100755 utils/osm_file_date.py diff --git a/lib/admin/update.php b/lib/admin/update.php index a2ff6158..fba5300b 100644 --- a/lib/admin/update.php +++ b/lib/admin/update.php @@ -111,9 +111,6 @@ if ($aResult['verbose']) { $oNominatimCmd->addParams('--verbose'); } -$sPyosmiumBin = getSetting('PYOSMIUM_BINARY'); -$sBaseURL = getSetting('REPLICATION_URL'); - if ($aResult['init-updates']) { $oCmd = (clone($oNominatimCmd))->addParams('replication', '--init'); @@ -203,7 +200,10 @@ if ($aResult['recompute-word-counts']) { } if ($aResult['index']) { - (clone $oNominatimCmd)->addParams('index', '--minrank', $aResult['index-rank'])->run(); + (clone $oNominatimCmd) + ->addParams('index', '--minrank', $aResult['index-rank']) + ->addParams('--threads', $aResult['index-instances']) + ->run(); } if ($aResult['update-address-levels']) { @@ -228,146 +228,17 @@ if ($aResult['recompute-importance']) { } if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) { - // - if (strpos($sBaseURL, 'download.geofabrik.de') !== false && getSetting('REPLICATION_UPDATE_INTERVAL') < 86400) { - fail('Error: Update interval too low for download.geofabrik.de. ' . - "Please check install documentation (https://nominatim.org/release-docs/latest/admin/Import-and-Update#setting-up-the-update-process)\n"); + $oCmd = (clone($oNominatimCmd)) + ->addParams('replication') + ->addParams('--threads', $aResult['index-instances']); + + if (!$aResult['import-osmosis-all']) { + $oCmd->addParams('--once'); } - $sImportFile = CONST_InstallDir.'/osmosischange.osc'; - - $oCMDDownload = (new \Nominatim\Shell($sPyosmiumBin)) - ->addParams('--server', $sBaseURL) - ->addParams('--outfile', $sImportFile) - ->addParams('--size', getSetting('REPLICATION_MAX_DIFF')); - - $oCMDImport = (clone $oOsm2pgsqlCmd)->addParams($sImportFile); - - while (true) { - $fStartTime = time(); - $aLastState = $oDB->getRow('SELECT *, EXTRACT (EPOCH FROM lastimportdate) as unix_ts FROM import_status'); - - if (!$aLastState['sequence_id']) { - echo "Updates not set up. Please run ./utils/update.php --init-updates.\n"; - exit(1); - } - - echo 'Currently at sequence '.$aLastState['sequence_id'].' ('.$aLastState['lastimportdate'].') - '.$aLastState['indexed']." indexed\n"; - - $sBatchEnd = $aLastState['lastimportdate']; - $iEndSequence = $aLastState['sequence_id']; - - if ($aLastState['indexed']) { - // Sleep if the update interval has not yet been reached. - $fNextUpdate = $aLastState['unix_ts'] + getSetting('REPLICATION_UPDATE_INTERVAL'); - if ($fNextUpdate > $fStartTime) { - $iSleepTime = $fNextUpdate - $fStartTime; - echo "Waiting for next update for $iSleepTime sec."; - sleep($iSleepTime); - } - - // Download the next batch of changes. - do { - $fCMDStartTime = time(); - $iNextSeq = (int) $aLastState['sequence_id']; - unset($aOutput); - - $oCMD = (clone $oCMDDownload)->addParams('--start-id', $iNextSeq); - echo $oCMD->escapedCmd()."\n"; - if (file_exists($sImportFile)) { - unlink($sImportFile); - } - exec($oCMD->escapedCmd(), $aOutput, $iResult); - - if ($iResult == 3) { - $sSleep = getSetting('REPLICATION_RECHECK_INTERVAL'); - echo 'No new updates. Sleeping for '.$sSleep." sec.\n"; - sleep($sSleep); - } elseif ($iResult != 0) { - echo 'ERROR: updates failed.'; - exit($iResult); - } else { - $iEndSequence = (int)$aOutput[0]; - } - } while ($iResult); - - // get the newest object from the diff file - $sBatchEnd = 0; - $iRet = 0; - $oCMD = new \Nominatim\Shell(CONST_BinDir.'/osm_file_date.py', $sImportFile); - exec($oCMD->escapedCmd(), $sBatchEnd, $iRet); - if ($iRet == 5) { - echo "Diff file is empty. skipping import.\n"; - if (!$aResult['import-osmosis-all']) { - exit(0); - } else { - continue; - } - } - if ($iRet != 0) { - fail('Error getting date from diff file.'); - } - $sBatchEnd = $sBatchEnd[0]; - - // Import the file - $fCMDStartTime = time(); - - - echo $oCMDImport->escapedCmd()."\n"; - unset($sJunk); - $iErrorLevel = $oCMDImport->run(); - if ($iErrorLevel) { - echo "Error executing osm2pgsql: $iErrorLevel\n"; - exit($iErrorLevel); - } - - // write the update logs - $iFileSize = filesize($sImportFile); - $sSQL = 'INSERT INTO import_osmosis_log'; - $sSQL .= '(batchend, batchseq, batchsize, starttime, endtime, event)'; - $sSQL .= " values ('$sBatchEnd',$iEndSequence,$iFileSize,'"; - $sSQL .= date('Y-m-d H:i:s', $fCMDStartTime)."','"; - $sSQL .= date('Y-m-d H:i:s')."','import')"; - var_Dump($sSQL); - $oDB->exec($sSQL); - - // update the status - $sSQL = "UPDATE import_status SET lastimportdate = '$sBatchEnd', indexed=false, sequence_id = $iEndSequence"; - var_Dump($sSQL); - $oDB->exec($sSQL); - echo date('Y-m-d H:i:s')." Completed download step for $sBatchEnd in ".round((time()-$fCMDStartTime)/60, 2)." minutes\n"; - } - - // Index file - if (!$aResult['no-index']) { - $fCMDStartTime = time(); - - $oThisIndexCmd = clone($oNominatimCmd); - $oThisIndexCmd->addParams('index'); - echo $oThisIndexCmd->escapedCmd()."\n"; - $iErrorLevel = $oThisIndexCmd->run(); - if ($iErrorLevel) { - echo "Error: $iErrorLevel\n"; - exit($iErrorLevel); - } - - $sSQL = 'INSERT INTO import_osmosis_log'; - $sSQL .= '(batchend, batchseq, batchsize, starttime, endtime, event)'; - $sSQL .= " values ('$sBatchEnd',$iEndSequence,NULL,'"; - $sSQL .= date('Y-m-d H:i:s', $fCMDStartTime)."','"; - $sSQL .= date('Y-m-d H:i:s')."','index')"; - var_Dump($sSQL); - $oDB->exec($sSQL); - echo date('Y-m-d H:i:s')." Completed index step for $sBatchEnd in ".round((time()-$fCMDStartTime)/60, 2)." minutes\n"; - } else { - if ($aResult['import-osmosis-all']) { - echo "Error: --no-index cannot be used with continuous imports (--import-osmosis-all).\n"; - exit(1); - } - } - - $fDuration = time() - $fStartTime; - echo date('Y-m-d H:i:s')." Completed all for $sBatchEnd in ".round($fDuration/60, 2)." minutes\n"; - if (!$aResult['import-osmosis-all']) exit(0); + if ($aResult['no-index']) { + $oCmd->addParams('--no-index'); } + + exit($oCmd->run()); } diff --git a/nominatim/cli.py b/nominatim/cli.py index edfddcbc..f02277ae 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -2,8 +2,10 @@ Command-line interface to the Nominatim functions for import, update, database administration and querying. """ -import sys +import datetime as dt import os +import sys +import time import argparse import logging from pathlib import Path @@ -11,6 +13,7 @@ from pathlib import Path from .config import Configuration from .tools.exec_utils import run_legacy_script, run_api_script from .db.connection import connect +from .db import status LOG = logging.getLogger() @@ -88,6 +91,17 @@ class CommandlineParser: return args.command.run(args) + +def _osm2pgsql_options_from_args(args, default_cache, default_threads): + """ Set up the stanadrd osm2pgsql from the command line arguments. + """ + return dict(osm2pgsql=args.osm2pgsql_path, + osm2pgsql_cache=args.osm2pgsql_cache or default_cache, + osm2pgsql_style=args.config.get_import_style_file(), + threads=args.threads or default_threads, + dsn=args.config.get_libpq_dsn(), + flatnode_file=args.config.FLATNODE_FILE) + ##### Subcommand classes # # Each class needs to implement two functions: add_args() adds the CLI parameters @@ -231,6 +245,88 @@ class UpdateReplication: group.add_argument('--no-index', action='store_false', dest='do_index', help="""Do not index the new data. Only applicable together with --once""") + group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int, + help='Size of cache to be used by osm2pgsql (in MB)') + + @staticmethod + def _init_replication(args): + from .tools import replication, refresh + + LOG.warning("Initialising replication updates") + conn = connect(args.config.get_libpq_dsn()) + replication.init_replication(conn, base_url=args.config.REPLICATION_URL) + if args.update_functions: + LOG.warning("Create functions") + refresh.create_functions(conn, args.config, args.data_dir, + True, False) + conn.close() + return 0 + + + @staticmethod + def _check_for_updates(args): + from .tools import replication + + conn = connect(args.config.get_libpq_dsn()) + ret = replication.check_for_updates(conn, base_url=args.config.REPLICATION_URL) + conn.close() + return ret + + + @staticmethod + def _update(args): + from .tools import replication + from .indexer.indexer import Indexer + + params = _osm2pgsql_options_from_args(args, 2000, 1) + params.update(base_url=args.config.REPLICATION_URL, + update_interval=args.config.get_int('REPLICATION_UPDATE_INTERVAL'), + import_file=args.project_dir / 'osmosischange.osc', + max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'), + indexed_only=not args.once) + + # Sanity check to not overwhelm the Geofabrik servers. + if 'download.geofabrik.de'in params['base_url']\ + and params['update_interval'] < 86400: + LOG.fatal("Update interval too low for download.geofabrik.de.\n" + "Please check install documentation " + "(https://nominatim.org/release-docs/latest/admin/Import-and-Update#" + "setting-up-the-update-process).") + raise RuntimeError("Invalid replication update interval setting.") + + if not args.once: + if not args.do_index: + LOG.fatal("Indexing cannot be disabled when running updates continuously.") + raise RuntimeError("Bad arguments.") + recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL') + + while True: + conn = connect(args.config.get_libpq_dsn()) + start = dt.datetime.now(dt.timezone.utc) + state = replication.update(conn, params) + status.log_status(conn, start, 'import') + conn.close() + + if state is not replication.UpdateState.NO_CHANGES and args.do_index: + start = dt.datetime.now(dt.timezone.utc) + indexer = Indexer(args.config.get_libpq_dsn(), + args.threads or 1) + indexer.index_boundaries(0, 30) + indexer.index_by_rank(0, 30) + + conn = connect(args.config.get_libpq_dsn()) + status.set_indexed(conn, True) + status.log_status(conn, start, 'index') + conn.close() + + if args.once: + break + + if state is replication.UpdateState.NO_CHANGES: + LOG.warning("No new changes. Sleeping for %d sec.", recheck_interval) + time.sleep(recheck_interval) + + return state.value @staticmethod def run(args): @@ -241,35 +337,13 @@ class UpdateReplication: "To install pyosmium via pip: pip3 install osmium") return 1 - from .tools import replication, refresh - - conn = connect(args.config.get_libpq_dsn()) - - params = ['update.php'] if args.init: - LOG.warning("Initialising replication updates") - replication.init_replication(conn, args.config.REPLICATION_URL) - if args.update_functions: - LOG.warning("Create functions") - refresh.create_functions(conn, args.config, args.data_dir, - True, False) - conn.close() - return 0 + return UpdateReplication._init_replication(args) if args.check_for_updates: - ret = replication.check_for_updates(conn, args.config.REPLICATION_URL) - conn.close() - return ret - - if args.once: - params.append('--import-osmosis') - else: - params.append('--import-osmosis-all') - if not args.do_index: - params.append('--no-index') - - return run_legacy_script(*params, nominatim_env=args) + return UpdateReplication._check_for_updates(args) + return UpdateReplication._update(args) class UpdateAddData: """\ @@ -350,8 +424,11 @@ class UpdateIndex: if not args.boundaries_only: indexer.index_by_rank(args.minrank, args.maxrank) - if not args.no_boundaries and not args.boundaries_only: - indexer.update_status_table() + if not args.no_boundaries and not args.boundaries_only \ + and args.minrank == 0 and args.maxrank == 30: + conn = connect(args.config.get_libpq_dsn()) + status.set_indexed(conn, True) + conn.close() return 0 @@ -390,25 +467,31 @@ class UpdateRefresh: def run(args): from .tools import refresh - conn = connect(args.config.get_libpq_dsn()) - if args.postcodes: LOG.warning("Update postcodes centroid") + conn = connect(args.config.get_libpq_dsn()) refresh.update_postcodes(conn, args.data_dir) + conn.close() if args.word_counts: LOG.warning('Recompute frequency of full-word search terms') + conn = connect(args.config.get_libpq_dsn()) refresh.recompute_word_counts(conn, args.data_dir) + conn.close() if args.address_levels: cfg = Path(args.config.ADDRESS_LEVEL_CONFIG) LOG.warning('Updating address levels from %s', cfg) + conn = connect(args.config.get_libpq_dsn()) refresh.load_address_levels_from_file(conn, cfg) + conn.close() if args.functions: LOG.warning('Create functions') + conn = connect(args.config.get_libpq_dsn()) refresh.create_functions(conn, args.config, args.data_dir, args.diffs, args.enable_debug_statements) + conn.close() if args.wiki_data: run_legacy_script('setup.php', '--import-wikipedia-articles', @@ -421,8 +504,6 @@ class UpdateRefresh: run_legacy_script('setup.php', '--setup-website', nominatim_env=args, throw_on_fail=True) - conn.close() - return 0 diff --git a/nominatim/config.py b/nominatim/config.py index 3f75ce33..271d2d4d 100644 --- a/nominatim/config.py +++ b/nominatim/config.py @@ -1,10 +1,14 @@ """ Nominatim configuration accessor. """ +import logging import os +from pathlib import Path from dotenv import dotenv_values +LOG = logging.getLogger() + class Configuration: """ Load and manage the project configuration. @@ -21,6 +25,7 @@ class Configuration: def __init__(self, project_dir, config_dir): self.project_dir = project_dir + self.config_dir = config_dir self._config = dotenv_values(str((config_dir / 'env.defaults').resolve())) if project_dir is not None: self._config.update(dotenv_values(str((project_dir / '.env').resolve()))) @@ -38,24 +43,56 @@ class Configuration: return os.environ.get(name) or self._config[name] def get_bool(self, name): - """ Return the given configuration parameters as a boolean. + """ Return the given configuration parameter as a boolean. Values of '1', 'yes' and 'true' are accepted as truthy values, everything else is interpreted as false. """ return self.__getattr__(name).lower() in ('1', 'yes', 'true') + + def get_int(self, name): + """ Return the given configuration parameter as an int. + """ + try: + return int(self.__getattr__(name)) + except ValueError: + LOG.fatal("Invalid setting NOMINATIM_%s. Needs to be a number.", name) + raise + + def get_libpq_dsn(self): """ Get configured database DSN converted into the key/value format understood by libpq and psycopg. """ dsn = self.DATABASE_DSN + def quote_param(param): + key, val = param.split('=') + val = val.replace('\\', '\\\\').replace("'", "\\'") + if ' ' in val: + val = "'" + val + "'" + return key + '=' + val + if dsn.startswith('pgsql:'): # Old PHP DSN format. Convert before returning. - return dsn[6:].replace(';', ' ') + return ' '.join([quote_param(p) for p in dsn[6:].split(';')]) return dsn + + def get_import_style_file(self): + """ Return the import style file as a path object. Translates the + name of the standard styles automatically into a file in the + config style. + """ + style = self.__getattr__('IMPORT_STYLE') + + if style in ('admin', 'street', 'address', 'full', 'extratags'): + return self.config_dir / 'import-{}.style'.format(style) + + return Path(style) + + def get_os_env(self): """ Return a copy of the OS environment with the Nominatim configuration merged in. diff --git a/nominatim/db/status.py b/nominatim/db/status.py index a0454771..9d7344c4 100644 --- a/nominatim/db/status.py +++ b/nominatim/db/status.py @@ -1,5 +1,5 @@ """ -Access and helper functions for the status table. +Access and helper functions for the status and status log table. """ import datetime as dt import logging @@ -61,3 +61,21 @@ def get_status(conn): row = cur.fetchone() return row['lastimportdate'], row['sequence_id'], row['indexed'] + + +def set_indexed(conn, state): + """ Set the indexed flag in the status table to the given state. + """ + with conn.cursor() as cur: + cur.execute("UPDATE import_status SET indexed = %s", (state, )) + conn.commit() + + +def log_status(conn, start, event, batchsize=None): + """ Write a new status line to the `import_osmosis_log` table. + """ + with conn.cursor() as cur: + cur.execute("""INSERT INTO import_osmosis_log + (batchend, batchseq, batchsize, starttime, endtime, event) + SELECT lastimportdate, sequence_id, %s, %s, now(), %s FROM import_status""", + (batchsize, start, event)) diff --git a/nominatim/tools/exec_utils.py b/nominatim/tools/exec_utils.py index 0d3db204..03bed986 100644 --- a/nominatim/tools/exec_utils.py +++ b/nominatim/tools/exec_utils.py @@ -2,10 +2,13 @@ Helper functions for executing external programs. """ import logging +import os import subprocess import urllib.request as urlrequest from urllib.parse import urlencode +from psycopg2.extensions import parse_dsn + from ..version import NOMINATIM_VERSION LOG = logging.getLogger() @@ -87,6 +90,41 @@ def run_api_script(endpoint, project_dir, extra_env=None, phpcgi_bin=None, return 0 +def run_osm2pgsql(options): + """ Run osm2pgsql with the given options. + """ + env = os.environ + cmd = [options['osm2pgsql'], + '--hstore', '--latlon', '--slim', + '--with-forward-dependencies', 'false', + '--log-progress', 'true', + '--number-processes', str(options['threads']), + '--cache', str(options['osm2pgsql_cache']), + '--output', 'gazetteer', + '--style', str(options['osm2pgsql_style']) + ] + if options['append']: + cmd.append('--append') + + if options['flatnode_file']: + cmd.extend(('--flat-nodes', options['flatnode_file'])) + + dsn = parse_dsn(options['dsn']) + if 'password' in dsn: + env['PGPASSWORD'] = dsn['password'] + if 'dbname' in dsn: + cmd.extend(('-d', dsn['dbname'])) + if 'user' in dsn: + cmd.extend(('--username', dsn['user'])) + for param in ('host', 'port'): + if param in dsn: + cmd.extend(('--' + param, dsn[param])) + + cmd.append(str(options['import_file'])) + + subprocess.run(cmd, cwd=options.get('cwd', '.'), env=env, check=True) + + def get_url(url): """ Get the contents from the given URL and return it as a UTF-8 string. """ diff --git a/nominatim/tools/replication.py b/nominatim/tools/replication.py index f278556a..04f1c45b 100644 --- a/nominatim/tools/replication.py +++ b/nominatim/tools/replication.py @@ -1,12 +1,16 @@ """ Functions for updating a database from a replication source. """ -import datetime +import datetime as dt +from enum import Enum import logging +import time from osmium.replication.server import ReplicationServer +from osmium import WriteHandler from ..db import status +from .exec_utils import run_osm2pgsql LOG = logging.getLogger() @@ -17,7 +21,7 @@ def init_replication(conn, base_url): date = status.compute_database_date(conn) # margin of error to make sure we get all data - date -= datetime.timedelta(hours=3) + date -= dt.timedelta(hours=3) repl = ReplicationServer(base_url) @@ -53,7 +57,62 @@ def check_for_updates(conn, base_url): if state.sequence <= seq: LOG.warning("Database is up to date.") - return 1 + return 2 LOG.warning("New data available (%i => %i).", seq, state.sequence) return 0 + +class UpdateState(Enum): + """ Possible states after an update has run. + """ + + UP_TO_DATE = 0 + MORE_PENDING = 2 + NO_CHANGES = 3 + + +def update(conn, options): + """ Update database from the next batch of data. Returns the state of + updates according to `UpdateState`. + """ + startdate, startseq, indexed = status.get_status(conn) + + if startseq is None: + LOG.error("Replication not set up. " + "Please run 'nominatim replication --init' first.") + raise RuntimeError("Replication not set up.") + + if not indexed and options['indexed_only']: + LOG.info("Skipping update. There is data that needs indexing.") + return UpdateState.MORE_PENDING + + last_since_update = dt.datetime.now(dt.timezone.utc) - startdate + update_interval = dt.timedelta(seconds=options['update_interval']) + if last_since_update < update_interval: + duration = (update_interval - last_since_update).seconds + LOG.warning("Sleeping for %s sec before next update.", duration) + time.sleep(duration) + + if options['import_file'].exists(): + options['import_file'].unlink() + + # Read updates into file. + repl = ReplicationServer(options['base_url']) + + outhandler = WriteHandler(str(options['import_file'])) + endseq = repl.apply_diffs(outhandler, startseq, + max_size=options['max_diff_size'] * 1024) + outhandler.close() + + if endseq is None: + return UpdateState.NO_CHANGES + + # Consume updates with osm2pgsql. + options['append'] = True + run_osm2pgsql(options) + + # Write the current status to the file + endstate = repl.get_state_info(endseq) + status.set_status(conn, endstate.timestamp, seq=endseq, indexed=False) + + return UpdateState.UP_TO_DATE diff --git a/test/python/conftest.py b/test/python/conftest.py index 2e81e919..8b0ba145 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -101,7 +101,8 @@ def def_config(): @pytest.fixture def status_table(temp_db_conn): - """ Create an empty version of the status table. + """ Create an empty version of the status table and + the status logging table. """ with temp_db_conn.cursor() as cur: cur.execute("""CREATE TABLE import_status ( @@ -109,6 +110,14 @@ def status_table(temp_db_conn): sequence_id integer, indexed boolean )""") + cur.execute("""CREATE TABLE import_osmosis_log ( + batchend timestamp, + batchseq integer, + batchsize bigint, + starttime timestamp, + endtime timestamp, + event text + )""") temp_db_conn.commit() diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 71e3ff65..fc2454cd 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -1,8 +1,13 @@ """ Tests for command line interface wrapper. + +These tests just check that the various command line parameters route to the +correct functionionality. They use a lot of monkeypatching to avoid executing +the actual functions. """ import psycopg2 import pytest +import time import nominatim.cli import nominatim.indexer.indexer @@ -21,9 +26,9 @@ class MockParamCapture: """ Mock that records the parameters with which a function was called as well as the number of calls. """ - def __init__(self): + def __init__(self, retval=0): self.called = 0 - self.return_value = 0 + self.return_value = retval def __call__(self, *args, **kwargs): self.called += 1 @@ -142,6 +147,69 @@ def test_replication_command(monkeypatch, temp_db, params, func): assert func_mock.called == 1 +def test_replication_update_bad_interval(monkeypatch, temp_db): + monkeypatch.setenv('NOMINATIM_REPLICATION_UPDATE_INTERVAL', 'xx') + + with pytest.raises(ValueError): + call_nominatim('replication') + + +def test_replication_update_bad_interval_for_geofabrik(monkeypatch, temp_db): + monkeypatch.setenv('NOMINATIM_REPLICATION_URL', + 'https://download.geofabrik.de/europe/ireland-and-northern-ireland-updates') + + with pytest.raises(RuntimeError, match='Invalid replication.*'): + call_nominatim('replication') + + +@pytest.mark.parametrize("state, retval", [ + (nominatim.tools.replication.UpdateState.UP_TO_DATE, 0), + (nominatim.tools.replication.UpdateState.NO_CHANGES, 3) + ]) +def test_replication_update_once_no_index(monkeypatch, temp_db, status_table, state, retval): + func_mock = MockParamCapture(retval=state) + monkeypatch.setattr(nominatim.tools.replication, 'update', func_mock) + + assert retval == call_nominatim('replication', '--once', '--no-index') + + +def test_replication_update_continuous(monkeypatch, status_table): + states = [nominatim.tools.replication.UpdateState.UP_TO_DATE, + nominatim.tools.replication.UpdateState.UP_TO_DATE] + monkeypatch.setattr(nominatim.tools.replication, 'update', + lambda *args, **kwargs: states.pop()) + + index_mock = MockParamCapture() + monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', index_mock) + monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', index_mock) + + with pytest.raises(IndexError): + call_nominatim('replication') + + assert index_mock.called == 4 + + +def test_replication_update_continuous_no_change(monkeypatch, status_table): + states = [nominatim.tools.replication.UpdateState.NO_CHANGES, + nominatim.tools.replication.UpdateState.UP_TO_DATE] + monkeypatch.setattr(nominatim.tools.replication, 'update', + lambda *args, **kwargs: states.pop()) + + index_mock = MockParamCapture() + monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', index_mock) + monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', index_mock) + + sleep_mock = MockParamCapture() + monkeypatch.setattr(time, 'sleep', sleep_mock) + + with pytest.raises(IndexError): + call_nominatim('replication') + + assert index_mock.called == 2 + assert sleep_mock.called == 1 + assert sleep_mock.last_args[0] == 60 + + @pytest.mark.parametrize("params", [ ('search', '--query', 'new'), ('reverse', '--lat', '0', '--lon', '0'), diff --git a/test/python/test_config.py b/test/python/test_config.py index 064f71ba..f9fefeb2 100644 --- a/test/python/test_config.py +++ b/test/python/test_config.py @@ -69,6 +69,18 @@ def test_get_libpq_dsn_convert_php(monkeypatch): assert config.get_libpq_dsn() == 'dbname=gis password=foo host=localhost' +@pytest.mark.parametrize("val,expect", [('foo bar', "'foo bar'"), + ("xy'z", "xy\\'z"), + ]) +def test_get_libpq_dsn_convert_php_special_chars(monkeypatch, val, expect): + config = Configuration(None, DEFCFG_DIR) + + monkeypatch.setenv('NOMINATIM_DATABASE_DSN', + 'pgsql:dbname=gis;password={}'.format(val)) + + assert config.get_libpq_dsn() == "dbname=gis password={}".format(expect) + + def test_get_libpq_dsn_convert_libpq(monkeypatch): config = Configuration(None, DEFCFG_DIR) @@ -93,3 +105,51 @@ def test_get_bool_empty(): assert config.DATABASE_MODULE_PATH == '' assert config.get_bool('DATABASE_MODULE_PATH') == False + + +@pytest.mark.parametrize("value,result", [('0', 0), ('1', 1), + ('85762513444', 85762513444)]) +def test_get_int_success(monkeypatch, value, result): + config = Configuration(None, DEFCFG_DIR) + + monkeypatch.setenv('NOMINATIM_FOOBAR', value) + + assert config.get_int('FOOBAR') == result + + +@pytest.mark.parametrize("value", ['1b', 'fg', '0x23']) +def test_get_int_bad_values(monkeypatch, value): + config = Configuration(None, DEFCFG_DIR) + + monkeypatch.setenv('NOMINATIM_FOOBAR', value) + + with pytest.raises(ValueError): + config.get_int('FOOBAR') + + +def test_get_int_empty(): + config = Configuration(None, DEFCFG_DIR) + + assert config.DATABASE_MODULE_PATH == '' + + with pytest.raises(ValueError): + config.get_int('DATABASE_MODULE_PATH') + + +def test_get_import_style_intern(monkeypatch): + config = Configuration(None, DEFCFG_DIR) + + monkeypatch.setenv('NOMINATIM_IMPORT_STYLE', 'street') + + expected = DEFCFG_DIR / 'import-street.style' + + assert config.get_import_style_file() == expected + + +@pytest.mark.parametrize("value", ['custom', '/foo/bar.stye']) +def test_get_import_style_intern(monkeypatch, value): + config = Configuration(None, DEFCFG_DIR) + + monkeypatch.setenv('NOMINATIM_IMPORT_STYLE', value) + + assert str(config.get_import_style_file()) == value diff --git a/test/python/test_db_status.py b/test/python/test_db_status.py index 9631170a..1a538aec 100644 --- a/test/python/test_db_status.py +++ b/test/python/test_db_status.py @@ -84,3 +84,30 @@ def test_get_status_success(status_table, temp_db_conn): assert nominatim.db.status.get_status(temp_db_conn) == \ (date, 667, False) + + +@pytest.mark.parametrize("old_state", [True, False]) +@pytest.mark.parametrize("new_state", [True, False]) +def test_set_indexed(status_table, temp_db_conn, temp_db_cursor, old_state, new_state): + date = dt.datetime.fromordinal(1000000).replace(tzinfo=dt.timezone.utc) + nominatim.db.status.set_status(temp_db_conn, date=date, indexed=old_state) + nominatim.db.status.set_indexed(temp_db_conn, new_state) + + assert temp_db_cursor.scalar("SELECT indexed FROM import_status") == new_state + + +def test_set_indexed_empty_status(status_table, temp_db_conn, temp_db_cursor): + nominatim.db.status.set_indexed(temp_db_conn, True) + + assert temp_db_cursor.scalar("SELECT count(*) FROM import_status") == 0 + + +def text_log_status(status_table, temp_db_conn): + date = dt.datetime.fromordinal(1000000).replace(tzinfo=dt.timezone.utc) + start = dt.datetime.now() - dt.timedelta(hours=1) + nominatim.db.status.set_status(temp_db_conn, date=date, seq=56) + nominatim.db.status.log_status(temp_db_conn, start, 'index') + + assert temp_db_cursor.scalar("SELECT count(*) FROM import_osmosis_log") == 1 + assert temp_db_cursor.scalar("SELECT seq FROM import_osmosis_log") == 56 + assert temp_db_cursor.scalar("SELECT date FROM import_osmosis_log") == date diff --git a/test/python/test_tools_exec_utils.py b/test/python/test_tools_exec_utils.py index a4eef61f..26a714f3 100644 --- a/test/python/test_tools_exec_utils.py +++ b/test/python/test_tools_exec_utils.py @@ -99,3 +99,12 @@ def test_run_api_with_extra_env(tmp_project_dir): extra_env = dict(SCRIPT_FILENAME=str(tmp_project_dir / 'website' / 'test.php')) assert 0 == exec_utils.run_api_script('badname', tmp_project_dir, extra_env=extra_env) + + +### run_osm2pgsql + +def test_run_osm2pgsql(): + exec_utils.run_osm2pgsql(dict(osm2pgsql='echo', append=False, flatnode_file=None, + dsn='dbname=foobar', threads=1, osm2pgsql_cache=500, + osm2pgsql_style='./my.style', + import_file='foo.bar')) diff --git a/test/python/test_tools_replication.py b/test/python/test_tools_replication.py index e06eda59..b94563ff 100644 --- a/test/python/test_tools_replication.py +++ b/test/python/test_tools_replication.py @@ -2,6 +2,7 @@ Tests for replication functionality. """ import datetime as dt +import time import pytest from osmium.replication.server import OsmosisState @@ -16,6 +17,7 @@ OSM_NODE_DATA = """\ """ +### init replication def test_init_replication_bad_base_url(monkeypatch, status_table, place_row, temp_db_conn, temp_db_cursor): place_row(osm_type='N', osm_id=100) @@ -43,20 +45,20 @@ def test_init_replication_success(monkeypatch, status_table, place_row, temp_db_ assert temp_db_cursor.fetchone() == [expected_date, 234, True] +### checking for updates + def test_check_for_updates_empty_status_table(status_table, temp_db_conn): assert nominatim.tools.replication.check_for_updates(temp_db_conn, 'https://test.io') == 254 def test_check_for_updates_seq_not_set(status_table, temp_db_conn): - status.set_status(temp_db_conn, dt.datetime.now().replace(tzinfo=dt.timezone.utc)) + status.set_status(temp_db_conn, dt.datetime.now(dt.timezone.utc)) assert nominatim.tools.replication.check_for_updates(temp_db_conn, 'https://test.io') == 254 def test_check_for_updates_no_state(monkeypatch, status_table, temp_db_conn): - status.set_status(temp_db_conn, - dt.datetime.now().replace(tzinfo=dt.timezone.utc), - seq=345) + status.set_status(temp_db_conn, dt.datetime.now(dt.timezone.utc), seq=345) monkeypatch.setattr(nominatim.tools.replication.ReplicationServer, "get_state_info", lambda self: None) @@ -64,10 +66,10 @@ def test_check_for_updates_no_state(monkeypatch, status_table, temp_db_conn): assert nominatim.tools.replication.check_for_updates(temp_db_conn, 'https://test.io') == 253 -@pytest.mark.parametrize("server_sequence,result", [(344, 1), (345, 1), (346, 0)]) +@pytest.mark.parametrize("server_sequence,result", [(344, 2), (345, 2), (346, 0)]) def test_check_for_updates_no_new_data(monkeypatch, status_table, temp_db_conn, server_sequence, result): - date = dt.datetime.now().replace(tzinfo=dt.timezone.utc) + date = dt.datetime.now(dt.timezone.utc) status.set_status(temp_db_conn, date, seq=345) monkeypatch.setattr(nominatim.tools.replication.ReplicationServer, @@ -75,3 +77,61 @@ def test_check_for_updates_no_new_data(monkeypatch, status_table, temp_db_conn, lambda self: OsmosisState(server_sequence, date)) assert nominatim.tools.replication.check_for_updates(temp_db_conn, 'https://test.io') == result + + +### updating + +@pytest.fixture +def update_options(tmpdir): + return dict(base_url='https://test.io', + indexed_only=False, + update_interval=3600, + import_file=tmpdir / 'foo.osm', + max_diff_size=1) + +def test_update_empty_status_table(status_table, temp_db_conn): + with pytest.raises(RuntimeError): + nominatim.tools.replication.update(temp_db_conn, {}) + + +def test_update_already_indexed(status_table, temp_db_conn): + status.set_status(temp_db_conn, dt.datetime.now(dt.timezone.utc), seq=34, indexed=False) + + assert nominatim.tools.replication.update(temp_db_conn, dict(indexed_only=True)) \ + == nominatim.tools.replication.UpdateState.MORE_PENDING + + +def test_update_no_data_no_sleep(monkeypatch, status_table, temp_db_conn, update_options): + date = dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=1) + status.set_status(temp_db_conn, date, seq=34) + + monkeypatch.setattr(nominatim.tools.replication.ReplicationServer, + "apply_diffs", + lambda *args, **kwargs: None) + + sleeptime = [] + monkeypatch.setattr(time, 'sleep', lambda s: sleeptime.append(s)) + + assert nominatim.tools.replication.update(temp_db_conn, update_options) \ + == nominatim.tools.replication.UpdateState.NO_CHANGES + + assert not sleeptime + + +def test_update_no_data_sleep(monkeypatch, status_table, temp_db_conn, update_options): + date = dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=30) + status.set_status(temp_db_conn, date, seq=34) + + monkeypatch.setattr(nominatim.tools.replication.ReplicationServer, + "apply_diffs", + lambda *args, **kwargs: None) + + sleeptime = [] + monkeypatch.setattr(time, 'sleep', lambda s: sleeptime.append(s)) + + assert nominatim.tools.replication.update(temp_db_conn, update_options) \ + == nominatim.tools.replication.UpdateState.NO_CHANGES + + assert len(sleeptime) == 1 + assert sleeptime[0] < 3600 + assert sleeptime[0] > 0 diff --git a/utils/osm_file_date.py b/utils/osm_file_date.py deleted file mode 100755 index 0443e6ac..00000000 --- a/utils/osm_file_date.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env python3 - -import osmium -import sys -import datetime - - -class Datecounter(osmium.SimpleHandler): - - filedate = None - - def date(self, o): - ts = o.timestamp - if self.filedate is None or ts > self.filedate: - self.filedate = ts - - node = date - way = date - relation = date - - -if __name__ == '__main__': - if len(sys.argv) != 2: - print("Usage: python osm_file_date.py ") - sys.exit(-1) - - h = Datecounter() - - h.apply_file(sys.argv[1]) - - if h.filedate is None: - exit(5) - - print(h.filedate) From 01e0fd7e134bedf1b791b18725531b7a5e21e432 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 30 Jan 2021 15:52:49 +0100 Subject: [PATCH 08/13] whitelist pyosmium for pylint --- .github/workflows/ci-tests.yml | 2 +- CMakeLists.txt | 2 +- CONTRIBUTING.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 8b20b55d..323d36d7 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -52,7 +52,7 @@ jobs: run: phpcs --report-width=120 . - name: Python linting - run: pylint nominatim + run: pylint --extension-pkg-whitelist=osmium nominatim - name: PHP unit tests run: phpunit ./ diff --git a/CMakeLists.txt b/CMakeLists.txt index c6f63a8b..d579bf1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -191,7 +191,7 @@ if (BUILD_TESTS) if (PYLINT) message(STATUS "Using pylint binary ${PYLINT}") add_test(NAME pylint - COMMAND ${PYLINT} nominatim + COMMAND ${PYLINT} --extension-pkg-whitelist=osmium nominatim WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) else() message(WARNING "pylint not found. Python linting tests disabled.") diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 552d1da1..6798c39d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -53,7 +53,7 @@ The coding style is enforced with PHPCS and pylint. It can be tested with: ``` phpcs --report-width=120 --colors . -pylint3 nominatim +pylint3 --extension-pkg-whitelist=osmium nominatim ``` ## Testing From 45ea73913f994df4c00cacfcf0d398db84329798 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 30 Jan 2021 15:55:04 +0100 Subject: [PATCH 09/13] remove setting for PYOSMIUM_BINARY pyosmium is now called as a library from the python code, so that pyosmium-get-changes is no longer needed. --- docs/admin/Update.md | 9 --------- settings/env.defaults | 4 ---- 2 files changed, 13 deletions(-) diff --git a/docs/admin/Update.md b/docs/admin/Update.md index 4b1a2be7..01980bd8 100644 --- a/docs/admin/Update.md +++ b/docs/admin/Update.md @@ -19,15 +19,6 @@ Run (as the same user who will later run the updates): pip3 install --user osmium ``` -Nominatim needs a tool called `pyosmium-get-changes` which comes with -Pyosmium. You need to tell Nominatim where to find it. Add the -following line to your `.env`: - - NOMINATIM_PYOSMIUM_BINARY=/home/user/.local/bin/pyosmium-get-changes - -The path above is fine if you used the `--user` parameter with pip. -Replace `user` with your user name. - #### Setting up the update process Next the update needs to be initialised. By default Nominatim is configured diff --git a/settings/env.defaults b/settings/env.defaults index 9fefc622..e2eda340 100644 --- a/settings/env.defaults +++ b/settings/env.defaults @@ -67,10 +67,6 @@ NOMINATIM_HTTP_PROXY_PASSWORD= # EXPERT ONLY. You should usually use the supplied osm2pgsql. NOMINATIM_OSM2PGSQL_BINARY= -# Location of pyosmium-get-changes. -# Only needed when running updates. -NOMINATIM_PYOSMIUM_BINARY= - # Directory where to find US Tiger data files to import. # Used with setup.php --import-tiger-data. When unset, the data is expected # to be located under 'data/tiger' in the source tree. From e629a175ed0a1c54398622251f56d56baeef768f Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 30 Jan 2021 16:20:10 +0100 Subject: [PATCH 10/13] introduce custom UsageError This is a exception to be thrown when the error occures because of bad user data. We don't want to print a full stack trace in these cases but just tell the user what went wrong. --- nominatim/cli.py | 16 +++++++++++++--- nominatim/config.py | 4 +++- nominatim/db/connection.py | 2 +- nominatim/db/status.py | 5 +++-- nominatim/errors.py | 9 +++++++++ nominatim/tools/replication.py | 5 +++-- test/python/test_cli.py | 7 +++---- test/python/test_config.py | 5 +++-- test/python/test_db_connection.py | 2 +- test/python/test_db_status.py | 5 +++-- test/python/test_tools_replication.py | 5 +++-- 11 files changed, 45 insertions(+), 20 deletions(-) create mode 100644 nominatim/errors.py diff --git a/nominatim/cli.py b/nominatim/cli.py index f02277ae..cc591ea5 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -14,6 +14,7 @@ from .config import Configuration from .tools.exec_utils import run_legacy_script, run_api_script from .db.connection import connect from .db import status +from .errors import UsageError LOG = logging.getLogger() @@ -89,7 +90,16 @@ class CommandlineParser: args.config = Configuration(args.project_dir, args.data_dir / 'settings') - return args.command.run(args) + try: + return args.command.run(args) + except UsageError as e: + log = logging.getLogger() + if log.isEnabledFor(logging.DEBUG): + raise # use Python's exception printing + log.fatal('FATAL: ' + str(e)) + + # If we get here, then execution has failed in some way. + return 1 def _osm2pgsql_options_from_args(args, default_cache, default_threads): @@ -292,12 +302,12 @@ class UpdateReplication: "Please check install documentation " "(https://nominatim.org/release-docs/latest/admin/Import-and-Update#" "setting-up-the-update-process).") - raise RuntimeError("Invalid replication update interval setting.") + raise UsageError("Invalid replication update interval setting.") if not args.once: if not args.do_index: LOG.fatal("Indexing cannot be disabled when running updates continuously.") - raise RuntimeError("Bad arguments.") + raise UsageError("Bad argument '--no-index'.") recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL') while True: diff --git a/nominatim/config.py b/nominatim/config.py index 271d2d4d..4de2052e 100644 --- a/nominatim/config.py +++ b/nominatim/config.py @@ -7,6 +7,8 @@ from pathlib import Path from dotenv import dotenv_values +from .errors import UsageError + LOG = logging.getLogger() class Configuration: @@ -57,7 +59,7 @@ class Configuration: return int(self.__getattr__(name)) except ValueError: LOG.fatal("Invalid setting NOMINATIM_%s. Needs to be a number.", name) - raise + raise UsageError("Configuration error.") def get_libpq_dsn(self): diff --git a/nominatim/db/connection.py b/nominatim/db/connection.py index 8e75d7a2..4d30151d 100644 --- a/nominatim/db/connection.py +++ b/nominatim/db/connection.py @@ -27,7 +27,7 @@ class _Cursor(psycopg2.extras.DictCursor): self.execute(sql, args) if self.rowcount != 1: - raise ValueError("Query did not return a single row.") + raise RuntimeError("Query did not return a single row.") return self.fetchone()[0] diff --git a/nominatim/db/status.py b/nominatim/db/status.py index 9d7344c4..75da3c16 100644 --- a/nominatim/db/status.py +++ b/nominatim/db/status.py @@ -6,6 +6,7 @@ import logging import re from ..tools.exec_utils import get_url +from ..errors import UsageError LOG = logging.getLogger() @@ -19,7 +20,7 @@ def compute_database_date(conn): if osmid is None: LOG.fatal("No data found in the database.") - raise RuntimeError("No data found in the database.") + raise UsageError("No data found in the database.") LOG.info("Using node id %d for timestamp lookup", osmid) # Get the node from the API to find the timestamp when it was created. @@ -31,7 +32,7 @@ def compute_database_date(conn): if match is None: LOG.fatal("The node data downloaded from the API does not contain valid data.\n" "URL used: %s", node_url) - raise RuntimeError("Bad API data.") + raise UsageError("Bad API data.") LOG.debug("Found timestamp %s", match[1]) diff --git a/nominatim/errors.py b/nominatim/errors.py new file mode 100644 index 00000000..80181aba --- /dev/null +++ b/nominatim/errors.py @@ -0,0 +1,9 @@ +""" +Custom exception and error classes for Nominatim. +""" + +class UsageError(Exception): + """ An error raised because of bad user input. This error will usually + not cause a stack trace to be printed unless debugging is enabled. + """ + pass diff --git a/nominatim/tools/replication.py b/nominatim/tools/replication.py index 04f1c45b..c7d0d3e5 100644 --- a/nominatim/tools/replication.py +++ b/nominatim/tools/replication.py @@ -11,6 +11,7 @@ from osmium import WriteHandler from ..db import status from .exec_utils import run_osm2pgsql +from ..errors import UsageError LOG = logging.getLogger() @@ -31,7 +32,7 @@ def init_replication(conn, base_url): LOG.fatal("Cannot reach the configured replication service '%s'.\n" "Does the URL point to a directory containing OSM update data?", base_url) - raise RuntimeError("Failed to reach replication service") + raise UsageError("Failed to reach replication service") status.set_status(conn, date=date, seq=seq) @@ -80,7 +81,7 @@ def update(conn, options): if startseq is None: LOG.error("Replication not set up. " "Please run 'nominatim replication --init' first.") - raise RuntimeError("Replication not set up.") + raise UsageError("Replication not set up.") if not indexed and options['indexed_only']: LOG.info("Skipping update. There is data that needs indexing.") diff --git a/test/python/test_cli.py b/test/python/test_cli.py index fc2454cd..c4f3ef36 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -13,6 +13,7 @@ import nominatim.cli import nominatim.indexer.indexer import nominatim.tools.refresh import nominatim.tools.replication +from nominatim.errors import UsageError def call_nominatim(*args): return nominatim.cli.nominatim(module_dir='build/module', @@ -150,16 +151,14 @@ def test_replication_command(monkeypatch, temp_db, params, func): def test_replication_update_bad_interval(monkeypatch, temp_db): monkeypatch.setenv('NOMINATIM_REPLICATION_UPDATE_INTERVAL', 'xx') - with pytest.raises(ValueError): - call_nominatim('replication') + assert call_nominatim('replication') == 1 def test_replication_update_bad_interval_for_geofabrik(monkeypatch, temp_db): monkeypatch.setenv('NOMINATIM_REPLICATION_URL', 'https://download.geofabrik.de/europe/ireland-and-northern-ireland-updates') - with pytest.raises(RuntimeError, match='Invalid replication.*'): - call_nominatim('replication') + assert call_nominatim('replication') == 1 @pytest.mark.parametrize("state, retval", [ diff --git a/test/python/test_config.py b/test/python/test_config.py index f9fefeb2..4578be13 100644 --- a/test/python/test_config.py +++ b/test/python/test_config.py @@ -7,6 +7,7 @@ import tempfile import pytest from nominatim.config import Configuration +from nominatim.errors import UsageError DEFCFG_DIR = Path(__file__) / '..' / '..' / '..' / 'settings' @@ -123,7 +124,7 @@ def test_get_int_bad_values(monkeypatch, value): monkeypatch.setenv('NOMINATIM_FOOBAR', value) - with pytest.raises(ValueError): + with pytest.raises(UsageError): config.get_int('FOOBAR') @@ -132,7 +133,7 @@ def test_get_int_empty(): assert config.DATABASE_MODULE_PATH == '' - with pytest.raises(ValueError): + with pytest.raises(UsageError): config.get_int('DATABASE_MODULE_PATH') diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py index 5c484558..ef1ae741 100644 --- a/test/python/test_db_connection.py +++ b/test/python/test_db_connection.py @@ -28,5 +28,5 @@ def test_cursor_scalar(db, temp_db_cursor): def test_cursor_scalar_many_rows(db): with db.cursor() as cur: - with pytest.raises(ValueError): + with pytest.raises(RuntimeError): cur.scalar('SELECT * FROM pg_tables') diff --git a/test/python/test_db_status.py b/test/python/test_db_status.py index 1a538aec..399a0036 100644 --- a/test/python/test_db_status.py +++ b/test/python/test_db_status.py @@ -6,9 +6,10 @@ import datetime as dt import pytest import nominatim.db.status +from nominatim.errors import UsageError def test_compute_database_date_place_empty(status_table, place_table, temp_db_conn): - with pytest.raises(RuntimeError): + with pytest.raises(UsageError): nominatim.db.status.compute_database_date(temp_db_conn) OSM_NODE_DATA = """\ @@ -44,7 +45,7 @@ def test_compute_database_broken_api(monkeypatch, status_table, place_row, temp_ monkeypatch.setattr(nominatim.db.status, "get_url", mock_url) - with pytest.raises(RuntimeError): + with pytest.raises(UsageError): date = nominatim.db.status.compute_database_date(temp_db_conn) diff --git a/test/python/test_tools_replication.py b/test/python/test_tools_replication.py index b94563ff..156385ad 100644 --- a/test/python/test_tools_replication.py +++ b/test/python/test_tools_replication.py @@ -9,6 +9,7 @@ from osmium.replication.server import OsmosisState import nominatim.tools.replication import nominatim.db.status as status +from nominatim.errors import UsageError OSM_NODE_DATA = """\ @@ -24,7 +25,7 @@ def test_init_replication_bad_base_url(monkeypatch, status_table, place_row, tem monkeypatch.setattr(nominatim.db.status, "get_url", lambda u : OSM_NODE_DATA) - with pytest.raises(RuntimeError, match="Failed to reach replication service"): + with pytest.raises(UsageError, match="Failed to reach replication service"): nominatim.tools.replication.init_replication(temp_db_conn, 'https://test.io') @@ -90,7 +91,7 @@ def update_options(tmpdir): max_diff_size=1) def test_update_empty_status_table(status_table, temp_db_conn): - with pytest.raises(RuntimeError): + with pytest.raises(UsageError): nominatim.tools.replication.update(temp_db_conn, {}) From 7158433cd38096f38b2d97c1ea096b14bd91adba Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 30 Jan 2021 16:27:02 +0100 Subject: [PATCH 11/13] disable warning about non-toplevel import They are needed here so nominatim can be run when osmium is not installed. Everything except replication will work fine. --- nominatim/cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nominatim/cli.py b/nominatim/cli.py index cc591ea5..487206c1 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -123,6 +123,8 @@ def _osm2pgsql_options_from_args(args, default_cache, default_threads): # # No need to document the functions each time. # pylint: disable=C0111 +# Using non-top-level imports to make pyosmium optional for replication only. +# pylint: disable=C0415 class SetupAll: From 90aaab77fce2927a47b1f6ceca012d8ed3d399db Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 30 Jan 2021 16:42:25 +0100 Subject: [PATCH 12/13] fix linting issues --- nominatim/cli.py | 6 +++--- nominatim/errors.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/nominatim/cli.py b/nominatim/cli.py index 487206c1..68d3cca8 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -92,11 +92,11 @@ class CommandlineParser: try: return args.command.run(args) - except UsageError as e: + except UsageError as exception: log = logging.getLogger() if log.isEnabledFor(logging.DEBUG): raise # use Python's exception printing - log.fatal('FATAL: ' + str(e)) + log.fatal('FATAL: %s', exception) # If we get here, then execution has failed in some way. return 1 @@ -124,7 +124,7 @@ def _osm2pgsql_options_from_args(args, default_cache, default_threads): # No need to document the functions each time. # pylint: disable=C0111 # Using non-top-level imports to make pyosmium optional for replication only. -# pylint: disable=C0415 +# pylint: disable=E0012,C0415 class SetupAll: diff --git a/nominatim/errors.py b/nominatim/errors.py index 80181aba..e77f956a 100644 --- a/nominatim/errors.py +++ b/nominatim/errors.py @@ -6,4 +6,3 @@ class UsageError(Exception): """ An error raised because of bad user input. This error will usually not cause a stack trace to be printed unless debugging is enabled. """ - pass From 5f63d4ca1f54ac9863dde5736b9e3e11b39ecda9 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 30 Jan 2021 20:32:46 +0100 Subject: [PATCH 13/13] print nice summary after updates --- nominatim/cli.py | 22 ++++++++++++++++++++-- test/python/test_cli.py | 12 +++++++++--- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/nominatim/cli.py b/nominatim/cli.py index 68d3cca8..4873308d 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -284,6 +284,18 @@ class UpdateReplication: conn.close() return ret + @staticmethod + def _report_update(batchdate, start_import, start_index): + def round_time(delta): + return dt.timedelta(seconds=int(delta.total_seconds())) + + end = dt.datetime.now(dt.timezone.utc) + LOG.warning("Update completed. Import: %s. %sTotal: %s. Remaining backlog: %s.", + round_time((start_index or end) - start_import), + "Indexing: {} ".format(round_time(end - start_index)) + if start_index else '', + round_time(end - start_import), + round_time(end - batchdate)) @staticmethod def _update(args): @@ -317,10 +329,11 @@ class UpdateReplication: start = dt.datetime.now(dt.timezone.utc) state = replication.update(conn, params) status.log_status(conn, start, 'import') + batchdate, _, _ = status.get_status(conn) conn.close() if state is not replication.UpdateState.NO_CHANGES and args.do_index: - start = dt.datetime.now(dt.timezone.utc) + index_start = dt.datetime.now(dt.timezone.utc) indexer = Indexer(args.config.get_libpq_dsn(), args.threads or 1) indexer.index_boundaries(0, 30) @@ -328,8 +341,13 @@ class UpdateReplication: conn = connect(args.config.get_libpq_dsn()) status.set_indexed(conn, True) - status.log_status(conn, start, 'index') + status.log_status(conn, index_start, 'index') conn.close() + else: + index_start = None + + if LOG.isEnabledFor(logging.WARNING): + UpdateReplication._report_update(batchdate, start, index_start) if args.once: break diff --git a/test/python/test_cli.py b/test/python/test_cli.py index c4f3ef36..cde84759 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -5,6 +5,7 @@ These tests just check that the various command line parameters route to the correct functionionality. They use a lot of monkeypatching to avoid executing the actual functions. """ +import datetime as dt import psycopg2 import pytest import time @@ -14,6 +15,7 @@ import nominatim.indexer.indexer import nominatim.tools.refresh import nominatim.tools.replication from nominatim.errors import UsageError +from nominatim.db import status def call_nominatim(*args): return nominatim.cli.nominatim(module_dir='build/module', @@ -165,14 +167,17 @@ def test_replication_update_bad_interval_for_geofabrik(monkeypatch, temp_db): (nominatim.tools.replication.UpdateState.UP_TO_DATE, 0), (nominatim.tools.replication.UpdateState.NO_CHANGES, 3) ]) -def test_replication_update_once_no_index(monkeypatch, temp_db, status_table, state, retval): +def test_replication_update_once_no_index(monkeypatch, temp_db, temp_db_conn, + status_table, state, retval): + status.set_status(temp_db_conn, date=dt.datetime.now(dt.timezone.utc), seq=1) func_mock = MockParamCapture(retval=state) monkeypatch.setattr(nominatim.tools.replication, 'update', func_mock) assert retval == call_nominatim('replication', '--once', '--no-index') -def test_replication_update_continuous(monkeypatch, status_table): +def test_replication_update_continuous(monkeypatch, temp_db_conn, status_table): + status.set_status(temp_db_conn, date=dt.datetime.now(dt.timezone.utc), seq=1) states = [nominatim.tools.replication.UpdateState.UP_TO_DATE, nominatim.tools.replication.UpdateState.UP_TO_DATE] monkeypatch.setattr(nominatim.tools.replication, 'update', @@ -188,7 +193,8 @@ def test_replication_update_continuous(monkeypatch, status_table): assert index_mock.called == 4 -def test_replication_update_continuous_no_change(monkeypatch, status_table): +def test_replication_update_continuous_no_change(monkeypatch, temp_db_conn, status_table): + status.set_status(temp_db_conn, date=dt.datetime.now(dt.timezone.utc), seq=1) states = [nominatim.tools.replication.UpdateState.NO_CHANGES, nominatim.tools.replication.UpdateState.UP_TO_DATE] monkeypatch.setattr(nominatim.tools.replication, 'update',