port wikipedia importance functions to python

This commit is contained in:
Sarah Hoffmann
2021-02-24 22:02:13 +01:00
parent 32683f73c7
commit c7fd0a7af4
11 changed files with 132 additions and 103 deletions

View File

@@ -131,7 +131,7 @@ if ($aCMDResult['create-partition-functions'] || $aCMDResult['all']) {
if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all']) { if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all']) {
$bDidSomething = true; $bDidSomething = true;
$oSetup->importWikipediaArticles(); (clone($oNominatimCmd))->addParams('refresh', '--wiki-data')->run();
} }
if ($aCMDResult['load-data'] || $aCMDResult['all']) { if ($aCMDResult['load-data'] || $aCMDResult['all']) {
@@ -157,7 +157,7 @@ if ($aCMDResult['index'] || $aCMDResult['all']) {
if ($aCMDResult['drop']) { if ($aCMDResult['drop']) {
$bDidSomething = true; $bDidSomething = true;
$oSetup->drop($aCMDResult); (clone($oNominatimCmd))->addParams('freeze')->run(true);
} }
if ($aCMDResult['create-search-indices'] || $aCMDResult['all']) { if ($aCMDResult['create-search-indices'] || $aCMDResult['all']) {
@@ -172,7 +172,7 @@ if ($aCMDResult['create-country-names'] || $aCMDResult['all']) {
if ($aCMDResult['setup-website'] || $aCMDResult['all']) { if ($aCMDResult['setup-website'] || $aCMDResult['all']) {
$bDidSomething = true; $bDidSomething = true;
$oSetup->setupWebsite(); (clone($oNominatimCmd))->addParams('refresh', '--website')->run(true);
} }
// ****************************************************** // ******************************************************

View File

@@ -211,20 +211,7 @@ if ($aResult['update-address-levels']) {
} }
if ($aResult['recompute-importance']) { if ($aResult['recompute-importance']) {
echo "Updating importance values for database.\n"; (clone($oNominatimCmd))->addParams('refresh', '--importance')->run(true);
$oDB = new Nominatim\DB();
$oDB->connect();
$sSQL = 'ALTER TABLE placex DISABLE TRIGGER ALL;';
$sSQL .= 'UPDATE placex SET (wikipedia, importance) =';
$sSQL .= ' (SELECT wikipedia, importance';
$sSQL .= ' FROM compute_importance(extratags, country_code, osm_type, osm_id));';
$sSQL .= 'UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance';
$sSQL .= ' FROM placex d';
$sSQL .= ' WHERE s.place_id = d.linked_place_id and d.wikipedia is not null';
$sSQL .= ' and (s.wikipedia is null or s.importance < d.importance);';
$sSQL .= 'ALTER TABLE placex ENABLE TRIGGER ALL;';
$oDB->exec($sSQL);
} }
if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) { if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) {

View File

@@ -6,7 +6,6 @@ require_once(CONST_LibDir.'/Shell.php');
class SetupFunctions class SetupFunctions
{ {
protected $iCacheMemory;
protected $iInstances; protected $iInstances;
protected $aDSNInfo; protected $aDSNInfo;
protected $bQuiet; protected $bQuiet;
@@ -31,16 +30,6 @@ class SetupFunctions
warn('resetting threads to '.$this->iInstances); warn('resetting threads to '.$this->iInstances);
} }
if (isset($aCMDResult['osm2pgsql-cache'])) {
$this->iCacheMemory = $aCMDResult['osm2pgsql-cache'];
} elseif (getSetting('FLATNODE_FILE')) {
// When flatnode files are enabled then disable cache per default.
$this->iCacheMemory = 0;
} else {
// Otherwise: Assume we can steal all the cache memory in the box.
$this->iCacheMemory = getCacheMemoryMB();
}
// parse database string // parse database string
$this->aDSNInfo = \Nominatim\DB::parseDSN(getSetting('DATABASE_DSN')); $this->aDSNInfo = \Nominatim\DB::parseDSN(getSetting('DATABASE_DSN'));
if (!isset($this->aDSNInfo['port'])) { if (!isset($this->aDSNInfo['port'])) {
@@ -82,6 +71,7 @@ class SetupFunctions
if ($this->bVerbose) { if ($this->bVerbose) {
$this->oNominatimCmd->addParams('--verbose'); $this->oNominatimCmd->addParams('--verbose');
} }
$this->oNominatimCmd->addParams('--threads', $this->iInstances);
} }
public function createFunctions() public function createFunctions()
@@ -136,20 +126,6 @@ class SetupFunctions
$this->createSqlFunctions(); // also create partition functions $this->createSqlFunctions(); // also create partition functions
} }
public function importWikipediaArticles()
{
$sWikiArticlePath = getSetting('WIKIPEDIA_DATA_PATH', CONST_InstallDir);
$sWikiArticlesFile = $sWikiArticlePath.'/wikimedia-importance.sql.gz';
if (file_exists($sWikiArticlesFile)) {
info('Importing wikipedia articles and redirects');
$this->dropTable('wikipedia_article');
$this->dropTable('wikipedia_redirect');
$this->pgsqlRunScriptFile($sWikiArticlesFile);
} else {
warn('wikipedia importance dump file not found - places will have default importance');
}
}
public function loadData($bDisableTokenPrecalc) public function loadData($bDisableTokenPrecalc)
{ {
info('Drop old Data'); info('Drop old Data');
@@ -505,21 +481,6 @@ class SetupFunctions
$this->pgsqlRunScript($sSQL); $this->pgsqlRunScript($sSQL);
} }
public function drop()
{
(clone($this->oNominatimCmd))->addParams('freeze')->run();
}
/**
* Setup the directory for the API scripts.
*
* @return null
*/
public function setupWebsite()
{
(clone($this->oNominatimCmd))->addParams('refresh', '--website')->run();
}
/** /**
* Return the connection to the database. * Return the connection to the database.
* *
@@ -538,15 +499,6 @@ class SetupFunctions
return $this->oDB; return $this->oDB;
} }
private function removeFlatnodeFile()
{
$sFName = getSetting('FLATNODE_FILE');
if ($sFName && file_exists($sFName)) {
if ($this->bVerbose) echo 'Deleting '.$sFName."\n";
unlink($sFName);
}
}
private function pgsqlRunScript($sScript, $bfatal = true) private function pgsqlRunScript($sScript, $bfatal = true)
{ {
runSQLScript( runSQLScript(
@@ -570,7 +522,7 @@ class SetupFunctions
$oCmd->addParams('--enable-debug-statements'); $oCmd->addParams('--enable-debug-statements');
} }
$oCmd->run(); $oCmd->run(!$this->sIgnoreErrors);
} }
private function pgsqlRunPartitionScript($sTemplate) private function pgsqlRunPartitionScript($sTemplate)

View File

@@ -5,7 +5,6 @@ import logging
from pathlib import Path from pathlib import Path
from ..db.connection import connect from ..db.connection import connect
from ..tools.exec_utils import run_legacy_script
# Do not repeat documentation of subcommand classes. # Do not repeat documentation of subcommand classes.
# pylint: disable=C0111 # pylint: disable=C0111
@@ -69,12 +68,20 @@ class UpdateRefresh:
args.diffs, args.enable_debug_statements) args.diffs, args.enable_debug_statements)
if args.wiki_data: if args.wiki_data:
run_legacy_script('setup.php', '--import-wikipedia-articles', data_path = Path(args.config.WIKIPEDIA_DATA_PATH
nominatim_env=args, throw_on_fail=True) or args.project_dir)
LOG.warning('Import wikipdia article importance from %s', data_path)
if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
data_path) > 0:
LOG.fatal('FATAL: Wikipedia importance dump file not found')
return 1
# Attention: importance MUST come after wiki data import. # Attention: importance MUST come after wiki data import.
if args.importance: if args.importance:
run_legacy_script('update.php', '--recompute-importance', LOG.warning('Update importance values for database')
nominatim_env=args, throw_on_fail=True) with connect(args.config.get_libpq_dsn()) as conn:
refresh.recompute_importance(conn)
if args.website: if args.website:
webdir = args.project_dir / 'website' webdir = args.project_dir / 'website'
LOG.warning('Setting up website directory at %s', webdir) LOG.warning('Setting up website directory at %s', webdir)

View File

@@ -21,9 +21,12 @@ def _pipe_to_proc(proc, fdesc):
return len(chunk) return len(chunk)
def execute_file(dsn, fname, ignore_errors=False): def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None):
""" Read an SQL file and run its contents against the given database """ Read an SQL file and run its contents against the given database
using psql. using psql. Use `pre_code` and `post_code` to run extra commands
before or after executing the file. The commands are run within the
same session, so they may be used to wrap the file execution in a
transaction.
""" """
cmd = ['psql'] cmd = ['psql']
if not ignore_errors: if not ignore_errors:
@@ -33,6 +36,9 @@ def execute_file(dsn, fname, ignore_errors=False):
if not LOG.isEnabledFor(logging.INFO): if not LOG.isEnabledFor(logging.INFO):
proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8')) proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8'))
if pre_code:
proc.stdin.write((pre_code + ';').encode('utf-8'))
if fname.suffix == '.gz': if fname.suffix == '.gz':
with gzip.open(str(fname), 'rb') as fdesc: with gzip.open(str(fname), 'rb') as fdesc:
remain = _pipe_to_proc(proc, fdesc) remain = _pipe_to_proc(proc, fdesc)
@@ -40,6 +46,9 @@ def execute_file(dsn, fname, ignore_errors=False):
with fname.open('rb') as fdesc: with fname.open('rb') as fdesc:
remain = _pipe_to_proc(proc, fdesc) remain = _pipe_to_proc(proc, fdesc)
if remain == 0 and post_code:
proc.stdin.write((';' + post_code).encode('utf-8'))
proc.stdin.close() proc.stdin.close()
ret = proc.wait() ret = proc.wait()

View File

@@ -200,6 +200,53 @@ PHP_CONST_DEFS = (
) )
def import_wikipedia_articles(dsn, data_path, ignore_errors=False):
""" Replaces the wikipedia importance tables with new data.
The import is run in a single transaction so that the new data
is replace seemlessly.
Returns 0 if all was well and 1 if the importance file could not
be found. Throws an exception if there was an error reading the file.
"""
datafile = data_path / 'wikimedia-importance.sql.gz'
if not datafile.exists():
return 1
pre_code = """BEGIN;
DROP TABLE IF EXISTS "wikipedia_article";
DROP TABLE IF EXISTS "wikipedia_redirect"
"""
post_code = "COMMIT"
execute_file(dsn, datafile, ignore_errors=ignore_errors,
pre_code=pre_code, post_code=post_code)
return 0
def recompute_importance(conn):
""" Recompute wikipedia links and importance for all entries in placex.
This is a long-running operations that must not be executed in
parallel with updates.
"""
with conn.cursor() as cur:
cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL')
cur.execute("""
UPDATE placex SET (wikipedia, importance) =
(SELECT wikipedia, importance
FROM compute_importance(extratags, country_code, osm_type, osm_id))
""")
cur.execute("""
UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
FROM placex d
WHERE s.place_id = d.linked_place_id and d.wikipedia is not null
and (s.wikipedia is null or s.importance < d.importance);
""")
cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL')
conn.commit()
def setup_website(basedir, phplib_dir, config): def setup_website(basedir, phplib_dir, config):
""" Create the website script stubs. """ Create the website script stubs.
""" """

View File

@@ -71,6 +71,12 @@ def temp_db(monkeypatch):
conn.close() conn.close()
@pytest.fixture
def dsn(temp_db):
return 'dbname=' + temp_db
@pytest.fixture @pytest.fixture
def temp_db_with_extensions(temp_db): def temp_db_with_extensions(temp_db):
conn = psycopg2.connect(database=temp_db) conn = psycopg2.connect(database=temp_db)
@@ -101,6 +107,14 @@ def temp_db_cursor(temp_db):
conn.close() conn.close()
@pytest.fixture
def table_factory(temp_db_cursor):
def mk_table(name, definition='id INT'):
temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
return mk_table
@pytest.fixture @pytest.fixture
def def_config(): def def_config():
return Configuration(None, SRC_DIR.resolve() / 'settings') return Configuration(None, SRC_DIR.resolve() / 'settings')

View File

@@ -135,24 +135,13 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra
assert rank_mock.called == do_ranks assert rank_mock.called == do_ranks
@pytest.mark.parametrize("command,params", [
('wiki-data', ('setup.php', '--import-wikipedia-articles')),
('importance', ('update.php', '--recompute-importance')),
])
def test_refresh_legacy_command(mock_func_factory, temp_db, command, params):
mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script')
assert 0 == call_nominatim('refresh', '--' + command)
assert mock_run_legacy.called == 1
assert len(mock_run_legacy.last_args) >= len(params)
assert mock_run_legacy.last_args[:len(params)] == params
@pytest.mark.parametrize("command,func", [ @pytest.mark.parametrize("command,func", [
('postcodes', 'update_postcodes'), ('postcodes', 'update_postcodes'),
('word-counts', 'recompute_word_counts'), ('word-counts', 'recompute_word_counts'),
('address-levels', 'load_address_levels_from_file'), ('address-levels', 'load_address_levels_from_file'),
('functions', 'create_functions'), ('functions', 'create_functions'),
('wiki-data', 'import_wikipedia_articles'),
('importance', 'recompute_importance'),
('website', 'setup_website'), ('website', 'setup_website'),
]) ])
def test_refresh_command(mock_func_factory, temp_db, command, func): def test_refresh_command(mock_func_factory, temp_db, command, func):
@@ -162,13 +151,16 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
assert func_mock.called == 1 assert func_mock.called == 1
def test_refresh_importance_computed_after_wiki_import(mock_func_factory, temp_db): def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script') calls = []
monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
lambda *args, **kwargs: calls.append('import') or 0)
monkeypatch.setattr(nominatim.tools.refresh, 'recompute_importance',
lambda *args, **kwargs: calls.append('update'))
assert 0 == call_nominatim('refresh', '--importance', '--wiki-data') assert 0 == call_nominatim('refresh', '--importance', '--wiki-data')
assert mock_run_legacy.called == 2 assert calls == ['import', 'update']
assert mock_run_legacy.last_args == ('update.php', '--recompute-importance')
def test_serve_command(mock_func_factory): def test_serve_command(mock_func_factory):

View File

@@ -12,10 +12,10 @@ def db(temp_db):
yield conn yield conn
def test_connection_table_exists(db, temp_db_cursor): def test_connection_table_exists(db, table_factory):
assert db.table_exists('foobar') == False assert db.table_exists('foobar') == False
temp_db_cursor.execute('CREATE TABLE foobar (id INT)') table_factory('foobar')
assert db.table_exists('foobar') == True assert db.table_exists('foobar') == True
@@ -31,10 +31,10 @@ def test_connection_index_exists(db, temp_db_cursor):
assert db.index_exists('some_index', table='bar') == False assert db.index_exists('some_index', table='bar') == False
def test_drop_table_existing(db, temp_db_cursor): def test_drop_table_existing(db, table_factory):
temp_db_cursor.execute('CREATE TABLE dummy (id INT)') table_factory('dummy')
assert db.table_exists('dummy') assert db.table_exists('dummy')
db.drop_table('dummy') db.drop_table('dummy')
assert not db.table_exists('dummy') assert not db.table_exists('dummy')
@@ -65,8 +65,8 @@ def test_connection_postgis_version_tuple(db, temp_db_cursor):
assert ver[0] >= 2 assert ver[0] >= 2
def test_cursor_scalar(db, temp_db_cursor): def test_cursor_scalar(db, table_factory):
temp_db_cursor.execute('CREATE TABLE dummy (id INT)') table_factory('dummy')
with db.cursor() as cur: with db.cursor() as cur:
assert cur.scalar('SELECT count(*) FROM dummy') == 0 assert cur.scalar('SELECT count(*) FROM dummy') == 0

View File

@@ -7,10 +7,6 @@ import pytest
import nominatim.db.utils as db_utils import nominatim.db.utils as db_utils
from nominatim.errors import UsageError from nominatim.errors import UsageError
@pytest.fixture
def dsn(temp_db):
return 'dbname=' + temp_db
def test_execute_file_success(dsn, temp_db_cursor, tmp_path): def test_execute_file_success(dsn, temp_db_cursor, tmp_path):
tmpfile = tmp_path / 'test.sql' tmpfile = tmp_path / 'test.sql'
tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);') tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);')
@@ -40,3 +36,27 @@ def test_execute_file_bad_sql_ignore_errors(dsn, tmp_path):
tmpfile.write_text('CREATE STABLE test (id INT)') tmpfile.write_text('CREATE STABLE test (id INT)')
db_utils.execute_file(dsn, tmpfile, ignore_errors=True) db_utils.execute_file(dsn, tmpfile, ignore_errors=True)
def test_execute_file_with_pre_code(dsn, tmp_path, temp_db_cursor):
tmpfile = tmp_path / 'test.sql'
tmpfile.write_text('INSERT INTO test VALUES(4)')
db_utils.execute_file(dsn, tmpfile, pre_code='CREATE TABLE test (id INT)')
temp_db_cursor.execute('SELECT * FROM test')
assert temp_db_cursor.rowcount == 1
assert temp_db_cursor.fetchone()[0] == 4
def test_execute_file_with_post_code(dsn, tmp_path, temp_db_cursor):
tmpfile = tmp_path / 'test.sql'
tmpfile.write_text('CREATE TABLE test (id INT)')
db_utils.execute_file(dsn, tmpfile, post_code='INSERT INTO test VALUES(23)')
temp_db_cursor.execute('SELECT * FROM test')
assert temp_db_cursor.rowcount == 1
assert temp_db_cursor.fetchone()[0] == 23

View File

@@ -2,9 +2,10 @@
Tests for function for importing address ranks. Tests for function for importing address ranks.
""" """
import json import json
import pytest
from pathlib import Path from pathlib import Path
import pytest
from nominatim.tools.refresh import load_address_levels, load_address_levels_from_file from nominatim.tools.refresh import load_address_levels, load_address_levels_from_file
def test_load_ranks_def_config(temp_db_conn, temp_db_cursor, def_config): def test_load_ranks_def_config(temp_db_conn, temp_db_cursor, def_config):