port wikipedia importance functions to python

This commit is contained in:
Sarah Hoffmann
2021-02-24 22:02:13 +01:00
parent 32683f73c7
commit c7fd0a7af4
11 changed files with 132 additions and 103 deletions

View File

@@ -131,7 +131,7 @@ if ($aCMDResult['create-partition-functions'] || $aCMDResult['all']) {
if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all']) {
$bDidSomething = true;
$oSetup->importWikipediaArticles();
(clone($oNominatimCmd))->addParams('refresh', '--wiki-data')->run();
}
if ($aCMDResult['load-data'] || $aCMDResult['all']) {
@@ -157,7 +157,7 @@ if ($aCMDResult['index'] || $aCMDResult['all']) {
if ($aCMDResult['drop']) {
$bDidSomething = true;
$oSetup->drop($aCMDResult);
(clone($oNominatimCmd))->addParams('freeze')->run(true);
}
if ($aCMDResult['create-search-indices'] || $aCMDResult['all']) {
@@ -172,7 +172,7 @@ if ($aCMDResult['create-country-names'] || $aCMDResult['all']) {
if ($aCMDResult['setup-website'] || $aCMDResult['all']) {
$bDidSomething = true;
$oSetup->setupWebsite();
(clone($oNominatimCmd))->addParams('refresh', '--website')->run(true);
}
// ******************************************************

View File

@@ -211,20 +211,7 @@ if ($aResult['update-address-levels']) {
}
if ($aResult['recompute-importance']) {
echo "Updating importance values for database.\n";
$oDB = new Nominatim\DB();
$oDB->connect();
$sSQL = 'ALTER TABLE placex DISABLE TRIGGER ALL;';
$sSQL .= 'UPDATE placex SET (wikipedia, importance) =';
$sSQL .= ' (SELECT wikipedia, importance';
$sSQL .= ' FROM compute_importance(extratags, country_code, osm_type, osm_id));';
$sSQL .= 'UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance';
$sSQL .= ' FROM placex d';
$sSQL .= ' WHERE s.place_id = d.linked_place_id and d.wikipedia is not null';
$sSQL .= ' and (s.wikipedia is null or s.importance < d.importance);';
$sSQL .= 'ALTER TABLE placex ENABLE TRIGGER ALL;';
$oDB->exec($sSQL);
(clone($oNominatimCmd))->addParams('refresh', '--importance')->run(true);
}
if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) {

View File

@@ -6,7 +6,6 @@ require_once(CONST_LibDir.'/Shell.php');
class SetupFunctions
{
protected $iCacheMemory;
protected $iInstances;
protected $aDSNInfo;
protected $bQuiet;
@@ -31,16 +30,6 @@ class SetupFunctions
warn('resetting threads to '.$this->iInstances);
}
if (isset($aCMDResult['osm2pgsql-cache'])) {
$this->iCacheMemory = $aCMDResult['osm2pgsql-cache'];
} elseif (getSetting('FLATNODE_FILE')) {
// When flatnode files are enabled then disable cache per default.
$this->iCacheMemory = 0;
} else {
// Otherwise: Assume we can steal all the cache memory in the box.
$this->iCacheMemory = getCacheMemoryMB();
}
// parse database string
$this->aDSNInfo = \Nominatim\DB::parseDSN(getSetting('DATABASE_DSN'));
if (!isset($this->aDSNInfo['port'])) {
@@ -82,6 +71,7 @@ class SetupFunctions
if ($this->bVerbose) {
$this->oNominatimCmd->addParams('--verbose');
}
$this->oNominatimCmd->addParams('--threads', $this->iInstances);
}
public function createFunctions()
@@ -136,20 +126,6 @@ class SetupFunctions
$this->createSqlFunctions(); // also create partition functions
}
public function importWikipediaArticles()
{
$sWikiArticlePath = getSetting('WIKIPEDIA_DATA_PATH', CONST_InstallDir);
$sWikiArticlesFile = $sWikiArticlePath.'/wikimedia-importance.sql.gz';
if (file_exists($sWikiArticlesFile)) {
info('Importing wikipedia articles and redirects');
$this->dropTable('wikipedia_article');
$this->dropTable('wikipedia_redirect');
$this->pgsqlRunScriptFile($sWikiArticlesFile);
} else {
warn('wikipedia importance dump file not found - places will have default importance');
}
}
public function loadData($bDisableTokenPrecalc)
{
info('Drop old Data');
@@ -505,21 +481,6 @@ class SetupFunctions
$this->pgsqlRunScript($sSQL);
}
public function drop()
{
(clone($this->oNominatimCmd))->addParams('freeze')->run();
}
/**
* Setup the directory for the API scripts.
*
* @return null
*/
public function setupWebsite()
{
(clone($this->oNominatimCmd))->addParams('refresh', '--website')->run();
}
/**
* Return the connection to the database.
*
@@ -538,15 +499,6 @@ class SetupFunctions
return $this->oDB;
}
private function removeFlatnodeFile()
{
$sFName = getSetting('FLATNODE_FILE');
if ($sFName && file_exists($sFName)) {
if ($this->bVerbose) echo 'Deleting '.$sFName."\n";
unlink($sFName);
}
}
private function pgsqlRunScript($sScript, $bfatal = true)
{
runSQLScript(
@@ -570,7 +522,7 @@ class SetupFunctions
$oCmd->addParams('--enable-debug-statements');
}
$oCmd->run();
$oCmd->run(!$this->sIgnoreErrors);
}
private function pgsqlRunPartitionScript($sTemplate)

View File

@@ -5,7 +5,6 @@ import logging
from pathlib import Path
from ..db.connection import connect
from ..tools.exec_utils import run_legacy_script
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
@@ -69,12 +68,20 @@ class UpdateRefresh:
args.diffs, args.enable_debug_statements)
if args.wiki_data:
run_legacy_script('setup.php', '--import-wikipedia-articles',
nominatim_env=args, throw_on_fail=True)
data_path = Path(args.config.WIKIPEDIA_DATA_PATH
or args.project_dir)
LOG.warning('Import wikipdia article importance from %s', data_path)
if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
data_path) > 0:
LOG.fatal('FATAL: Wikipedia importance dump file not found')
return 1
# Attention: importance MUST come after wiki data import.
if args.importance:
run_legacy_script('update.php', '--recompute-importance',
nominatim_env=args, throw_on_fail=True)
LOG.warning('Update importance values for database')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.recompute_importance(conn)
if args.website:
webdir = args.project_dir / 'website'
LOG.warning('Setting up website directory at %s', webdir)

View File

@@ -21,9 +21,12 @@ def _pipe_to_proc(proc, fdesc):
return len(chunk)
def execute_file(dsn, fname, ignore_errors=False):
def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None):
""" Read an SQL file and run its contents against the given database
using psql.
using psql. Use `pre_code` and `post_code` to run extra commands
before or after executing the file. The commands are run within the
same session, so they may be used to wrap the file execution in a
transaction.
"""
cmd = ['psql']
if not ignore_errors:
@@ -33,6 +36,9 @@ def execute_file(dsn, fname, ignore_errors=False):
if not LOG.isEnabledFor(logging.INFO):
proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8'))
if pre_code:
proc.stdin.write((pre_code + ';').encode('utf-8'))
if fname.suffix == '.gz':
with gzip.open(str(fname), 'rb') as fdesc:
remain = _pipe_to_proc(proc, fdesc)
@@ -40,6 +46,9 @@ def execute_file(dsn, fname, ignore_errors=False):
with fname.open('rb') as fdesc:
remain = _pipe_to_proc(proc, fdesc)
if remain == 0 and post_code:
proc.stdin.write((';' + post_code).encode('utf-8'))
proc.stdin.close()
ret = proc.wait()

View File

@@ -200,6 +200,53 @@ PHP_CONST_DEFS = (
)
def import_wikipedia_articles(dsn, data_path, ignore_errors=False):
""" Replaces the wikipedia importance tables with new data.
The import is run in a single transaction so that the new data
is replace seemlessly.
Returns 0 if all was well and 1 if the importance file could not
be found. Throws an exception if there was an error reading the file.
"""
datafile = data_path / 'wikimedia-importance.sql.gz'
if not datafile.exists():
return 1
pre_code = """BEGIN;
DROP TABLE IF EXISTS "wikipedia_article";
DROP TABLE IF EXISTS "wikipedia_redirect"
"""
post_code = "COMMIT"
execute_file(dsn, datafile, ignore_errors=ignore_errors,
pre_code=pre_code, post_code=post_code)
return 0
def recompute_importance(conn):
""" Recompute wikipedia links and importance for all entries in placex.
This is a long-running operations that must not be executed in
parallel with updates.
"""
with conn.cursor() as cur:
cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL')
cur.execute("""
UPDATE placex SET (wikipedia, importance) =
(SELECT wikipedia, importance
FROM compute_importance(extratags, country_code, osm_type, osm_id))
""")
cur.execute("""
UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
FROM placex d
WHERE s.place_id = d.linked_place_id and d.wikipedia is not null
and (s.wikipedia is null or s.importance < d.importance);
""")
cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL')
conn.commit()
def setup_website(basedir, phplib_dir, config):
""" Create the website script stubs.
"""

View File

@@ -71,6 +71,12 @@ def temp_db(monkeypatch):
conn.close()
@pytest.fixture
def dsn(temp_db):
return 'dbname=' + temp_db
@pytest.fixture
def temp_db_with_extensions(temp_db):
conn = psycopg2.connect(database=temp_db)
@@ -101,6 +107,14 @@ def temp_db_cursor(temp_db):
conn.close()
@pytest.fixture
def table_factory(temp_db_cursor):
def mk_table(name, definition='id INT'):
temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
return mk_table
@pytest.fixture
def def_config():
return Configuration(None, SRC_DIR.resolve() / 'settings')

View File

@@ -135,24 +135,13 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra
assert rank_mock.called == do_ranks
@pytest.mark.parametrize("command,params", [
('wiki-data', ('setup.php', '--import-wikipedia-articles')),
('importance', ('update.php', '--recompute-importance')),
])
def test_refresh_legacy_command(mock_func_factory, temp_db, command, params):
mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script')
assert 0 == call_nominatim('refresh', '--' + command)
assert mock_run_legacy.called == 1
assert len(mock_run_legacy.last_args) >= len(params)
assert mock_run_legacy.last_args[:len(params)] == params
@pytest.mark.parametrize("command,func", [
('postcodes', 'update_postcodes'),
('word-counts', 'recompute_word_counts'),
('address-levels', 'load_address_levels_from_file'),
('functions', 'create_functions'),
('wiki-data', 'import_wikipedia_articles'),
('importance', 'recompute_importance'),
('website', 'setup_website'),
])
def test_refresh_command(mock_func_factory, temp_db, command, func):
@@ -162,13 +151,16 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
assert func_mock.called == 1
def test_refresh_importance_computed_after_wiki_import(mock_func_factory, temp_db):
mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script')
def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
calls = []
monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
lambda *args, **kwargs: calls.append('import') or 0)
monkeypatch.setattr(nominatim.tools.refresh, 'recompute_importance',
lambda *args, **kwargs: calls.append('update'))
assert 0 == call_nominatim('refresh', '--importance', '--wiki-data')
assert mock_run_legacy.called == 2
assert mock_run_legacy.last_args == ('update.php', '--recompute-importance')
assert calls == ['import', 'update']
def test_serve_command(mock_func_factory):

View File

@@ -12,10 +12,10 @@ def db(temp_db):
yield conn
def test_connection_table_exists(db, temp_db_cursor):
def test_connection_table_exists(db, table_factory):
assert db.table_exists('foobar') == False
temp_db_cursor.execute('CREATE TABLE foobar (id INT)')
table_factory('foobar')
assert db.table_exists('foobar') == True
@@ -31,10 +31,10 @@ def test_connection_index_exists(db, temp_db_cursor):
assert db.index_exists('some_index', table='bar') == False
def test_drop_table_existing(db, temp_db_cursor):
temp_db_cursor.execute('CREATE TABLE dummy (id INT)')
def test_drop_table_existing(db, table_factory):
table_factory('dummy')
assert db.table_exists('dummy')
db.drop_table('dummy')
assert not db.table_exists('dummy')
@@ -65,8 +65,8 @@ def test_connection_postgis_version_tuple(db, temp_db_cursor):
assert ver[0] >= 2
def test_cursor_scalar(db, temp_db_cursor):
temp_db_cursor.execute('CREATE TABLE dummy (id INT)')
def test_cursor_scalar(db, table_factory):
table_factory('dummy')
with db.cursor() as cur:
assert cur.scalar('SELECT count(*) FROM dummy') == 0

View File

@@ -7,10 +7,6 @@ import pytest
import nominatim.db.utils as db_utils
from nominatim.errors import UsageError
@pytest.fixture
def dsn(temp_db):
return 'dbname=' + temp_db
def test_execute_file_success(dsn, temp_db_cursor, tmp_path):
tmpfile = tmp_path / 'test.sql'
tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);')
@@ -40,3 +36,27 @@ def test_execute_file_bad_sql_ignore_errors(dsn, tmp_path):
tmpfile.write_text('CREATE STABLE test (id INT)')
db_utils.execute_file(dsn, tmpfile, ignore_errors=True)
def test_execute_file_with_pre_code(dsn, tmp_path, temp_db_cursor):
tmpfile = tmp_path / 'test.sql'
tmpfile.write_text('INSERT INTO test VALUES(4)')
db_utils.execute_file(dsn, tmpfile, pre_code='CREATE TABLE test (id INT)')
temp_db_cursor.execute('SELECT * FROM test')
assert temp_db_cursor.rowcount == 1
assert temp_db_cursor.fetchone()[0] == 4
def test_execute_file_with_post_code(dsn, tmp_path, temp_db_cursor):
tmpfile = tmp_path / 'test.sql'
tmpfile.write_text('CREATE TABLE test (id INT)')
db_utils.execute_file(dsn, tmpfile, post_code='INSERT INTO test VALUES(23)')
temp_db_cursor.execute('SELECT * FROM test')
assert temp_db_cursor.rowcount == 1
assert temp_db_cursor.fetchone()[0] == 23

View File

@@ -2,9 +2,10 @@
Tests for function for importing address ranks.
"""
import json
import pytest
from pathlib import Path
import pytest
from nominatim.tools.refresh import load_address_levels, load_address_levels_from_file
def test_load_ranks_def_config(temp_db_conn, temp_db_cursor, def_config):