forked from hans/Nominatim
port wikipedia importance functions to python
This commit is contained in:
@@ -131,7 +131,7 @@ if ($aCMDResult['create-partition-functions'] || $aCMDResult['all']) {
|
||||
|
||||
if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all']) {
|
||||
$bDidSomething = true;
|
||||
$oSetup->importWikipediaArticles();
|
||||
(clone($oNominatimCmd))->addParams('refresh', '--wiki-data')->run();
|
||||
}
|
||||
|
||||
if ($aCMDResult['load-data'] || $aCMDResult['all']) {
|
||||
@@ -157,7 +157,7 @@ if ($aCMDResult['index'] || $aCMDResult['all']) {
|
||||
|
||||
if ($aCMDResult['drop']) {
|
||||
$bDidSomething = true;
|
||||
$oSetup->drop($aCMDResult);
|
||||
(clone($oNominatimCmd))->addParams('freeze')->run(true);
|
||||
}
|
||||
|
||||
if ($aCMDResult['create-search-indices'] || $aCMDResult['all']) {
|
||||
@@ -172,7 +172,7 @@ if ($aCMDResult['create-country-names'] || $aCMDResult['all']) {
|
||||
|
||||
if ($aCMDResult['setup-website'] || $aCMDResult['all']) {
|
||||
$bDidSomething = true;
|
||||
$oSetup->setupWebsite();
|
||||
(clone($oNominatimCmd))->addParams('refresh', '--website')->run(true);
|
||||
}
|
||||
|
||||
// ******************************************************
|
||||
|
||||
@@ -211,20 +211,7 @@ if ($aResult['update-address-levels']) {
|
||||
}
|
||||
|
||||
if ($aResult['recompute-importance']) {
|
||||
echo "Updating importance values for database.\n";
|
||||
$oDB = new Nominatim\DB();
|
||||
$oDB->connect();
|
||||
|
||||
$sSQL = 'ALTER TABLE placex DISABLE TRIGGER ALL;';
|
||||
$sSQL .= 'UPDATE placex SET (wikipedia, importance) =';
|
||||
$sSQL .= ' (SELECT wikipedia, importance';
|
||||
$sSQL .= ' FROM compute_importance(extratags, country_code, osm_type, osm_id));';
|
||||
$sSQL .= 'UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance';
|
||||
$sSQL .= ' FROM placex d';
|
||||
$sSQL .= ' WHERE s.place_id = d.linked_place_id and d.wikipedia is not null';
|
||||
$sSQL .= ' and (s.wikipedia is null or s.importance < d.importance);';
|
||||
$sSQL .= 'ALTER TABLE placex ENABLE TRIGGER ALL;';
|
||||
$oDB->exec($sSQL);
|
||||
(clone($oNominatimCmd))->addParams('refresh', '--importance')->run(true);
|
||||
}
|
||||
|
||||
if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) {
|
||||
|
||||
@@ -6,7 +6,6 @@ require_once(CONST_LibDir.'/Shell.php');
|
||||
|
||||
class SetupFunctions
|
||||
{
|
||||
protected $iCacheMemory;
|
||||
protected $iInstances;
|
||||
protected $aDSNInfo;
|
||||
protected $bQuiet;
|
||||
@@ -31,16 +30,6 @@ class SetupFunctions
|
||||
warn('resetting threads to '.$this->iInstances);
|
||||
}
|
||||
|
||||
if (isset($aCMDResult['osm2pgsql-cache'])) {
|
||||
$this->iCacheMemory = $aCMDResult['osm2pgsql-cache'];
|
||||
} elseif (getSetting('FLATNODE_FILE')) {
|
||||
// When flatnode files are enabled then disable cache per default.
|
||||
$this->iCacheMemory = 0;
|
||||
} else {
|
||||
// Otherwise: Assume we can steal all the cache memory in the box.
|
||||
$this->iCacheMemory = getCacheMemoryMB();
|
||||
}
|
||||
|
||||
// parse database string
|
||||
$this->aDSNInfo = \Nominatim\DB::parseDSN(getSetting('DATABASE_DSN'));
|
||||
if (!isset($this->aDSNInfo['port'])) {
|
||||
@@ -82,6 +71,7 @@ class SetupFunctions
|
||||
if ($this->bVerbose) {
|
||||
$this->oNominatimCmd->addParams('--verbose');
|
||||
}
|
||||
$this->oNominatimCmd->addParams('--threads', $this->iInstances);
|
||||
}
|
||||
|
||||
public function createFunctions()
|
||||
@@ -136,20 +126,6 @@ class SetupFunctions
|
||||
$this->createSqlFunctions(); // also create partition functions
|
||||
}
|
||||
|
||||
public function importWikipediaArticles()
|
||||
{
|
||||
$sWikiArticlePath = getSetting('WIKIPEDIA_DATA_PATH', CONST_InstallDir);
|
||||
$sWikiArticlesFile = $sWikiArticlePath.'/wikimedia-importance.sql.gz';
|
||||
if (file_exists($sWikiArticlesFile)) {
|
||||
info('Importing wikipedia articles and redirects');
|
||||
$this->dropTable('wikipedia_article');
|
||||
$this->dropTable('wikipedia_redirect');
|
||||
$this->pgsqlRunScriptFile($sWikiArticlesFile);
|
||||
} else {
|
||||
warn('wikipedia importance dump file not found - places will have default importance');
|
||||
}
|
||||
}
|
||||
|
||||
public function loadData($bDisableTokenPrecalc)
|
||||
{
|
||||
info('Drop old Data');
|
||||
@@ -505,21 +481,6 @@ class SetupFunctions
|
||||
$this->pgsqlRunScript($sSQL);
|
||||
}
|
||||
|
||||
public function drop()
|
||||
{
|
||||
(clone($this->oNominatimCmd))->addParams('freeze')->run();
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup the directory for the API scripts.
|
||||
*
|
||||
* @return null
|
||||
*/
|
||||
public function setupWebsite()
|
||||
{
|
||||
(clone($this->oNominatimCmd))->addParams('refresh', '--website')->run();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the connection to the database.
|
||||
*
|
||||
@@ -538,15 +499,6 @@ class SetupFunctions
|
||||
return $this->oDB;
|
||||
}
|
||||
|
||||
private function removeFlatnodeFile()
|
||||
{
|
||||
$sFName = getSetting('FLATNODE_FILE');
|
||||
if ($sFName && file_exists($sFName)) {
|
||||
if ($this->bVerbose) echo 'Deleting '.$sFName."\n";
|
||||
unlink($sFName);
|
||||
}
|
||||
}
|
||||
|
||||
private function pgsqlRunScript($sScript, $bfatal = true)
|
||||
{
|
||||
runSQLScript(
|
||||
@@ -570,7 +522,7 @@ class SetupFunctions
|
||||
$oCmd->addParams('--enable-debug-statements');
|
||||
}
|
||||
|
||||
$oCmd->run();
|
||||
$oCmd->run(!$this->sIgnoreErrors);
|
||||
}
|
||||
|
||||
private function pgsqlRunPartitionScript($sTemplate)
|
||||
|
||||
@@ -5,7 +5,6 @@ import logging
|
||||
from pathlib import Path
|
||||
|
||||
from ..db.connection import connect
|
||||
from ..tools.exec_utils import run_legacy_script
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
@@ -69,12 +68,20 @@ class UpdateRefresh:
|
||||
args.diffs, args.enable_debug_statements)
|
||||
|
||||
if args.wiki_data:
|
||||
run_legacy_script('setup.php', '--import-wikipedia-articles',
|
||||
nominatim_env=args, throw_on_fail=True)
|
||||
data_path = Path(args.config.WIKIPEDIA_DATA_PATH
|
||||
or args.project_dir)
|
||||
LOG.warning('Import wikipdia article importance from %s', data_path)
|
||||
if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
|
||||
data_path) > 0:
|
||||
LOG.fatal('FATAL: Wikipedia importance dump file not found')
|
||||
return 1
|
||||
|
||||
# Attention: importance MUST come after wiki data import.
|
||||
if args.importance:
|
||||
run_legacy_script('update.php', '--recompute-importance',
|
||||
nominatim_env=args, throw_on_fail=True)
|
||||
LOG.warning('Update importance values for database')
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
refresh.recompute_importance(conn)
|
||||
|
||||
if args.website:
|
||||
webdir = args.project_dir / 'website'
|
||||
LOG.warning('Setting up website directory at %s', webdir)
|
||||
|
||||
@@ -21,9 +21,12 @@ def _pipe_to_proc(proc, fdesc):
|
||||
|
||||
return len(chunk)
|
||||
|
||||
def execute_file(dsn, fname, ignore_errors=False):
|
||||
def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None):
|
||||
""" Read an SQL file and run its contents against the given database
|
||||
using psql.
|
||||
using psql. Use `pre_code` and `post_code` to run extra commands
|
||||
before or after executing the file. The commands are run within the
|
||||
same session, so they may be used to wrap the file execution in a
|
||||
transaction.
|
||||
"""
|
||||
cmd = ['psql']
|
||||
if not ignore_errors:
|
||||
@@ -33,6 +36,9 @@ def execute_file(dsn, fname, ignore_errors=False):
|
||||
if not LOG.isEnabledFor(logging.INFO):
|
||||
proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8'))
|
||||
|
||||
if pre_code:
|
||||
proc.stdin.write((pre_code + ';').encode('utf-8'))
|
||||
|
||||
if fname.suffix == '.gz':
|
||||
with gzip.open(str(fname), 'rb') as fdesc:
|
||||
remain = _pipe_to_proc(proc, fdesc)
|
||||
@@ -40,6 +46,9 @@ def execute_file(dsn, fname, ignore_errors=False):
|
||||
with fname.open('rb') as fdesc:
|
||||
remain = _pipe_to_proc(proc, fdesc)
|
||||
|
||||
if remain == 0 and post_code:
|
||||
proc.stdin.write((';' + post_code).encode('utf-8'))
|
||||
|
||||
proc.stdin.close()
|
||||
|
||||
ret = proc.wait()
|
||||
|
||||
@@ -200,6 +200,53 @@ PHP_CONST_DEFS = (
|
||||
)
|
||||
|
||||
|
||||
def import_wikipedia_articles(dsn, data_path, ignore_errors=False):
|
||||
""" Replaces the wikipedia importance tables with new data.
|
||||
The import is run in a single transaction so that the new data
|
||||
is replace seemlessly.
|
||||
|
||||
Returns 0 if all was well and 1 if the importance file could not
|
||||
be found. Throws an exception if there was an error reading the file.
|
||||
"""
|
||||
datafile = data_path / 'wikimedia-importance.sql.gz'
|
||||
|
||||
if not datafile.exists():
|
||||
return 1
|
||||
|
||||
pre_code = """BEGIN;
|
||||
DROP TABLE IF EXISTS "wikipedia_article";
|
||||
DROP TABLE IF EXISTS "wikipedia_redirect"
|
||||
"""
|
||||
post_code = "COMMIT"
|
||||
execute_file(dsn, datafile, ignore_errors=ignore_errors,
|
||||
pre_code=pre_code, post_code=post_code)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def recompute_importance(conn):
|
||||
""" Recompute wikipedia links and importance for all entries in placex.
|
||||
This is a long-running operations that must not be executed in
|
||||
parallel with updates.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL')
|
||||
cur.execute("""
|
||||
UPDATE placex SET (wikipedia, importance) =
|
||||
(SELECT wikipedia, importance
|
||||
FROM compute_importance(extratags, country_code, osm_type, osm_id))
|
||||
""")
|
||||
cur.execute("""
|
||||
UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
|
||||
FROM placex d
|
||||
WHERE s.place_id = d.linked_place_id and d.wikipedia is not null
|
||||
and (s.wikipedia is null or s.importance < d.importance);
|
||||
""")
|
||||
|
||||
cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL')
|
||||
conn.commit()
|
||||
|
||||
|
||||
def setup_website(basedir, phplib_dir, config):
|
||||
""" Create the website script stubs.
|
||||
"""
|
||||
|
||||
@@ -71,6 +71,12 @@ def temp_db(monkeypatch):
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dsn(temp_db):
|
||||
return 'dbname=' + temp_db
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_db_with_extensions(temp_db):
|
||||
conn = psycopg2.connect(database=temp_db)
|
||||
@@ -101,6 +107,14 @@ def temp_db_cursor(temp_db):
|
||||
conn.close()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_factory(temp_db_cursor):
|
||||
def mk_table(name, definition='id INT'):
|
||||
temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
|
||||
|
||||
return mk_table
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def def_config():
|
||||
return Configuration(None, SRC_DIR.resolve() / 'settings')
|
||||
|
||||
@@ -135,24 +135,13 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra
|
||||
assert rank_mock.called == do_ranks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("command,params", [
|
||||
('wiki-data', ('setup.php', '--import-wikipedia-articles')),
|
||||
('importance', ('update.php', '--recompute-importance')),
|
||||
])
|
||||
def test_refresh_legacy_command(mock_func_factory, temp_db, command, params):
|
||||
mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script')
|
||||
|
||||
assert 0 == call_nominatim('refresh', '--' + command)
|
||||
|
||||
assert mock_run_legacy.called == 1
|
||||
assert len(mock_run_legacy.last_args) >= len(params)
|
||||
assert mock_run_legacy.last_args[:len(params)] == params
|
||||
|
||||
@pytest.mark.parametrize("command,func", [
|
||||
('postcodes', 'update_postcodes'),
|
||||
('word-counts', 'recompute_word_counts'),
|
||||
('address-levels', 'load_address_levels_from_file'),
|
||||
('functions', 'create_functions'),
|
||||
('wiki-data', 'import_wikipedia_articles'),
|
||||
('importance', 'recompute_importance'),
|
||||
('website', 'setup_website'),
|
||||
])
|
||||
def test_refresh_command(mock_func_factory, temp_db, command, func):
|
||||
@@ -162,13 +151,16 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
|
||||
assert func_mock.called == 1
|
||||
|
||||
|
||||
def test_refresh_importance_computed_after_wiki_import(mock_func_factory, temp_db):
|
||||
mock_run_legacy = mock_func_factory(nominatim.clicmd.refresh, 'run_legacy_script')
|
||||
def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
|
||||
calls = []
|
||||
monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
|
||||
lambda *args, **kwargs: calls.append('import') or 0)
|
||||
monkeypatch.setattr(nominatim.tools.refresh, 'recompute_importance',
|
||||
lambda *args, **kwargs: calls.append('update'))
|
||||
|
||||
assert 0 == call_nominatim('refresh', '--importance', '--wiki-data')
|
||||
|
||||
assert mock_run_legacy.called == 2
|
||||
assert mock_run_legacy.last_args == ('update.php', '--recompute-importance')
|
||||
assert calls == ['import', 'update']
|
||||
|
||||
|
||||
def test_serve_command(mock_func_factory):
|
||||
|
||||
@@ -12,10 +12,10 @@ def db(temp_db):
|
||||
yield conn
|
||||
|
||||
|
||||
def test_connection_table_exists(db, temp_db_cursor):
|
||||
def test_connection_table_exists(db, table_factory):
|
||||
assert db.table_exists('foobar') == False
|
||||
|
||||
temp_db_cursor.execute('CREATE TABLE foobar (id INT)')
|
||||
table_factory('foobar')
|
||||
|
||||
assert db.table_exists('foobar') == True
|
||||
|
||||
@@ -31,10 +31,10 @@ def test_connection_index_exists(db, temp_db_cursor):
|
||||
assert db.index_exists('some_index', table='bar') == False
|
||||
|
||||
|
||||
def test_drop_table_existing(db, temp_db_cursor):
|
||||
temp_db_cursor.execute('CREATE TABLE dummy (id INT)')
|
||||
|
||||
def test_drop_table_existing(db, table_factory):
|
||||
table_factory('dummy')
|
||||
assert db.table_exists('dummy')
|
||||
|
||||
db.drop_table('dummy')
|
||||
assert not db.table_exists('dummy')
|
||||
|
||||
@@ -65,8 +65,8 @@ def test_connection_postgis_version_tuple(db, temp_db_cursor):
|
||||
assert ver[0] >= 2
|
||||
|
||||
|
||||
def test_cursor_scalar(db, temp_db_cursor):
|
||||
temp_db_cursor.execute('CREATE TABLE dummy (id INT)')
|
||||
def test_cursor_scalar(db, table_factory):
|
||||
table_factory('dummy')
|
||||
|
||||
with db.cursor() as cur:
|
||||
assert cur.scalar('SELECT count(*) FROM dummy') == 0
|
||||
|
||||
@@ -7,10 +7,6 @@ import pytest
|
||||
import nominatim.db.utils as db_utils
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
@pytest.fixture
|
||||
def dsn(temp_db):
|
||||
return 'dbname=' + temp_db
|
||||
|
||||
def test_execute_file_success(dsn, temp_db_cursor, tmp_path):
|
||||
tmpfile = tmp_path / 'test.sql'
|
||||
tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);')
|
||||
@@ -40,3 +36,27 @@ def test_execute_file_bad_sql_ignore_errors(dsn, tmp_path):
|
||||
tmpfile.write_text('CREATE STABLE test (id INT)')
|
||||
|
||||
db_utils.execute_file(dsn, tmpfile, ignore_errors=True)
|
||||
|
||||
|
||||
def test_execute_file_with_pre_code(dsn, tmp_path, temp_db_cursor):
|
||||
tmpfile = tmp_path / 'test.sql'
|
||||
tmpfile.write_text('INSERT INTO test VALUES(4)')
|
||||
|
||||
db_utils.execute_file(dsn, tmpfile, pre_code='CREATE TABLE test (id INT)')
|
||||
|
||||
temp_db_cursor.execute('SELECT * FROM test')
|
||||
|
||||
assert temp_db_cursor.rowcount == 1
|
||||
assert temp_db_cursor.fetchone()[0] == 4
|
||||
|
||||
|
||||
def test_execute_file_with_post_code(dsn, tmp_path, temp_db_cursor):
|
||||
tmpfile = tmp_path / 'test.sql'
|
||||
tmpfile.write_text('CREATE TABLE test (id INT)')
|
||||
|
||||
db_utils.execute_file(dsn, tmpfile, post_code='INSERT INTO test VALUES(23)')
|
||||
|
||||
temp_db_cursor.execute('SELECT * FROM test')
|
||||
|
||||
assert temp_db_cursor.rowcount == 1
|
||||
assert temp_db_cursor.fetchone()[0] == 23
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
Tests for function for importing address ranks.
|
||||
"""
|
||||
import json
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from nominatim.tools.refresh import load_address_levels, load_address_levels_from_file
|
||||
|
||||
def test_load_ranks_def_config(temp_db_conn, temp_db_cursor, def_config):
|
||||
|
||||
Reference in New Issue
Block a user