port wikipedia importance functions to python

This commit is contained in:
Sarah Hoffmann
2021-02-24 22:02:13 +01:00
parent 32683f73c7
commit c7fd0a7af4
11 changed files with 132 additions and 103 deletions

View File

@@ -5,7 +5,6 @@ import logging
from pathlib import Path
from ..db.connection import connect
from ..tools.exec_utils import run_legacy_script
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
@@ -69,12 +68,20 @@ class UpdateRefresh:
args.diffs, args.enable_debug_statements)
if args.wiki_data:
run_legacy_script('setup.php', '--import-wikipedia-articles',
nominatim_env=args, throw_on_fail=True)
data_path = Path(args.config.WIKIPEDIA_DATA_PATH
or args.project_dir)
LOG.warning('Import wikipdia article importance from %s', data_path)
if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
data_path) > 0:
LOG.fatal('FATAL: Wikipedia importance dump file not found')
return 1
# Attention: importance MUST come after wiki data import.
if args.importance:
run_legacy_script('update.php', '--recompute-importance',
nominatim_env=args, throw_on_fail=True)
LOG.warning('Update importance values for database')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.recompute_importance(conn)
if args.website:
webdir = args.project_dir / 'website'
LOG.warning('Setting up website directory at %s', webdir)

View File

@@ -21,9 +21,12 @@ def _pipe_to_proc(proc, fdesc):
return len(chunk)
def execute_file(dsn, fname, ignore_errors=False):
def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None):
""" Read an SQL file and run its contents against the given database
using psql.
using psql. Use `pre_code` and `post_code` to run extra commands
before or after executing the file. The commands are run within the
same session, so they may be used to wrap the file execution in a
transaction.
"""
cmd = ['psql']
if not ignore_errors:
@@ -33,6 +36,9 @@ def execute_file(dsn, fname, ignore_errors=False):
if not LOG.isEnabledFor(logging.INFO):
proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8'))
if pre_code:
proc.stdin.write((pre_code + ';').encode('utf-8'))
if fname.suffix == '.gz':
with gzip.open(str(fname), 'rb') as fdesc:
remain = _pipe_to_proc(proc, fdesc)
@@ -40,6 +46,9 @@ def execute_file(dsn, fname, ignore_errors=False):
with fname.open('rb') as fdesc:
remain = _pipe_to_proc(proc, fdesc)
if remain == 0 and post_code:
proc.stdin.write((';' + post_code).encode('utf-8'))
proc.stdin.close()
ret = proc.wait()

View File

@@ -200,6 +200,53 @@ PHP_CONST_DEFS = (
)
def import_wikipedia_articles(dsn, data_path, ignore_errors=False):
""" Replaces the wikipedia importance tables with new data.
The import is run in a single transaction so that the new data
is replace seemlessly.
Returns 0 if all was well and 1 if the importance file could not
be found. Throws an exception if there was an error reading the file.
"""
datafile = data_path / 'wikimedia-importance.sql.gz'
if not datafile.exists():
return 1
pre_code = """BEGIN;
DROP TABLE IF EXISTS "wikipedia_article";
DROP TABLE IF EXISTS "wikipedia_redirect"
"""
post_code = "COMMIT"
execute_file(dsn, datafile, ignore_errors=ignore_errors,
pre_code=pre_code, post_code=post_code)
return 0
def recompute_importance(conn):
""" Recompute wikipedia links and importance for all entries in placex.
This is a long-running operations that must not be executed in
parallel with updates.
"""
with conn.cursor() as cur:
cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL')
cur.execute("""
UPDATE placex SET (wikipedia, importance) =
(SELECT wikipedia, importance
FROM compute_importance(extratags, country_code, osm_type, osm_id))
""")
cur.execute("""
UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
FROM placex d
WHERE s.place_id = d.linked_place_id and d.wikipedia is not null
and (s.wikipedia is null or s.importance < d.importance);
""")
cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL')
conn.commit()
def setup_website(basedir, phplib_dir, config):
""" Create the website script stubs.
"""