Merge pull request #2401 from lonvia/port-add-data-to-python

Port add-data functions from PHP to Python
This commit is contained in:
Sarah Hoffmann
2021-07-26 12:38:56 +02:00
committed by GitHub
7 changed files with 143 additions and 215 deletions

View File

@@ -1,150 +0,0 @@
<?php
@define('CONST_LibDir', dirname(dirname(__FILE__)));
require_once(CONST_LibDir.'/init-cmd.php');
require_once(CONST_LibDir.'/setup_functions.php');
ini_set('memory_limit', '800M');
// (long-opt, short-opt, min-occurs, max-occurs, num-arguments, num-arguments, type, help)
$aCMDOptions
= array(
'Import / update / index osm data',
array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
array('calculate-postcodes', '', 0, 1, 0, 0, 'bool', 'Update postcode centroid table'),
array('import-file', '', 0, 1, 1, 1, 'realpath', 'Re-import data from an OSM file'),
array('import-diff', '', 0, 1, 1, 1, 'realpath', 'Import a diff (osc) file from local file system'),
array('osm2pgsql-cache', '', 0, 1, 1, 1, 'int', 'Cache size used by osm2pgsql'),
array('import-node', '', 0, 1, 1, 1, 'int', 'Re-import node'),
array('import-way', '', 0, 1, 1, 1, 'int', 'Re-import way'),
array('import-relation', '', 0, 1, 1, 1, 'int', 'Re-import relation'),
array('import-from-main-api', '', 0, 1, 0, 0, 'bool', 'Use OSM API instead of Overpass to download objects'),
array('project-dir', '', 0, 1, 1, 1, 'realpath', 'Base directory of the Nominatim installation (default: .)'),
);
getCmdOpt($_SERVER['argv'], $aCMDOptions, $aResult, true, true);
loadSettings($aCMDResult['project-dir'] ?? getcwd());
setupHTTPProxy();
date_default_timezone_set('Etc/UTC');
$oDB = new Nominatim\DB();
$oDB->connect();
$fPostgresVersion = $oDB->getPostgresVersion();
$aDSNInfo = Nominatim\DB::parseDSN(getSetting('DATABASE_DSN'));
if (!isset($aDSNInfo['port']) || !$aDSNInfo['port']) {
$aDSNInfo['port'] = 5432;
}
// cache memory to be used by osm2pgsql, should not be more than the available memory
$iCacheMemory = (isset($aResult['osm2pgsql-cache'])?$aResult['osm2pgsql-cache']:2000);
if ($iCacheMemory + 500 > getTotalMemoryMB()) {
$iCacheMemory = getCacheMemoryMB();
echo "WARNING: resetting cache memory to $iCacheMemory\n";
}
$oOsm2pgsqlCmd = (new \Nominatim\Shell(getOsm2pgsqlBinary()))
->addParams('--hstore')
->addParams('--latlong')
->addParams('--append')
->addParams('--slim')
->addParams('--with-forward-dependencies', 'false')
->addParams('--log-progress', 'true')
->addParams('--number-processes', 1)
->addParams('--cache', $iCacheMemory)
->addParams('--output', 'gazetteer')
->addParams('--style', getImportStyle())
->addParams('--database', $aDSNInfo['database'])
->addParams('--port', $aDSNInfo['port']);
if (isset($aDSNInfo['hostspec']) && $aDSNInfo['hostspec']) {
$oOsm2pgsqlCmd->addParams('--host', $aDSNInfo['hostspec']);
}
if (isset($aDSNInfo['username']) && $aDSNInfo['username']) {
$oOsm2pgsqlCmd->addParams('--user', $aDSNInfo['username']);
}
if (isset($aDSNInfo['password']) && $aDSNInfo['password']) {
$oOsm2pgsqlCmd->addEnvPair('PGPASSWORD', $aDSNInfo['password']);
}
if (getSetting('FLATNODE_FILE')) {
$oOsm2pgsqlCmd->addParams('--flat-nodes', getSetting('FLATNODE_FILE'));
}
if ($fPostgresVersion >= 11.0) {
$oOsm2pgsqlCmd->addEnvPair(
'PGOPTIONS',
'-c jit=off -c max_parallel_workers_per_gather=0'
);
}
if (isset($aResult['import-diff']) || isset($aResult['import-file'])) {
// import diffs and files directly (e.g. from osmosis --rri)
$sNextFile = isset($aResult['import-diff']) ? $aResult['import-diff'] : $aResult['import-file'];
if (!file_exists($sNextFile)) {
fail("Cannot open $sNextFile\n");
}
// Import the file
$oCMD = (clone $oOsm2pgsqlCmd)->addParams($sNextFile);
echo $oCMD->escapedCmd()."\n";
$iRet = $oCMD->run();
if ($iRet) {
fail("Error from osm2pgsql, $iRet\n");
}
// Don't update the import status - we don't know what this file contains
}
$sTemporaryFile = CONST_InstallDir.'/osmosischange.osc';
$bHaveDiff = false;
$bUseOSMApi = isset($aResult['import-from-main-api']) && $aResult['import-from-main-api'];
$sContentURL = '';
if (isset($aResult['import-node']) && $aResult['import-node']) {
if ($bUseOSMApi) {
$sContentURL = 'https://www.openstreetmap.org/api/0.6/node/'.$aResult['import-node'];
} else {
$sContentURL = 'https://overpass-api.de/api/interpreter?data=node('.$aResult['import-node'].');out%20meta;';
}
}
if (isset($aResult['import-way']) && $aResult['import-way']) {
if ($bUseOSMApi) {
$sContentURL = 'https://www.openstreetmap.org/api/0.6/way/'.$aResult['import-way'].'/full';
} else {
$sContentURL = 'https://overpass-api.de/api/interpreter?data=(way('.$aResult['import-way'].');%3E;);out%20meta;';
}
}
if (isset($aResult['import-relation']) && $aResult['import-relation']) {
if ($bUseOSMApi) {
$sContentURL = 'https://www.openstreetmap.org/api/0.6/relation/'.$aResult['import-relation'].'/full';
} else {
$sContentURL = 'https://overpass-api.de/api/interpreter?data=(rel(id:'.$aResult['import-relation'].');%3E;);out%20meta;';
}
}
if ($sContentURL) {
file_put_contents($sTemporaryFile, file_get_contents($sContentURL));
$bHaveDiff = true;
}
if ($bHaveDiff) {
// import generated change file
$oCMD = (clone $oOsm2pgsqlCmd)->addParams($sTemporaryFile);
echo $oCMD->escapedCmd()."\n";
$iRet = $oCMD->run();
if ($iRet) {
fail("osm2pgsql exited with error level $iRet\n");
}
}

View File

@@ -114,63 +114,6 @@ class CommandlineParser:
#
# No need to document the functions each time.
# pylint: disable=C0111
# Using non-top-level imports to make pyosmium optional for replication only.
# pylint: disable=E0012,C0415
class UpdateAddData:
"""\
Add additional data from a file or an online source.
Data is only imported, not indexed. You need to call `nominatim index`
to complete the process.
"""
@staticmethod
def add_args(parser):
group_name = parser.add_argument_group('Source')
group = group_name.add_mutually_exclusive_group(required=True)
group.add_argument('--file', metavar='FILE',
help='Import data from an OSM file')
group.add_argument('--diff', metavar='FILE',
help='Import data from an OSM diff file')
group.add_argument('--node', metavar='ID', type=int,
help='Import a single node from the API')
group.add_argument('--way', metavar='ID', type=int,
help='Import a single way from the API')
group.add_argument('--relation', metavar='ID', type=int,
help='Import a single relation from the API')
group.add_argument('--tiger-data', metavar='DIR',
help='Add housenumbers from the US TIGER census database.')
group = parser.add_argument_group('Extra arguments')
group.add_argument('--use-main-api', action='store_true',
help='Use OSM API instead of Overpass to download objects')
@staticmethod
def run(args):
from nominatim.tokenizer import factory as tokenizer_factory
from nominatim.tools import tiger_data
if args.tiger_data:
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
return tiger_data.add_tiger_data(args.tiger_data,
args.config, args.threads or 1,
tokenizer)
params = ['update.php']
if args.file:
params.extend(('--import-file', args.file))
elif args.diff:
params.extend(('--import-diff', args.diff))
elif args.node:
params.extend(('--import-node', args.node))
elif args.way:
params.extend(('--import-way', args.way))
elif args.relation:
params.extend(('--import-relation', args.relation))
if args.use_main_api:
params.append('--use-main-api')
return run_legacy_script(*params, nominatim_env=args)
class QueryExport:
"""\
Export addresses as CSV file from the database.
@@ -261,7 +204,7 @@ def get_set_parser(**kwargs):
parser.add_subcommand('special-phrases', clicmd.ImportSpecialPhrases)
parser.add_subcommand('add-data', UpdateAddData)
parser.add_subcommand('add-data', clicmd.UpdateAddData)
parser.add_subcommand('index', clicmd.UpdateIndex)
parser.add_subcommand('refresh', clicmd.UpdateRefresh())

View File

@@ -7,6 +7,7 @@ from nominatim.clicmd.replication import UpdateReplication
from nominatim.clicmd.api import APISearch, APIReverse, APILookup, APIDetails, APIStatus
from nominatim.clicmd.index import UpdateIndex
from nominatim.clicmd.refresh import UpdateRefresh
from nominatim.clicmd.add_data import UpdateAddData
from nominatim.clicmd.admin import AdminFuncs
from nominatim.clicmd.freeze import SetupFreeze
from nominatim.clicmd.special_phrases import ImportSpecialPhrases

View File

@@ -0,0 +1,76 @@
"""
Implementation of the 'add-data' subcommand.
"""
import logging
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
LOG = logging.getLogger()
class UpdateAddData:
"""\
Add additional data from a file or an online source.
Data is only imported, not indexed. You need to call `nominatim index`
to complete the process.
"""
@staticmethod
def add_args(parser):
group_name = parser.add_argument_group('Source')
group = group_name.add_mutually_exclusive_group(required=True)
group.add_argument('--file', metavar='FILE',
help='Import data from an OSM file or diff file')
group.add_argument('--diff', metavar='FILE',
help='Import data from an OSM diff file (deprecated: use --file)')
group.add_argument('--node', metavar='ID', type=int,
help='Import a single node from the API')
group.add_argument('--way', metavar='ID', type=int,
help='Import a single way from the API')
group.add_argument('--relation', metavar='ID', type=int,
help='Import a single relation from the API')
group.add_argument('--tiger-data', metavar='DIR',
help='Add housenumbers from the US TIGER census database.')
group = parser.add_argument_group('Extra arguments')
group.add_argument('--use-main-api', action='store_true',
help='Use OSM API instead of Overpass to download objects')
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
help='Size of cache to be used by osm2pgsql (in MB)')
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
help='Set timeout for file downloads.')
@staticmethod
def run(args):
from nominatim.tokenizer import factory as tokenizer_factory
from nominatim.tools import tiger_data, add_osm_data
if args.tiger_data:
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
return tiger_data.add_tiger_data(args.tiger_data,
args.config, args.threads or 1,
tokenizer)
osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
if args.file or args.diff:
return add_osm_data.add_data_from_file(args.file or args.diff,
osm2pgsql_params)
if args.node:
return add_osm_data.add_osm_object('node', args.node,
args.use_main_api,
osm2pgsql_params)
if args.way:
return add_osm_data.add_osm_object('way', args.way,
args.use_main_api,
osm2pgsql_params)
if args.relation:
return add_osm_data.add_osm_object('relation', args.relation,
args.use_main_api,
osm2pgsql_params)
return 0

View File

@@ -0,0 +1,46 @@
"""
Function to add additional OSM data from a file or the API into the database.
"""
from pathlib import Path
import logging
import urllib
from nominatim.tools.exec_utils import run_osm2pgsql, get_url
LOG = logging.getLogger()
def add_data_from_file(fname, options):
""" Adds data from a OSM file to the database. The file may be a normal
OSM file or a diff file in all formats supported by libosmium.
"""
options['import_file'] = Path(fname)
options['append'] = True
run_osm2pgsql(options)
# No status update. We don't know where the file came from.
return 0
def add_osm_object(osm_type, osm_id, use_main_api, options):
""" Add or update a single OSM object from the latest version of the
API.
"""
if use_main_api:
base_url = f'https://www.openstreetmap.org/api/0.6/{osm_type}/{osm_id}'
if osm_type in ('way', 'relation'):
base_url += '/full'
else:
# use Overpass API
if osm_type == 'node':
data = f'node({osm_id});out meta;'
elif osm_type == 'way':
data = f'(way({osm_id});>;);out meta;'
else:
data = f'(rel(id:{osm_id});>;);out meta;'
base_url = 'https://overpass-api.de/api/interpreter?' \
+ urllib.parse.urlencode({'data': data})
options['append'] = True
options['import_data'] = get_url(base_url).encode('utf-8')
run_osm2pgsql(options)

View File

@@ -128,9 +128,14 @@ def run_osm2pgsql(options):
if options.get('disable_jit', False):
env['PGOPTIONS'] = '-c jit=off -c max_parallel_workers_per_gather=0'
cmd.append(str(options['import_file']))
if 'import_data' in options:
cmd.extend(('-r', 'xml', '-'))
else:
cmd.append(str(options['import_file']))
subprocess.run(cmd, cwd=options.get('cwd', '.'), env=env, check=True)
subprocess.run(cmd, cwd=options.get('cwd', '.'),
input=options.get('import_data'),
env=env, check=True)
def get_url(url):

View File

@@ -15,6 +15,7 @@ import nominatim.clicmd.admin
import nominatim.clicmd.setup
import nominatim.indexer.indexer
import nominatim.tools.admin
import nominatim.tools.add_osm_data
import nominatim.tools.check_database
import nominatim.tools.database_import
import nominatim.tools.freeze
@@ -60,7 +61,6 @@ class TestCli:
@pytest.mark.parametrize("command,script", [
(('add-data', '--file', 'foo.osm'), 'update'),
(('export',), 'export')
])
def test_legacy_commands_simple(self, mock_run_legacy, command, script):
@@ -88,13 +88,20 @@ class TestCli:
assert mock.called == 1
@pytest.mark.parametrize("name,oid", [('file', 'foo.osm'), ('diff', 'foo.osc'),
('node', 12), ('way', 8), ('relation', 32)])
def test_add_data_command(self, mock_run_legacy, name, oid):
@pytest.mark.parametrize("name,oid", [('file', 'foo.osm'), ('diff', 'foo.osc')])
def test_add_data_file_command(self, mock_func_factory, name, oid):
mock_run_legacy = mock_func_factory(nominatim.tools.add_osm_data, 'add_data_from_file')
assert self.call_nominatim('add-data', '--' + name, str(oid)) == 0
assert mock_run_legacy.called == 1
@pytest.mark.parametrize("name,oid", [('node', 12), ('way', 8), ('relation', 32)])
def test_add_data_object_command(self, mock_func_factory, name, oid):
mock_run_legacy = mock_func_factory(nominatim.tools.add_osm_data, 'add_osm_object')
assert self.call_nominatim('add-data', '--' + name, str(oid)) == 0
assert mock_run_legacy.called == 1
assert mock_run_legacy.last_args == ('update.php', '--import-' + name, oid)
def test_serve_command(self, mock_func_factory):