port export function to Python

Some of the parameters have been renoved as they don't make sense
anymore.
This commit is contained in:
Sarah Hoffmann
2023-07-17 22:37:07 +02:00
parent faeee7528f
commit 0804cc0cff
7 changed files with 200 additions and 281 deletions

View File

@@ -1,190 +0,0 @@
<?php
/**
* SPDX-License-Identifier: GPL-2.0-only
*
* This file is part of Nominatim. (https://nominatim.org)
*
* Copyright (C) 2022 by the Nominatim developer community.
* For a full list of authors see the git log.
*/
@define('CONST_LibDir', dirname(dirname(__FILE__)));
// Script to extract structured city and street data
// from a running nominatim instance as CSV data
require_once(CONST_LibDir.'/init-cmd.php');
require_once(CONST_LibDir.'/ParameterParser.php');
ini_set('memory_limit', '800M');
$aCMDOptions = array(
'Export addresses as CSV file from a Nominatim database',
array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
array('output-type', '', 0, 1, 1, 1, 'str', 'Type of places to output (see below)'),
array('output-format', '', 0, 1, 1, 1, 'str', 'Column mapping (see below)'),
array('output-all-postcodes', '', 0, 1, 0, 0, 'bool', 'List all postcodes for address instead of just the most likely one'),
array('language', '', 0, 1, 1, 1, 'str', 'Preferred language for output (local name, if omitted)'),
array('restrict-to-country', '', 0, 1, 1, 1, 'str', 'Export only objects within country (country code)'),
array('restrict-to-osm-node', '', 0, 1, 1, 1, 'int', 'Export only objects that are children of this OSM node'),
array('restrict-to-osm-way', '', 0, 1, 1, 1, 'int', 'Export only objects that are children of this OSM way'),
array('restrict-to-osm-relation', '', 0, 1, 1, 1, 'int', 'Export only objects that are children of this OSM relation'),
array('project-dir', '', 0, 1, 1, 1, 'realpath', 'Base directory of the Nominatim installation (default: .)'),
"\nAddress ranks: continent, country, state, county, city, suburb, street, path",
'Additional output types: postcode, placeid (placeid for each object)',
"\noutput-format must be a semicolon-separated list of address ranks. Multiple ranks",
'can be merged into one column by simply using a comma-separated list.',
"\nDefault output-type: street",
'Default output format: street;suburb;city;county;state;country'
);
getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
loadSettings($aCMDResult['project-dir'] ?? getcwd());
$aRankmap = array(
'continent' => 1,
'country' => 4,
'state' => 8,
'county' => 12,
'city' => 16,
'suburb' => 20,
'street' => 26,
'path' => 27
);
$oDB = new Nominatim\DB();
$oDB->connect();
if (isset($aCMDResult['output-type'])) {
if (!isset($aRankmap[$aCMDResult['output-type']])) {
fail('unknown output-type: '.$aCMDResult['output-type']);
}
$iOutputRank = $aRankmap[$aCMDResult['output-type']];
} else {
$iOutputRank = $aRankmap['street'];
}
// Preferred language
$oParams = new Nominatim\ParameterParser();
if (!isset($aCMDResult['language'])) {
$aCMDResult['language'] = 'xx';
}
$aLangPrefOrder = $oParams->getPreferredLanguages($aCMDResult['language']);
$sLanguagePrefArraySQL = $oDB->getArraySQL($oDB->getDBQuotedList($aLangPrefOrder));
// output formatting: build up a lookup table that maps address ranks to columns
$aColumnMapping = array();
$iNumCol = 0;
if (!isset($aCMDResult['output-format'])) {
$aCMDResult['output-format'] = 'street;suburb;city;county;state;country';
}
foreach (preg_split('/\s*;\s*/', $aCMDResult['output-format']) as $sColumn) {
$bHasData = false;
foreach (preg_split('/\s*,\s*/', $sColumn) as $sRank) {
if ($sRank == 'postcode' || $sRank == 'placeid') {
$aColumnMapping[$sRank] = $iNumCol;
$bHasData = true;
} elseif (isset($aRankmap[$sRank])) {
$iRank = $aRankmap[$sRank];
if ($iRank <= $iOutputRank) {
$aColumnMapping[(string)$iRank] = $iNumCol;
$bHasData = true;
}
}
}
if ($bHasData) {
$iNumCol++;
}
}
// build the query for objects
$sPlacexSQL = 'select min(place_id) as place_id, ';
$sPlacexSQL .= 'array_agg(place_id) as place_ids, ';
$sPlacexSQL .= 'country_code as cc, ';
$sPlacexSQL .= 'postcode, ';
// get the address places excluding postcodes
$sPlacexSQL .= 'array(select address_place_id from place_addressline a';
$sPlacexSQL .= ' where a.place_id = placex.place_id and isaddress';
$sPlacexSQL .= ' and address_place_id != placex.place_id';
$sPlacexSQL .= ' and not cached_rank_address in (5,11)';
$sPlacexSQL .= ' and cached_rank_address > 2 order by cached_rank_address)';
$sPlacexSQL .= ' as address';
$sPlacexSQL .= ' from placex where name is not null and linked_place_id is null';
$sPlacexSQL .= ' and rank_address = '.$iOutputRank;
if (isset($aCMDResult['restrict-to-country'])) {
$sPlacexSQL .= ' and country_code = '.$oDB->getDBQuoted($aCMDResult['restrict-to-country']);
}
// restriction to parent place id
$sParentId = false;
$sOsmType = false;
if (isset($aCMDResult['restrict-to-osm-node'])) {
$sOsmType = 'N';
$sOsmId = $aCMDResult['restrict-to-osm-node'];
}
if (isset($aCMDResult['restrict-to-osm-way'])) {
$sOsmType = 'W';
$sOsmId = $aCMDResult['restrict-to-osm-way'];
}
if (isset($aCMDResult['restrict-to-osm-relation'])) {
$sOsmType = 'R';
$sOsmId = $aCMDResult['restrict-to-osm-relation'];
}
if ($sOsmType) {
$sSQL = 'select place_id from placex where osm_type = :osm_type and osm_id = :osm_id';
$sParentId = $oDB->getOne($sSQL, array('osm_type' => $sOsmType, 'osm_id' => $sOsmId));
if (!$sParentId) {
fail('Could not find place '.$sOsmType.' '.$sOsmId);
}
}
if ($sParentId) {
$sPlacexSQL .= ' and place_id in (select place_id from place_addressline where address_place_id = '.$sParentId.' and isaddress)';
}
$sPlacexSQL .= " group by name->'name', address, postcode, country_code, placex.place_id";
// Iterate over placeids
// to get further hierarchical information
//var_dump($sPlacexSQL);
$oResults = $oDB->getQueryStatement($sPlacexSQL);
$fOutstream = fopen('php://output', 'w');
while ($aRow = $oResults->fetch()) {
$iPlaceID = $aRow['place_id'];
$sSQL = "select rank_address,get_name_by_language(name,$sLanguagePrefArraySQL) as localname from get_addressdata(:place_id, -1)";
$sSQL .= ' WHERE isaddress';
$sSQL .= ' order by rank_address desc,isaddress desc';
$aAddressLines = $oDB->getAll($sSQL, array('place_id' => $iPlaceID));
$aOutput = array_fill(0, $iNumCol, '');
// output address parts
foreach ($aAddressLines as $aAddress) {
if (isset($aColumnMapping[$aAddress['rank_address']])) {
$aOutput[$aColumnMapping[$aAddress['rank_address']]] = $aAddress['localname'];
}
}
// output postcode
if (isset($aColumnMapping['postcode'])) {
if ($aCMDResult['output-all-postcodes']) {
$sSQL = 'select array_agg(px.postcode) from placex px join place_addressline pa ';
$sSQL .= 'on px.place_id = pa.address_place_id ';
$sSQL .= 'where pa.cached_rank_address in (5,11) ';
$sSQL .= 'and pa.place_id in (select place_id from place_addressline where address_place_id in (:first_place_id)) ';
$sSQL .= 'group by postcode order by count(*) desc limit 1';
$sRes = $oDB->getOne($sSQL, array('first_place_id' => substr($aRow['place_ids'], 1, -1)));
$aOutput[$aColumnMapping['postcode']] = substr($sRes, 1, -1);
} else {
$aOutput[$aColumnMapping['postcode']] = $aRow['postcode'];
}
}
if (isset($aColumnMapping['placeid'])) {
$aOutput[$aColumnMapping['placeid']] = substr($aRow['place_ids'], 1, -1);
}
fputcsv($fOutstream, $aOutput);
}
fclose($fOutstream);

View File

@@ -16,6 +16,7 @@ import from this file, not from the source files directly.
from .core import (NominatimAPI as NominatimAPI, from .core import (NominatimAPI as NominatimAPI,
NominatimAPIAsync as NominatimAPIAsync) NominatimAPIAsync as NominatimAPIAsync)
from .connection import (SearchConnection as SearchConnection)
from .status import (StatusResult as StatusResult) from .status import (StatusResult as StatusResult)
from .types import (PlaceID as PlaceID, from .types import (PlaceID as PlaceID,
OsmID as OsmID, OsmID as OsmID,

View File

@@ -2,13 +2,13 @@
# #
# This file is part of Nominatim. (https://nominatim.org) # This file is part of Nominatim. (https://nominatim.org)
# #
# Copyright (C) 2022 by the Nominatim developer community. # Copyright (C) 2023 by the Nominatim developer community.
# For a full list of authors see the git log. # For a full list of authors see the git log.
""" """
Command-line interface to the Nominatim functions for import, update, Command-line interface to the Nominatim functions for import, update,
database administration and querying. database administration and querying.
""" """
from typing import Optional, Any, List, Union from typing import Optional, Any
import importlib import importlib
import logging import logging
import os import os
@@ -17,7 +17,7 @@ import argparse
from pathlib import Path from pathlib import Path
from nominatim.config import Configuration from nominatim.config import Configuration
from nominatim.tools.exec_utils import run_legacy_script, run_php_server from nominatim.tools.exec_utils import run_php_server
from nominatim.errors import UsageError from nominatim.errors import UsageError
from nominatim import clicmd from nominatim import clicmd
from nominatim import version from nominatim import version
@@ -140,60 +140,6 @@ class CommandlineParser:
# #
# No need to document the functions each time. # No need to document the functions each time.
# pylint: disable=C0111 # pylint: disable=C0111
class QueryExport:
"""\
Export addresses as CSV file from the database.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Output arguments')
group.add_argument('--output-type', default='street',
choices=('continent', 'country', 'state', 'county',
'city', 'suburb', 'street', 'path'),
help='Type of places to output (default: street)')
group.add_argument('--output-format',
default='street;suburb;city;county;state;country',
help=("Semicolon-separated list of address types "
"(see --output-type). Multiple ranks can be "
"merged into one column by simply using a "
"comma-separated list."))
group.add_argument('--output-all-postcodes', action='store_true',
help=("List all postcodes for address instead of "
"just the most likely one"))
group.add_argument('--language',
help=("Preferred language for output "
"(use local name, if omitted)"))
group = parser.add_argument_group('Filter arguments')
group.add_argument('--restrict-to-country', metavar='COUNTRY_CODE',
help='Export only objects within country')
group.add_argument('--restrict-to-osm-node', metavar='ID', type=int,
help='Export only children of this OSM node')
group.add_argument('--restrict-to-osm-way', metavar='ID', type=int,
help='Export only children of this OSM way')
group.add_argument('--restrict-to-osm-relation', metavar='ID', type=int,
help='Export only children of this OSM relation')
def run(self, args: NominatimArgs) -> int:
params: List[Union[int, str]] = [
'--output-type', args.output_type,
'--output-format', args.output_format]
if args.output_all_postcodes:
params.append('--output-all-postcodes')
if args.language:
params.extend(('--language', args.language))
if args.restrict_to_country:
params.extend(('--restrict-to-country', args.restrict_to_country))
if args.restrict_to_osm_node:
params.extend(('--restrict-to-osm-node', args.restrict_to_osm_node))
if args.restrict_to_osm_way:
params.extend(('--restrict-to-osm-way', args.restrict_to_osm_way))
if args.restrict_to_osm_relation:
params.extend(('--restrict-to-osm-relation', args.restrict_to_osm_relation))
return run_legacy_script('export.php', *params, config=args.config)
class AdminServe: class AdminServe:
"""\ """\
Start a simple web server for serving the API. Start a simple web server for serving the API.
@@ -260,7 +206,7 @@ def get_set_parser() -> CommandlineParser:
parser.add_subcommand('admin', clicmd.AdminFuncs()) parser.add_subcommand('admin', clicmd.AdminFuncs())
parser.add_subcommand('export', QueryExport()) parser.add_subcommand('export', clicmd.QueryExport())
parser.add_subcommand('serve', AdminServe()) parser.add_subcommand('serve', AdminServe())
parser.add_subcommand('search', clicmd.APISearch()) parser.add_subcommand('search', clicmd.APISearch())

View File

@@ -2,7 +2,7 @@
# #
# This file is part of Nominatim. (https://nominatim.org) # This file is part of Nominatim. (https://nominatim.org)
# #
# Copyright (C) 2022 by the Nominatim developer community. # Copyright (C) 2023 by the Nominatim developer community.
# For a full list of authors see the git log. # For a full list of authors see the git log.
""" """
Subcommand definitions for the command-line tool. Subcommand definitions for the command-line tool.
@@ -24,3 +24,4 @@ from nominatim.clicmd.add_data import UpdateAddData as UpdateAddData
from nominatim.clicmd.admin import AdminFuncs as AdminFuncs from nominatim.clicmd.admin import AdminFuncs as AdminFuncs
from nominatim.clicmd.freeze import SetupFreeze as SetupFreeze from nominatim.clicmd.freeze import SetupFreeze as SetupFreeze
from nominatim.clicmd.special_phrases import ImportSpecialPhrases as ImportSpecialPhrases from nominatim.clicmd.special_phrases import ImportSpecialPhrases as ImportSpecialPhrases
from nominatim.clicmd.export import QueryExport as QueryExport

View File

@@ -100,9 +100,6 @@ class NominatimArgs:
output_all_postcodes: bool output_all_postcodes: bool
language: Optional[str] language: Optional[str]
restrict_to_country: Optional[str] restrict_to_country: Optional[str]
restrict_to_osm_node: Optional[int]
restrict_to_osm_way: Optional[int]
restrict_to_osm_relation: Optional[int]
# Arguments to 'refresh' # Arguments to 'refresh'
postcodes: bool postcodes: bool

193
nominatim/clicmd/export.py Normal file
View File

@@ -0,0 +1,193 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2023 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'export' subcommand.
"""
from typing import Optional, List, cast
import logging
import argparse
import asyncio
import csv
import sys
import sqlalchemy as sa
from nominatim.clicmd.args import NominatimArgs
import nominatim.api as napi
from nominatim.api.results import create_from_placex_row, ReverseResult, add_result_details
from nominatim.api.types import LookupDetails
from nominatim.errors import UsageError
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
# Needed for SQLAlchemy
# pylint: disable=singleton-comparison
LOG = logging.getLogger()
RANK_RANGE_MAP = {
'country': (4, 4),
'state': (5, 9),
'county': (10, 12),
'city': (13, 16),
'suburb': (17, 21),
'street': (26, 26),
'path': (27, 27)
}
RANK_TO_OUTPUT_MAP = {
4: 'country',
5: 'state', 6: 'state', 7: 'state', 8: 'state', 9: 'state',
10: 'county', 11: 'county', 12: 'county',
13: 'city', 14: 'city', 15: 'city', 16: 'city',
17: 'suburb', 18: 'suburb', 19: 'suburb', 20: 'suburb', 21: 'suburb',
26: 'street', 27: 'path'}
class QueryExport:
"""\
Export places as CSV file from the database.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Output arguments')
group.add_argument('--output-type', default='street',
choices=('country', 'state', 'county',
'city', 'suburb', 'street', 'path'),
help='Type of places to output (default: street)')
group.add_argument('--output-format',
default='street;suburb;city;county;state;country',
help=("Semicolon-separated list of address types "
"(see --output-type)."))
group.add_argument('--language',
help=("Preferred language for output "
"(use local name, if omitted)"))
group = parser.add_argument_group('Filter arguments')
group.add_argument('--restrict-to-country', metavar='COUNTRY_CODE',
help='Export only objects within country')
group.add_argument('--restrict-to-osm-node', metavar='ID', type=int,
dest='node',
help='Export only children of this OSM node')
group.add_argument('--restrict-to-osm-way', metavar='ID', type=int,
dest='way',
help='Export only children of this OSM way')
group.add_argument('--restrict-to-osm-relation', metavar='ID', type=int,
dest='relation',
help='Export only children of this OSM relation')
def run(self, args: NominatimArgs) -> int:
return asyncio.run(export(args))
async def export(args: NominatimArgs) -> int:
""" The actual export as a asynchronous function.
"""
api = napi.NominatimAPIAsync(args.project_dir)
output_range = RANK_RANGE_MAP[args.output_type]
writer = init_csv_writer(args.output_format)
async with api.begin() as conn, api.begin() as detail_conn:
t = conn.t.placex
sql = sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.admin_level,
t.c.address, t.c.extratags,
t.c.housenumber, t.c.postcode, t.c.country_code,
t.c.importance, t.c.wikipedia, t.c.indexed_date,
t.c.rank_address, t.c.rank_search,
t.c.centroid)\
.where(t.c.linked_place_id == None)\
.where(t.c.rank_address.between(*output_range))
parent_place_id = await get_parent_id(conn, args.node, args.way, args.relation)
if parent_place_id:
taddr = conn.t.addressline
sql = sql.join(taddr, taddr.c.place_id == t.c.place_id)\
.where(taddr.c.address_place_id == parent_place_id)\
.where(taddr.c.isaddress)
if args.restrict_to_country:
sql = sql.where(t.c.country_code == args.restrict_to_country.lower())
results = []
for row in await conn.execute(sql):
result = create_from_placex_row(row, ReverseResult)
if result is not None:
results.append(result)
if len(results) == 1000:
await dump_results(detail_conn, results, writer, args.language)
results = []
if results:
await dump_results(detail_conn, results, writer, args.language)
return 0
def init_csv_writer(output_format: str) -> 'csv.DictWriter[str]':
fields = output_format.split(';')
writer = csv.DictWriter(sys.stdout, fieldnames=fields, extrasaction='ignore')
writer.writeheader()
return writer
async def dump_results(conn: napi.SearchConnection,
results: List[ReverseResult],
writer: 'csv.DictWriter[str]',
lang: Optional[str]) -> None:
await add_result_details(conn, results,
LookupDetails(address_details=True))
locale = napi.Locales([lang] if lang else None)
for result in results:
data = {'placeid': result.place_id,
'postcode': result.postcode}
result.localize(locale)
for line in (result.address_rows or []):
if line.isaddress and line.local_name\
and line.rank_address in RANK_TO_OUTPUT_MAP:
data[RANK_TO_OUTPUT_MAP[line.rank_address]] = line.local_name
writer.writerow(data)
async def get_parent_id(conn: napi.SearchConnection, node_id: Optional[int],
way_id: Optional[int],
relation_id: Optional[int]) -> Optional[int]:
""" Get the place ID for the given OSM object.
"""
if node_id is not None:
osm_type, osm_id = 'N', node_id
elif way_id is not None:
osm_type, osm_id = 'W', way_id
elif relation_id is not None:
osm_type, osm_id = 'R', relation_id
else:
return None
t = conn.t.placex
sql = sa.select(t.c.place_id).limit(1)\
.where(t.c.osm_type == osm_type)\
.where(t.c.osm_id == osm_id)\
.where(t.c.rank_address > 0)\
.order_by(t.c.rank_address)
for result in await conn.execute(sql):
return cast(int, result[0])
raise UsageError(f'Cannot find a place {osm_type}{osm_id}.')

View File

@@ -100,35 +100,6 @@ def test_cli_serve_uvicorn_based(cli_call, engine, mock_func_factory):
assert func.last_kwargs['host'] == '127.0.0.1' assert func.last_kwargs['host'] == '127.0.0.1'
assert func.last_kwargs['port'] == 8088 assert func.last_kwargs['port'] == 8088
def test_cli_export_command(cli_call, mock_run_legacy):
assert cli_call('export', '--output-all-postcodes') == 0
assert mock_run_legacy.called == 1
assert mock_run_legacy.last_args[0] == 'export.php'
@pytest.mark.parametrize("param,value", [('output-type', 'country'),
('output-format', 'street;city'),
('language', 'xf'),
('restrict-to-country', 'us'),
('restrict-to-osm-node', '536'),
('restrict-to-osm-way', '727'),
('restrict-to-osm-relation', '197532')
])
def test_export_parameters(src_dir, tmp_path, param, value, monkeypatch):
(tmp_path / 'admin').mkdir()
(tmp_path / 'admin' / 'export.php').write_text(f"""<?php
exit(strpos(implode(' ', $_SERVER['argv']), '--{param} {value}') >= 0 ? 0 : 10);
""")
monkeypatch.setattr(nominatim.paths, 'PHPLIB_DIR', tmp_path)
assert nominatim.cli.nominatim(module_dir='MODULE NOT AVAILABLE',
osm2pgsql_path='OSM2PGSQL NOT AVAILABLE',
phpcgi_path='/usr/bin/php-cgi',
cli_args=['export', '--' + param, value]) == 0
class TestCliWithDb: class TestCliWithDb: