add CSV format for importance import

This commit is contained in:
Sarah Hoffmann
2024-05-05 09:39:52 +02:00
parent 5c7073901e
commit 60b03d506f
6 changed files with 167 additions and 39 deletions

View File

@@ -20,6 +20,54 @@ CREATE TYPE place_importance as (
wikipedia TEXT wikipedia TEXT
); );
{% if 'wikimedia_importance' in db.tables %}
CREATE OR REPLACE FUNCTION get_wikipedia_match(extratags HSTORE, country_code varchar(2))
RETURNS wikipedia_article_match
AS $$
DECLARE
i INT;
wiki_article_title TEXT;
wiki_article_language TEXT;
result wikipedia_article_match;
entry RECORD;
BEGIN
IF extratags ? 'wikipedia' and strpos(extratags->'wikipedia', ':') IN (3,4) THEN
wiki_article_language := lower(trim(split_part(extratags->'wikipedia', ':', 1)));
wiki_article_title := trim(substr(extratags->'wikipedia',
strpos(extratags->'wikipedia', ':') + 1));
FOR result IN
SELECT language, title, importance FROM wikimedia_importance
WHERE language = wiki_article_language
and title = replace(wiki_article_title, ' ', '_')
LOOP
RETURN result;
END LOOP;
END IF;
FOREACH wiki_article_language IN ARRAY ARRAY['ar','bg','ca','cs','da','de','en','es','eo','eu','fa','fr','ko','hi','hr','id','it','he','lt','hu','ms','nl','ja','no','pl','pt','kk','ro','ru','sk','sl','sr','fi','sv','tr','uk','vi','vo','war','zh']
LOOP
IF extratags ? ('wikipedia:' || wiki_article_language) THEN
wiki_article_title := extratags->('wikipedia:' || wiki_article_language);
FOR result IN
SELECT language, title, importance FROM wikimedia_importance
WHERE language = wiki_article_language
and title = replace(wiki_article_title, ' ', '_')
LOOP
RETURN result;
END LOOP;
END IF;
END LOOP;
RETURN NULL;
END;
$$
LANGUAGE plpgsql IMMUTABLE STRICT;
{% else %}
-- See: http://stackoverflow.com/questions/6410088/how-can-i-mimic-the-php-urldecode-function-in-postgresql -- See: http://stackoverflow.com/questions/6410088/how-can-i-mimic-the-php-urldecode-function-in-postgresql
CREATE OR REPLACE FUNCTION decode_url_part(p varchar) CREATE OR REPLACE FUNCTION decode_url_part(p varchar)
@@ -93,6 +141,7 @@ END;
$$ $$
LANGUAGE plpgsql STABLE; LANGUAGE plpgsql STABLE;
{% endif %}
CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE, CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE,
country_code varchar(2), country_code varchar(2),
@@ -118,9 +167,16 @@ BEGIN
-- Nothing? Then try with the wikidata tag. -- Nothing? Then try with the wikidata tag.
IF result.importance is null AND extratags ? 'wikidata' THEN IF result.importance is null AND extratags ? 'wikidata' THEN
FOR match IN SELECT * FROM wikipedia_article FOR match IN
WHERE wd_page_title = extratags->'wikidata' {% if 'wikimedia_importance' in db.tables %}
ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 SELECT * FROM wikimedia_importance
WHERE wikidata = extratags->'wikidata'
LIMIT 1
{% else %}
SELECT * FROM wikipedia_article
WHERE wd_page_title = extratags->'wikidata'
ORDER BY language = 'en' DESC, langcount DESC LIMIT 1
{% endif %}
LOOP LOOP
result.importance := match.importance; result.importance := match.importance;
result.wikipedia := match.language || ':' || match.title; result.wikipedia := match.language || ':' || match.title;

View File

@@ -273,28 +273,15 @@ GRANT SELECT ON import_polygon_delete TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS file; DROP SEQUENCE IF EXISTS file;
CREATE SEQUENCE file start 1; CREATE SEQUENCE file start 1;
-- null table so it won't error {% if 'wikimedia_importance' not in db.tables and 'wikipedia_article' not in db.tables %}
-- deliberately no drop - importing the table is expensive and static, if it is already there better to avoid removing it -- create dummy tables here, if nothing was imported
CREATE TABLE IF NOT EXISTS wikipedia_article ( CREATE TABLE wikimedia_importance (
language text NOT NULL, language TEXT NOT NULL,
title text NOT NULL, title TEXT NOT NULL,
langcount integer, importance double precision NOT NULL,
othercount integer, wikidata TEXT
totalcount integer, ) {{db.tablespace.address_data}};
lat double precision, {% endif %}
lon double precision,
importance double precision,
osm_type character(1),
osm_id bigint,
wd_page_title text,
instance_of text
);
CREATE TABLE IF NOT EXISTS wikipedia_redirect (
language text,
from_title text,
to_title text
);
-- osm2pgsql does not create indexes on the middle tables for Nominatim -- osm2pgsql does not create indexes on the middle tables for Nominatim
-- Add one for lookup of associated street relations. -- Add one for lookup of associated street relations.

View File

@@ -89,6 +89,7 @@ class UpdateRefresh:
from ..tools import refresh, postcodes from ..tools import refresh, postcodes
from ..indexer.indexer import Indexer from ..indexer.indexer import Indexer
need_function_refresh = args.functions
if args.postcodes: if args.postcodes:
if postcodes.can_compute(args.config.get_libpq_dsn()): if postcodes.can_compute(args.config.get_libpq_dsn()):
@@ -131,13 +132,7 @@ class UpdateRefresh:
args.project_dir) > 0: args.project_dir) > 0:
LOG.fatal('FATAL: Cannot update secondary importance raster data') LOG.fatal('FATAL: Cannot update secondary importance raster data')
return 1 return 1
need_function_refresh = True
if args.functions:
LOG.warning('Create functions')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.create_functions(conn, args.config,
args.diffs, args.enable_debug_statements)
self._get_tokenizer(args.config).update_sql_functions(args.config)
if args.wiki_data: if args.wiki_data:
data_path = Path(args.config.WIKIPEDIA_DATA_PATH data_path = Path(args.config.WIKIPEDIA_DATA_PATH
@@ -147,8 +142,16 @@ class UpdateRefresh:
data_path) > 0: data_path) > 0:
LOG.fatal('FATAL: Wikipedia importance file not found in %s', data_path) LOG.fatal('FATAL: Wikipedia importance file not found in %s', data_path)
return 1 return 1
need_function_refresh = True
# Attention: importance MUST come after wiki data import. if need_function_refresh:
LOG.warning('Create functions')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.create_functions(conn, args.config,
args.diffs, args.enable_debug_statements)
self._get_tokenizer(args.config).update_sql_functions(args.config)
# Attention: importance MUST come after wiki data import and after functions.
if args.importance: if args.importance:
LOG.warning('Update importance values for database') LOG.warning('Update importance values for database')
with connect(args.config.get_libpq_dsn()) as conn: with connect(args.config.get_libpq_dsn()) as conn:

View File

@@ -92,6 +92,11 @@ class CopyBuffer:
return self return self
def size(self) -> int:
""" Return the number of bytes the buffer currently contains.
"""
return self.buffer.tell()
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
if self.buffer is not None: if self.buffer is not None:
self.buffer.close() self.buffer.close()
@@ -115,7 +120,10 @@ class CopyBuffer:
def copy_out(self, cur: Cursor, table: str, columns: Optional[Iterable[str]] = None) -> None: def copy_out(self, cur: Cursor, table: str, columns: Optional[Iterable[str]] = None) -> None:
""" Copy all collected data into the given table. """ Copy all collected data into the given table.
The buffer is empty and reusable after this operation.
""" """
if self.buffer.tell() > 0: if self.buffer.tell() > 0:
self.buffer.seek(0) self.buffer.seek(0)
cur.copy_from(self.buffer, table, columns=columns) cur.copy_from(self.buffer, table, columns=columns)
self.buffer = io.StringIO()

View File

@@ -8,6 +8,8 @@
Functions for bringing auxiliary data in the database up-to-date. Functions for bringing auxiliary data in the database up-to-date.
""" """
from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
import csv
import gzip
import logging import logging
from textwrap import dedent from textwrap import dedent
from pathlib import Path from pathlib import Path
@@ -16,7 +18,7 @@ from psycopg2 import sql as pysql
from nominatim.config import Configuration from nominatim.config import Configuration
from nominatim.db.connection import Connection, connect from nominatim.db.connection import Connection, connect
from nominatim.db.utils import execute_file from nominatim.db.utils import execute_file, CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.version import NOMINATIM_VERSION from nominatim.version import NOMINATIM_VERSION
@@ -132,21 +134,89 @@ def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = F
Returns 0 if all was well and 1 if the importance file could not Returns 0 if all was well and 1 if the importance file could not
be found. Throws an exception if there was an error reading the file. be found. Throws an exception if there was an error reading the file.
""" """
datafile = data_path / 'wikimedia-importance.sql.gz' if import_importance_csv(dsn, data_path / 'wikimedia-importance.csv.gz') == 0 \
or import_importance_sql(dsn, data_path / 'wikimedia-importance.sql.gz',
ignore_errors) == 0:
return 0
if not datafile.exists(): return 1
def import_importance_csv(dsn: str, data_file: Path) -> int:
""" Replace wikipedia importance table with data from a
single CSV file.
The file must be a gzipped CSV and have the following columns:
language, title, importance, wikidata_id
Other columns may be present but will be ignored.
"""
if not data_file.exists():
return 1
# Only import the first occurance of a wikidata ID.
# This keeps indexes and table small.
wd_done = set()
with connect(dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('wikipedia_article')
cur.drop_table('wikipedia_redirect')
cur.drop_table('wikimedia_importance')
cur.execute("""CREATE TABLE wikimedia_importance (
language TEXT NOT NULL,
title TEXT NOT NULL,
importance double precision NOT NULL,
wikidata TEXT
) """)
with gzip.open(str(data_file), 'rt') as fd, CopyBuffer() as buf:
for row in csv.DictReader(fd, delimiter='\t', quotechar='|'):
wd_id = int(row['wikidata_id'][1:])
buf.add(row['language'], row['title'], row['importance'],
None if wd_id in wd_done else row['wikidata_id'])
wd_done.add(wd_id)
if buf.size() > 10000000:
with conn.cursor() as cur:
buf.copy_out(cur, 'wikimedia_importance',
columns=['language', 'title', 'importance',
'wikidata'])
with conn.cursor() as cur:
buf.copy_out(cur, 'wikimedia_importance',
columns=['language', 'title', 'importance', 'wikidata'])
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_title
ON wikimedia_importance (title)""")
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_wikidata
ON wikimedia_importance (wikidata)
WHERE wikidata is not null""")
conn.commit()
return 0
def import_importance_sql(dsn: str, data_file: Path, ignore_errors: bool) -> int:
""" Replace wikipedia importance table with data from an SQL file.
"""
if not data_file.exists():
return 1 return 1
pre_code = """BEGIN; pre_code = """BEGIN;
DROP TABLE IF EXISTS "wikipedia_article"; DROP TABLE IF EXISTS "wikipedia_article";
DROP TABLE IF EXISTS "wikipedia_redirect" DROP TABLE IF EXISTS "wikipedia_redirect";
DROP TABLE IF EXISTS "wikipedia_importance";
""" """
post_code = "COMMIT" post_code = "COMMIT"
execute_file(dsn, datafile, ignore_errors=ignore_errors, execute_file(dsn, data_file, ignore_errors=ignore_errors,
pre_code=pre_code, post_code=post_code) pre_code=pre_code, post_code=post_code)
return 0 return 0
def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int: def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
""" Replaces the secondary importance raster data table with new data. """ Replaces the secondary importance raster data table with new data.

View File

@@ -28,6 +28,7 @@ class TestRefresh:
('website', 'setup_website'), ('website', 'setup_website'),
]) ])
def test_refresh_command(self, mock_func_factory, command, func): def test_refresh_command(self, mock_func_factory, command, func):
mock_func_factory(nominatim.tools.refresh, 'create_functions')
func_mock = mock_func_factory(nominatim.tools.refresh, func) func_mock = mock_func_factory(nominatim.tools.refresh, func)
assert self.call_nominatim('refresh', '--' + command) == 0 assert self.call_nominatim('refresh', '--' + command) == 0
@@ -71,6 +72,7 @@ class TestRefresh:
assert self.call_nominatim('refresh', '--wiki-data') == 1 assert self.call_nominatim('refresh', '--wiki-data') == 1
def test_refresh_secondary_importance_file_not_found(self): def test_refresh_secondary_importance_file_not_found(self):
assert self.call_nominatim('refresh', '--secondary-importance') == 1 assert self.call_nominatim('refresh', '--secondary-importance') == 1
@@ -84,16 +86,18 @@ class TestRefresh:
assert mocks[1].called == 1 assert mocks[1].called == 1
def test_refresh_importance_computed_after_wiki_import(self, monkeypatch): def test_refresh_importance_computed_after_wiki_import(self, monkeypatch, mock_func_factory):
calls = [] calls = []
monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles', monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
lambda *args, **kwargs: calls.append('import') or 0) lambda *args, **kwargs: calls.append('import') or 0)
monkeypatch.setattr(nominatim.tools.refresh, 'recompute_importance', monkeypatch.setattr(nominatim.tools.refresh, 'recompute_importance',
lambda *args, **kwargs: calls.append('update')) lambda *args, **kwargs: calls.append('update'))
func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
assert self.call_nominatim('refresh', '--importance', '--wiki-data') == 0 assert self.call_nominatim('refresh', '--importance', '--wiki-data') == 0
assert calls == ['import', 'update'] assert calls == ['import', 'update']
assert func_mock.called == 1
@pytest.mark.parametrize('params', [('--data-object', 'w234'), @pytest.mark.parametrize('params', [('--data-object', 'w234'),
('--data-object', 'N23', '--data-object', 'N24'), ('--data-object', 'N23', '--data-object', 'N24'),