Compare commits

...

17 Commits

Author SHA1 Message Date
Sarah Hoffmann
489653b6ed prepare 3.5.2 release 2020-09-24 11:54:17 +02:00
Sarah Hoffmann
bb0c42e638 update osm2pgsql to same version as master 2020-09-24 11:53:23 +02:00
Sarah Hoffmann
2d226be156 remove ST_Covers check when also testing for ST_Intersects
Using both is slightly problematic because they have different
ways to use the index. Newer versions of Postgis exhibit a
query planner issue when both functions appear together.
As ST_Intersects includes ST_Covers, simply remove the latter.
2020-09-24 11:53:23 +02:00
Sarah Hoffmann
61fe274c6e make sure that all postcodes have an entry in word
It may happen that two different postcodes normalize to exactly
the same token. In that case we still need two different entries
in the word table. Token lookup will then make sure that the correct
one is choosen.

Fixes #1953.
2020-09-24 11:53:23 +02:00
marc tobias
0ac99bc2a9 starting PHP 5.4 get_magic_quotes_gpc() returns false, no need to check 2020-09-24 11:53:23 +02:00
Sarah Hoffmann
76ddace267 tests: use larger grid to avoid rouding errors 2020-09-24 11:53:13 +02:00
Sarah Hoffmann
777c70926a increase splitting for large geometries
When computing the address parts for a geometry, we need to do
a ST_Relates lookup in the location_area_large_* tables. This is
potentially very expensive for geometries with many vertices.
There is already a funtion for splitting large areas to reduce the
impact. This commit reduces the minimum area of a split, effectively
increasing the number of splits.

The effect on database size is minimal (around 3% increase), while
the indexing speed for streets increases by a good 60%.
2020-09-24 11:52:17 +02:00
Sarah Hoffmann
b2886426b7 indexer: allow batch processing of places
Request and process multiple place_ids at once so that
Postgres can make better use of caching and there are less
transactions running.
2020-09-24 10:17:31 +02:00
Sarah Hoffmann
a836ca8991 indexer: move progress tracker into separate class 2020-09-24 10:17:21 +02:00
Sarah Hoffmann
30016b98b7 indexer: get rid of special handling of few places
Given that we do not distiribute geometry sectors to threads anymore,
there is no point in this kind of special handling.
2020-09-24 10:17:12 +02:00
Sarah Hoffmann
0f5fc10e31 make house number reappear in display name on named POIs
After 6cc6cf950c names and house numbers
of POIS got mingled into a single item when creating the display name.
Add the house number as extra information without place_id to avoid
later mangling.
2020-09-24 10:16:03 +02:00
Sarah Hoffmann
72335fb631 make indexing during updates less quiet
Adjust verbosity behaviour to that of indexing during setup.
2020-09-24 10:14:49 +02:00
Sarah Hoffmann
a863392938 add wiki tags to all styles
wikipedia and wikidata tags are needed to compute the importance
so we need to put them into extra tags for all styles.

Fixes #1885.
2020-09-24 10:13:17 +02:00
Sarah Hoffmann
168c2e222e prepare 3.5.1 release 2020-06-29 20:54:50 +02:00
Sarah Hoffmann
770f8e31a8 update libosmium to 2.15.6
Fixes an issue where osm2pgsql hangs on a particularly
complicated multipolygon.
2020-06-29 20:54:50 +02:00
Sarah Hoffmann
dd55a76d6d make phpcs happy 2020-06-28 23:19:49 +02:00
Sarah Hoffmann
670cff0d09 disable JIT and parallel processing for osm2pgsql in updates
This is known to cause issues because of bad indexing
statistics.
2020-06-28 23:19:41 +02:00
18 changed files with 172 additions and 106 deletions

View File

@@ -20,7 +20,7 @@ project(nominatim)
set(NOMINATIM_VERSION_MAJOR 3)
set(NOMINATIM_VERSION_MINOR 5)
set(NOMINATIM_VERSION_PATCH 0)
set(NOMINATIM_VERSION_PATCH 2)
set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}")

View File

@@ -1,3 +1,22 @@
3.5.2
* ensure that wikipedia tags are imported for all styles
* reinstate verbosity for indexing during updates
* make house number reappear in display name on named POIs
* introduce batch processing in indexer to avoid transaction ID overrun
* increase splitting for large geometries to improve indexing speed
* remove deprecated get_magic_quotes_gpc() function
* make sure that all postcodes have an entry in word and are thus searchable
* remove use of ST_Covers in conjunction woth ST_Intersects,
causes bad query planning and slow updates in Postgis3
* update osm2pgsql
3.5.1
* disable jit and parallel processing in PostgreSQL for osm2pgsql
* update libosmium to 2.15.6 (fixes an issue with processing hanging
on large multipolygons)
3.5.0
* structured select on HTML search page

View File

@@ -2,8 +2,3 @@
require_once(CONST_BasePath.'/lib/lib.php');
require_once(CONST_BasePath.'/lib/DB.php');
if (get_magic_quotes_gpc()) {
echo "Please disable magic quotes in your php.ini configuration\n";
exit;
}

View File

View File

@@ -0,0 +1,52 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim.
# Copyright (C) 2020 Sarah Hoffmann
import logging
from datetime import datetime
log = logging.getLogger()
class ProgressLogger(object):
""" Tracks and prints progress for the indexing process.
`name` is the name of the indexing step being tracked.
`total` sets up the total number of items that need processing.
`log_interval` denotes the interval in seconds at which progres
should be reported.
"""
def __init__(self, name, total, log_interval=1):
self.name = name
self.total_places = total
self.done_places = 0
self.rank_start_time = datetime.now()
self.next_info = 100 if log.isEnabledFor(logging.INFO) else total + 1
def add(self, num=1):
""" Mark `num` places as processed. Print a log message if the
logging is at least info and the log interval has past.
"""
self.done_places += num
if self.done_places >= self.next_info:
now = datetime.now()
done_time = (now - self.rank_start_time).total_seconds()
places_per_sec = self.done_places / done_time
eta = (self.total_places - self.done_places)/places_per_sec
log.info("Done {} in {} @ {:.3f} per second - {} ETA (seconds): {:.2f}"
.format(self.done_places, int(done_time),
places_per_sec, self.name, eta))
self.next_info += int(places_per_sec)
def done(self):
""" Print final staticstics about the progress.
"""
rank_end_time = datetime.now()
diff_seconds = (rank_end_time-self.rank_start_time).total_seconds()
log.warning("Done {}/{} in {} @ {:.3f} per second - FINISHED {}\n".format(
self.done_places, self.total_places, int(diff_seconds),
self.done_places/diff_seconds, self.name))

View File

@@ -32,6 +32,8 @@ import psycopg2
from psycopg2.extras import wait_select
import select
from indexer.progress import ProgressLogger
log = logging.getLogger()
def make_connection(options, asynchronous=False):
@@ -55,24 +57,19 @@ class RankRunner(object):
def name(self):
return "rank {}".format(self.rank)
def sql_index_sectors(self):
return """SELECT geometry_sector, count(*) FROM placex
def sql_count_objects(self):
return """SELECT count(*) FROM placex
WHERE rank_search = {} and indexed_status > 0
GROUP BY geometry_sector
ORDER BY geometry_sector""".format(self.rank)
""".format(self.rank)
def sql_nosector_places(self):
def sql_get_objects(self):
return """SELECT place_id FROM placex
WHERE indexed_status > 0 and rank_search = {}
ORDER BY geometry_sector""".format(self.rank)
def sql_sector_places(self):
return """SELECT place_id FROM placex
WHERE indexed_status > 0 and rank_search = {}
and geometry_sector = %s""".format(self.rank)
def sql_index_place(self):
return "UPDATE placex SET indexed_status = 0 WHERE place_id = %s"
def sql_index_place(self, ids):
return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
.format(','.join((str(i) for i in ids)))
class InterpolationRunner(object):
@@ -83,25 +80,19 @@ class InterpolationRunner(object):
def name(self):
return "interpolation lines (location_property_osmline)"
def sql_index_sectors(self):
return """SELECT geometry_sector, count(*) FROM location_property_osmline
WHERE indexed_status > 0
GROUP BY geometry_sector
ORDER BY geometry_sector"""
def sql_count_objects(self):
return """SELECT count(*) FROM location_property_osmline
WHERE indexed_status > 0"""
def sql_nosector_places(self):
def sql_get_objects(self):
return """SELECT place_id FROM location_property_osmline
WHERE indexed_status > 0
ORDER BY geometry_sector"""
def sql_sector_places(self):
return """SELECT place_id FROM location_property_osmline
WHERE indexed_status > 0 and geometry_sector = %s
ORDER BY geometry_sector"""
def sql_index_place(self):
def sql_index_place(self, ids):
return """UPDATE location_property_osmline
SET indexed_status = 0 WHERE place_id = %s"""
SET indexed_status = 0 WHERE place_id IN ({})"""\
.format(','.join((str(i) for i in ids)))
class DBConnection(object):
@@ -210,83 +201,48 @@ class Indexer(object):
self.index(RankRunner(rank))
if self.maxrank == 30:
self.index(InterpolationRunner())
self.index(InterpolationRunner(), 20)
self.index(RankRunner(self.maxrank))
self.index(RankRunner(self.maxrank), 20)
def index(self, obj):
def index(self, obj, batch=1):
""" Index a single rank or table. `obj` describes the SQL to use
for indexing.
for indexing. `batch` describes the number of objects that
should be processed with a single SQL statement
"""
log.warning("Starting {}".format(obj.name()))
cur = self.conn.cursor(name='main')
cur.execute(obj.sql_index_sectors())
cur = self.conn.cursor()
cur.execute(obj.sql_count_objects())
total_tuples = 0
for r in cur:
total_tuples += r[1]
log.debug("Total number of rows; {}".format(total_tuples))
total_tuples = cur.fetchone()[0]
log.debug("Total number of rows: {}".format(total_tuples))
cur.scroll(0, mode='absolute')
cur.close()
next_thread = self.find_free_thread()
done_tuples = 0
rank_start_time = datetime.now()
progress = ProgressLogger(obj.name(), total_tuples)
sector_sql = obj.sql_sector_places()
index_sql = obj.sql_index_place()
min_grouped_tuples = total_tuples - len(self.threads) * 1000
cur = self.conn.cursor(name='places')
cur.execute(obj.sql_get_objects())
next_info = 100 if log.isEnabledFor(logging.INFO) else total_tuples + 1
while True:
places = [p[0] for p in cur.fetchmany(batch)]
if len(places) == 0:
break
for r in cur:
sector = r[0]
# Should we do the remaining ones together?
do_all = done_tuples > min_grouped_tuples
pcur = self.conn.cursor(name='places')
if do_all:
pcur.execute(obj.sql_nosector_places())
else:
pcur.execute(sector_sql, (sector, ))
for place in pcur:
place_id = place[0]
log.debug("Processing place {}".format(place_id))
log.debug("Processing places: {}".format(places))
thread = next(next_thread)
thread.perform(index_sql, (place_id,))
done_tuples += 1
if done_tuples >= next_info:
now = datetime.now()
done_time = (now - rank_start_time).total_seconds()
tuples_per_sec = done_tuples / done_time
log.info("Done {} in {} @ {:.3f} per second - {} ETA (seconds): {:.2f}"
.format(done_tuples, int(done_time),
tuples_per_sec, obj.name(),
(total_tuples - done_tuples)/tuples_per_sec))
next_info += int(tuples_per_sec)
pcur.close()
if do_all:
break
thread.perform(obj.sql_index_place(places))
progress.add(len(places))
cur.close()
for t in self.threads:
t.wait()
rank_end_time = datetime.now()
diff_seconds = (rank_end_time-rank_start_time).total_seconds()
log.warning("Done {}/{} in {} @ {:.3f} per second - FINISHED {}\n".format(
done_tuples, total_tuples, int(diff_seconds),
done_tuples/diff_seconds, obj.name()))
progress.done()
def find_free_thread(self):
""" Generator that returns the next connection that is free for

View File

@@ -5,6 +5,11 @@
"no" : "skip"
}
},
{ "keys" : ["wikipedia", "wikipedia:*", "wikidata"],
"values" : {
"" : "extra"
}
},
{
"keys" : ["name:prefix", "name:suffix", "name:botanical", "*wikidata"],
"values" : {

View File

@@ -1,4 +1,9 @@
[
{ "keys" : ["wikipedia", "wikipedia:*", "wikidata"],
"values" : {
"" : "extra"
}
},
{
"keys" : ["name:prefix", "name:suffix", "name:botanical", "*wikidata"],
"values" : {

View File

@@ -1,4 +1,9 @@
[
{ "keys" : ["wikipedia", "wikipedia:*", "wikidata"],
"values" : {
"" : "extra"
}
},
{
"keys" : ["name:prefix", "name:suffix", "name:botanical", "*wikidata"],
"values" : {

View File

@@ -272,7 +272,7 @@ BEGIN
END IF;
IF searchhousenumber IS NOT NULL THEN
location := ROW(in_place_id, null, null, hstore('ref', searchhousenumber),
location := ROW(null, null, null, hstore('ref', searchhousenumber),
'place', 'house_number', null, null, true, true, 28, 0)::addressline;
RETURN NEXT location;
END IF;

View File

@@ -81,7 +81,8 @@ BEGIN
lookup_word := upper(trim(postcode));
lookup_token := ' ' || make_standard_name(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and class='place' and type='postcode'
WHERE word_token = lookup_token and word = lookup_word
and class='place' and type='postcode'
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');

View File

@@ -162,14 +162,14 @@ BEGIN
IF st_area(NEW.geometry) < 0.000000001 AND st_area(existinggeometry) < 1 THEN
-- re-index points that have moved in / out of the polygon, could be done as a single query but postgres gets the index usage wrong
update placex set indexed_status = 2 where indexed_status = 0 and
(st_covers(NEW.geometry, placex.geometry) OR ST_Intersects(NEW.geometry, placex.geometry))
AND NOT (st_covers(existinggeometry, placex.geometry) OR ST_Intersects(existinggeometry, placex.geometry))
update placex set indexed_status = 2 where indexed_status = 0
AND ST_Intersects(NEW.geometry, placex.geometry)
AND NOT ST_Intersects(existinggeometry, placex.geometry)
AND rank_search > existingplacex.rank_search AND (rank_search < 28 or name is not null);
update placex set indexed_status = 2 where indexed_status = 0 and
(st_covers(existinggeometry, placex.geometry) OR ST_Intersects(existinggeometry, placex.geometry))
AND NOT (st_covers(NEW.geometry, placex.geometry) OR ST_Intersects(NEW.geometry, placex.geometry))
update placex set indexed_status = 2 where indexed_status = 0
AND ST_Intersects(existinggeometry, placex.geometry)
AND NOT ST_Intersects(NEW.geometry, placex.geometry)
AND rank_search > existingplacex.rank_search AND (rank_search < 28 or name is not null);
END IF;

View File

@@ -455,9 +455,9 @@ BEGIN
-- RAISE WARNING 'placex poly insert: % % % %',NEW.osm_type,NEW.osm_id,NEW.class,NEW.type;
-- work around bug in postgis, this may have been fixed in 2.0.0 (see http://trac.osgeo.org/postgis/ticket/547)
update placex set indexed_status = 2 where (st_covers(NEW.geometry, placex.geometry) OR ST_Intersects(NEW.geometry, placex.geometry))
update placex set indexed_status = 2 where ST_Intersects(NEW.geometry, placex.geometry)
AND rank_search > NEW.rank_search and indexed_status = 0 and ST_geometrytype(placex.geometry) = 'ST_Point' and (rank_search < 28 or name is not null or (NEW.rank_search >= 16 and address ? 'place'));
update placex set indexed_status = 2 where (st_covers(NEW.geometry, placex.geometry) OR ST_Intersects(NEW.geometry, placex.geometry))
update placex set indexed_status = 2 where ST_Intersects(NEW.geometry, placex.geometry)
AND rank_search > NEW.rank_search and indexed_status = 0 and ST_geometrytype(placex.geometry) != 'ST_Point' and (rank_search < 28 or name is not null or (NEW.rank_search >= 16 and address ? 'place'));
END IF;
ELSE

View File

@@ -431,7 +431,7 @@ DECLARE
geo RECORD;
BEGIN
-- 10000000000 is ~~ 1x1 degree
FOR geo IN select quad_split_geometry(geometry, 0.25, 20) as geom LOOP
FOR geo IN select quad_split_geometry(geometry, 0.01, 20) as geom LOOP
RETURN NEXT geo.geom;
END LOOP;
RETURN;
@@ -476,9 +476,9 @@ BEGIN
IF placegeom IS NOT NULL AND ST_IsValid(placegeom) THEN
IF ST_GeometryType(placegeom) in ('ST_Polygon','ST_MultiPolygon') THEN
FOR geom IN select split_geometry(placegeom) FROM placex WHERE place_id = placeid LOOP
update placex set indexed_status = 2 where (st_covers(geom, placex.geometry) OR ST_Intersects(geom, placex.geometry))
update placex set indexed_status = 2 where ST_Intersects(geom, placex.geometry)
AND rank_search > rank and indexed_status = 0 and ST_geometrytype(placex.geometry) = 'ST_Point' and (rank_search < 28 or name is not null or (rank >= 16 and address ? 'place'));
update placex set indexed_status = 2 where (st_covers(geom, placex.geometry) OR ST_Intersects(geom, placex.geometry))
update placex set indexed_status = 2 where ST_Intersects(geom, placex.geometry)
AND rank_search > rank and indexed_status = 0 and ST_geometrytype(placex.geometry) != 'ST_Point' and (rank_search < 28 or name is not null or (rank >= 16 and address ? 'place'));
END LOOP;
ELSE

View File

@@ -137,3 +137,22 @@ Feature: Import of postcodes
And word contains
| word | class | type |
| 01982 | place | postcode |
Scenario: Different postcodes with the same normalization can both be found
Given the places
| osm | class | type | addr+postcode | addr+housenumber | geometry |
| N34 | place | house | EH4 7EA | 111 | country:gb |
| N35 | place | house | E4 7EA | 111 | country:gb |
When importing
Then location_postcode contains exactly
| country | postcode | geometry |
| gb | EH4 7EA | country:gb |
| gb | E4 7EA | country:gb |
When searching for "EH4 7EA"
Then results contain
| type | placename |
| postcode | EH4 7EA |
When searching for "E4 7EA"
Then results contain
| type | placename |
| postcode | E4 7EA |

View File

@@ -4,13 +4,13 @@ import random
import os
from nose.tools import * # for assert functions
@given(u'the (\d+ )?grid')
@given(u'the ([0-9.]+ )?grid')
def define_node_grid(context, grid_step):
"""
Define a grid of node positions.
"""
if grid_step is not None:
grid_step = int(grd_step.strip())
grid_step = float(grid_step.strip())
else:
grid_step = 0.00001

View File

@@ -55,6 +55,7 @@ date_default_timezone_set('Etc/UTC');
$oDB = new Nominatim\DB();
$oDB->connect();
$fPostgresVersion = $oDB->getPostgresVersion();
$aDSNInfo = Nominatim\DB::parseDSN(CONST_Database_DSN);
if (!isset($aDSNInfo['port']) || !$aDSNInfo['port']) $aDSNInfo['port'] = 5432;
@@ -90,13 +91,21 @@ if (isset($aDSNInfo['password']) && $aDSNInfo['password']) {
if (!is_null(CONST_Osm2pgsql_Flatnode_File) && CONST_Osm2pgsql_Flatnode_File) {
$oOsm2pgsqlCmd->addParams('--flat-nodes', CONST_Osm2pgsql_Flatnode_File);
}
if ($fPostgresVersion >= 11.0) {
$oOsm2pgsqlCmd->addEnvPair(
'PGOPTIONS',
'-c jit=off -c max_parallel_workers_per_gather=0'
);
}
$oIndexCmd = (new \Nominatim\Shell(CONST_BasePath.'/nominatim/nominatim.py'))
->addParams('--database', $aDSNInfo['database'])
->addParams('--port', $aDSNInfo['port'])
->addParams('--threads', $aResult['index-instances']);
if (!$aResult['quiet']) {
$oIndexCmd->addParams('--verbose');
}
if ($aResult['verbose']) {
$oIndexCmd->addParams('--verbose');
}