Merge pull request #2829 from lonvia/optimize-indexes

Further optimize indexes
This commit is contained in:
Sarah Hoffmann
2022-09-26 10:02:51 +02:00
committed by GitHub
9 changed files with 122 additions and 65 deletions

View File

@@ -52,7 +52,9 @@ BEGIN
IF parent_place_id is null THEN
FOR location IN SELECT place_id FROM placex
WHERE ST_DWithin(geom, placex.geometry, 0.001) and placex.rank_search = 26
WHERE ST_DWithin(geom, placex.geometry, 0.001)
and placex.rank_search = 26
and placex.osm_type = 'W' -- needed for index selection
ORDER BY CASE WHEN ST_GeometryType(geom) = 'ST_Line' THEN
(ST_distance(placex.geometry, ST_LineInterpolatePoint(geom,0))+
ST_distance(placex.geometry, ST_LineInterpolatePoint(geom,0.5))+

View File

@@ -197,6 +197,7 @@ BEGIN
SELECT place_id FROM placex
WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
AND rank_address between 5 and 25
AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
ORDER BY rank_address desc
LOOP
RETURN location.place_id;
@@ -212,6 +213,7 @@ BEGIN
SELECT place_id FROM placex
WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
AND rank_address between 5 and 25
AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
ORDER BY rank_address desc
LOOP
RETURN location.place_id;
@@ -275,7 +277,9 @@ BEGIN
-- If extratags has a place tag, look for linked nodes by their place type.
-- Area and node still have to have the same name.
IF bnd.extratags ? 'place' and bnd_name is not null THEN
IF bnd.extratags ? 'place' and bnd.extratags->'place' != 'postcode'
and bnd_name is not null
THEN
FOR linked_placex IN
SELECT * FROM placex
WHERE (position(lower(name->'name') in bnd_name) > 0
@@ -284,7 +288,6 @@ BEGIN
AND placex.osm_type = 'N'
AND (placex.linked_place_id is null or placex.linked_place_id = bnd.place_id)
AND placex.rank_search < 26 -- needed to select the right index
AND placex.type != 'postcode'
AND ST_Covers(bnd.geometry, placex.geometry)
LOOP
{% if debug %}RAISE WARNING 'Found type-matching place node %', linked_placex.osm_id;{% endif %}
@@ -846,7 +849,8 @@ BEGIN
FROM placex
WHERE osm_type = 'R' and class = 'boundary' and type = 'administrative'
and admin_level < NEW.admin_level and admin_level > 3
and rank_address > 0
and rank_address between 1 and 25 -- for index selection
and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') -- for index selection
and geometry && NEW.centroid and _ST_Covers(geometry, NEW.centroid)
ORDER BY admin_level desc LIMIT 1
LOOP

View File

@@ -10,62 +10,73 @@
CREATE INDEX IF NOT EXISTS idx_place_addressline_address_place_id
ON place_addressline USING BTREE (address_place_id) {{db.tablespace.search_index}};
---
CREATE INDEX IF NOT EXISTS idx_placex_rank_search
ON placex USING BTREE (rank_search) {{db.tablespace.search_index}};
---
CREATE INDEX IF NOT EXISTS idx_placex_rank_address
ON placex USING BTREE (rank_address) {{db.tablespace.search_index}};
---
CREATE INDEX IF NOT EXISTS idx_placex_parent_place_id
ON placex USING BTREE (parent_place_id) {{db.tablespace.search_index}}
WHERE parent_place_id IS NOT NULL;
---
CREATE INDEX IF NOT EXISTS idx_placex_geometry ON placex
USING GIST (geometry) {{db.tablespace.search_index}};
---
CREATE INDEX IF NOT EXISTS idx_placex_geometry_reverse_lookupPolygon
ON placex USING gist (geometry) {{db.tablespace.search_index}}
WHERE St_GeometryType(geometry) in ('ST_Polygon', 'ST_MultiPolygon')
AND rank_address between 4 and 25 AND type != 'postcode'
AND name is not null AND indexed_status = 0 AND linked_place_id is null;
---
CREATE INDEX IF NOT EXISTS idx_osmline_parent_place_id
ON location_property_osmline USING BTREE (parent_place_id) {{db.tablespace.search_index}}
WHERE parent_place_id is not null;
---
CREATE INDEX IF NOT EXISTS idx_osmline_parent_osm_id
ON location_property_osmline USING BTREE (osm_id) {{db.tablespace.search_index}};
---
CREATE INDEX IF NOT EXISTS idx_postcode_postcode
ON location_postcode USING BTREE (postcode) {{db.tablespace.search_index}};
{% if drop %}
---
DROP INDEX IF EXISTS idx_placex_geometry_address_area_candidates;
DROP INDEX IF EXISTS idx_placex_geometry_buildings;
DROP INDEX IF EXISTS idx_placex_geometry_lower_rank_ways;
DROP INDEX IF EXISTS idx_placex_wikidata;
DROP INDEX IF EXISTS idx_placex_rank_address_sector;
DROP INDEX IF EXISTS idx_placex_rank_boundaries_sector;
{% else %}
-- Indices only needed for updating.
{% if not drop %}
CREATE INDEX IF NOT EXISTS idx_placex_pendingsector
ON placex USING BTREE (rank_address,geometry_sector) {{db.tablespace.address_index}}
WHERE indexed_status > 0;
---
CREATE INDEX IF NOT EXISTS idx_location_area_country_place_id
ON location_area_country USING BTREE (place_id) {{db.tablespace.address_index}};
---
CREATE UNIQUE INDEX IF NOT EXISTS idx_place_osm_unique
ON place USING btree(osm_id, osm_type, class, type) {{db.tablespace.address_index}};
{% endif %}
-- Indices only needed for search.
{% if 'search_name' in db.tables %}
---
CREATE INDEX IF NOT EXISTS idx_search_name_nameaddress_vector
ON search_name USING GIN (nameaddress_vector) WITH (fastupdate = off) {{db.tablespace.search_index}};
---
CREATE INDEX IF NOT EXISTS idx_search_name_name_vector
ON search_name USING GIN (name_vector) WITH (fastupdate = off) {{db.tablespace.search_index}};
---
CREATE INDEX IF NOT EXISTS idx_search_name_centroid
ON search_name USING GIST (centroid) {{db.tablespace.search_index}};
{% if postgres.has_index_non_key_column %}
---
CREATE INDEX IF NOT EXISTS idx_placex_housenumber
ON placex USING btree (parent_place_id)
INCLUDE (housenumber) {{db.tablespace.search_index}}
WHERE housenumber is not null;
---
CREATE INDEX IF NOT EXISTS idx_osmline_parent_osm_id_with_hnr
ON location_property_osmline USING btree(parent_place_id)
INCLUDE (startnumber, endnumber) {{db.tablespace.search_index}}

View File

@@ -137,7 +137,9 @@ CREATE TABLE place_addressline (
) {{db.tablespace.search_data}};
CREATE INDEX idx_place_addressline_place_id on place_addressline USING BTREE (place_id) {{db.tablespace.search_index}};
drop table if exists placex;
--------- PLACEX - storage for all indexed places -----------------
DROP TABLE IF EXISTS placex;
CREATE TABLE placex (
place_id BIGINT NOT NULL,
parent_place_id BIGINT,
@@ -157,24 +159,66 @@ CREATE TABLE placex (
postcode TEXT,
centroid GEOMETRY(Geometry, 4326)
) {{db.tablespace.search_data}};
CREATE UNIQUE INDEX idx_place_id ON placex USING BTREE (place_id) {{db.tablespace.search_index}};
CREATE INDEX idx_placex_osmid ON placex USING BTREE (osm_type, osm_id) {{db.tablespace.search_index}};
CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id) {{db.tablespace.address_index}} WHERE linked_place_id IS NOT NULL;
CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
{% for osm_type in ('N', 'W', 'R') %}
CREATE INDEX idx_placex_osmid_{{osm_type | lower}} ON placex
USING BTREE (osm_id) {{db.tablespace.search_index}}
WHERE osm_type = '{{osm_type}}';
{% endfor %}
-- Usage: - removing linkage status on update
-- - lookup linked places for /details
CREATE INDEX idx_placex_linked_place_id ON placex
USING BTREE (linked_place_id) {{db.tablespace.address_index}}
WHERE linked_place_id IS NOT NULL;
-- Usage: - check that admin boundaries do not overtake each other rank-wise
-- - check that place node in a admin boundary with the same address level
-- - boundary is not completely contained in a place area
-- - parenting of large-area or unparentable features
CREATE INDEX idx_placex_geometry_address_area_candidates ON placex
USING gist (geometry) {{db.tablespace.address_index}}
WHERE rank_address between 1 and 25
and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
-- Usage: - POI is within building with housenumber
CREATE INDEX idx_placex_geometry_buildings ON placex
USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.address_index}}
WHERE address is not null and rank_search = 30
and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
-- Usage: - linking of similar named places to boundaries
-- - linking of place nodes with same type to boundaries
-- - lookupPolygon()
CREATE INDEX idx_placex_geometry_placenode ON placex
USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.address_index}}
WHERE osm_type = 'N' and rank_search < 26
and class = 'place' and type != 'postcode' and linked_place_id is null;
CREATE INDEX idx_placex_wikidata on placex USING BTREE ((extratags -> 'wikidata')) {{db.tablespace.address_index}} WHERE extratags ? 'wikidata' and class = 'place' and osm_type = 'N' and rank_search < 26;
and class = 'place' and type != 'postcode';
-- Usage: - is node part of a way?
-- - find parent of interpolation spatially
CREATE INDEX idx_placex_geometry_lower_rank_ways ON placex
USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.address_index}}
WHERE osm_type = 'W' and rank_search >= 26;
-- Usage: - linking place nodes by wikidata tag to boundaries
CREATE INDEX idx_placex_wikidata on placex
USING BTREE ((extratags -> 'wikidata')) {{db.tablespace.address_index}}
WHERE extratags ? 'wikidata' and class = 'place'
and osm_type = 'N' and rank_search < 26;
-- The following two indexes function as a todo list for indexing.
CREATE INDEX idx_placex_rank_address_sector ON placex
USING BTREE (rank_address, geometry_sector) {{db.tablespace.address_index}}
WHERE indexed_status > 0;
CREATE INDEX idx_placex_rank_boundaries_sector ON placex
USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}}
WHERE class = 'boundary' and type = 'administrative'
and indexed_status > 0;
DROP SEQUENCE IF EXISTS seq_place;
CREATE SEQUENCE seq_place start 1;

View File

@@ -15,7 +15,7 @@ from pathlib import Path
import psutil
from nominatim.config import Configuration
from nominatim.db.connection import connect, Connection
from nominatim.db.connection import connect
from nominatim.db import status, properties
from nominatim.tokenizer.base import AbstractTokenizer
from nominatim.version import version_str
@@ -72,6 +72,8 @@ class SetupAll:
from ..tools import database_import, refresh, postcodes, freeze
from ..indexer.indexer import Indexer
num_threads = args.threads or psutil.cpu_count() or 1
country_info.setup_country_config(args.config)
if args.continue_at is None:
@@ -109,8 +111,7 @@ class SetupAll:
database_import.truncate_data_tables(conn)
LOG.warning('Load data into placex table')
database_import.load_data(args.config.get_libpq_dsn(),
args.threads or psutil.cpu_count() or 1)
database_import.load_data(args.config.get_libpq_dsn(), num_threads)
LOG.warning("Setting up tokenizer")
tokenizer = self._get_tokenizer(args.continue_at, args.config)
@@ -121,18 +122,15 @@ class SetupAll:
args.project_dir, tokenizer)
if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
if args.continue_at is not None and args.continue_at != 'load-data':
with connect(args.config.get_libpq_dsn()) as conn:
self._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
LOG.warning('Indexing places')
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
args.threads or psutil.cpu_count() or 1)
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, num_threads)
indexer.index_full(analyse=not args.index_noanalyse)
LOG.warning('Post-process tables')
with connect(args.config.get_libpq_dsn()) as conn:
database_import.create_search_indices(conn, args.config,
drop=args.no_updates)
drop=args.no_updates,
threads=num_threads)
LOG.warning('Create search index for default country names.')
country_info.create_country_names(conn, tokenizer,
args.config.get_str_list('LANGUAGES'))
@@ -188,27 +186,6 @@ class SetupAll:
return tokenizer_factory.get_tokenizer_for_db(config)
def _create_pending_index(self, conn: Connection, tablespace: str) -> None:
""" Add a supporting index for finding places still to be indexed.
This index is normally created at the end of the import process
for later updates. When indexing was partially done, then this
index can greatly improve speed going through already indexed data.
"""
if conn.index_exists('idx_placex_pendingsector'):
return
with conn.cursor() as cur:
LOG.warning('Creating support index')
if tablespace:
tablespace = 'TABLESPACE ' + tablespace
cur.execute(f"""CREATE INDEX idx_placex_pendingsector
ON placex USING BTREE (rank_address,geometry_sector)
{tablespace} WHERE indexed_status > 0
""")
conn.commit()
def _finalize_database(self, dsn: str, offline: bool) -> None:
""" Determine the database date and set the status accordingly.
"""

View File

@@ -11,6 +11,7 @@ from typing import Set, Dict, Any
import jinja2
from nominatim.db.connection import Connection
from nominatim.db.async_connection import WorkerPool
from nominatim.config import Configuration
def _get_partitions(conn: Connection) -> Set[int]:
@@ -96,3 +97,21 @@ class SQLPreprocessor:
with conn.cursor() as cur:
cur.execute(sql)
conn.commit()
def run_parallel_sql_file(self, dsn: str, name: str, num_threads: int = 1,
**kwargs: Any) -> None:
""" Execure the given SQL files using parallel asynchronous connections.
The keyword arguments may supply additional parameters for
preprocessing.
After preprocessing the SQL code is cut at lines containing only
'---'. Each chunk is sent to one of the `num_threads` workers.
"""
sql = self.env.get_template(name).render(**kwargs)
parts = sql.split('\n---\n')
with WorkerPool(dsn, num_threads) as pool:
for part in parts:
pool.next_free_worker().perform(part)

View File

@@ -114,9 +114,10 @@ def _get_indexes(conn: Connection) -> List[str]:
indexes.extend(('idx_placex_housenumber',
'idx_osmline_parent_osm_id_with_hnr'))
if conn.table_exists('place'):
indexes.extend(('idx_placex_pendingsector',
'idx_location_area_country_place_id',
'idx_place_osm_unique'))
indexes.extend(('idx_location_area_country_place_id',
'idx_place_osm_unique',
'idx_placex_rank_address_sector',
'idx_placex_rank_boundaries_sector'))
return indexes
@@ -199,7 +200,7 @@ def check_tokenizer(_: Connection, config: Configuration) -> CheckResult:
def check_existance_wikipedia(conn: Connection, _: Configuration) -> CheckResult:
""" Checking for wikipedia/wikidata data
"""
if not conn.table_exists('search_name'):
if not conn.table_exists('search_name') or not conn.table_exists('place'):
return CheckState.NOT_APPLICABLE
with conn.cursor() as cur:

View File

@@ -225,7 +225,8 @@ def load_data(dsn: str, threads: int) -> None:
cur.execute('ANALYSE')
def create_search_indices(conn: Connection, config: Configuration, drop: bool = False) -> None:
def create_search_indices(conn: Connection, config: Configuration,
drop: bool = False, threads: int = 1) -> None:
""" Create tables that have explicit partitioning.
"""
@@ -243,4 +244,5 @@ def create_search_indices(conn: Connection, config: Configuration, drop: bool =
sql = SQLPreprocessor(conn, config)
sql.run_sql_file(conn, 'indices.sql', drop=drop)
sql.run_parallel_sql_file(config.get_libpq_dsn(),
'indices.sql', min(8, threads), drop=drop)

View File

@@ -105,11 +105,8 @@ class TestCliImportWithDb:
for mock in mocks:
assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
assert temp_db_conn.index_exists('idx_placex_pendingsector')
# Calling it again still works for the index
assert self.call_nominatim('import', '--continue', 'indexing') == 0
assert temp_db_conn.index_exists('idx_placex_pendingsector')
def test_import_continue_postprocess(self, mock_func_factory):