Merge pull request #3906 from AyushDharDubey/fix/issue_2463-Use-search_name-table-for-TIGER-data-imports-on-'dropped'-databases

Use `search_name` as fallback for TIGER imports when update tables are dropped
This commit is contained in:
Sarah Hoffmann
2026-01-23 20:52:40 +01:00
committed by GitHub
4 changed files with 167 additions and 32 deletions

View File

@@ -15,6 +15,99 @@ CREATE TABLE location_property_tiger_import (
step SMALLINT, step SMALLINT,
postcode TEXT); postcode TEXT);
-- Lookup functions for tiger import when update
-- informations are dropped (see gh-issue #2463)
CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceIdSlow(in_centroid GEOMETRY,
in_token_info JSONB)
RETURNS BIGINT
AS $$
DECLARE
out_place_id BIGINT;
BEGIN
SELECT place_id INTO out_place_id
FROM search_name
WHERE
-- finds rows where name_vector shares elements with search tokens.
token_matches_street(in_token_info, name_vector)
-- limits search area
AND centroid && ST_Expand(in_centroid, 0.015)
AND address_rank BETWEEN 26 AND 27
ORDER BY ST_Distance(centroid, in_centroid) ASC
LIMIT 1;
RETURN out_place_id;
END
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getNearestParallelRoadFeatureSlow(line GEOMETRY)
RETURNS BIGINT
AS $$
DECLARE
r RECORD;
search_diameter FLOAT;
p1 GEOMETRY;
p2 GEOMETRY;
p3 GEOMETRY;
BEGIN
IF ST_GeometryType(line) not in ('ST_LineString') THEN
RETURN NULL;
END IF;
p1 := ST_LineInterpolatePoint(line,0);
p2 := ST_LineInterpolatePoint(line,0.5);
p3 := ST_LineInterpolatePoint(line,1);
search_diameter := 0.0005;
WHILE search_diameter < 0.01 LOOP
FOR r IN
SELECT place_id FROM placex
WHERE ST_DWithin(line, geometry, search_diameter)
AND rank_address BETWEEN 26 AND 27
ORDER BY (ST_distance(geometry, p1)+
ST_distance(geometry, p2)+
ST_distance(geometry, p3)) ASC limit 1
LOOP
RETURN r.place_id;
END LOOP;
search_diameter := search_diameter * 2;
END LOOP;
RETURN NULL;
END
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getNearestRoadPlaceIdSlow(point GEOMETRY)
RETURNS BIGINT
AS $$
DECLARE
r RECORD;
search_diameter FLOAT;
BEGIN
search_diameter := 0.00005;
WHILE search_diameter < 0.1 LOOP
FOR r IN
SELECT place_id FROM placex
WHERE ST_DWithin(geometry, point, search_diameter)
AND rank_address BETWEEN 26 AND 27
ORDER BY ST_Distance(geometry, point) ASC limit 1
LOOP
RETURN r.place_id;
END LOOP;
search_diameter := search_diameter * 2;
END LOOP;
RETURN NULL;
END
$$
LANGUAGE plpgsql;
-- Tiger import function
CREATE OR REPLACE FUNCTION tiger_line_import(linegeo GEOMETRY, in_startnumber INTEGER, CREATE OR REPLACE FUNCTION tiger_line_import(linegeo GEOMETRY, in_startnumber INTEGER,
in_endnumber INTEGER, interpolationtype TEXT, in_endnumber INTEGER, interpolationtype TEXT,
token_info JSONB, in_postcode TEXT) RETURNS INTEGER token_info JSONB, in_postcode TEXT) RETURNS INTEGER
@@ -71,28 +164,51 @@ BEGIN
place_centroid := ST_Centroid(linegeo); place_centroid := ST_Centroid(linegeo);
out_partition := get_partition('us'); out_partition := get_partition('us');
out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid, -- HYBRID LOOKUP LOGIC (see gh-issue #2463)
-- if partition tables exist, use them for fast spatial lookups
{% if 'location_road_0' in db.tables %}
out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
token_info); token_info);
IF out_parent_place_id IS NULL THEN IF out_parent_place_id IS NULL THEN
SELECT getNearestParallelRoadFeature(out_partition, linegeo) SELECT getNearestParallelRoadFeature(out_partition, linegeo)
INTO out_parent_place_id; INTO out_parent_place_id;
END IF;
IF out_parent_place_id IS NULL THEN
SELECT getNearestRoadPlaceId(out_partition, place_centroid)
INTO out_parent_place_id;
END IF;
-- When updatable information has been dropped:
-- Partition tables no longer exist, but search_name still persists.
{% elif 'search_name' in db.tables %}
-- Fallback: Look up in 'search_name' table
-- though spatial lookups here can be slower.
out_parent_place_id := getNearestNamedRoadPlaceIdSlow(place_centroid, token_info);
IF out_parent_place_id IS NULL THEN
out_parent_place_id := getNearestParallelRoadFeatureSlow(linegeo);
END IF;
IF out_parent_place_id IS NULL THEN
out_parent_place_id := getNearestRoadPlaceIdSlow(place_centroid);
END IF;
{% endif %}
-- If parent was found, insert street(line) into import table
IF out_parent_place_id IS NOT NULL THEN
INSERT INTO location_property_tiger_import (linegeo, place_id, partition,
parent_place_id, startnumber, endnumber,
step, postcode)
VALUES (linegeo, nextval('seq_place'), out_partition,
out_parent_place_id, startnumber, endnumber,
stepsize, in_postcode);
RETURN 1;
END IF; END IF;
RETURN 0;
IF out_parent_place_id IS NULL THEN
SELECT getNearestRoadPlaceId(out_partition, place_centroid)
INTO out_parent_place_id;
END IF;
--insert street(line) into import table
insert into location_property_tiger_import (linegeo, place_id, partition,
parent_place_id, startnumber, endnumber,
step, postcode)
values (linegeo, nextval('seq_place'), out_partition,
out_parent_place_id, startnumber, endnumber,
stepsize, in_postcode);
RETURN 1;
END; END;
$$ $$
LANGUAGE plpgsql; LANGUAGE plpgsql;

View File

@@ -65,14 +65,14 @@ class UpdateAddData:
def run(self, args: NominatimArgs) -> int: def run(self, args: NominatimArgs) -> int:
from ..tools import add_osm_data from ..tools import add_osm_data
if args.tiger_data:
return asyncio.run(self._add_tiger_data(args))
with connect(args.config.get_libpq_dsn()) as conn: with connect(args.config.get_libpq_dsn()) as conn:
if is_frozen(conn): if is_frozen(conn):
print('Database is marked frozen. New data can\'t be added.') print('Database is marked frozen. New data can\'t be added.')
return 1 return 1
if args.tiger_data:
return asyncio.run(self._add_tiger_data(args))
osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1) osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
if args.file or args.diff: if args.file or args.diff:
return add_osm_data.add_data_from_file(args.config.get_libpq_dsn(), return add_osm_data.add_data_from_file(args.config.get_libpq_dsn(),

View File

@@ -17,13 +17,12 @@ import tarfile
from psycopg.types.json import Json from psycopg.types.json import Json
from ..config import Configuration from ..config import Configuration
from ..db.connection import connect from ..db.connection import connect, table_exists
from ..db.sql_preprocessor import SQLPreprocessor from ..db.sql_preprocessor import SQLPreprocessor
from ..errors import UsageError from ..errors import UsageError
from ..db.query_pool import QueryPool from ..db.query_pool import QueryPool
from ..data.place_info import PlaceInfo from ..data.place_info import PlaceInfo
from ..tokenizer.base import AbstractTokenizer from ..tokenizer.base import AbstractTokenizer
from . import freeze
LOG = logging.getLogger() LOG = logging.getLogger()
@@ -90,16 +89,19 @@ async def add_tiger_data(data_dir: str, config: Configuration, threads: int,
""" """
dsn = config.get_libpq_dsn() dsn = config.get_libpq_dsn()
with connect(dsn) as conn:
if freeze.is_frozen(conn):
raise UsageError("Tiger cannot be imported when database frozen (Github issue #3048)")
with TigerInput(data_dir) as tar: with TigerInput(data_dir) as tar:
if not tar: if not tar:
return 1 return 1
with connect(dsn) as conn: with connect(dsn) as conn:
sql = SQLPreprocessor(conn, config) sql = SQLPreprocessor(conn, config)
if not table_exists(conn, 'search_name'):
raise UsageError(
"Cannot perform tiger import: required tables are missing. "
"See https://github.com/osm-search/Nominatim/issues/2463 for details."
)
sql.run_sql_file(conn, 'tiger_import_start.sql') sql.run_sql_file(conn, 'tiger_import_start.sql')
# Reading files and then for each file line handling # Reading files and then for each file line handling

View File

@@ -32,6 +32,8 @@ class MockTigerTable:
# We need this table to determine if the database is frozen or not # We need this table to determine if the database is frozen or not
cur.execute("CREATE TABLE place (number INTEGER)") cur.execute("CREATE TABLE place (number INTEGER)")
# We need this table to determine if the database is in reverse-only mode
cur.execute("CREATE TABLE search_name (place_id BIGINT)")
def count(self): def count(self):
return execute_scalar(self.conn, "SELECT count(*) FROM tiger") return execute_scalar(self.conn, "SELECT count(*) FROM tiger")
@@ -87,15 +89,30 @@ async def test_add_tiger_data(def_config, src_dir, tiger_table, tokenizer_mock,
assert tiger_table.count() == 6213 assert tiger_table.count() == 6213
@pytest.mark.parametrize("threads", (1, 5))
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_add_tiger_data_database_frozen(def_config, temp_db_conn, tiger_table, tokenizer_mock, async def test_add_tiger_data_database_frozen(def_config, src_dir, temp_db_conn, tiger_table,
tmp_path): tokenizer_mock, threads):
freeze.drop_update_tables(temp_db_conn) freeze.drop_update_tables(temp_db_conn)
with pytest.raises(UsageError) as excinfo: await tiger_data.add_tiger_data(str(src_dir / 'test' / 'testdb' / 'tiger'),
await tiger_data.add_tiger_data(str(tmp_path), def_config, 1, tokenizer_mock()) def_config, threads, tokenizer_mock())
assert "database frozen" in str(excinfo.value) assert tiger_table.count() == 6213
@pytest.mark.asyncio
async def test_add_tiger_data_reverse_only(def_config, src_dir, temp_db_conn, tiger_table,
tokenizer_mock):
with temp_db_conn.cursor() as cur:
cur.execute("DROP TABLE search_name")
temp_db_conn.commit()
with pytest.raises(UsageError,
match="Cannot perform tiger import: required tables are missing. "
"See https://github.com/osm-search/Nominatim/issues/2463 for details."):
await tiger_data.add_tiger_data(str(src_dir / 'test' / 'testdb' / 'tiger'),
def_config, 1, tokenizer_mock())
assert tiger_table.count() == 0 assert tiger_table.count() == 0