rework postcode computation

Now adds areas to location_postcodes, ignores postcode points
inside areas and supports customizable extents.
This commit is contained in:
Sarah Hoffmann
2025-12-23 15:46:36 +01:00
parent a97b5d97cb
commit d1cb578535
5 changed files with 291 additions and 176 deletions

View File

@@ -2,10 +2,10 @@
--
-- This file is part of Nominatim. (https://nominatim.org)
--
-- Copyright (C) 2022 by the Nominatim developer community.
-- Copyright (C) 2025 by the Nominatim developer community.
-- For a full list of authors see the git log.
-- Trigger functions for location_postcode table.
-- Trigger functions for location_postcodes table.
-- Trigger for updates of location_postcode
@@ -13,7 +13,7 @@
-- Computes the parent object the postcode most likely refers to.
-- This will be the place that determines the address displayed when
-- searching for this postcode.
CREATE OR REPLACE FUNCTION postcode_update()
CREATE OR REPLACE FUNCTION postcodes_update()
RETURNS TRIGGER
AS $$
DECLARE
@@ -28,13 +28,10 @@ BEGIN
partition := get_partition(NEW.country_code);
SELECT * FROM get_postcode_rank(NEW.country_code, NEW.postcode)
INTO NEW.rank_search, NEW.rank_address;
NEW.parent_place_id = 0;
FOR location IN
SELECT place_id
FROM getNearFeatures(partition, NEW.geometry, NEW.geometry, NEW.rank_search)
FROM getNearFeatures(partition, NEW.centroid, NEW.centroid, NEW.rank_search)
WHERE NOT isguess ORDER BY rank_address DESC, distance asc LIMIT 1
LOOP
NEW.parent_place_id = location.place_id;
@@ -45,3 +42,89 @@ END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION postcodes_delete()
RETURNS TRIGGER
AS $$
BEGIN
{% if not disable_diff_updates %}
UPDATE placex p SET indexed_status = 2
WHERE p.postcode = OLD.postcode AND ST_Intersects(OLD.geometry, p.geometry)
AND indexed_status = 0;
{% endif %}
RETURN OLD;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION postcodes_insert()
RETURNS TRIGGER
AS $$
DECLARE
existing RECORD;
BEGIN
IF NEW.osm_id is not NULL THEN
-- postcode area, remove existing from same OSM object
SELECT * INTO existing FROM location_postcodes p
WHERE p.osm_id = NEW.osm_id;
IF existing.place_id is not NULL THEN
IF existing.postcode != NEW.postcode or existing.country_code != NEW.country_code THEN
DELETE FROM location_postcodes p WHERE p.osm_id = NEW.osm_id;
existing := NULL;
END IF;
END IF;
END IF;
IF existing is NULL THEN
SELECT * INTO existing FROM location_postcodes p
WHERE p.country_code = NEW.country_code AND p.postcode = NEW.postcode;
IF existing.postcode is NULL THEN
{% if not disable_diff_updates %}
UPDATE placex p SET indexed_status = 2
WHERE ST_Intersects(NEW.geometry, p.geometry)
AND indexed_status = 0
AND p.rank_address >= 22 AND not p.address ? 'postcode';
{% endif %}
-- new entry, just insert
NEW.indexed_status := 1;
NEW.place_id := nextval('seq_place');
RETURN NEW;
END IF;
END IF;
-- update: only when there are changes
IF coalesce(NEW.osm_id, -1) != coalesce(existing.osm_id, -1)
OR (NEW.osm_id is not null AND NEW.geometry::text != existing.geometry::text)
OR (NEW.osm_id is null
AND (abs(ST_X(existing.centroid) - ST_X(NEW.centroid)) > 0.0000001
OR abs(ST_Y(existing.centroid) - ST_Y(NEW.centroid)) > 0.0000001))
THEN
{% if not disable_diff_updates %}
UPDATE placex p SET indexed_status = 2
WHERE ST_Intersects(ST_Difference(NEW.geometry, existing.geometry), p.geometry)
AND indexed_status = 0
AND p.rank_address >= 22 AND not p.address ? 'postcode';
UPDATE placex p SET indexed_status = 2
WHERE ST_Intersects(ST_Difference(existing.geometry, NEW.geometry), p.geometry)
AND indexed_status = 0
AND p.postcode = OLD.postcode;
{% endif %}
UPDATE location_postcodes p
SET osm_id = NEW.osm_id,
indexed_status = 2,
centroid = NEW.centroid,
geometry = NEW.geometry
WHERE p.country_code = NEW.country_code AND p.postcode = NEW.postcode;
END IF;
RETURN NULL;
END;
$$
LANGUAGE plpgsql;

View File

@@ -2,7 +2,7 @@
--
-- This file is part of Nominatim. (https://nominatim.org)
--
-- Copyright (C) 2022 by the Nominatim developer community.
-- Copyright (C) 2025 by the Nominatim developer community.
-- For a full list of authors see the git log.
-- Functions related to search and address ranks
@@ -114,66 +114,6 @@ $$
LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Guess a ranking for postcodes from country and postcode format.
CREATE OR REPLACE FUNCTION get_postcode_rank(country_code VARCHAR(2), postcode TEXT,
OUT rank_search SMALLINT,
OUT rank_address SMALLINT)
AS $$
DECLARE
part TEXT;
BEGIN
rank_search := 30;
rank_address := 30;
postcode := upper(postcode);
IF country_code = 'gb' THEN
IF postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z]? [0-9][A-Z][A-Z])$' THEN
rank_search := 25;
rank_address := 5;
ELSEIF postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z]? [0-9])$' THEN
rank_search := 23;
rank_address := 5;
ELSEIF postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z])$' THEN
rank_search := 21;
rank_address := 5;
END IF;
ELSEIF country_code = 'sg' THEN
IF postcode ~ '^([0-9]{6})$' THEN
rank_search := 25;
rank_address := 11;
END IF;
ELSEIF country_code = 'de' THEN
IF postcode ~ '^([0-9]{5})$' THEN
rank_search := 21;
rank_address := 11;
END IF;
ELSE
-- Guess at the postcode format and coverage (!)
IF postcode ~ '^[A-Z0-9]{1,5}$' THEN -- Probably too short to be very local
rank_search := 21;
rank_address := 11;
ELSE
-- Does it look splitable into and area and local code?
part := substring(postcode from '^([- :A-Z0-9]+)([- :][A-Z0-9]+)$');
IF part IS NOT NULL THEN
rank_search := 25;
rank_address := 11;
ELSEIF postcode ~ '^[- :A-Z0-9]{6,}$' THEN
rank_search := 21;
rank_address := 11;
END IF;
END IF;
END IF;
END;
$$
LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Get standard search and address rank for an object.
--
-- \param country Two-letter country code where the object is in.
@@ -198,12 +138,7 @@ AS $$
DECLARE
classtype TEXT;
BEGIN
IF place_class in ('place','boundary')
and place_type in ('postcode','postal_code')
THEN
SELECT * INTO search_rank, address_rank
FROM get_postcode_rank(country, postcode);
ELSEIF extended_type = 'N' AND place_class = 'highway' THEN
IF extended_type = 'N' AND place_class = 'highway' THEN
search_rank = 30;
address_rank = 30;
ELSEIF place_class = 'landuse' AND extended_type != 'A' THEN

View File

@@ -323,6 +323,17 @@ END;
$$
LANGUAGE plpgsql;
-- Return the bounding box of the geometry buffered by the given number
-- of meters.
CREATE OR REPLACE FUNCTION expand_by_meters(geom GEOMETRY, meters FLOAT)
RETURNS GEOMETRY
AS $$
SELECT ST_Envelope(ST_Buffer(geom::geography, meters, 1)::geometry)
$$
LANGUAGE sql IMMUTABLE PARALLEL SAFE;
-- Create a bounding box with an extent computed from the radius (in meters)
-- which in turn is derived from the given search rank.
CREATE OR REPLACE FUNCTION place_node_fuzzy_area(geom GEOMETRY, rank_search INTEGER)
@@ -341,9 +352,7 @@ BEGIN
radius := 1000;
END IF;
RETURN ST_Envelope(ST_Collect(
ST_Project(geom::geography, radius, 0.785398)::geometry,
ST_Project(geom::geography, radius, 3.9269908)::geometry));
RETURN expand_by_meters(geom, radius);
END;
$$
LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;

View File

@@ -26,4 +26,8 @@ CREATE TRIGGER place_before_insert BEFORE INSERT ON place
FOR EACH ROW EXECUTE PROCEDURE place_insert();
CREATE TRIGGER location_postcode_before_update BEFORE UPDATE ON location_postcodes
FOR EACH ROW EXECUTE PROCEDURE postcode_update();
FOR EACH ROW EXECUTE PROCEDURE postcodes_update();
CREATE TRIGGER location_postcodes_before_delete BEFORE DELETE ON location_postcodes
FOR EACH ROW EXECUTE PROCEDURE postcodes_delete();
CREATE TRIGGER location_postcodes_before_insert BEFORE INSERT ON location_postcodes
FOR EACH ROW EXECUTE PROCEDURE postcodes_insert();

View File

@@ -8,7 +8,7 @@
Functions for importing, updating and otherwise maintaining the table
of artificial postcode centroids.
"""
from typing import Optional, Tuple, Dict, List, TextIO
from typing import Optional, Tuple, Dict, TextIO
from collections import defaultdict
from pathlib import Path
import csv
@@ -38,13 +38,26 @@ def _to_float(numstr: str, max_value: float) -> float:
return num
def _extent_to_rank(extent: int) -> int:
""" Guess a suitable search rank from the extent of a postcode.
"""
if extent <= 100:
return 25
if extent <= 3000:
return 23
return 21
class _PostcodeCollector:
""" Collector for postcodes of a single country.
"""
def __init__(self, country: str, matcher: Optional[CountryPostcodeMatcher]):
def __init__(self, country: str, matcher: Optional[CountryPostcodeMatcher],
default_extent: int, exclude: set[str] = set()):
self.country = country
self.matcher = matcher
self.extent = default_extent
self.exclude = exclude
self.collected: Dict[str, PointsCentroid] = defaultdict(PointsCentroid)
self.normalization_cache: Optional[Tuple[str, Optional[str]]] = None
@@ -61,7 +74,7 @@ class _PostcodeCollector:
normalized = self.matcher.normalize(match) if match else None
self.normalization_cache = (postcode, normalized)
if normalized:
if normalized and normalized not in self.exclude:
self.collected[normalized] += (x, y)
def commit(self, conn: Connection, analyzer: AbstractAnalyzer,
@@ -73,61 +86,37 @@ class _PostcodeCollector:
"""
if project_dir is not None:
self._update_from_external(analyzer, project_dir)
to_add, to_delete, to_update = self._compute_changes(conn)
LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
self.country, len(to_add), len(to_delete), len(to_update))
with conn.cursor() as cur:
cur.execute("""SELECT postcode FROM location_postcodes
WHERE country_code = %s AND osm_id is null""",
(self.country, ))
to_delete = [row[0] for row in cur if row[0] not in self.collected]
to_add = [dict(zip(('pc', 'x', 'y'), (k, *v.centroid())))
for k, v in self.collected.items()]
self.collected = defaultdict(PointsCentroid)
LOG.info("Processing country '%s' (%s added, %s deleted).",
self.country, len(to_add), len(to_delete))
with conn.cursor() as cur:
if to_add:
cur.executemany(pysql.SQL(
"""INSERT INTO location_postcode
(place_id, indexed_status, country_code,
postcode, geometry)
VALUES (nextval('seq_place'), 1, {}, %s,
ST_SetSRID(ST_MakePoint(%s, %s), 4326))
""").format(pysql.Literal(self.country)),
"""INSERT INTO location_postcodes
(country_code, rank_search, postcode, centroid, geometry)
VALUES ({}, {}, %(pc)s,
ST_SetSRID(ST_MakePoint(%(x)s, %(y)s), 4326),
expand_by_meters(ST_SetSRID(ST_MakePoint(%(x)s, %(y)s), 4326), {}))
""").format(pysql.Literal(self.country),
pysql.Literal(_extent_to_rank(self.extent)),
pysql.Literal(self.extent)),
to_add)
if to_delete:
cur.execute("""DELETE FROM location_postcode
cur.execute("""DELETE FROM location_postcodes
WHERE country_code = %s and postcode = any(%s)
AND osm_id is null
""", (self.country, to_delete))
if to_update:
cur.executemany(
pysql.SQL("""UPDATE location_postcode
SET indexed_status = 2,
geometry = ST_SetSRID(ST_Point(%s, %s), 4326)
WHERE country_code = {} and postcode = %s
""").format(pysql.Literal(self.country)),
to_update)
def _compute_changes(
self, conn: Connection
) -> Tuple[List[Tuple[str, float, float]], List[str], List[Tuple[float, float, str]]]:
""" Compute which postcodes from the collected postcodes have to be
added or modified and which from the location_postcode table
have to be deleted.
"""
to_update = []
to_delete = []
with conn.cursor() as cur:
cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
FROM location_postcode
WHERE country_code = %s""",
(self.country, ))
for postcode, x, y in cur:
pcobj = self.collected.pop(postcode, None)
if pcobj:
newx, newy = pcobj.centroid()
if abs(x - newx) > 0.0000001 or abs(y - newy) > 0.0000001:
to_update.append((newx, newy, postcode))
else:
to_delete.append(postcode)
to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
self.collected = defaultdict(PointsCentroid)
return to_add, to_delete, to_update
def _update_from_external(self, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
""" Look for an external postcode file for the active country in
@@ -152,7 +141,7 @@ class _PostcodeCollector:
_to_float(row['lat'], 90))
self.collected[postcode] += centroid
except ValueError:
LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
LOG.warning("Bad coordinates %s, %s in '%s' country postcode file.",
row['lat'], row['lon'], self.country)
finally:
@@ -175,35 +164,130 @@ class _PostcodeCollector:
def update_postcodes(dsn: str, project_dir: Optional[Path], tokenizer: AbstractTokenizer) -> None:
""" Update the table of artificial postcodes.
Computes artificial postcode centroids from the placex table,
potentially enhances it with external data and then updates the
postcodes in the table 'location_postcode'.
""" Update the table of postcodes from the input tables
placex and place_postcode.
"""
matcher = PostcodeFormatter()
with tokenizer.name_analyzer() as analyzer:
with connect(dsn) as conn:
# Backfill country_code column where required
conn.execute("""UPDATE place_postcode
SET country_code = get_country_code(centroid)
WHERE country_code is null
""")
# Now update first postcode areas
_update_postcode_areas(conn, analyzer, matcher)
# Then fill with estimated postcode centroids from other info
_update_guessed_postcode(conn, analyzer, matcher, project_dir)
conn.commit()
analyzer.update_postcodes_from_db()
def _insert_postcode_areas(conn: Connection, country_code: str,
extent: int, pcs: list[dict[str, str]]) -> None:
if pcs:
with conn.cursor() as cur:
cur.executemany(
pysql.SQL(
""" INSERT INTO location_postcodes
(osm_id, country_code, rank_search, postcode, centroid, geometry)
SELECT osm_id, country_code, {}, %(out)s, centroid, geometry
FROM place_postcode
WHERE osm_type = 'R'
and country_code = {} and postcode = %(in)s
and geometry is not null
""").format(pysql.Literal(_extent_to_rank(extent)),
pysql.Literal(country_code)),
pcs)
def _update_postcode_areas(conn: Connection, analyzer: AbstractAnalyzer,
matcher: PostcodeFormatter) -> None:
""" Update the postcode areas made from postcode boundaries.
"""
# first delete all areas that have gone
conn.execute(""" DELETE FROM location_postcodes pc
WHERE pc.osm_id is not null
AND NOT EXISTS(
SELECT * FROM place_postcode pp
WHERE pp.osm_type = 'R' and pp.osm_id = pc.osm_id
and geometry is not null)
""")
# now insert all in country batches, triggers will ensure proper updates
with conn.cursor() as cur:
cur.execute(""" SELECT country_code, postcode FROM place_postcode
WHERE geometry is not null and osm_type = 'R'
ORDER BY country_code
""")
country_code = None
fmt = None
pcs = []
for cc, postcode in cur:
if country_code is None:
country_code = cc
fmt = matcher.get_matcher(country_code)
elif country_code != cc:
_insert_postcode_areas(conn, country_code,
matcher.get_postcode_extent(country_code), pcs)
country_code = cc
fmt = matcher.get_matcher(country_code)
pcs = []
assert fmt is not None
if (m := fmt.match(postcode)):
pcs.append({'out': fmt.normalize(m), 'in': postcode})
if country_code is not None and pcs:
_insert_postcode_areas(conn, country_code,
matcher.get_postcode_extent(country_code), pcs)
def _update_guessed_postcode(conn: Connection, analyzer: AbstractAnalyzer,
matcher: PostcodeFormatter, project_dir: Optional[Path]) -> None:
""" Computes artificial postcode centroids from the placex table,
potentially enhances it with external data and then updates the
postcodes in the table 'location_postcodes'.
"""
# First get the list of countries that currently have postcodes.
# (Doing this before starting to insert, so it is fast on import.)
with conn.cursor() as cur:
cur.execute("SELECT DISTINCT country_code FROM location_postcode")
todo_countries = set((row[0] for row in cur))
cur.execute("""SELECT DISTINCT country_code FROM location_postcodes
WHERE osm_id is null""")
todo_countries = {row[0] for row in cur}
# Next, get the list of postcodes that are already covered by areas.
area_pcs = defaultdict(set)
with conn.cursor() as cur:
cur.execute("""SELECT country_code, postcode
FROM location_postcodes WHERE osm_id is not null
ORDER BY country_code""")
for cc, pc in cur:
area_pcs[cc].add(pc)
# Create a temporary table which contains coverage of the postcode areas.
with conn.cursor() as cur:
cur.execute("DROP TABLE IF EXISTS _global_postcode_area")
cur.execute("""CREATE TABLE _global_postcode_area AS
(SELECT ST_SubDivide(ST_Union(geometry)) as geometry
FROM place_postcode WHERE geometry is not null)
""")
cur.execute("CREATE INDEX ON _global_postcode_area USING gist(geometry)")
# Recompute the list of valid postcodes from placex.
with conn.cursor(name="placex_postcodes") as cur:
cur.execute("""
SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
FROM (SELECT
COALESCE(plx.country_code,
get_country_code(ST_Centroid(pl.geometry))) as cc,
pl.address->'postcode' as pc,
COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
FROM place AS pl LEFT OUTER JOIN placex AS plx
ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
WHERE pc IS NOT null AND cc IS NOT null
ORDER BY cc, pc""")
SELECT country_code, postcode, ST_X(centroid), ST_Y(centroid)
FROM (
(SELECT country_code, address->'postcode' as postcode, centroid
FROM placex WHERE address ? 'postcode')
UNION
(SELECT country_code, postcode, centroid
FROM place_postcode WHERE geometry is null)
) x
WHERE not postcode like '%,%' and not postcode like '%;%'
AND NOT EXISTS(SELECT * FROM _global_postcode_area g
WHERE ST_Intersects(x.centroid, g.geometry))
ORDER BY country_code""")
collector = None
@@ -211,7 +295,9 @@ def update_postcodes(dsn: str, project_dir: Optional[Path], tokenizer: AbstractT
if collector is None or country != collector.country:
if collector is not None:
collector.commit(conn, analyzer, project_dir)
collector = _PostcodeCollector(country, matcher.get_matcher(country))
collector = _PostcodeCollector(country, matcher.get_matcher(country),
matcher.get_postcode_extent(country),
exclude=area_pcs[country])
todo_countries.discard(country)
collector.add(postcode, x, y)
@@ -221,17 +307,15 @@ def update_postcodes(dsn: str, project_dir: Optional[Path], tokenizer: AbstractT
# Now handle any countries that are only in the postcode table.
for country in todo_countries:
fmt = matcher.get_matcher(country)
_PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir)
ext = matcher.get_postcode_extent(country)
_PostcodeCollector(country, fmt, ext,
exclude=area_pcs[country]).commit(conn, analyzer, project_dir)
conn.commit()
analyzer.update_postcodes_from_db()
conn.execute("DROP TABLE IF EXISTS _global_postcode_area")
def can_compute(dsn: str) -> bool:
"""
Check that the place table exists so that
postcodes can be computed.
""" Check that the necessary tables exist so that postcodes can be computed.
"""
with connect(dsn) as conn:
return table_exists(conn, 'place')
return table_exists(conn, 'place_postcode')