From d1cb5785350347592f15be8498dc5115d7dce47a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 23 Dec 2025 15:46:36 +0100 Subject: [PATCH] rework postcode computation Now adds areas to location_postcodes, ignores postcode points inside areas and supports customizable extents. --- lib-sql/functions/postcode_triggers.sql | 97 +++++++- lib-sql/functions/ranking.sql | 69 +----- lib-sql/functions/utils.sql | 15 +- lib-sql/table-triggers.sql | 6 +- src/nominatim_db/tools/postcodes.py | 280 +++++++++++++++--------- 5 files changed, 291 insertions(+), 176 deletions(-) diff --git a/lib-sql/functions/postcode_triggers.sql b/lib-sql/functions/postcode_triggers.sql index 3465e775..9518b39c 100644 --- a/lib-sql/functions/postcode_triggers.sql +++ b/lib-sql/functions/postcode_triggers.sql @@ -2,10 +2,10 @@ -- -- This file is part of Nominatim. (https://nominatim.org) -- --- Copyright (C) 2022 by the Nominatim developer community. +-- Copyright (C) 2025 by the Nominatim developer community. -- For a full list of authors see the git log. --- Trigger functions for location_postcode table. +-- Trigger functions for location_postcodes table. -- Trigger for updates of location_postcode @@ -13,7 +13,7 @@ -- Computes the parent object the postcode most likely refers to. -- This will be the place that determines the address displayed when -- searching for this postcode. -CREATE OR REPLACE FUNCTION postcode_update() +CREATE OR REPLACE FUNCTION postcodes_update() RETURNS TRIGGER AS $$ DECLARE @@ -28,13 +28,10 @@ BEGIN partition := get_partition(NEW.country_code); - SELECT * FROM get_postcode_rank(NEW.country_code, NEW.postcode) - INTO NEW.rank_search, NEW.rank_address; - NEW.parent_place_id = 0; FOR location IN SELECT place_id - FROM getNearFeatures(partition, NEW.geometry, NEW.geometry, NEW.rank_search) + FROM getNearFeatures(partition, NEW.centroid, NEW.centroid, NEW.rank_search) WHERE NOT isguess ORDER BY rank_address DESC, distance asc LIMIT 1 LOOP NEW.parent_place_id = location.place_id; @@ -45,3 +42,89 @@ END; $$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION postcodes_delete() + RETURNS TRIGGER + AS $$ +BEGIN +{% if not disable_diff_updates %} + UPDATE placex p SET indexed_status = 2 + WHERE p.postcode = OLD.postcode AND ST_Intersects(OLD.geometry, p.geometry) + AND indexed_status = 0; +{% endif %} + + RETURN OLD; +END; +$$ +LANGUAGE plpgsql; + + +CREATE OR REPLACE FUNCTION postcodes_insert() + RETURNS TRIGGER + AS $$ +DECLARE + existing RECORD; +BEGIN + IF NEW.osm_id is not NULL THEN + -- postcode area, remove existing from same OSM object + SELECT * INTO existing FROM location_postcodes p + WHERE p.osm_id = NEW.osm_id; + + IF existing.place_id is not NULL THEN + IF existing.postcode != NEW.postcode or existing.country_code != NEW.country_code THEN + DELETE FROM location_postcodes p WHERE p.osm_id = NEW.osm_id; + existing := NULL; + END IF; + END IF; + END IF; + + IF existing is NULL THEN + SELECT * INTO existing FROM location_postcodes p + WHERE p.country_code = NEW.country_code AND p.postcode = NEW.postcode; + + IF existing.postcode is NULL THEN +{% if not disable_diff_updates %} + UPDATE placex p SET indexed_status = 2 + WHERE ST_Intersects(NEW.geometry, p.geometry) + AND indexed_status = 0 + AND p.rank_address >= 22 AND not p.address ? 'postcode'; +{% endif %} + + -- new entry, just insert + NEW.indexed_status := 1; + NEW.place_id := nextval('seq_place'); + RETURN NEW; + END IF; + END IF; + + -- update: only when there are changes + IF coalesce(NEW.osm_id, -1) != coalesce(existing.osm_id, -1) + OR (NEW.osm_id is not null AND NEW.geometry::text != existing.geometry::text) + OR (NEW.osm_id is null + AND (abs(ST_X(existing.centroid) - ST_X(NEW.centroid)) > 0.0000001 + OR abs(ST_Y(existing.centroid) - ST_Y(NEW.centroid)) > 0.0000001)) + THEN +{% if not disable_diff_updates %} + UPDATE placex p SET indexed_status = 2 + WHERE ST_Intersects(ST_Difference(NEW.geometry, existing.geometry), p.geometry) + AND indexed_status = 0 + AND p.rank_address >= 22 AND not p.address ? 'postcode'; + + UPDATE placex p SET indexed_status = 2 + WHERE ST_Intersects(ST_Difference(existing.geometry, NEW.geometry), p.geometry) + AND indexed_status = 0 + AND p.postcode = OLD.postcode; +{% endif %} + + UPDATE location_postcodes p + SET osm_id = NEW.osm_id, + indexed_status = 2, + centroid = NEW.centroid, + geometry = NEW.geometry + WHERE p.country_code = NEW.country_code AND p.postcode = NEW.postcode; + END IF; + + RETURN NULL; +END; +$$ +LANGUAGE plpgsql; diff --git a/lib-sql/functions/ranking.sql b/lib-sql/functions/ranking.sql index cee72fcb..6a7c5278 100644 --- a/lib-sql/functions/ranking.sql +++ b/lib-sql/functions/ranking.sql @@ -2,7 +2,7 @@ -- -- This file is part of Nominatim. (https://nominatim.org) -- --- Copyright (C) 2022 by the Nominatim developer community. +-- Copyright (C) 2025 by the Nominatim developer community. -- For a full list of authors see the git log. -- Functions related to search and address ranks @@ -114,66 +114,6 @@ $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; --- Guess a ranking for postcodes from country and postcode format. -CREATE OR REPLACE FUNCTION get_postcode_rank(country_code VARCHAR(2), postcode TEXT, - OUT rank_search SMALLINT, - OUT rank_address SMALLINT) -AS $$ -DECLARE - part TEXT; -BEGIN - rank_search := 30; - rank_address := 30; - postcode := upper(postcode); - - IF country_code = 'gb' THEN - IF postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z]? [0-9][A-Z][A-Z])$' THEN - rank_search := 25; - rank_address := 5; - ELSEIF postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z]? [0-9])$' THEN - rank_search := 23; - rank_address := 5; - ELSEIF postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z])$' THEN - rank_search := 21; - rank_address := 5; - END IF; - - ELSEIF country_code = 'sg' THEN - IF postcode ~ '^([0-9]{6})$' THEN - rank_search := 25; - rank_address := 11; - END IF; - - ELSEIF country_code = 'de' THEN - IF postcode ~ '^([0-9]{5})$' THEN - rank_search := 21; - rank_address := 11; - END IF; - - ELSE - -- Guess at the postcode format and coverage (!) - IF postcode ~ '^[A-Z0-9]{1,5}$' THEN -- Probably too short to be very local - rank_search := 21; - rank_address := 11; - ELSE - -- Does it look splitable into and area and local code? - part := substring(postcode from '^([- :A-Z0-9]+)([- :][A-Z0-9]+)$'); - - IF part IS NOT NULL THEN - rank_search := 25; - rank_address := 11; - ELSEIF postcode ~ '^[- :A-Z0-9]{6,}$' THEN - rank_search := 21; - rank_address := 11; - END IF; - END IF; - END IF; - -END; -$$ -LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; - - -- Get standard search and address rank for an object. -- -- \param country Two-letter country code where the object is in. @@ -198,12 +138,7 @@ AS $$ DECLARE classtype TEXT; BEGIN - IF place_class in ('place','boundary') - and place_type in ('postcode','postal_code') - THEN - SELECT * INTO search_rank, address_rank - FROM get_postcode_rank(country, postcode); - ELSEIF extended_type = 'N' AND place_class = 'highway' THEN + IF extended_type = 'N' AND place_class = 'highway' THEN search_rank = 30; address_rank = 30; ELSEIF place_class = 'landuse' AND extended_type != 'A' THEN diff --git a/lib-sql/functions/utils.sql b/lib-sql/functions/utils.sql index 9a6228e4..3baf8240 100644 --- a/lib-sql/functions/utils.sql +++ b/lib-sql/functions/utils.sql @@ -323,6 +323,17 @@ END; $$ LANGUAGE plpgsql; + +-- Return the bounding box of the geometry buffered by the given number +-- of meters. +CREATE OR REPLACE FUNCTION expand_by_meters(geom GEOMETRY, meters FLOAT) + RETURNS GEOMETRY + AS $$ + SELECT ST_Envelope(ST_Buffer(geom::geography, meters, 1)::geometry) +$$ +LANGUAGE sql IMMUTABLE PARALLEL SAFE; + + -- Create a bounding box with an extent computed from the radius (in meters) -- which in turn is derived from the given search rank. CREATE OR REPLACE FUNCTION place_node_fuzzy_area(geom GEOMETRY, rank_search INTEGER) @@ -341,9 +352,7 @@ BEGIN radius := 1000; END IF; - RETURN ST_Envelope(ST_Collect( - ST_Project(geom::geography, radius, 0.785398)::geometry, - ST_Project(geom::geography, radius, 3.9269908)::geometry)); + RETURN expand_by_meters(geom, radius); END; $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; diff --git a/lib-sql/table-triggers.sql b/lib-sql/table-triggers.sql index e178e621..f9c3f890 100644 --- a/lib-sql/table-triggers.sql +++ b/lib-sql/table-triggers.sql @@ -26,4 +26,8 @@ CREATE TRIGGER place_before_insert BEFORE INSERT ON place FOR EACH ROW EXECUTE PROCEDURE place_insert(); CREATE TRIGGER location_postcode_before_update BEFORE UPDATE ON location_postcodes - FOR EACH ROW EXECUTE PROCEDURE postcode_update(); + FOR EACH ROW EXECUTE PROCEDURE postcodes_update(); +CREATE TRIGGER location_postcodes_before_delete BEFORE DELETE ON location_postcodes + FOR EACH ROW EXECUTE PROCEDURE postcodes_delete(); +CREATE TRIGGER location_postcodes_before_insert BEFORE INSERT ON location_postcodes + FOR EACH ROW EXECUTE PROCEDURE postcodes_insert(); diff --git a/src/nominatim_db/tools/postcodes.py b/src/nominatim_db/tools/postcodes.py index 555a5810..d64e7b57 100644 --- a/src/nominatim_db/tools/postcodes.py +++ b/src/nominatim_db/tools/postcodes.py @@ -8,7 +8,7 @@ Functions for importing, updating and otherwise maintaining the table of artificial postcode centroids. """ -from typing import Optional, Tuple, Dict, List, TextIO +from typing import Optional, Tuple, Dict, TextIO from collections import defaultdict from pathlib import Path import csv @@ -38,13 +38,26 @@ def _to_float(numstr: str, max_value: float) -> float: return num +def _extent_to_rank(extent: int) -> int: + """ Guess a suitable search rank from the extent of a postcode. + """ + if extent <= 100: + return 25 + if extent <= 3000: + return 23 + return 21 + + class _PostcodeCollector: """ Collector for postcodes of a single country. """ - def __init__(self, country: str, matcher: Optional[CountryPostcodeMatcher]): + def __init__(self, country: str, matcher: Optional[CountryPostcodeMatcher], + default_extent: int, exclude: set[str] = set()): self.country = country self.matcher = matcher + self.extent = default_extent + self.exclude = exclude self.collected: Dict[str, PointsCentroid] = defaultdict(PointsCentroid) self.normalization_cache: Optional[Tuple[str, Optional[str]]] = None @@ -61,7 +74,7 @@ class _PostcodeCollector: normalized = self.matcher.normalize(match) if match else None self.normalization_cache = (postcode, normalized) - if normalized: + if normalized and normalized not in self.exclude: self.collected[normalized] += (x, y) def commit(self, conn: Connection, analyzer: AbstractAnalyzer, @@ -73,61 +86,37 @@ class _PostcodeCollector: """ if project_dir is not None: self._update_from_external(analyzer, project_dir) - to_add, to_delete, to_update = self._compute_changes(conn) - LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).", - self.country, len(to_add), len(to_delete), len(to_update)) + with conn.cursor() as cur: + cur.execute("""SELECT postcode FROM location_postcodes + WHERE country_code = %s AND osm_id is null""", + (self.country, )) + to_delete = [row[0] for row in cur if row[0] not in self.collected] + + to_add = [dict(zip(('pc', 'x', 'y'), (k, *v.centroid()))) + for k, v in self.collected.items()] + self.collected = defaultdict(PointsCentroid) + + LOG.info("Processing country '%s' (%s added, %s deleted).", + self.country, len(to_add), len(to_delete)) with conn.cursor() as cur: if to_add: cur.executemany(pysql.SQL( - """INSERT INTO location_postcode - (place_id, indexed_status, country_code, - postcode, geometry) - VALUES (nextval('seq_place'), 1, {}, %s, - ST_SetSRID(ST_MakePoint(%s, %s), 4326)) - """).format(pysql.Literal(self.country)), + """INSERT INTO location_postcodes + (country_code, rank_search, postcode, centroid, geometry) + VALUES ({}, {}, %(pc)s, + ST_SetSRID(ST_MakePoint(%(x)s, %(y)s), 4326), + expand_by_meters(ST_SetSRID(ST_MakePoint(%(x)s, %(y)s), 4326), {})) + """).format(pysql.Literal(self.country), + pysql.Literal(_extent_to_rank(self.extent)), + pysql.Literal(self.extent)), to_add) if to_delete: - cur.execute("""DELETE FROM location_postcode + cur.execute("""DELETE FROM location_postcodes WHERE country_code = %s and postcode = any(%s) + AND osm_id is null """, (self.country, to_delete)) - if to_update: - cur.executemany( - pysql.SQL("""UPDATE location_postcode - SET indexed_status = 2, - geometry = ST_SetSRID(ST_Point(%s, %s), 4326) - WHERE country_code = {} and postcode = %s - """).format(pysql.Literal(self.country)), - to_update) - - def _compute_changes( - self, conn: Connection - ) -> Tuple[List[Tuple[str, float, float]], List[str], List[Tuple[float, float, str]]]: - """ Compute which postcodes from the collected postcodes have to be - added or modified and which from the location_postcode table - have to be deleted. - """ - to_update = [] - to_delete = [] - with conn.cursor() as cur: - cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry) - FROM location_postcode - WHERE country_code = %s""", - (self.country, )) - for postcode, x, y in cur: - pcobj = self.collected.pop(postcode, None) - if pcobj: - newx, newy = pcobj.centroid() - if abs(x - newx) > 0.0000001 or abs(y - newy) > 0.0000001: - to_update.append((newx, newy, postcode)) - else: - to_delete.append(postcode) - - to_add = [(k, *v.centroid()) for k, v in self.collected.items()] - self.collected = defaultdict(PointsCentroid) - - return to_add, to_delete, to_update def _update_from_external(self, analyzer: AbstractAnalyzer, project_dir: Path) -> None: """ Look for an external postcode file for the active country in @@ -152,7 +141,7 @@ class _PostcodeCollector: _to_float(row['lat'], 90)) self.collected[postcode] += centroid except ValueError: - LOG.warning("Bad coordinates %s, %s in %s country postcode file.", + LOG.warning("Bad coordinates %s, %s in '%s' country postcode file.", row['lat'], row['lon'], self.country) finally: @@ -175,63 +164,158 @@ class _PostcodeCollector: def update_postcodes(dsn: str, project_dir: Optional[Path], tokenizer: AbstractTokenizer) -> None: - """ Update the table of artificial postcodes. - - Computes artificial postcode centroids from the placex table, - potentially enhances it with external data and then updates the - postcodes in the table 'location_postcode'. + """ Update the table of postcodes from the input tables + placex and place_postcode. """ matcher = PostcodeFormatter() with tokenizer.name_analyzer() as analyzer: with connect(dsn) as conn: - # First get the list of countries that currently have postcodes. - # (Doing this before starting to insert, so it is fast on import.) - with conn.cursor() as cur: - cur.execute("SELECT DISTINCT country_code FROM location_postcode") - todo_countries = set((row[0] for row in cur)) - - # Recompute the list of valid postcodes from placex. - with conn.cursor(name="placex_postcodes") as cur: - cur.execute(""" - SELECT cc, pc, ST_X(centroid), ST_Y(centroid) - FROM (SELECT - COALESCE(plx.country_code, - get_country_code(ST_Centroid(pl.geometry))) as cc, - pl.address->'postcode' as pc, - COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid - FROM place AS pl LEFT OUTER JOIN placex AS plx - ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type - WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx - WHERE pc IS NOT null AND cc IS NOT null - ORDER BY cc, pc""") - - collector = None - - for country, postcode, x, y in cur: - if collector is None or country != collector.country: - if collector is not None: - collector.commit(conn, analyzer, project_dir) - collector = _PostcodeCollector(country, matcher.get_matcher(country)) - todo_countries.discard(country) - collector.add(postcode, x, y) - - if collector is not None: - collector.commit(conn, analyzer, project_dir) - - # Now handle any countries that are only in the postcode table. - for country in todo_countries: - fmt = matcher.get_matcher(country) - _PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir) - + # Backfill country_code column where required + conn.execute("""UPDATE place_postcode + SET country_code = get_country_code(centroid) + WHERE country_code is null + """) + # Now update first postcode areas + _update_postcode_areas(conn, analyzer, matcher) + # Then fill with estimated postcode centroids from other info + _update_guessed_postcode(conn, analyzer, matcher, project_dir) conn.commit() analyzer.update_postcodes_from_db() -def can_compute(dsn: str) -> bool: +def _insert_postcode_areas(conn: Connection, country_code: str, + extent: int, pcs: list[dict[str, str]]) -> None: + if pcs: + with conn.cursor() as cur: + cur.executemany( + pysql.SQL( + """ INSERT INTO location_postcodes + (osm_id, country_code, rank_search, postcode, centroid, geometry) + SELECT osm_id, country_code, {}, %(out)s, centroid, geometry + FROM place_postcode + WHERE osm_type = 'R' + and country_code = {} and postcode = %(in)s + and geometry is not null + """).format(pysql.Literal(_extent_to_rank(extent)), + pysql.Literal(country_code)), + pcs) + + +def _update_postcode_areas(conn: Connection, analyzer: AbstractAnalyzer, + matcher: PostcodeFormatter) -> None: + """ Update the postcode areas made from postcode boundaries. """ - Check that the place table exists so that - postcodes can be computed. + # first delete all areas that have gone + conn.execute(""" DELETE FROM location_postcodes pc + WHERE pc.osm_id is not null + AND NOT EXISTS( + SELECT * FROM place_postcode pp + WHERE pp.osm_type = 'R' and pp.osm_id = pc.osm_id + and geometry is not null) + """) + # now insert all in country batches, triggers will ensure proper updates + with conn.cursor() as cur: + cur.execute(""" SELECT country_code, postcode FROM place_postcode + WHERE geometry is not null and osm_type = 'R' + ORDER BY country_code + """) + country_code = None + fmt = None + pcs = [] + for cc, postcode in cur: + if country_code is None: + country_code = cc + fmt = matcher.get_matcher(country_code) + elif country_code != cc: + _insert_postcode_areas(conn, country_code, + matcher.get_postcode_extent(country_code), pcs) + country_code = cc + fmt = matcher.get_matcher(country_code) + pcs = [] + assert fmt is not None + + if (m := fmt.match(postcode)): + pcs.append({'out': fmt.normalize(m), 'in': postcode}) + + if country_code is not None and pcs: + _insert_postcode_areas(conn, country_code, + matcher.get_postcode_extent(country_code), pcs) + + +def _update_guessed_postcode(conn: Connection, analyzer: AbstractAnalyzer, + matcher: PostcodeFormatter, project_dir: Optional[Path]) -> None: + """ Computes artificial postcode centroids from the placex table, + potentially enhances it with external data and then updates the + postcodes in the table 'location_postcodes'. + """ + # First get the list of countries that currently have postcodes. + # (Doing this before starting to insert, so it is fast on import.) + with conn.cursor() as cur: + cur.execute("""SELECT DISTINCT country_code FROM location_postcodes + WHERE osm_id is null""") + todo_countries = {row[0] for row in cur} + + # Next, get the list of postcodes that are already covered by areas. + area_pcs = defaultdict(set) + with conn.cursor() as cur: + cur.execute("""SELECT country_code, postcode + FROM location_postcodes WHERE osm_id is not null + ORDER BY country_code""") + for cc, pc in cur: + area_pcs[cc].add(pc) + + # Create a temporary table which contains coverage of the postcode areas. + with conn.cursor() as cur: + cur.execute("DROP TABLE IF EXISTS _global_postcode_area") + cur.execute("""CREATE TABLE _global_postcode_area AS + (SELECT ST_SubDivide(ST_Union(geometry)) as geometry + FROM place_postcode WHERE geometry is not null) + """) + cur.execute("CREATE INDEX ON _global_postcode_area USING gist(geometry)") + # Recompute the list of valid postcodes from placex. + with conn.cursor(name="placex_postcodes") as cur: + cur.execute(""" + SELECT country_code, postcode, ST_X(centroid), ST_Y(centroid) + FROM ( + (SELECT country_code, address->'postcode' as postcode, centroid + FROM placex WHERE address ? 'postcode') + UNION + (SELECT country_code, postcode, centroid + FROM place_postcode WHERE geometry is null) + ) x + WHERE not postcode like '%,%' and not postcode like '%;%' + AND NOT EXISTS(SELECT * FROM _global_postcode_area g + WHERE ST_Intersects(x.centroid, g.geometry)) + ORDER BY country_code""") + + collector = None + + for country, postcode, x, y in cur: + if collector is None or country != collector.country: + if collector is not None: + collector.commit(conn, analyzer, project_dir) + collector = _PostcodeCollector(country, matcher.get_matcher(country), + matcher.get_postcode_extent(country), + exclude=area_pcs[country]) + todo_countries.discard(country) + collector.add(postcode, x, y) + + if collector is not None: + collector.commit(conn, analyzer, project_dir) + + # Now handle any countries that are only in the postcode table. + for country in todo_countries: + fmt = matcher.get_matcher(country) + ext = matcher.get_postcode_extent(country) + _PostcodeCollector(country, fmt, ext, + exclude=area_pcs[country]).commit(conn, analyzer, project_dir) + + conn.execute("DROP TABLE IF EXISTS _global_postcode_area") + + +def can_compute(dsn: str) -> bool: + """ Check that the necessary tables exist so that postcodes can be computed. """ with connect(dsn) as conn: - return table_exists(conn, 'place') + return table_exists(conn, 'place_postcode')