move postcode centroid computation to Python

This commit is contained in:
Sarah Hoffmann
2022-06-06 10:46:48 +02:00
parent 4885fdf0f9
commit bf86b45178

View File

@@ -8,6 +8,7 @@
Functions for importing, updating and otherwise maintaining the table Functions for importing, updating and otherwise maintaining the table
of artificial postcode centroids. of artificial postcode centroids.
""" """
from collections import defaultdict
import csv import csv
import gzip import gzip
import logging import logging
@@ -16,6 +17,7 @@ from math import isfinite
from psycopg2 import sql as pysql from psycopg2 import sql as pysql
from nominatim.db.connection import connect from nominatim.db.connection import connect
from nominatim.utils.centroid import PointsCentroid
LOG = logging.getLogger() LOG = logging.getLogger()
@@ -36,14 +38,14 @@ class _CountryPostcodesCollector:
def __init__(self, country): def __init__(self, country):
self.country = country self.country = country
self.collected = {} self.collected = defaultdict(PointsCentroid)
def add(self, postcode, x, y): def add(self, postcode, x, y):
""" Add the given postcode to the collection cache. If the postcode """ Add the given postcode to the collection cache. If the postcode
already existed, it is overwritten with the new centroid. already existed, it is overwritten with the new centroid.
""" """
self.collected[postcode] = (x, y) self.collected[postcode] += (x, y)
def commit(self, conn, analyzer, project_dir): def commit(self, conn, analyzer, project_dir):
@@ -93,16 +95,16 @@ class _CountryPostcodesCollector:
WHERE country_code = %s""", WHERE country_code = %s""",
(self.country, )) (self.country, ))
for postcode, x, y in cur: for postcode, x, y in cur:
newx, newy = self.collected.pop(postcode, (None, None)) pcobj = self.collected.pop(postcode, None)
if newx is not None: if pcobj:
dist = (x - newx)**2 + (y - newy)**2 newx, newy = pcobj.centroid()
if dist > 0.0000001: if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
to_update.append((postcode, newx, newy)) to_update.append((postcode, newx, newy))
else: else:
to_delete.append(postcode) to_delete.append(postcode)
to_add = [(k, v[0], v[1]) for k, v in self.collected.items()] to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
self.collected = [] self.collected = None
return to_add, to_delete, to_update return to_add, to_delete, to_update
@@ -125,8 +127,10 @@ class _CountryPostcodesCollector:
postcode = analyzer.normalize_postcode(row['postcode']) postcode = analyzer.normalize_postcode(row['postcode'])
if postcode not in self.collected: if postcode not in self.collected:
try: try:
self.collected[postcode] = (_to_float(row['lon'], 180), # Do float conversation separately, it might throw
_to_float(row['lat'], 90)) centroid = (_to_float(row['lon'], 180),
_to_float(row['lat'], 90))
self.collected[postcode] += centroid
except ValueError: except ValueError:
LOG.warning("Bad coordinates %s, %s in %s country postcode file.", LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
row['lat'], row['lon'], self.country) row['lat'], row['lon'], self.country)
@@ -174,12 +178,10 @@ def update_postcodes(dsn, project_dir, tokenizer):
COALESCE(plx.country_code, COALESCE(plx.country_code,
get_country_code(ST_Centroid(pl.geometry))) as cc, get_country_code(ST_Centroid(pl.geometry))) as cc,
token_normalized_postcode(pl.address->'postcode') as pc, token_normalized_postcode(pl.address->'postcode') as pc,
ST_Centroid(ST_Collect(COALESCE(plx.centroid, COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
ST_Centroid(pl.geometry)))) as centroid
FROM place AS pl LEFT OUTER JOIN placex AS plx FROM place AS pl LEFT OUTER JOIN placex AS plx
ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
GROUP BY cc, pc) xx
WHERE pc IS NOT null AND cc IS NOT null WHERE pc IS NOT null AND cc IS NOT null
ORDER BY country_code, pc""") ORDER BY country_code, pc""")