move postcode normalization into tokenizer

This commit is contained in:
Sarah Hoffmann
2021-04-25 18:26:36 +02:00
parent d8ed1bfc60
commit ffc2d82b0e
12 changed files with 181 additions and 74 deletions

View File

@@ -116,7 +116,8 @@ class SetupAll:
if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Calculate postcodes')
postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
tokenizer)
if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
if args.continue_at is not None and args.continue_at != 'load-data':

View File

@@ -147,7 +147,7 @@ class Indexer:
if maxrank == 30:
self._index(runners.RankRunner(0, analyzer))
self._index(runners.InterpolationRunner(), 20)
self._index(runners.InterpolationRunner(analyzer), 20)
self._index(runners.RankRunner(30, analyzer), 20)
else:
self._index(runners.RankRunner(maxrank, analyzer))

View File

@@ -25,7 +25,7 @@ class AbstractPlacexRunner:
SET indexed_status = 0, address = v.addr, token_info = v.ti
FROM (VALUES {}) as v(id, addr, ti)
WHERE place_id = v.id
""".format(','.join(["(%s, %s::hstore, %s::json)"] * num_places))
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
def index_places(self, worker, places):
@@ -82,6 +82,10 @@ class InterpolationRunner:
location_property_osmline.
"""
def __init__(self, analyzer):
self.analyzer = analyzer
@staticmethod
def name():
return "interpolation lines (location_property_osmline)"
@@ -93,15 +97,30 @@ class InterpolationRunner:
@staticmethod
def sql_get_objects():
return """SELECT place_id FROM location_property_osmline
return """SELECT place_id, get_interpolation_address(address, osm_id) as address
FROM location_property_osmline
WHERE indexed_status > 0
ORDER BY geometry_sector"""
@staticmethod
def index_places(worker, ids):
worker.perform(""" UPDATE location_property_osmline
SET indexed_status = 0 WHERE place_id IN ({})
""".format(','.join((str(i[0]) for i in ids))))
@functools.lru_cache(maxsize=1)
def _index_sql(num_places):
return """ UPDATE location_property_osmline
SET indexed_status = 0, address = v.addr, token_info = v.ti
FROM (VALUES {}) as v(id, addr, ti)
WHERE place_id = v.id
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
def index_places(self, worker, places):
values = []
for place in places:
values.extend((place[x] for x in ('place_id', 'address')))
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
worker.perform(self._index_sql(len(places)), values)
class PostcodeRunner:

View File

@@ -1,6 +1,7 @@
"""
Tokenizer implementing normalisation as used before Nominatim 4.
"""
from collections import OrderedDict
import logging
import re
import shutil
@@ -213,6 +214,15 @@ class LegacyNameAnalyzer:
self.conn.close()
self.conn = None
def add_postcodes_from_db(self):
""" Add postcodes from the location_postcode table to the word table.
"""
with self.conn.cursor() as cur:
cur.execute("""SELECT count(create_postcode_id(pc))
FROM (SELECT distinct(postcode) as pc
FROM location_postcode) x""")
def process_place(self, place):
""" Determine tokenizer information about the given place.
@@ -226,11 +236,25 @@ class LegacyNameAnalyzer:
address = place.get('address')
if address:
self._add_postcode(address.get('postcode'))
token_info.add_housenumbers(self.conn, address)
return token_info.data
def _add_postcode(self, postcode):
""" Make sure the normalized postcode is present in the word table.
"""
if not postcode or re.search(r'[:,;]', postcode) is not None:
return
def _create_postcode_from_db(pcode):
with self.conn.cursor() as cur:
cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
@@ -285,6 +309,32 @@ class _TokenInfo:
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
class _LRU:
""" Least recently used cache that accepts a generator function to
produce the item when there is a cache miss.
"""
def __init__(self, maxsize=128):
self.data = OrderedDict()
self.maxsize = maxsize
def get(self, key, generator):
""" Get the item with the given key from the cache. If nothing
is found in the cache, generate the value through the
generator function and store it in the cache.
"""
value = self.data.get(key)
if value is not None:
self.data.move_to_end(key)
else:
value = generator(key)
if len(self.data) >= self.maxsize:
self.data.popitem(last=False)
self.data[key] = value
return value
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
@@ -292,6 +342,9 @@ class _TokenCache:
analyzer.
"""
def __init__(self, conn):
# various LRU caches
self.postcodes = _LRU(maxsize=32)
# Lookup houseunumbers up to 100 and cache them
with conn.cursor() as cur:
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text

View File

@@ -6,7 +6,7 @@ of artificial postcode centroids.
from nominatim.db.utils import execute_file
from nominatim.db.connection import connect
def import_postcodes(dsn, project_dir):
def import_postcodes(dsn, project_dir, tokenizer):
""" Set up the initial list of postcodes.
"""
@@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, country_code,
upper(trim (both ' ' from address->'postcode')) as pc,
token_normalized_postcode(address->'postcode') as pc,
ST_Centroid(ST_Collect(ST_Centroid(geometry)))
FROM placex
WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
WHERE address ? 'postcode'
and token_normalized_postcode(address->'postcode') is not null
AND geometry IS NOT null
GROUP BY country_code, pc
""")
@@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
cur.execute("""
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, 'us', postcode,
SELECT nextval('seq_place'), 1, 'us',
token_normalized_postcode(postcode),
ST_SetSRID(ST_Point(x,y),4326)
FROM us_postcode WHERE postcode NOT IN
FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
(SELECT postcode FROM location_postcode
WHERE country_code = 'us')
""")
@@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
cur.execute("""
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
FROM gb_postcode WHERE postcode NOT IN
SELECT nextval('seq_place'), 1, 'gb',
token_normalized_postcode(postcode), geometry
FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
(SELECT postcode FROM location_postcode
WHERE country_code = 'gb')
""")
@@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
DELETE FROM word WHERE class='place' and type='postcode'
and word NOT IN (SELECT postcode FROM location_postcode)
""")
cur.execute("""
SELECT count(getorcreate_postcode_id(v)) FROM
(SELECT distinct(postcode) as v FROM location_postcode) p
""")
conn.commit()
with tokenizer.name_analyzer() as analyzer:
analyzer.add_postcodes_from_db()