mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
move postcode normalization into tokenizer
This commit is contained in:
@@ -116,7 +116,8 @@ class SetupAll:
|
||||
|
||||
if args.continue_at is None or args.continue_at == 'load-data':
|
||||
LOG.warning('Calculate postcodes')
|
||||
postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
|
||||
postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
|
||||
tokenizer)
|
||||
|
||||
if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
|
||||
if args.continue_at is not None and args.continue_at != 'load-data':
|
||||
|
||||
@@ -147,7 +147,7 @@ class Indexer:
|
||||
|
||||
if maxrank == 30:
|
||||
self._index(runners.RankRunner(0, analyzer))
|
||||
self._index(runners.InterpolationRunner(), 20)
|
||||
self._index(runners.InterpolationRunner(analyzer), 20)
|
||||
self._index(runners.RankRunner(30, analyzer), 20)
|
||||
else:
|
||||
self._index(runners.RankRunner(maxrank, analyzer))
|
||||
|
||||
@@ -25,7 +25,7 @@ class AbstractPlacexRunner:
|
||||
SET indexed_status = 0, address = v.addr, token_info = v.ti
|
||||
FROM (VALUES {}) as v(id, addr, ti)
|
||||
WHERE place_id = v.id
|
||||
""".format(','.join(["(%s, %s::hstore, %s::json)"] * num_places))
|
||||
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
|
||||
|
||||
|
||||
def index_places(self, worker, places):
|
||||
@@ -82,6 +82,10 @@ class InterpolationRunner:
|
||||
location_property_osmline.
|
||||
"""
|
||||
|
||||
def __init__(self, analyzer):
|
||||
self.analyzer = analyzer
|
||||
|
||||
|
||||
@staticmethod
|
||||
def name():
|
||||
return "interpolation lines (location_property_osmline)"
|
||||
@@ -93,15 +97,30 @@ class InterpolationRunner:
|
||||
|
||||
@staticmethod
|
||||
def sql_get_objects():
|
||||
return """SELECT place_id FROM location_property_osmline
|
||||
return """SELECT place_id, get_interpolation_address(address, osm_id) as address
|
||||
FROM location_property_osmline
|
||||
WHERE indexed_status > 0
|
||||
ORDER BY geometry_sector"""
|
||||
|
||||
|
||||
@staticmethod
|
||||
def index_places(worker, ids):
|
||||
worker.perform(""" UPDATE location_property_osmline
|
||||
SET indexed_status = 0 WHERE place_id IN ({})
|
||||
""".format(','.join((str(i[0]) for i in ids))))
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def _index_sql(num_places):
|
||||
return """ UPDATE location_property_osmline
|
||||
SET indexed_status = 0, address = v.addr, token_info = v.ti
|
||||
FROM (VALUES {}) as v(id, addr, ti)
|
||||
WHERE place_id = v.id
|
||||
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
|
||||
|
||||
|
||||
def index_places(self, worker, places):
|
||||
values = []
|
||||
for place in places:
|
||||
values.extend((place[x] for x in ('place_id', 'address')))
|
||||
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
|
||||
|
||||
worker.perform(self._index_sql(len(places)), values)
|
||||
|
||||
|
||||
|
||||
class PostcodeRunner:
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
Tokenizer implementing normalisation as used before Nominatim 4.
|
||||
"""
|
||||
from collections import OrderedDict
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
@@ -213,6 +214,15 @@ class LegacyNameAnalyzer:
|
||||
self.conn.close()
|
||||
self.conn = None
|
||||
|
||||
|
||||
def add_postcodes_from_db(self):
|
||||
""" Add postcodes from the location_postcode table to the word table.
|
||||
"""
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT count(create_postcode_id(pc))
|
||||
FROM (SELECT distinct(postcode) as pc
|
||||
FROM location_postcode) x""")
|
||||
|
||||
def process_place(self, place):
|
||||
""" Determine tokenizer information about the given place.
|
||||
|
||||
@@ -226,11 +236,25 @@ class LegacyNameAnalyzer:
|
||||
address = place.get('address')
|
||||
|
||||
if address:
|
||||
self._add_postcode(address.get('postcode'))
|
||||
token_info.add_housenumbers(self.conn, address)
|
||||
|
||||
return token_info.data
|
||||
|
||||
|
||||
def _add_postcode(self, postcode):
|
||||
""" Make sure the normalized postcode is present in the word table.
|
||||
"""
|
||||
if not postcode or re.search(r'[:,;]', postcode) is not None:
|
||||
return
|
||||
|
||||
def _create_postcode_from_db(pcode):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
|
||||
|
||||
self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
|
||||
|
||||
|
||||
class _TokenInfo:
|
||||
""" Collect token information to be sent back to the database.
|
||||
"""
|
||||
@@ -285,6 +309,32 @@ class _TokenInfo:
|
||||
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
|
||||
|
||||
|
||||
class _LRU:
|
||||
""" Least recently used cache that accepts a generator function to
|
||||
produce the item when there is a cache miss.
|
||||
"""
|
||||
|
||||
def __init__(self, maxsize=128):
|
||||
self.data = OrderedDict()
|
||||
self.maxsize = maxsize
|
||||
|
||||
def get(self, key, generator):
|
||||
""" Get the item with the given key from the cache. If nothing
|
||||
is found in the cache, generate the value through the
|
||||
generator function and store it in the cache.
|
||||
"""
|
||||
value = self.data.get(key)
|
||||
if value is not None:
|
||||
self.data.move_to_end(key)
|
||||
else:
|
||||
value = generator(key)
|
||||
if len(self.data) >= self.maxsize:
|
||||
self.data.popitem(last=False)
|
||||
self.data[key] = value
|
||||
|
||||
return value
|
||||
|
||||
|
||||
class _TokenCache:
|
||||
""" Cache for token information to avoid repeated database queries.
|
||||
|
||||
@@ -292,6 +342,9 @@ class _TokenCache:
|
||||
analyzer.
|
||||
"""
|
||||
def __init__(self, conn):
|
||||
# various LRU caches
|
||||
self.postcodes = _LRU(maxsize=32)
|
||||
|
||||
# Lookup houseunumbers up to 100 and cache them
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
|
||||
|
||||
@@ -6,7 +6,7 @@ of artificial postcode centroids.
|
||||
from nominatim.db.utils import execute_file
|
||||
from nominatim.db.connection import connect
|
||||
|
||||
def import_postcodes(dsn, project_dir):
|
||||
def import_postcodes(dsn, project_dir, tokenizer):
|
||||
""" Set up the initial list of postcodes.
|
||||
"""
|
||||
|
||||
@@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
|
||||
INSERT INTO location_postcode
|
||||
(place_id, indexed_status, country_code, postcode, geometry)
|
||||
SELECT nextval('seq_place'), 1, country_code,
|
||||
upper(trim (both ' ' from address->'postcode')) as pc,
|
||||
token_normalized_postcode(address->'postcode') as pc,
|
||||
ST_Centroid(ST_Collect(ST_Centroid(geometry)))
|
||||
FROM placex
|
||||
WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
|
||||
WHERE address ? 'postcode'
|
||||
and token_normalized_postcode(address->'postcode') is not null
|
||||
AND geometry IS NOT null
|
||||
GROUP BY country_code, pc
|
||||
""")
|
||||
@@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
|
||||
cur.execute("""
|
||||
INSERT INTO location_postcode
|
||||
(place_id, indexed_status, country_code, postcode, geometry)
|
||||
SELECT nextval('seq_place'), 1, 'us', postcode,
|
||||
SELECT nextval('seq_place'), 1, 'us',
|
||||
token_normalized_postcode(postcode),
|
||||
ST_SetSRID(ST_Point(x,y),4326)
|
||||
FROM us_postcode WHERE postcode NOT IN
|
||||
FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
|
||||
(SELECT postcode FROM location_postcode
|
||||
WHERE country_code = 'us')
|
||||
""")
|
||||
@@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
|
||||
cur.execute("""
|
||||
INSERT INTO location_postcode
|
||||
(place_id, indexed_status, country_code, postcode, geometry)
|
||||
SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
|
||||
FROM gb_postcode WHERE postcode NOT IN
|
||||
SELECT nextval('seq_place'), 1, 'gb',
|
||||
token_normalized_postcode(postcode), geometry
|
||||
FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
|
||||
(SELECT postcode FROM location_postcode
|
||||
WHERE country_code = 'gb')
|
||||
""")
|
||||
@@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
|
||||
DELETE FROM word WHERE class='place' and type='postcode'
|
||||
and word NOT IN (SELECT postcode FROM location_postcode)
|
||||
""")
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(getorcreate_postcode_id(v)) FROM
|
||||
(SELECT distinct(postcode) as v FROM location_postcode) p
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
analyzer.add_postcodes_from_db()
|
||||
|
||||
Reference in New Issue
Block a user