mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
indexer: fetch extra place data asynchronously
The indexer now fetches any extra data besides the place_id asynchronously while processing the places from the last batch. This also means that more places are now fetched at once.
This commit is contained in:
@@ -14,6 +14,73 @@ from nominatim.db.connection import connect
|
|||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
|
class PlaceFetcher:
|
||||||
|
""" Asynchronous connection that fetches place details for processing.
|
||||||
|
"""
|
||||||
|
def __init__(self, dsn, setup_conn):
|
||||||
|
self.wait_time = 0
|
||||||
|
self.current_ids = None
|
||||||
|
self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor)
|
||||||
|
|
||||||
|
with setup_conn.cursor() as cur:
|
||||||
|
# need to fetch those manually because register_hstore cannot
|
||||||
|
# fetch them on an asynchronous connection below.
|
||||||
|
hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
|
||||||
|
hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
|
||||||
|
|
||||||
|
psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
|
||||||
|
array_oid=hstore_array_oid)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
""" Close the underlying asynchronous connection.
|
||||||
|
"""
|
||||||
|
if self.conn:
|
||||||
|
self.conn.close()
|
||||||
|
self.conn = None
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_next_batch(self, cur, runner):
|
||||||
|
""" Send a request for the next batch of places.
|
||||||
|
If details for the places are required, they will be fetched
|
||||||
|
asynchronously.
|
||||||
|
|
||||||
|
Returns true if there is still data available.
|
||||||
|
"""
|
||||||
|
ids = cur.fetchmany(100)
|
||||||
|
|
||||||
|
if not ids:
|
||||||
|
self.current_ids = None
|
||||||
|
return False
|
||||||
|
|
||||||
|
if hasattr(runner, 'get_place_details'):
|
||||||
|
runner.get_place_details(self.conn, ids)
|
||||||
|
self.current_ids = []
|
||||||
|
else:
|
||||||
|
self.current_ids = ids
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_batch(self):
|
||||||
|
""" Get the next batch of data, previously requested with
|
||||||
|
`fetch_next_batch`.
|
||||||
|
"""
|
||||||
|
if self.current_ids is not None and not self.current_ids:
|
||||||
|
tstart = time.time()
|
||||||
|
self.conn.wait()
|
||||||
|
self.wait_time += time.time() - tstart
|
||||||
|
self.current_ids = self.conn.cursor.fetchall()
|
||||||
|
|
||||||
|
return self.current_ids
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
|
self.conn.wait()
|
||||||
|
self.close()
|
||||||
|
|
||||||
class WorkerPool:
|
class WorkerPool:
|
||||||
""" A pool of asynchronous database connections.
|
""" A pool of asynchronous database connections.
|
||||||
|
|
||||||
@@ -24,6 +91,7 @@ class WorkerPool:
|
|||||||
def __init__(self, dsn, pool_size):
|
def __init__(self, dsn, pool_size):
|
||||||
self.threads = [DBConnection(dsn) for _ in range(pool_size)]
|
self.threads = [DBConnection(dsn) for _ in range(pool_size)]
|
||||||
self.free_workers = self._yield_free_worker()
|
self.free_workers = self._yield_free_worker()
|
||||||
|
self.wait_time = 0
|
||||||
|
|
||||||
|
|
||||||
def finish_all(self):
|
def finish_all(self):
|
||||||
@@ -67,7 +135,9 @@ class WorkerPool:
|
|||||||
ready = self.threads
|
ready = self.threads
|
||||||
command_stat = 0
|
command_stat = 0
|
||||||
else:
|
else:
|
||||||
|
tstart = time.time()
|
||||||
_, ready, _ = select.select([], self.threads, [])
|
_, ready, _ = select.select([], self.threads, [])
|
||||||
|
self.wait_time += time.time() - tstart
|
||||||
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
@@ -75,6 +145,7 @@ class WorkerPool:
|
|||||||
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
|
self.finish_all()
|
||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
|
|
||||||
@@ -184,70 +255,33 @@ class Indexer:
|
|||||||
total_tuples = cur.scalar(runner.sql_count_objects())
|
total_tuples = cur.scalar(runner.sql_count_objects())
|
||||||
LOG.debug("Total number of rows: %i", total_tuples)
|
LOG.debug("Total number of rows: %i", total_tuples)
|
||||||
|
|
||||||
# need to fetch those manually because register_hstore cannot
|
|
||||||
# fetch them on an asynchronous connection below.
|
|
||||||
hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
|
|
||||||
hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
|
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
progress = ProgressLogger(runner.name(), total_tuples)
|
progress = ProgressLogger(runner.name(), total_tuples)
|
||||||
|
|
||||||
fetcher_wait = 0
|
|
||||||
pool_wait = 0
|
|
||||||
|
|
||||||
if total_tuples > 0:
|
if total_tuples > 0:
|
||||||
with conn.cursor(name='places') as cur:
|
with conn.cursor(name='places') as cur:
|
||||||
cur.execute(runner.sql_get_objects())
|
cur.execute(runner.sql_get_objects())
|
||||||
|
|
||||||
fetcher = DBConnection(self.dsn, cursor_factory=psycopg2.extras.DictCursor)
|
with PlaceFetcher(self.dsn, conn) as fetcher:
|
||||||
psycopg2.extras.register_hstore(fetcher.conn,
|
with WorkerPool(self.dsn, self.num_threads) as pool:
|
||||||
oid=hstore_oid,
|
has_more = fetcher.fetch_next_batch(cur, runner)
|
||||||
array_oid=hstore_array_oid)
|
while has_more:
|
||||||
|
places = fetcher.get_batch()
|
||||||
|
|
||||||
with WorkerPool(self.dsn, self.num_threads) as pool:
|
# asynchronously get the next batch
|
||||||
places = self._fetch_next_batch(cur, fetcher, runner)
|
has_more = fetcher.fetch_next_batch(cur, runner)
|
||||||
while places is not None:
|
|
||||||
if not places:
|
|
||||||
t0 = time.time()
|
|
||||||
fetcher.wait()
|
|
||||||
fetcher_wait += time.time() - t0
|
|
||||||
places = fetcher.cursor.fetchall()
|
|
||||||
|
|
||||||
# asynchronously get the next batch
|
# And insert the curent batch
|
||||||
next_places = self._fetch_next_batch(cur, fetcher, runner)
|
for idx in range(0, len(places), batch):
|
||||||
|
part = places[idx:idx+batch]
|
||||||
|
LOG.debug("Processing places: %s", str(part))
|
||||||
|
runner.index_places(pool.next_free_worker(), part)
|
||||||
|
progress.add(len(part))
|
||||||
|
|
||||||
# And insert the curent batch
|
LOG.info("Wait time: fetcher: %.2fs, pool: %.2fs",
|
||||||
for idx in range(0, len(places), batch):
|
fetcher.wait_time, pool.wait_time)
|
||||||
t0 = time.time()
|
|
||||||
worker = pool.next_free_worker()
|
|
||||||
pool_wait += time.time() - t0
|
|
||||||
part = places[idx:idx+batch]
|
|
||||||
LOG.debug("Processing places: %s", str(part))
|
|
||||||
runner.index_places(worker, part)
|
|
||||||
progress.add(len(part))
|
|
||||||
|
|
||||||
places = next_places
|
|
||||||
|
|
||||||
pool.finish_all()
|
|
||||||
|
|
||||||
fetcher.wait()
|
|
||||||
fetcher.close()
|
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
progress.done()
|
progress.done()
|
||||||
LOG.warning("Wait time: fetcher: {}s, pool: {}s".format(fetcher_wait, pool_wait))
|
|
||||||
|
|
||||||
|
|
||||||
def _fetch_next_batch(self, cur, fetcher, runner):
|
|
||||||
ids = cur.fetchmany(100)
|
|
||||||
|
|
||||||
if not ids:
|
|
||||||
return None
|
|
||||||
|
|
||||||
if not hasattr(runner, 'get_place_details'):
|
|
||||||
return ids
|
|
||||||
|
|
||||||
runner.get_place_details(fetcher, ids)
|
|
||||||
return []
|
|
||||||
|
|||||||
@@ -28,7 +28,8 @@ class AbstractPlacexRunner:
|
|||||||
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
|
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
|
||||||
|
|
||||||
|
|
||||||
def get_place_details(self, worker, ids):
|
@staticmethod
|
||||||
|
def get_place_details(worker, ids):
|
||||||
worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
|
worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
|
||||||
FROM placex WHERE place_id IN %s""",
|
FROM placex WHERE place_id IN %s""",
|
||||||
(tuple((p[0] for p in ids)), ))
|
(tuple((p[0] for p in ids)), ))
|
||||||
@@ -103,12 +104,19 @@ class InterpolationRunner:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def sql_get_objects():
|
def sql_get_objects():
|
||||||
return """SELECT place_id, get_interpolation_address(address, osm_id) as address
|
return """SELECT place_id
|
||||||
FROM location_property_osmline
|
FROM location_property_osmline
|
||||||
WHERE indexed_status > 0
|
WHERE indexed_status > 0
|
||||||
ORDER BY geometry_sector"""
|
ORDER BY geometry_sector"""
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_place_details(worker, ids):
|
||||||
|
worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
|
||||||
|
FROM location_property_osmline WHERE place_id IN %s""",
|
||||||
|
(tuple((p[0] for p in ids)), ))
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@functools.lru_cache(maxsize=1)
|
@functools.lru_cache(maxsize=1)
|
||||||
def _index_sql(num_places):
|
def _index_sql(num_places):
|
||||||
|
|||||||
Reference in New Issue
Block a user