move indexing function into its own Python module

This makes it mow a standard function of our new Python library instead of a stand-alone program.
2026-03-13 22:34:07 +00:00 · 2021-01-17 17:19:17 +01:00
parent b79c79fa73
commit 27977411e9
4 changed files with 194 additions and 190 deletions
--- a/nominatim/indexer/db.py
+++ b/nominatim/indexer/db.py
@@ -1,116 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# This file is part of Nominatim.
-# Copyright (C) 2021 by the Nominatim developer community.
-# For a full list of authors see the git log.
-""" Database helper functions for the indexer.
-"""
-import logging
-import psycopg2
-from psycopg2.extras import wait_select
-
-LOG = logging.getLogger()
-
-def make_connection(options, asynchronous=False):
-    """ Create a psycopg2 connection from the given options.
-    """
-    params = {'dbname' : options.dbname,
-              'user' : options.user,
-              'password' : options.password,
-              'host' : options.host,
-              'port' : options.port,
-              'async' : asynchronous}
-
-    return psycopg2.connect(**params)
-
-class DBConnection:
-    """ A single non-blocking database connection.
-    """
-
-    def __init__(self, options):
-        self.current_query = None
-        self.current_params = None
-        self.options = options
-
-        self.conn = None
-        self.cursor = None
-        self.connect()
-
-    def connect(self):
-        """ (Re)connect to the database. Creates an asynchronous connection
-            with JIT and parallel processing disabled. If a connection was
-            already open, it is closed and a new connection established.
-            The caller must ensure that no query is pending before reconnecting.
-        """
-        if self.conn is not None:
-            self.cursor.close()
-            self.conn.close()
-
-        self.conn = make_connection(self.options, asynchronous=True)
-        self.wait()
-
-        self.cursor = self.conn.cursor()
-        # Disable JIT and parallel workers as they are known to cause problems.
-        # Update pg_settings instead of using SET because it does not yield
-        # errors on older versions of Postgres where the settings are not
-        # implemented.
-        self.perform(
-            """ UPDATE pg_settings SET setting = -1 WHERE name = 'jit_above_cost';
-                UPDATE pg_settings SET setting = 0
-                   WHERE name = 'max_parallel_workers_per_gather';""")
-        self.wait()
-
-    def wait(self):
-        """ Block until any pending operation is done.
-        """
-        while True:
-            try:
-                wait_select(self.conn)
-                self.current_query = None
-                return
-            except psycopg2.extensions.TransactionRollbackError as error:
-                if error.pgcode == '40P01':
-                    LOG.info("Deadlock detected (params = %s), retry.",
-                             str(self.current_params))
-                    self.cursor.execute(self.current_query, self.current_params)
-                else:
-                    raise
-            except psycopg2.errors.DeadlockDetected: # pylint: disable=E1101
-                self.cursor.execute(self.current_query, self.current_params)
-
-    def perform(self, sql, args=None):
-        """ Send SQL query to the server. Returns immediately without
-            blocking.
-        """
-        self.current_query = sql
-        self.current_params = args
-        self.cursor.execute(sql, args)
-
-    def fileno(self):
-        """ File descriptor to wait for. (Makes this class select()able.)
-        """
-        return self.conn.fileno()
-
-    def is_done(self):
-        """ Check if the connection is available for a new query.
-
-            Also checks if the previous query has run into a deadlock.
-            If so, then the previous query is repeated.
-        """
-        if self.current_query is None:
-            return True
-
-        try:
-            if self.conn.poll() == psycopg2.extensions.POLL_OK:
-                self.current_query = None
-                return True
-        except psycopg2.extensions.TransactionRollbackError as error:
-            if error.pgcode == '40P01':
-                LOG.info("Deadlock detected (params = %s), retry.", str(self.current_params))
-                self.cursor.execute(self.current_query, self.current_params)
-            else:
-                raise
-        except psycopg2.errors.DeadlockDetected: # pylint: disable=E1101
-            self.cursor.execute(self.current_query, self.current_params)
-
-        return False
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@@ -0,0 +1,191 @@
+"""
+Main work horse for indexing (computing addresses) the database.
+"""
+# pylint: disable=C0111
+import logging
+import select
+
+from .progress import ProgressLogger
+from db.async_connection import DBConnection, make_connection
+
+LOG = logging.getLogger()
+
+class RankRunner:
+    """ Returns SQL commands for indexing one rank within the placex table.
+    """
+
+    def __init__(self, rank):
+        self.rank = rank
+
+    def name(self):
+        return "rank {}".format(self.rank)
+
+    def sql_count_objects(self):
+        return """SELECT count(*) FROM placex
+                  WHERE rank_address = {} and indexed_status > 0
+               """.format(self.rank)
+
+    def sql_get_objects(self):
+        return """SELECT place_id FROM placex
+                  WHERE indexed_status > 0 and rank_address = {}
+                  ORDER BY geometry_sector""".format(self.rank)
+
+    @staticmethod
+    def sql_index_place(ids):
+        return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
+               .format(','.join((str(i) for i in ids)))
+
+
+class InterpolationRunner:
+    """ Returns SQL commands for indexing the address interpolation table
+        location_property_osmline.
+    """
+
+    @staticmethod
+    def name():
+        return "interpolation lines (location_property_osmline)"
+
+    @staticmethod
+    def sql_count_objects():
+        return """SELECT count(*) FROM location_property_osmline
+                  WHERE indexed_status > 0"""
+
+    @staticmethod
+    def sql_get_objects():
+        return """SELECT place_id FROM location_property_osmline
+                  WHERE indexed_status > 0
+                  ORDER BY geometry_sector"""
+
+    @staticmethod
+    def sql_index_place(ids):
+        return """UPDATE location_property_osmline
+                  SET indexed_status = 0 WHERE place_id IN ({})"""\
+               .format(','.join((str(i) for i in ids)))
+
+class BoundaryRunner:
+    """ Returns SQL commands for indexing the administrative boundaries
+        of a certain rank.
+    """
+
+    def __init__(self, rank):
+        self.rank = rank
+
+    def name(self):
+        return "boundaries rank {}".format(self.rank)
+
+    def sql_count_objects(self):
+        return """SELECT count(*) FROM placex
+                  WHERE indexed_status > 0
+                    AND rank_search = {}
+                    AND class = 'boundary' and type = 'administrative'""".format(self.rank)
+
+    def sql_get_objects(self):
+        return """SELECT place_id FROM placex
+                  WHERE indexed_status > 0 and rank_search = {}
+                        and class = 'boundary' and type = 'administrative'
+                  ORDER BY partition, admin_level""".format(self.rank)
+
+    @staticmethod
+    def sql_index_place(ids):
+        return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
+               .format(','.join((str(i) for i in ids)))
+
+class Indexer:
+    """ Main indexing routine.
+    """
+
+    def __init__(self, opts):
+        self.minrank = max(1, opts.minrank)
+        self.maxrank = min(30, opts.maxrank)
+        self.conn = make_connection(opts)
+        self.threads = [DBConnection(opts) for _ in range(opts.threads)]
+
+    def index_boundaries(self):
+        LOG.warning("Starting indexing boundaries using %s threads",
+                    len(self.threads))
+
+        for rank in range(max(self.minrank, 5), min(self.maxrank, 26)):
+            self.index(BoundaryRunner(rank))
+
+    def index_by_rank(self):
+        """ Run classic indexing by rank.
+        """
+        LOG.warning("Starting indexing rank (%i to %i) using %i threads",
+                    self.minrank, self.maxrank, len(self.threads))
+
+        for rank in range(max(1, self.minrank), self.maxrank):
+            self.index(RankRunner(rank))
+
+        if self.maxrank == 30:
+            self.index(RankRunner(0))
+            self.index(InterpolationRunner(), 20)
+            self.index(RankRunner(self.maxrank), 20)
+        else:
+            self.index(RankRunner(self.maxrank))
+
+    def index(self, obj, batch=1):
+        """ Index a single rank or table. `obj` describes the SQL to use
+            for indexing. `batch` describes the number of objects that
+            should be processed with a single SQL statement
+        """
+        LOG.warning("Starting %s (using batch size %s)", obj.name(), batch)
+
+        cur = self.conn.cursor()
+        cur.execute(obj.sql_count_objects())
+
+        total_tuples = cur.fetchone()[0]
+        LOG.debug("Total number of rows: %i", total_tuples)
+
+        cur.close()
+
+        progress = ProgressLogger(obj.name(), total_tuples)
+
+        if total_tuples > 0:
+            cur = self.conn.cursor(name='places')
+            cur.execute(obj.sql_get_objects())
+
+            next_thread = self.find_free_thread()
+            while True:
+                places = [p[0] for p in cur.fetchmany(batch)]
+                if not places:
+                    break
+
+                LOG.debug("Processing places: %s", str(places))
+                thread = next(next_thread)
+
+                thread.perform(obj.sql_index_place(places))
+                progress.add(len(places))
+
+            cur.close()
+
+            for thread in self.threads:
+                thread.wait()
+
+        progress.done()
+
+    def find_free_thread(self):
+        """ Generator that returns the next connection that is free for
+            sending a query.
+        """
+        ready = self.threads
+        command_stat = 0
+
+        while True:
+            for thread in ready:
+                if thread.is_done():
+                    command_stat += 1
+                    yield thread
+
+            # refresh the connections occasionaly to avoid potential
+            # memory leaks in Postgresql.
+            if command_stat > 100000:
+                for thread in self.threads:
+                    while not thread.is_done():
+                        thread.wait()
+                    thread.connect()
+                command_stat = 0
+                ready = self.threads
+            else:
+                ready, _, _ = select.select(self.threads, [], [])
+
+        assert False, "Unreachable code"