add migration for configurable tokenizer

Adds a migration that initialises a legacy tokenizer for
an existing database. The migration is not active yet as
it will need completion when more functionality is added
to the legacy tokenizer.
This commit is contained in:
Sarah Hoffmann
2021-04-21 15:38:52 +02:00
parent 296a66558f
commit b5540dc35c
3 changed files with 63 additions and 31 deletions

View File

@@ -33,12 +33,15 @@ def _import_tokenizer(name):
raise UsageError('Tokenizer not found') from exp raise UsageError('Tokenizer not found') from exp
def create_tokenizer(config): def create_tokenizer(config, init_db=True, module_name=None):
""" Create a new tokenizer as defined by the given configuration. """ Create a new tokenizer as defined by the given configuration.
The tokenizer data and code is copied into the 'tokenizer' directory The tokenizer data and code is copied into the 'tokenizer' directory
of the project directory and the tokenizer loaded from its new location. of the project directory and the tokenizer loaded from its new location.
""" """
if module_name is None:
module_name = config.TOKENIZER
# Create the directory for the tokenizer data # Create the directory for the tokenizer data
basedir = config.project_dir / 'tokenizer' basedir = config.project_dir / 'tokenizer'
if not basedir.exists(): if not basedir.exists():
@@ -47,13 +50,15 @@ def create_tokenizer(config):
LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir) LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
raise UsageError("Tokenizer setup failed.") raise UsageError("Tokenizer setup failed.")
tokenizer_module = _import_tokenizer(config.TOKENIZER) # Import and initialize the tokenizer.
tokenizer_module = _import_tokenizer(module_name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir) tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
tokenizer.init_new_db(config) if init_db:
tokenizer.init_new_db(config)
with connect(config.get_libpq_dsn()) as conn: with connect(config.get_libpq_dsn()) as conn:
properties.set_property(conn, 'tokenizer', config.TOKENIZER) properties.set_property(conn, 'tokenizer', module_name)
return tokenizer return tokenizer

View File

@@ -20,7 +20,7 @@ def create(dsn, data_dir):
return LegacyTokenizer(dsn, data_dir) return LegacyTokenizer(dsn, data_dir)
def _install_module(src_dir, module_dir): def _install_module(config_module_path, src_dir, module_dir):
""" Copies the PostgreSQL normalisation module into the project """ Copies the PostgreSQL normalisation module into the project
directory if necessary. For historical reasons the module is directory if necessary. For historical reasons the module is
saved in the '/module' subdirectory and not with the other tokenizer saved in the '/module' subdirectory and not with the other tokenizer
@@ -29,10 +29,17 @@ def _install_module(src_dir, module_dir):
The function detects when the installation is run from the The function detects when the installation is run from the
build directory. It doesn't touch the module in that case. build directory. It doesn't touch the module in that case.
""" """
# Custom module locations are simply used as is.
if config_module_path:
LOG.info("Using custom path for database module at '%s'", config_module_path)
return config_module_path
# Compatibility mode for builddir installations.
if module_dir.exists() and src_dir.samefile(module_dir): if module_dir.exists() and src_dir.samefile(module_dir):
LOG.info('Running from build directory. Leaving database module as is.') LOG.info('Running from build directory. Leaving database module as is.')
return return module_dir
# In any other case install the module in the project directory.
if not module_dir.exists(): if not module_dir.exists():
module_dir.mkdir() module_dir.mkdir()
@@ -42,6 +49,8 @@ def _install_module(src_dir, module_dir):
LOG.info('Database module installed at %s', str(destfile)) LOG.info('Database module installed at %s', str(destfile))
return module_dir
def _check_module(module_dir, conn): def _check_module(module_dir, conn):
with conn.cursor() as cur: with conn.cursor() as cur:
@@ -74,24 +83,15 @@ class LegacyTokenizer:
This copies all necessary data in the project directory to make This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates. sure the tokenizer remains stable even over updates.
""" """
# Find and optionally install the PsotgreSQL normalization module. module_dir = _install_module(config.DATABASE_MODULE_PATH,
if config.DATABASE_MODULE_PATH: config.lib_dir.module,
LOG.info("Using custom path for database module at '%s'", config.project_dir / 'module')
config.DATABASE_MODULE_PATH)
module_dir = config.DATABASE_MODULE_PATH
else:
_install_module(config.lib_dir.module, config.project_dir / 'module')
module_dir = config.project_dir / 'module'
self.normalization = config.TERM_NORMALIZATION self.normalization = config.TERM_NORMALIZATION
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
_check_module(module_dir, conn) _check_module(module_dir, conn)
self._save_config(conn)
# Stable configuration is saved in the database.
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
conn.commit()
def init_from_project(self): def init_from_project(self):
@@ -99,3 +99,26 @@ class LegacyTokenizer:
""" """
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION) self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
def migrate_database(self, config):
""" Initialise the project directory of an existing database for
use with this tokenizer.
This is a special migration function for updating existing databases
to new software versions.
"""
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn)
def _save_config(self, conn):
""" Save the configuration that needs to remain stable for the given
database as database properties.
"""
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)

View File

@@ -6,7 +6,8 @@ import logging
from nominatim.db import properties from nominatim.db import properties
from nominatim.db.connection import connect from nominatim.db.connection import connect
from nominatim.version import NOMINATIM_VERSION from nominatim.version import NOMINATIM_VERSION
from nominatim.tools import refresh, database_import from nominatim.tools import refresh
from nominatim.tokenizer import factory as tokenizer_factory
from nominatim.errors import UsageError from nominatim.errors import UsageError
LOG = logging.getLogger() LOG = logging.getLogger()
@@ -108,17 +109,6 @@ def import_status_timestamp_change(conn, **_):
TYPE timestamp with time zone;""") TYPE timestamp with time zone;""")
@_migration(3, 5, 0, 99)
def install_database_module_in_project_directory(conn, config, paths, **_):
""" Install database module in project directory.
The database module needs to be present in the project directory
since those were introduced.
"""
database_import.install_module(paths.module_dir, paths.project_dir,
config.DATABASE_MODULE_PATH, conn=conn)
@_migration(3, 5, 0, 99) @_migration(3, 5, 0, 99)
def add_nominatim_property_table(conn, config, **_): def add_nominatim_property_table(conn, config, **_):
""" Add nominatim_property table. """ Add nominatim_property table.
@@ -173,3 +163,17 @@ def switch_placenode_geometry_index(conn, **_):
and class = 'place' and type != 'postcode' and class = 'place' and type != 'postcode'
and linked_place_id is null""") and linked_place_id is null""")
cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """) cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
@_migration(3, 7, 0, 1)
def install_legacy_tokenizer(conn, config, **_):
""" Setup legacy tokenizer.
If no other tokenizer has been configured yet, then create the
configuration for the backwards-compatible legacy tokenizer
"""
if properties.get_property(conn, 'tokenizer') is None:
tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
module_name='legacy')
tokenizer.migrate_database(config)