mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-25 18:48:15 +00:00
Adds a migration that initialises a legacy tokenizer for an existing database. The migration is not active yet as it will need completion when more functionality is added to the legacy tokenizer.
125 lines
4.2 KiB
Python
125 lines
4.2 KiB
Python
"""
|
|
Tokenizer implementing normalisation as used before Nominatim 4.
|
|
"""
|
|
import logging
|
|
import shutil
|
|
|
|
import psycopg2
|
|
|
|
from nominatim.db.connection import connect
|
|
from nominatim.db import properties
|
|
from nominatim.errors import UsageError
|
|
|
|
DBCFG_NORMALIZATION = "tokenizer_normalization"
|
|
|
|
LOG = logging.getLogger()
|
|
|
|
def create(dsn, data_dir):
|
|
""" Create a new instance of the tokenizer provided by this module.
|
|
"""
|
|
return LegacyTokenizer(dsn, data_dir)
|
|
|
|
|
|
def _install_module(config_module_path, src_dir, module_dir):
|
|
""" Copies the PostgreSQL normalisation module into the project
|
|
directory if necessary. For historical reasons the module is
|
|
saved in the '/module' subdirectory and not with the other tokenizer
|
|
data.
|
|
|
|
The function detects when the installation is run from the
|
|
build directory. It doesn't touch the module in that case.
|
|
"""
|
|
# Custom module locations are simply used as is.
|
|
if config_module_path:
|
|
LOG.info("Using custom path for database module at '%s'", config_module_path)
|
|
return config_module_path
|
|
|
|
# Compatibility mode for builddir installations.
|
|
if module_dir.exists() and src_dir.samefile(module_dir):
|
|
LOG.info('Running from build directory. Leaving database module as is.')
|
|
return module_dir
|
|
|
|
# In any other case install the module in the project directory.
|
|
if not module_dir.exists():
|
|
module_dir.mkdir()
|
|
|
|
destfile = module_dir / 'nominatim.so'
|
|
shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
|
|
destfile.chmod(0o755)
|
|
|
|
LOG.info('Database module installed at %s', str(destfile))
|
|
|
|
return module_dir
|
|
|
|
|
|
def _check_module(module_dir, conn):
|
|
with conn.cursor() as cur:
|
|
try:
|
|
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
|
|
RETURNS text AS '{}/nominatim.so', 'transliteration'
|
|
LANGUAGE c IMMUTABLE STRICT;
|
|
DROP FUNCTION nominatim_test_import_func(text)
|
|
""".format(module_dir))
|
|
except psycopg2.DatabaseError as err:
|
|
LOG.fatal("Error accessing database module: %s", err)
|
|
raise UsageError("Database module cannot be accessed.") from err
|
|
|
|
|
|
class LegacyTokenizer:
|
|
""" The legacy tokenizer uses a special PostgreSQL module to normalize
|
|
names and queries. The tokenizer thus implements normalization through
|
|
calls to the database.
|
|
"""
|
|
|
|
def __init__(self, dsn, data_dir):
|
|
self.dsn = dsn
|
|
self.data_dir = data_dir
|
|
self.normalization = None
|
|
|
|
|
|
def init_new_db(self, config):
|
|
""" Set up a new tokenizer for the database.
|
|
|
|
This copies all necessary data in the project directory to make
|
|
sure the tokenizer remains stable even over updates.
|
|
"""
|
|
module_dir = _install_module(config.DATABASE_MODULE_PATH,
|
|
config.lib_dir.module,
|
|
config.project_dir / 'module')
|
|
|
|
self.normalization = config.TERM_NORMALIZATION
|
|
|
|
with connect(self.dsn) as conn:
|
|
_check_module(module_dir, conn)
|
|
self._save_config(conn)
|
|
|
|
|
|
def init_from_project(self):
|
|
""" Initialise the tokenizer from the project directory.
|
|
"""
|
|
with connect(self.dsn) as conn:
|
|
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
|
|
|
|
|
|
def migrate_database(self, config):
|
|
""" Initialise the project directory of an existing database for
|
|
use with this tokenizer.
|
|
|
|
This is a special migration function for updating existing databases
|
|
to new software versions.
|
|
"""
|
|
module_dir = _install_module(config.DATABASE_MODULE_PATH,
|
|
config.lib_dir.module,
|
|
config.project_dir / 'module')
|
|
|
|
with connect(self.dsn) as conn:
|
|
_check_module(module_dir, conn)
|
|
self._save_config(conn)
|
|
|
|
|
|
def _save_config(self, conn):
|
|
""" Save the configuration that needs to remain stable for the given
|
|
database as database properties.
|
|
"""
|
|
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
|