move module installation to legacy tokenizer

This commit is contained in:
Sarah Hoffmann
2021-04-21 15:00:37 +02:00
parent af968d4903
commit 296a66558f
6 changed files with 124 additions and 98 deletions

View File

@@ -68,12 +68,6 @@ class SetupAll:
args.no_partitions,
rouser=args.config.DATABASE_WEBUSER)
LOG.warning('Installing database module')
with connect(args.config.get_libpq_dsn()) as conn:
database_import.install_module(args.module_dir, args.project_dir,
args.config.DATABASE_MODULE_PATH,
conn=conn)
LOG.warning('Importing OSM data file')
database_import.import_osm_data(Path(args.osm_file),
args.osm2pgsql_options(0, 1),

View File

@@ -1,16 +1,61 @@
"""
Tokenizer implementing normalisation as used before Nominatim 4.
"""
import logging
import shutil
import psycopg2
from nominatim.db.connection import connect
from nominatim.db import properties
from nominatim.errors import UsageError
DBCFG_NORMALIZATION = "tokenizer_normalization"
LOG = logging.getLogger()
def create(dsn, data_dir):
""" Create a new instance of the tokenizer provided by this module.
"""
return LegacyTokenizer(dsn, data_dir)
def _install_module(src_dir, module_dir):
""" Copies the PostgreSQL normalisation module into the project
directory if necessary. For historical reasons the module is
saved in the '/module' subdirectory and not with the other tokenizer
data.
The function detects when the installation is run from the
build directory. It doesn't touch the module in that case.
"""
if module_dir.exists() and src_dir.samefile(module_dir):
LOG.info('Running from build directory. Leaving database module as is.')
return
if not module_dir.exists():
module_dir.mkdir()
destfile = module_dir / 'nominatim.so'
shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
destfile.chmod(0o755)
LOG.info('Database module installed at %s', str(destfile))
def _check_module(module_dir, conn):
with conn.cursor() as cur:
try:
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
RETURNS text AS '{}/nominatim.so', 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
DROP FUNCTION nominatim_test_import_func(text)
""".format(module_dir))
except psycopg2.DatabaseError as err:
LOG.fatal("Error accessing database module: %s", err)
raise UsageError("Database module cannot be accessed.") from err
class LegacyTokenizer:
""" The legacy tokenizer uses a special PostgreSQL module to normalize
names and queries. The tokenizer thus implements normalization through
@@ -29,12 +74,24 @@ class LegacyTokenizer:
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
# Find and optionally install the PsotgreSQL normalization module.
if config.DATABASE_MODULE_PATH:
LOG.info("Using custom path for database module at '%s'",
config.DATABASE_MODULE_PATH)
module_dir = config.DATABASE_MODULE_PATH
else:
_install_module(config.lib_dir.module, config.project_dir / 'module')
module_dir = config.project_dir / 'module'
self.normalization = config.TERM_NORMALIZATION
# Stable configuration is saved in the database.
with connect(self.dsn) as conn:
properties.set_property(conn, DBCFG_NORMALIZATION,
self.normalization)
_check_module(module_dir, conn)
# Stable configuration is saved in the database.
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
conn.commit()
def init_from_project(self):

View File

@@ -5,11 +5,9 @@ import logging
import os
import selectors
import subprocess
import shutil
from pathlib import Path
import psutil
import psycopg2
from nominatim.db.connection import connect, get_pg_env
from nominatim.db import utils as db_utils
@@ -89,49 +87,6 @@ def setup_extensions(conn):
raise UsageError('PostGIS version is too old.')
def install_module(src_dir, project_dir, module_dir, conn=None):
""" Copy the normalization module from src_dir into the project
directory under the '/module' directory. If 'module_dir' is set, then
use the module from there instead and check that it is accessible
for Postgresql.
The function detects when the installation is run from the
build directory. It doesn't touch the module in that case.
If 'conn' is given, then the function also tests if the module
can be access via the given database.
"""
if not module_dir:
module_dir = project_dir / 'module'
if not module_dir.exists() or not src_dir.samefile(module_dir):
if not module_dir.exists():
module_dir.mkdir()
destfile = module_dir / 'nominatim.so'
shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
destfile.chmod(0o755)
LOG.info('Database module installed at %s', str(destfile))
else:
LOG.info('Running from build directory. Leaving database module as is.')
else:
LOG.info("Using custom path for database module at '%s'", module_dir)
if conn is not None:
with conn.cursor() as cur:
try:
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
RETURNS text AS '{}/nominatim.so', 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
DROP FUNCTION nominatim_test_import_func(text)
""".format(module_dir))
except psycopg2.DatabaseError as err:
LOG.fatal("Error accessing database module: %s", err)
raise UsageError("Database module cannot be accessed.") from err
def import_base_data(dsn, sql_dir, ignore_partitions=False):
""" Create and populate the tables with basic static data that provides
the background for geocoding. Data is assumed to not yet exist.

View File

@@ -88,7 +88,6 @@ def test_import_bad_file(temp_db):
def test_import_full(temp_db, mock_func_factory):
mocks = [
mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'),
mock_func_factory(nominatim.tools.database_import, 'install_module'),
mock_func_factory(nominatim.tools.database_import, 'import_osm_data'),
mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'),
mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),

View File

@@ -5,24 +5,78 @@ import pytest
from nominatim.tokenizer import legacy_tokenizer
from nominatim.db import properties
from nominatim.errors import UsageError
@pytest.fixture
def tokenizer(dsn, tmp_path, def_config, property_table):
tok = legacy_tokenizer.create(dsn, tmp_path)
tok.init_new_db(def_config)
def test_config(def_config, tmp_path):
def_config.project_dir = tmp_path / 'project'
def_config.project_dir.mkdir()
return tok
module_dir = tmp_path / 'module_src'
module_dir.mkdir()
(module_dir / 'nominatim.so').write_text('TEST nomiantim.so')
def test_init_new(dsn, tmp_path, def_config, property_table, monkeypatch, temp_db_conn):
def_config.lib_dir.module = module_dir
return def_config
@pytest.fixture
def tokenizer_factory(dsn, tmp_path, monkeypatch):
def _maker():
return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
return _maker
@pytest.fixture
def tokenizer_setup(tokenizer_factory, test_config, property_table, monkeypatch):
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
tok = tokenizer_factory()
tok.init_new_db(test_config)
def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn):
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
tok = legacy_tokenizer.create(dsn, tmp_path)
tok.init_new_db(def_config)
tok = tokenizer_factory()
tok.init_new_db(test_config)
assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
outfile = test_config.project_dir / 'module' / 'nominatim.so'
def test_init_from_project(tokenizer):
tokenizer.init_from_project()
assert outfile.exists()
assert outfile.read_text() == 'TEST nomiantim.so'
assert outfile.stat().st_mode == 33261
assert tokenizer.normalization is not None
def test_init_module_load_failed(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn):
tok = tokenizer_factory()
with pytest.raises(UsageError):
tok.init_new_db(test_config)
def test_init_module_custom(tokenizer_factory, test_config, property_table,
monkeypatch, tmp_path):
module_dir = (tmp_path / 'custom').resolve()
module_dir.mkdir()
(module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir))
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
tok = tokenizer_factory()
tok.init_new_db(test_config)
assert not (test_config.project_dir / 'module').exists()
def test_init_from_project(tokenizer_setup, tokenizer_factory):
tok = tokenizer_factory()
tok.init_from_project()
assert tok.normalization is not None

View File

@@ -80,39 +80,6 @@ def test_setup_extensions_old_postgis(temp_db_conn, monkeypatch):
database_import.setup_extensions(temp_db_conn)
def test_install_module(tmp_path):
src_dir = tmp_path / 'source'
src_dir.mkdir()
(src_dir / 'nominatim.so').write_text('TEST nomiantim.so')
project_dir = tmp_path / 'project'
project_dir.mkdir()
database_import.install_module(src_dir, project_dir, '')
outfile = project_dir / 'module' / 'nominatim.so'
assert outfile.exists()
assert outfile.read_text() == 'TEST nomiantim.so'
assert outfile.stat().st_mode == 33261
def test_install_module_custom(tmp_path):
(tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
database_import.install_module(tmp_path, tmp_path, str(tmp_path.resolve()))
assert not (tmp_path / 'module').exists()
def test_install_module_fail_access(temp_db_conn, tmp_path):
(tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
with pytest.raises(UsageError, match='.*module cannot be accessed.*'):
database_import.install_module(tmp_path, tmp_path, '',
conn=temp_db_conn)
def test_import_base_data(src_dir, temp_db, temp_db_cursor):
temp_db_cursor.execute('CREATE EXTENSION hstore')
temp_db_cursor.execute('CREATE EXTENSION postgis')