diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 2014ff9e..68727972 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -56,6 +56,7 @@ class SetupAll: from ..tools import refresh from ..indexer.indexer import Indexer from ..tools import postcodes + from ..tokenizer import factory as tokenizer_factory if args.osm_file and not Path(args.osm_file).is_file(): LOG.fatal("OSM file '%s' does not exist.", args.osm_file) @@ -112,6 +113,10 @@ class SetupAll: args.data_dir, args.threads or psutil.cpu_count() or 1) + LOG.warning("Setting up tokenizer") + tokenizer = tokenizer_factory.create_tokenizer(args.config) + + if args.continue_at is None or args.continue_at == 'load-data': LOG.warning('Calculate postcodes') postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir) diff --git a/nominatim/tokenizer/__init__.py b/nominatim/tokenizer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nominatim/tokenizer/factory.py b/nominatim/tokenizer/factory.py new file mode 100644 index 00000000..1079c935 --- /dev/null +++ b/nominatim/tokenizer/factory.py @@ -0,0 +1,84 @@ +""" +Functions for creating a tokenizer or initialising the right one for an +existing database. + +A tokenizer is something that is bound to the lifetime of a database. It +can be choosen and configured before the intial import but then needs to +be used consistently when querying and updating the database. + +This module provides the functions to create and configure a new tokenizer +as well as instanciating the appropriate tokenizer for updating an existing +database. + +A tokenizer usually also includes PHP code for querying. The appropriate PHP +normalizer module is installed, when the tokenizer is created. +""" +import logging +import importlib + +from ..errors import UsageError +from ..db import properties +from ..db.connection import connect + +LOG = logging.getLogger() + +def _import_tokenizer(name): + """ Load the tokenizer.py module from project directory. + """ + try: + return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer') + except ModuleNotFoundError as exp: + LOG.fatal("No tokenizer named '%s' available. " + "Check the setting of NOMINATIM_TOKENIZER.", name) + raise UsageError('Tokenizer not found') from exp + + +def create_tokenizer(config): + """ Create a new tokenizer as defined by the given configuration. + + The tokenizer data and code is copied into the 'tokenizer' directory + of the project directory and the tokenizer loaded from its new location. + """ + # Create the directory for the tokenizer data + basedir = config.project_dir / 'tokenizer' + if not basedir.exists(): + basedir.mkdir() + elif not basedir.is_dir(): + LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir) + raise UsageError("Tokenizer setup failed.") + + tokenizer_module = _import_tokenizer(config.TOKENIZER) + + tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir) + tokenizer.init_new_db(config) + + with connect(config.get_libpq_dsn()) as conn: + properties.set_property(conn, 'tokenizer', config.TOKENIZER) + + return tokenizer + + +def get_tokenizer_for_db(config): + """ Instantiate a tokenizer for an existing database. + + The function looks up the appropriate tokenizer in the database + and initialises it. + """ + basedir = config.project_dir / 'tokenizer' + if not basedir.is_dir(): + LOG.fatal("Cannot find tokenizer data in '%s'.", basedir) + raise UsageError('Cannot initialize tokenizer.') + + with connect(config.get_libpq_dsn()) as conn: + name = properties.get_property(conn, 'tokenizer') + + if name is None: + LOG.fatal("Tokenizer was not set up properly. Database property missing.") + raise UsageError('Cannot initialize tokenizer.') + + tokenizer_module = _import_tokenizer(name) + + tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir) + tokenizer.init_from_project() + + return tokenizer diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py new file mode 100644 index 00000000..ab3e320e --- /dev/null +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -0,0 +1,44 @@ +""" +Tokenizer implementing normalisation as used before Nominatim 4. +""" +from nominatim.db.connection import connect +from nominatim.db import properties + +DBCFG_NORMALIZATION = "tokenizer_normalization" + +def create(dsn, data_dir): + """ Create a new instance of the tokenizer provided by this module. + """ + return LegacyTokenizer(dsn, data_dir) + +class LegacyTokenizer: + """ The legacy tokenizer uses a special PostgreSQL module to normalize + names and queries. The tokenizer thus implements normalization through + calls to the database. + """ + + def __init__(self, dsn, data_dir): + self.dsn = dsn + self.data_dir = data_dir + self.normalization = None + + + def init_new_db(self, config): + """ Set up a new tokenizer for the database. + + This copies all necessary data in the project directory to make + sure the tokenizer remains stable even over updates. + """ + self.normalization = config.TERM_NORMALIZATION + + # Stable configuration is saved in the database. + with connect(self.dsn) as conn: + properties.set_property(conn, DBCFG_NORMALIZATION, + self.normalization) + + + def init_from_project(self): + """ Initialise the tokenizer from the project directory. + """ + with connect(self.dsn) as conn: + self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION) diff --git a/settings/env.defaults b/settings/env.defaults index 4069270e..09819c0a 100644 --- a/settings/env.defaults +++ b/settings/env.defaults @@ -18,6 +18,12 @@ NOMINATIM_DATABASE_WEBUSER="www-data" # Changing this value requires to run 'nominatim refresh --functions'. NOMINATIM_DATABASE_MODULE_PATH= +# Tokenizer used for normalizing and parsing queries and names. +# The tokenizer is set up during import and cannot be changed afterwards +# without a reimport. +# Currently available tokenizers: legacy +NOMINATIM_TOKENIZER="legacy" + # Number of occurances of a word before it is considered frequent. # Similar to the concept of stop words. Frequent partial words get ignored # or handled differently during search. diff --git a/test/python/conftest.py b/test/python/conftest.py index 4b9749c0..d0fdc569 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -144,6 +144,11 @@ def tmp_phplib_dir(): yield Path(phpdir) + +@pytest.fixture +def property_table(table_factory): + table_factory('nominatim_properties', 'property TEXT, value TEXT') + @pytest.fixture def status_table(temp_db_conn): """ Create an empty version of the status table and diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py new file mode 100644 index 00000000..47cc580c --- /dev/null +++ b/test/python/dummy_tokenizer.py @@ -0,0 +1,25 @@ +""" +Tokenizer for testing. +""" + +def create(dsn, data_dir): + """ Create a new instance of the tokenizer provided by this module. + """ + return DummyTokenizer(dsn, data_dir) + +class DummyTokenizer: + + def __init__(self, dsn, data_dir): + self.dsn = dsn + self.data_dir = data_dir + self.init_state = None + + + def init_new_db(self, config): + assert self.init_state == None + self.init_state = "new" + + + def init_from_project(self): + assert self.init_state == None + self.init_state = "loaded" diff --git a/test/python/test_cli.py b/test/python/test_cli.py index afa01e57..6b8bfc55 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -22,6 +22,7 @@ import nominatim.tools.database_import import nominatim.tools.freeze import nominatim.tools.refresh import nominatim.tools.postcodes +import nominatim.tokenizer.factory from mocks import MockParamCapture @@ -97,6 +98,7 @@ def test_import_full(temp_db, mock_func_factory): mock_func_factory(nominatim.tools.database_import, 'create_partition_tables'), mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), + mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'), mock_func_factory(nominatim.tools.refresh, 'load_address_levels_from_file'), mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'), mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), @@ -120,6 +122,7 @@ def test_import_continue_load_data(temp_db, mock_func_factory): mock_func_factory(nominatim.tools.database_import, 'load_data'), mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), + mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'), mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'), mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), mock_func_factory(nominatim.tools.refresh, 'setup_website'), @@ -137,6 +140,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), + mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'), mock_func_factory(nominatim.tools.refresh, 'setup_website'), mock_func_factory(nominatim.db.properties, 'set_property') ] @@ -158,6 +162,7 @@ def test_import_continue_postprocess(temp_db, mock_func_factory): mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), mock_func_factory(nominatim.tools.refresh, 'setup_website'), + mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'), mock_func_factory(nominatim.db.properties, 'set_property') ] diff --git a/test/python/test_tokenizer_factory.py b/test/python/test_tokenizer_factory.py new file mode 100644 index 00000000..63c6915b --- /dev/null +++ b/test/python/test_tokenizer_factory.py @@ -0,0 +1,87 @@ +""" +Tests for creating new tokenizers. +""" +import importlib +import pytest + +from nominatim.db import properties +from nominatim.tokenizer import factory +from nominatim.errors import UsageError +import dummy_tokenizer + +@pytest.fixture +def test_config(def_config, tmp_path): + def_config.project_dir = tmp_path + return def_config + + +@pytest.fixture +def tokenizer_import(monkeypatch): + monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy') + + def _import_dummy(module, *args, **kwargs): + return dummy_tokenizer + + monkeypatch.setattr(importlib, "import_module", _import_dummy) + + +def test_setup_dummy_tokenizer(temp_db_conn, test_config, + tokenizer_import, property_table): + tokenizer = factory.create_tokenizer(test_config) + + assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer) + assert tokenizer.init_state == "new" + assert (test_config.project_dir / 'tokenizer').is_dir() + + assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy' + + +def test_setup_tokenizer_dir_exists(test_config, tokenizer_import, property_table): + (test_config.project_dir / 'tokenizer').mkdir() + + tokenizer = factory.create_tokenizer(test_config) + + assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer) + assert tokenizer.init_state == "new" + + +def test_setup_tokenizer_dir_failure(test_config, tokenizer_import, property_table): + (test_config.project_dir / 'tokenizer').write_text("foo") + + with pytest.raises(UsageError): + factory.create_tokenizer(test_config) + + +def test_setup_bad_tokenizer_name(test_config, monkeypatch): + monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy') + + with pytest.raises(UsageError): + factory.create_tokenizer(test_config) + +def test_load_tokenizer(temp_db_conn, test_config, + tokenizer_import, property_table): + factory.create_tokenizer(test_config) + + tokenizer = factory.get_tokenizer_for_db(test_config) + + assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer) + assert tokenizer.init_state == "loaded" + + +def test_load_no_tokenizer_dir(test_config, tokenizer_import, property_table): + factory.create_tokenizer(test_config) + + test_config.project_dir = test_config.project_dir / 'foo' + + with pytest.raises(UsageError): + factory.get_tokenizer_for_db(test_config) + + +def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_import, property_table): + factory.create_tokenizer(test_config) + + temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties") + + with pytest.raises(UsageError): + factory.get_tokenizer_for_db(test_config) + diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py new file mode 100644 index 00000000..cb6fb00b --- /dev/null +++ b/test/python/test_tokenizer_legacy.py @@ -0,0 +1,28 @@ +""" +Test for legacy tokenizer. +""" +import pytest + +from nominatim.tokenizer import legacy_tokenizer +from nominatim.db import properties + +@pytest.fixture +def tokenizer(dsn, tmp_path, def_config, property_table): + tok = legacy_tokenizer.create(dsn, tmp_path) + tok.init_new_db(def_config) + + return tok + +def test_init_new(dsn, tmp_path, def_config, property_table, monkeypatch, temp_db_conn): + monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv') + + tok = legacy_tokenizer.create(dsn, tmp_path) + tok.init_new_db(def_config) + + assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv' + + +def test_init_from_project(tokenizer): + tokenizer.init_from_project() + + assert tokenizer.normalization is not None