forked from hans/Nominatim
introduce tokenizer modules
This adds the boilerplate for selecting configurable tokenizers. A tokenizer can be chosen at import time and will then install itself such that it is fixed for the given database import even when the software itself is updated. The legacy tokenizer implements Nominatim's traditional algorithms.
This commit is contained in:
@@ -144,6 +144,11 @@ def tmp_phplib_dir():
|
||||
|
||||
yield Path(phpdir)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def property_table(table_factory):
|
||||
table_factory('nominatim_properties', 'property TEXT, value TEXT')
|
||||
|
||||
@pytest.fixture
|
||||
def status_table(temp_db_conn):
|
||||
""" Create an empty version of the status table and
|
||||
|
||||
25
test/python/dummy_tokenizer.py
Normal file
25
test/python/dummy_tokenizer.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""
|
||||
Tokenizer for testing.
|
||||
"""
|
||||
|
||||
def create(dsn, data_dir):
|
||||
""" Create a new instance of the tokenizer provided by this module.
|
||||
"""
|
||||
return DummyTokenizer(dsn, data_dir)
|
||||
|
||||
class DummyTokenizer:
|
||||
|
||||
def __init__(self, dsn, data_dir):
|
||||
self.dsn = dsn
|
||||
self.data_dir = data_dir
|
||||
self.init_state = None
|
||||
|
||||
|
||||
def init_new_db(self, config):
|
||||
assert self.init_state == None
|
||||
self.init_state = "new"
|
||||
|
||||
|
||||
def init_from_project(self):
|
||||
assert self.init_state == None
|
||||
self.init_state = "loaded"
|
||||
@@ -22,6 +22,7 @@ import nominatim.tools.database_import
|
||||
import nominatim.tools.freeze
|
||||
import nominatim.tools.refresh
|
||||
import nominatim.tools.postcodes
|
||||
import nominatim.tokenizer.factory
|
||||
|
||||
from mocks import MockParamCapture
|
||||
|
||||
@@ -97,6 +98,7 @@ def test_import_full(temp_db, mock_func_factory):
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_partition_tables'),
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
|
||||
mock_func_factory(nominatim.tools.refresh, 'load_address_levels_from_file'),
|
||||
mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'),
|
||||
mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
|
||||
@@ -120,6 +122,7 @@ def test_import_continue_load_data(temp_db, mock_func_factory):
|
||||
mock_func_factory(nominatim.tools.database_import, 'load_data'),
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
|
||||
mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'),
|
||||
mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
|
||||
mock_func_factory(nominatim.tools.refresh, 'setup_website'),
|
||||
@@ -137,6 +140,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
|
||||
mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
|
||||
mock_func_factory(nominatim.tools.refresh, 'setup_website'),
|
||||
mock_func_factory(nominatim.db.properties, 'set_property')
|
||||
]
|
||||
@@ -158,6 +162,7 @@ def test_import_continue_postprocess(temp_db, mock_func_factory):
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
|
||||
mock_func_factory(nominatim.tools.refresh, 'setup_website'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
|
||||
mock_func_factory(nominatim.db.properties, 'set_property')
|
||||
]
|
||||
|
||||
|
||||
87
test/python/test_tokenizer_factory.py
Normal file
87
test/python/test_tokenizer_factory.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
Tests for creating new tokenizers.
|
||||
"""
|
||||
import importlib
|
||||
import pytest
|
||||
|
||||
from nominatim.db import properties
|
||||
from nominatim.tokenizer import factory
|
||||
from nominatim.errors import UsageError
|
||||
import dummy_tokenizer
|
||||
|
||||
@pytest.fixture
|
||||
def test_config(def_config, tmp_path):
|
||||
def_config.project_dir = tmp_path
|
||||
return def_config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tokenizer_import(monkeypatch):
|
||||
monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
|
||||
|
||||
def _import_dummy(module, *args, **kwargs):
|
||||
return dummy_tokenizer
|
||||
|
||||
monkeypatch.setattr(importlib, "import_module", _import_dummy)
|
||||
|
||||
|
||||
def test_setup_dummy_tokenizer(temp_db_conn, test_config,
|
||||
tokenizer_import, property_table):
|
||||
tokenizer = factory.create_tokenizer(test_config)
|
||||
|
||||
assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer)
|
||||
assert tokenizer.init_state == "new"
|
||||
assert (test_config.project_dir / 'tokenizer').is_dir()
|
||||
|
||||
assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
|
||||
|
||||
|
||||
def test_setup_tokenizer_dir_exists(test_config, tokenizer_import, property_table):
|
||||
(test_config.project_dir / 'tokenizer').mkdir()
|
||||
|
||||
tokenizer = factory.create_tokenizer(test_config)
|
||||
|
||||
assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer)
|
||||
assert tokenizer.init_state == "new"
|
||||
|
||||
|
||||
def test_setup_tokenizer_dir_failure(test_config, tokenizer_import, property_table):
|
||||
(test_config.project_dir / 'tokenizer').write_text("foo")
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
factory.create_tokenizer(test_config)
|
||||
|
||||
|
||||
def test_setup_bad_tokenizer_name(test_config, monkeypatch):
|
||||
monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
factory.create_tokenizer(test_config)
|
||||
|
||||
def test_load_tokenizer(temp_db_conn, test_config,
|
||||
tokenizer_import, property_table):
|
||||
factory.create_tokenizer(test_config)
|
||||
|
||||
tokenizer = factory.get_tokenizer_for_db(test_config)
|
||||
|
||||
assert isinstance(tokenizer, dummy_tokenizer.DummyTokenizer)
|
||||
assert tokenizer.init_state == "loaded"
|
||||
|
||||
|
||||
def test_load_no_tokenizer_dir(test_config, tokenizer_import, property_table):
|
||||
factory.create_tokenizer(test_config)
|
||||
|
||||
test_config.project_dir = test_config.project_dir / 'foo'
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
factory.get_tokenizer_for_db(test_config)
|
||||
|
||||
|
||||
def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_import, property_table):
|
||||
factory.create_tokenizer(test_config)
|
||||
|
||||
temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties")
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
factory.get_tokenizer_for_db(test_config)
|
||||
|
||||
28
test/python/test_tokenizer_legacy.py
Normal file
28
test/python/test_tokenizer_legacy.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""
|
||||
Test for legacy tokenizer.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from nominatim.tokenizer import legacy_tokenizer
|
||||
from nominatim.db import properties
|
||||
|
||||
@pytest.fixture
|
||||
def tokenizer(dsn, tmp_path, def_config, property_table):
|
||||
tok = legacy_tokenizer.create(dsn, tmp_path)
|
||||
tok.init_new_db(def_config)
|
||||
|
||||
return tok
|
||||
|
||||
def test_init_new(dsn, tmp_path, def_config, property_table, monkeypatch, temp_db_conn):
|
||||
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
|
||||
|
||||
tok = legacy_tokenizer.create(dsn, tmp_path)
|
||||
tok.init_new_db(def_config)
|
||||
|
||||
assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
|
||||
|
||||
|
||||
def test_init_from_project(tokenizer):
|
||||
tokenizer.init_from_project()
|
||||
|
||||
assert tokenizer.normalization is not None
|
||||
Reference in New Issue
Block a user