mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-16 15:47:58 +00:00
This adds an installation step for PHP code for the tokenizer. The PHP code is split in two parts. The updateable code is found in lib-php. The tokenizer installs an additional script in the project directory which then includes the code from lib-php and defines all settings that are static to the database. The website code then always includes the PHP from the project directory.
160 lines
5.3 KiB
Python
160 lines
5.3 KiB
Python
"""
|
|
Test for legacy tokenizer.
|
|
"""
|
|
import shutil
|
|
|
|
import pytest
|
|
|
|
from nominatim.tokenizer import legacy_tokenizer
|
|
from nominatim.db import properties
|
|
from nominatim.errors import UsageError
|
|
|
|
@pytest.fixture
|
|
def test_config(def_config, tmp_path):
|
|
def_config.project_dir = tmp_path / 'project'
|
|
def_config.project_dir.mkdir()
|
|
|
|
module_dir = tmp_path / 'module_src'
|
|
module_dir.mkdir()
|
|
(module_dir / 'nominatim.so').write_text('TEST nomiantim.so')
|
|
|
|
def_config.lib_dir.module = module_dir
|
|
|
|
sqldir = tmp_path / 'sql'
|
|
sqldir.mkdir()
|
|
(sqldir / 'tokenizer').mkdir()
|
|
(sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
|
|
(sqldir / 'words.sql').write_text("SELECT 'a'")
|
|
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
|
|
str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
|
|
|
|
def_config.lib_dir.sql = sqldir
|
|
def_config.lib_dir.data = sqldir
|
|
|
|
return def_config
|
|
|
|
|
|
@pytest.fixture
|
|
def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
|
|
(tmp_path / 'tokenizer').mkdir()
|
|
|
|
def _maker():
|
|
return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
|
|
|
|
return _maker
|
|
|
|
@pytest.fixture
|
|
def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor):
|
|
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
|
|
tok = tokenizer_factory()
|
|
tok.init_new_db(test_config)
|
|
|
|
|
|
@pytest.fixture
|
|
def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor,
|
|
word_table, temp_db_with_extensions, tmp_path):
|
|
sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_tokenizer.sql'
|
|
sql.write_text("""
|
|
CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
|
|
RETURNS INTEGER AS $$ SELECT 342; $$ LANGUAGE SQL;
|
|
""")
|
|
|
|
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
|
|
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
|
|
tok = tokenizer_factory()
|
|
tok.init_new_db(test_config)
|
|
monkeypatch.undo()
|
|
|
|
with tok.name_analyzer() as analyzer:
|
|
yield analyzer
|
|
|
|
|
|
def test_init_new(tokenizer_factory, test_config, monkeypatch,
|
|
temp_db_conn, sql_preprocessor):
|
|
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
|
|
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
|
|
|
|
tok = tokenizer_factory()
|
|
tok.init_new_db(test_config)
|
|
|
|
assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
|
|
|
|
outfile = test_config.project_dir / 'module' / 'nominatim.so'
|
|
|
|
assert outfile.exists()
|
|
assert outfile.read_text() == 'TEST nomiantim.so'
|
|
assert outfile.stat().st_mode == 33261
|
|
|
|
|
|
def test_init_module_load_failed(tokenizer_factory, test_config,
|
|
monkeypatch, temp_db_conn):
|
|
tok = tokenizer_factory()
|
|
|
|
with pytest.raises(UsageError):
|
|
tok.init_new_db(test_config)
|
|
|
|
|
|
def test_init_module_custom(tokenizer_factory, test_config,
|
|
monkeypatch, tmp_path, sql_preprocessor):
|
|
module_dir = (tmp_path / 'custom').resolve()
|
|
module_dir.mkdir()
|
|
(module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
|
|
|
|
monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir))
|
|
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
|
|
|
|
tok = tokenizer_factory()
|
|
tok.init_new_db(test_config)
|
|
|
|
assert not (test_config.project_dir / 'module').exists()
|
|
|
|
|
|
def test_init_from_project(tokenizer_setup, tokenizer_factory):
|
|
tok = tokenizer_factory()
|
|
|
|
tok.init_from_project()
|
|
|
|
assert tok.normalization is not None
|
|
|
|
|
|
def test_update_sql_functions(sql_preprocessor, temp_db_conn,
|
|
tokenizer_factory, test_config, table_factory,
|
|
monkeypatch, temp_db_cursor):
|
|
monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
|
|
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
|
|
tok = tokenizer_factory()
|
|
tok.init_new_db(test_config)
|
|
monkeypatch.undo()
|
|
|
|
assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
|
|
|
|
table_factory('test', 'txt TEXT')
|
|
|
|
func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer.sql'
|
|
func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}'),
|
|
('{{modulepath}}')""")
|
|
|
|
tok.update_sql_functions(test_config)
|
|
|
|
test_content = temp_db_cursor.row_set('SELECT * FROM test')
|
|
assert test_content == set((('1133', ), (str(test_config.project_dir / 'module'), )))
|
|
|
|
|
|
def test_migrate_database(tokenizer_factory, test_config, temp_db_conn, monkeypatch):
|
|
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
|
|
tok = tokenizer_factory()
|
|
tok.migrate_database(test_config)
|
|
|
|
assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) is not None
|
|
assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) is not None
|
|
|
|
outfile = test_config.project_dir / 'module' / 'nominatim.so'
|
|
|
|
assert outfile.exists()
|
|
assert outfile.read_text() == 'TEST nomiantim.so'
|
|
assert outfile.stat().st_mode == 33261
|
|
|
|
|
|
def test_normalize(analyzer):
|
|
assert analyzer.normalize('TEsT') == 'test'
|