mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-15 02:47:59 +00:00
move word table and normalisation SQL into tokenizer
Creating and populating the word table is now the responsibility of the tokenizer. The get_maxwordfreq() function has been replaced with a simple template parameter to the SQL during function installation. The number is taken from the parameter list in the database to ensure that it is not changed after installation.
This commit is contained in:
@@ -286,7 +286,6 @@ def osm2pgsql_options(temp_db):
|
||||
|
||||
@pytest.fixture
|
||||
def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
|
||||
monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
|
||||
table_factory('country_name', 'partition INT', (0, 1, 2))
|
||||
cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
|
||||
cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
|
||||
|
||||
@@ -139,7 +139,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
|
||||
mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'),
|
||||
mock_func_factory(nominatim.tools.refresh, 'setup_website'),
|
||||
mock_func_factory(nominatim.db.properties, 'set_property')
|
||||
]
|
||||
@@ -161,7 +161,7 @@ def test_import_continue_postprocess(temp_db, mock_func_factory):
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
|
||||
mock_func_factory(nominatim.tools.refresh, 'setup_website'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'),
|
||||
mock_func_factory(nominatim.db.properties, 'set_property')
|
||||
]
|
||||
|
||||
@@ -242,7 +242,6 @@ def test_special_phrases_command(temp_db, mock_func_factory):
|
||||
('postcodes', 'update_postcodes'),
|
||||
('word-counts', 'recompute_word_counts'),
|
||||
('address-levels', 'load_address_levels_from_file'),
|
||||
('functions', 'create_functions'),
|
||||
('wiki-data', 'import_wikipedia_articles'),
|
||||
('importance', 'recompute_importance'),
|
||||
('website', 'setup_website'),
|
||||
@@ -254,6 +253,22 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
|
||||
assert func_mock.called == 1
|
||||
|
||||
|
||||
def test_refresh_create_functions(mock_func_factory, monkeypatch, temp_db):
|
||||
class DummyTokenizer:
|
||||
def update_sql_functions(self, *args):
|
||||
self.called = True
|
||||
|
||||
func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
|
||||
tok = DummyTokenizer()
|
||||
monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
|
||||
lambda *args: tok)
|
||||
|
||||
|
||||
assert 0 == call_nominatim('refresh', '--functions')
|
||||
assert func_mock.called == 1
|
||||
assert hasattr(tok, 'called')
|
||||
|
||||
|
||||
def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
|
||||
calls = []
|
||||
monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
|
||||
|
||||
@@ -24,7 +24,6 @@ def sql_factory(tmp_path):
|
||||
("'{{db.partitions|join}}'", '012'),
|
||||
("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"),
|
||||
("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"),
|
||||
("'{{config.DATABASE_MODULE_PATH}}'", '.')
|
||||
])
|
||||
def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret):
|
||||
sqlfile = sql_factory("RETURN {};".format(expr))
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
"""
|
||||
Test for legacy tokenizer.
|
||||
"""
|
||||
import shutil
|
||||
|
||||
import pytest
|
||||
|
||||
from nominatim.tokenizer import legacy_tokenizer
|
||||
@@ -18,6 +20,18 @@ def test_config(def_config, tmp_path):
|
||||
|
||||
def_config.lib_dir.module = module_dir
|
||||
|
||||
sqldir = tmp_path / 'sql'
|
||||
sqldir.mkdir()
|
||||
(sqldir / 'tokenizer').mkdir()
|
||||
(sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
|
||||
(sqldir / 'words.sql').write_text("SELECT 'a'")
|
||||
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
|
||||
str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
|
||||
|
||||
def_config.lib_dir.sql = sqldir
|
||||
def_config.lib_dir.data = sqldir
|
||||
|
||||
|
||||
return def_config
|
||||
|
||||
|
||||
@@ -30,13 +44,15 @@ def tokenizer_factory(dsn, tmp_path, monkeypatch):
|
||||
return _maker
|
||||
|
||||
@pytest.fixture
|
||||
def tokenizer_setup(tokenizer_factory, test_config, property_table, monkeypatch):
|
||||
def tokenizer_setup(tokenizer_factory, test_config, property_table,
|
||||
monkeypatch, sql_preprocessor):
|
||||
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
|
||||
tok = tokenizer_factory()
|
||||
tok.init_new_db(test_config)
|
||||
|
||||
|
||||
def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn):
|
||||
def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch,
|
||||
temp_db_conn, sql_preprocessor):
|
||||
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
|
||||
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
|
||||
|
||||
@@ -52,7 +68,8 @@ def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, t
|
||||
assert outfile.stat().st_mode == 33261
|
||||
|
||||
|
||||
def test_init_module_load_failed(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn):
|
||||
def test_init_module_load_failed(tokenizer_factory, test_config, property_table,
|
||||
monkeypatch, temp_db_conn):
|
||||
tok = tokenizer_factory()
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
@@ -60,7 +77,7 @@ def test_init_module_load_failed(tokenizer_factory, test_config, property_table,
|
||||
|
||||
|
||||
def test_init_module_custom(tokenizer_factory, test_config, property_table,
|
||||
monkeypatch, tmp_path):
|
||||
monkeypatch, tmp_path, sql_preprocessor):
|
||||
module_dir = (tmp_path / 'custom').resolve()
|
||||
module_dir.mkdir()
|
||||
(module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
|
||||
|
||||
@@ -138,14 +138,14 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options):
|
||||
|
||||
|
||||
def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
|
||||
tables = ('word', 'placex', 'place_addressline', 'location_area',
|
||||
tables = ('placex', 'place_addressline', 'location_area',
|
||||
'location_area_country',
|
||||
'location_property_tiger', 'location_property_osmline',
|
||||
'location_postcode', 'search_name', 'location_road_23')
|
||||
for table in tables:
|
||||
table_factory(table, content=(1, 2, 3))
|
||||
|
||||
database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23)
|
||||
database_import.truncate_data_tables(temp_db_conn)
|
||||
|
||||
for table in tables:
|
||||
assert temp_db_cursor.table_rows(table) == 0
|
||||
@@ -163,7 +163,7 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
|
||||
place_row(osm_type='W', osm_id=342, cls='place', typ='houses',
|
||||
geom='SRID=4326;LINESTRING(0 0, 10 10)')
|
||||
|
||||
database_import.load_data(dsn, src_dir / 'data', threads)
|
||||
database_import.load_data(dsn, threads)
|
||||
|
||||
assert temp_db_cursor.table_rows('placex') == 30
|
||||
assert temp_db_cursor.table_rows('location_property_osmline') == 1
|
||||
|
||||
Reference in New Issue
Block a user