move word table and normalisation SQL into tokenizer

Creating and populating the word table is now the responsibility
of the tokenizer.

The get_maxwordfreq() function has been replaced with a
simple template parameter to the SQL during function installation.
The number is taken from the parameter list in the database to
ensure that it is not changed after installation.
This commit is contained in:
Sarah Hoffmann
2021-04-22 22:47:34 +02:00
parent b5540dc35c
commit fbbdd31399
15 changed files with 117 additions and 53 deletions

View File

@@ -286,7 +286,6 @@ def osm2pgsql_options(temp_db):
@pytest.fixture
def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
table_factory('country_name', 'partition INT', (0, 1, 2))
cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',

View File

@@ -139,7 +139,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'),
mock_func_factory(nominatim.tools.refresh, 'setup_website'),
mock_func_factory(nominatim.db.properties, 'set_property')
]
@@ -161,7 +161,7 @@ def test_import_continue_postprocess(temp_db, mock_func_factory):
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
mock_func_factory(nominatim.tools.refresh, 'setup_website'),
mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'),
mock_func_factory(nominatim.db.properties, 'set_property')
]
@@ -242,7 +242,6 @@ def test_special_phrases_command(temp_db, mock_func_factory):
('postcodes', 'update_postcodes'),
('word-counts', 'recompute_word_counts'),
('address-levels', 'load_address_levels_from_file'),
('functions', 'create_functions'),
('wiki-data', 'import_wikipedia_articles'),
('importance', 'recompute_importance'),
('website', 'setup_website'),
@@ -254,6 +253,22 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
assert func_mock.called == 1
def test_refresh_create_functions(mock_func_factory, monkeypatch, temp_db):
class DummyTokenizer:
def update_sql_functions(self, *args):
self.called = True
func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
tok = DummyTokenizer()
monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
lambda *args: tok)
assert 0 == call_nominatim('refresh', '--functions')
assert func_mock.called == 1
assert hasattr(tok, 'called')
def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
calls = []
monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',

View File

@@ -24,7 +24,6 @@ def sql_factory(tmp_path):
("'{{db.partitions|join}}'", '012'),
("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"),
("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"),
("'{{config.DATABASE_MODULE_PATH}}'", '.')
])
def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret):
sqlfile = sql_factory("RETURN {};".format(expr))

View File

@@ -1,6 +1,8 @@
"""
Test for legacy tokenizer.
"""
import shutil
import pytest
from nominatim.tokenizer import legacy_tokenizer
@@ -18,6 +20,18 @@ def test_config(def_config, tmp_path):
def_config.lib_dir.module = module_dir
sqldir = tmp_path / 'sql'
sqldir.mkdir()
(sqldir / 'tokenizer').mkdir()
(sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
(sqldir / 'words.sql').write_text("SELECT 'a'")
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
def_config.lib_dir.sql = sqldir
def_config.lib_dir.data = sqldir
return def_config
@@ -30,13 +44,15 @@ def tokenizer_factory(dsn, tmp_path, monkeypatch):
return _maker
@pytest.fixture
def tokenizer_setup(tokenizer_factory, test_config, property_table, monkeypatch):
def tokenizer_setup(tokenizer_factory, test_config, property_table,
monkeypatch, sql_preprocessor):
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
tok = tokenizer_factory()
tok.init_new_db(test_config)
def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn):
def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch,
temp_db_conn, sql_preprocessor):
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
@@ -52,7 +68,8 @@ def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, t
assert outfile.stat().st_mode == 33261
def test_init_module_load_failed(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn):
def test_init_module_load_failed(tokenizer_factory, test_config, property_table,
monkeypatch, temp_db_conn):
tok = tokenizer_factory()
with pytest.raises(UsageError):
@@ -60,7 +77,7 @@ def test_init_module_load_failed(tokenizer_factory, test_config, property_table,
def test_init_module_custom(tokenizer_factory, test_config, property_table,
monkeypatch, tmp_path):
monkeypatch, tmp_path, sql_preprocessor):
module_dir = (tmp_path / 'custom').resolve()
module_dir.mkdir()
(module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')

View File

@@ -138,14 +138,14 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options):
def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
tables = ('word', 'placex', 'place_addressline', 'location_area',
tables = ('placex', 'place_addressline', 'location_area',
'location_area_country',
'location_property_tiger', 'location_property_osmline',
'location_postcode', 'search_name', 'location_road_23')
for table in tables:
table_factory(table, content=(1, 2, 3))
database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23)
database_import.truncate_data_tables(temp_db_conn)
for table in tables:
assert temp_db_cursor.table_rows(table) == 0
@@ -163,7 +163,7 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
place_row(osm_type='W', osm_id=342, cls='place', typ='houses',
geom='SRID=4326;LINESTRING(0 0, 10 10)')
database_import.load_data(dsn, src_dir / 'data', threads)
database_import.load_data(dsn, threads)
assert temp_db_cursor.table_rows('placex') == 30
assert temp_db_cursor.table_rows('location_property_osmline') == 1