adapt unit test for new word table

Requires a second wrapper class for the word table with the new
layout. This class is interface-compatible, so that later when
the ICU tokenizer becomes the default, all tests that depend on
behaviour of the default tokenizer can be switched to the other
wrapper.
This commit is contained in:
Sarah Hoffmann
2021-07-22 17:24:43 +02:00
parent eb6814d74e
commit e42878eeda
7 changed files with 225 additions and 125 deletions

View File

@@ -11,6 +11,12 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties
from mock_icu_word_table import MockIcuWordTable
@pytest.fixture
def word_table(temp_db_conn):
return MockIcuWordTable(temp_db_conn)
@pytest.fixture
def test_config(def_config, tmp_path):
@@ -21,8 +27,8 @@ def test_config(def_config, tmp_path):
sqldir.mkdir()
(sqldir / 'tokenizer').mkdir()
(sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'")
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
def_config.lib_dir.sql = sqldir
@@ -88,12 +94,14 @@ DECLARE
term_count INTEGER;
BEGIN
SELECT min(word_id) INTO full_token
FROM word WHERE word = norm_term and class is null and country_code is null;
FROM word WHERE info->>'word' = norm_term and type = 'W';
IF full_token IS NULL THEN
full_token := nextval('seq_word');
INSERT INTO word (word_id, word_token, word, search_name_count)
SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term;
INSERT INTO word (word_id, word_token, type, info)
SELECT full_token, lookup_term, 'W',
json_build_object('word', norm_term, 'count', 0)
FROM unnest(lookup_terms) as lookup_term;
END IF;
FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
@@ -105,18 +113,18 @@ BEGIN
partial_tokens := '{}'::INT[];
FOR term IN SELECT unnest(partial_terms) LOOP
SELECT min(word_id), max(search_name_count) INTO term_id, term_count
FROM word WHERE word_token = term and class is null and country_code is null;
SELECT min(word_id), max(info->>'count') INTO term_id, term_count
FROM word WHERE word_token = term and type = 'w';
IF term_id IS NULL THEN
term_id := nextval('seq_word');
term_count := 0;
INSERT INTO word (word_id, word_token, search_name_count)
VALUES (term_id, term, 0);
INSERT INTO word (word_id, word_token, type, info)
VALUES (term_id, term, 'w', json_build_object('count', term_count));
END IF;
IF NOT (ARRAY[term_id] <@ partial_tokens) THEN
partial_tokens := partial_tokens || term_id;
partial_tokens := partial_tokens || term_id;
END IF;
END LOOP;
END;
@@ -232,14 +240,14 @@ def test_update_special_phrase_empty_table(analyzer, word_table):
], True)
assert word_table.get_special() \
== {(' KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'),
(' KÖNIGE', 'Könige', 'amenity', 'royal', None),
(' STREET', 'street', 'highway', 'primary', 'in')}
== {('KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'),
('KÖNIGE', 'Könige', 'amenity', 'royal', None),
('STREET', 'street', 'highway', 'primary', 'in')}
def test_update_special_phrase_delete_all(analyzer, word_table):
word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in')
word_table.add_special(' BAR', 'bar', 'highway', 'road', None)
word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in')
word_table.add_special('BAR', 'bar', 'highway', 'road', None)
assert word_table.count_special() == 2
@@ -250,8 +258,8 @@ def test_update_special_phrase_delete_all(analyzer, word_table):
def test_update_special_phrases_no_replace(analyzer, word_table):
word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in')
word_table.add_special(' BAR', 'bar', 'highway', 'road', None)
word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in')
word_table.add_special('BAR', 'bar', 'highway', 'road', None)
assert word_table.count_special() == 2
@@ -262,8 +270,8 @@ def test_update_special_phrases_no_replace(analyzer, word_table):
def test_update_special_phrase_modify(analyzer, word_table):
word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in')
word_table.add_special(' BAR', 'bar', 'highway', 'road', None)
word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in')
word_table.add_special('BAR', 'bar', 'highway', 'road', None)
assert word_table.count_special() == 2
@@ -275,25 +283,25 @@ def test_update_special_phrase_modify(analyzer, word_table):
], True)
assert word_table.get_special() \
== {(' PRISON', 'prison', 'amenity', 'prison', 'in'),
(' BAR', 'bar', 'highway', 'road', None),
(' GARDEN', 'garden', 'leisure', 'garden', 'near')}
== {('PRISON', 'prison', 'amenity', 'prison', 'in'),
('BAR', 'bar', 'highway', 'road', None),
('GARDEN', 'garden', 'leisure', 'garden', 'near')}
def test_add_country_names_new(analyzer, word_table):
with analyzer() as anl:
anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})
assert word_table.get_country() == {('es', ' ESPAGÑA'), ('es', ' SPAIN')}
assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')}
def test_add_country_names_extend(analyzer, word_table):
word_table.add_country('ch', ' SCHWEIZ')
word_table.add_country('ch', 'SCHWEIZ')
with analyzer() as anl:
anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})
assert word_table.get_country() == {('ch', ' SCHWEIZ'), ('ch', ' SUISSE')}
assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')}
class TestPlaceNames:
@@ -307,6 +315,7 @@ class TestPlaceNames:
def expect_name_terms(self, info, *expected_terms):
tokens = self.analyzer.get_word_token_info(expected_terms)
print (tokens)
for token in tokens:
assert token[2] is not None, "No token for {0}".format(token)
@@ -316,7 +325,7 @@ class TestPlaceNames:
def test_simple_names(self):
info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}})
self.expect_name_terms(info, '#Soft bAr', '#34','Soft', 'bAr', '34')
self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
@pytest.mark.parametrize('sep', [',' , ';'])
@@ -339,7 +348,7 @@ class TestPlaceNames:
'country_feature': 'no'})
self.expect_name_terms(info, '#norge', 'norge')
assert word_table.get_country() == {('no', ' NORGE')}
assert word_table.get_country() == {('no', 'NORGE')}
class TestPlaceAddress: