adapt tests for ICU tokenizer

This commit is contained in:
Sarah Hoffmann
2021-06-06 11:00:44 +02:00
parent 8413075249
commit 2e3c5d4c5b
8 changed files with 143 additions and 67 deletions

View File

@@ -76,7 +76,7 @@ class ICUNameProcessor:
""" Normalize the given name, i.e. remove all elements not relevant
for search.
"""
return self.normalizer.transliterate(name)
return self.normalizer.transliterate(name).strip()
def get_variants_ascii(self, norm_name):
""" Compute the spelling variants for the given normalized name
@@ -108,4 +108,4 @@ class ICUNameProcessor:
""" Return the normalized version of the name (including transliteration)
to be applied at search time.
"""
return self.search.transliterate(name)
return self.search.transliterate(' ' + name + ' ').strip()

View File

@@ -28,7 +28,7 @@ class ICURuleLoader:
def get_search_rules(self):
""" Returns the ICU rules to be used during search.
""" Return the ICU rules to be used during search.
The rules combine normalization, compound decomposition (including
abbreviated compounds) and transliteration.
"""
@@ -60,7 +60,7 @@ class ICURuleLoader:
return self.transliteration_rules
def get_replacement_pairs(self):
""" Returns the list of possible compound decompositions with
""" Return the list of possible compound decompositions with
application of abbreviations included.
The result is a list of pairs: the first item is the sequence to
replace, the second is a list of replacements.

View File

@@ -219,7 +219,7 @@ class LegacyICUNameAnalyzer:
self.conn = None
def get_word_token_info(self, conn, words):
def get_word_token_info(self, words):
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
@@ -233,11 +233,11 @@ class LegacyICUNameAnalyzer:
tokens = {}
for word in words:
if word.startswith('#'):
tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
else:
tokens[word] = self.name_processor.get_normalized(word)
tokens[word] = self.name_processor.get_search_normalized(word)
with conn.cursor() as cur:
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = t.term
@@ -245,7 +245,7 @@ class LegacyICUNameAnalyzer:
(list(tokens.values()), ))
ids = {r[0]: r[1] for r in cur}
return [(k, v, ids[v]) for k, v in tokens.items()]
return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
@staticmethod
@@ -308,7 +308,7 @@ class LegacyICUNameAnalyzer:
def update_special_phrases(self, phrases, should_replace):
""" Replace the search index for special phrases with the new phrases.
"""
norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:

View File

@@ -271,8 +271,7 @@ class LegacyNameAnalyzer:
self.conn = None
@staticmethod
def get_word_token_info(conn, words):
def get_word_token_info(self, words):
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
@@ -283,7 +282,7 @@ class LegacyNameAnalyzer:
The function is used for testing and debugging only
and not necessarily efficient.
"""
with conn.cursor() as cur:
with self.conn.cursor() as cur:
cur.execute("""SELECT t.term, word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = (CASE

View File

@@ -214,7 +214,7 @@ def check_search_name_contents(context, exclude):
for name, value in zip(row.headings, row.cells):
if name in ('name_vector', 'nameaddress_vector'):
items = [x.strip() for x in value.split(',')]
tokens = analyzer.get_word_token_info(context.db, items)
tokens = analyzer.get_word_token_info(items)
if not exclude:
assert len(tokens) >= len(items), \

View File

@@ -34,6 +34,9 @@ def cfgfile(tmp_path, suffix='.yaml'):
return _create_config
def get_normalized_variants(proc, name):
return proc.get_variants_ascii(proc.get_normalized(name))
def test_simple_variants(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'],
['strasse,straße => str',
@@ -42,11 +45,11 @@ def test_simple_variants(cfgfile):
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
assert set(proc.get_normalized_variants("Bauwegstraße")) \
assert set(get_normalized_variants(proc, "Bauwegstraße")) \
== {'bauweg straße', 'bauweg str'}
assert proc.get_normalized_variants("Bauwegstr") == ['bauweg str']
assert proc.get_normalized_variants("holzweg") == ['holz weg']
assert proc.get_normalized_variants("hallo") == ['hallo']
assert get_normalized_variants(proc, "Bauwegstr") == ['bauweg str']
assert get_normalized_variants(proc, "holzweg") == ['holz weg']
assert get_normalized_variants(proc, "hallo") == ['hallo']
def test_multiple_replacements(cfgfile):
@@ -55,6 +58,17 @@ def test_multiple_replacements(cfgfile):
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
assert set(proc.get_normalized_variants("Saint Johns Street")) == \
assert set(get_normalized_variants(proc, "Saint Johns Street")) == \
{'saint johns street', 's johns street', 'st johns street',
'saint johns st', 's johns st', 'st johns st'}
def test_search_normalized(cfgfile):
fpath = cfgfile(['street'], ['street => s,st', 'master => mstr'])
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
assert proc.get_search_normalized('Master Street') == 'master street'
assert proc.get_search_normalized('Earnes St') == 'earne s st'
assert proc.get_search_normalized('Nostreet') == 'no street'

View File

@@ -69,7 +69,10 @@ def test_get_synonym_pairs(cfgfile):
repl = loader.get_replacement_pairs()
assert repl == [(' strasse ', {' strasse ', ' str ', ' st '}),
('strasse ', {' strasse ', ' str ', ' st '}),
('weg ', {' weg '})]
assert sorted(((a, sorted(b)) for a, b in repl)) == \
sorted([(' strasse ', [' st ', ' str ', ' strasse ']),
('strasse ', [' st ', ' str ', ' strasse ']),
('st ' , [' st ']),
('str ' , [' str ']),
('weg ', [' weg '])])

View File

@@ -2,10 +2,13 @@
Tests for Legacy ICU tokenizer.
"""
import shutil
import yaml
import pytest
from nominatim.tokenizer import legacy_icu_tokenizer
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties
@@ -40,16 +43,10 @@ def tokenizer_factory(dsn, tmp_path, property_table,
@pytest.fixture
def db_prop(temp_db_conn):
def _get_db_property(name):
return properties.get_property(temp_db_conn,
getattr(legacy_icu_tokenizer, name))
return properties.get_property(temp_db_conn, name)
return _get_db_property
@pytest.fixture
def tokenizer_setup(tokenizer_factory, test_config):
tok = tokenizer_factory()
tok.init_new_db(test_config)
@pytest.fixture
def analyzer(tokenizer_factory, test_config, monkeypatch,
@@ -62,9 +59,16 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
tok.init_new_db(test_config)
monkeypatch.undo()
def _mk_analyser(trans=':: upper();', abbr=(('STREET', 'ST'), )):
tok.transliteration = trans
tok.abbreviations = abbr
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
suffixes=('gasse', ), abbr=('street => st', )):
cfgfile = tmp_path / 'analyser_test_config.yaml'
with cfgfile.open('w') as stream:
cfgstr = {'normalization' : list(norm),
'transliteration' : list(trans),
'compound_suffixes' : list(suffixes),
'abbreviations' : list(abbr)}
yaml.dump(cfgstr, stream)
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgfile))
return tok.name_analyzer()
@@ -72,10 +76,54 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
@pytest.fixture
def getorcreate_term_id(temp_db_cursor):
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT)
RETURNS INTEGER AS $$
SELECT nextval('seq_word')::INTEGER; $$ LANGUAGE SQL""")
def getorcreate_full_word(temp_db_cursor):
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_full_word(
norm_term TEXT, lookup_terms TEXT[],
OUT full_token INT,
OUT partial_tokens INT[])
AS $$
DECLARE
partial_terms TEXT[] = '{}'::TEXT[];
term TEXT;
term_id INTEGER;
term_count INTEGER;
BEGIN
SELECT min(word_id) INTO full_token
FROM word WHERE word = norm_term and class is null and country_code is null;
IF full_token IS NULL THEN
full_token := nextval('seq_word');
INSERT INTO word (word_id, word_token, word, search_name_count)
SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term;
END IF;
FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
term := trim(term);
IF NOT (ARRAY[term] <@ partial_terms) THEN
partial_terms := partial_terms || term;
END IF;
END LOOP;
partial_tokens := '{}'::INT[];
FOR term IN SELECT unnest(partial_terms) LOOP
SELECT min(word_id), max(search_name_count) INTO term_id, term_count
FROM word WHERE word_token = term and class is null and country_code is null;
IF term_id IS NULL THEN
term_id := nextval('seq_word');
term_count := 0;
INSERT INTO word (word_id, word_token, search_name_count)
VALUES (term_id, term, 0);
END IF;
IF NOT (ARRAY[term_id] <@ partial_tokens) THEN
partial_tokens := partial_tokens || term_id;
END IF;
END LOOP;
END;
$$
LANGUAGE plpgsql;
""")
@pytest.fixture
@@ -91,19 +139,23 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
tok = tokenizer_factory()
tok.init_new_db(test_config)
assert db_prop('DBCFG_NORMALIZATION') == ':: lower();'
assert db_prop('DBCFG_TRANSLITERATION') is not None
assert db_prop('DBCFG_ABBREVIATIONS') is not None
assert db_prop(legacy_icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
def test_init_from_project(tokenizer_setup, tokenizer_factory):
def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '90300')
tok = tokenizer_factory()
tok.init_new_db(test_config)
monkeypatch.undo()
tok = tokenizer_factory()
tok.init_from_project()
assert tok.normalization is not None
assert tok.transliteration is not None
assert tok.abbreviations is not None
assert tok.naming_rules is not None
assert tok.term_normalization == ':: lower();'
assert tok.max_word_frequency == '90300'
def test_update_sql_functions(db_prop, temp_db_cursor,
@@ -114,7 +166,7 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
tok.init_new_db(test_config)
monkeypatch.undo()
assert db_prop('DBCFG_MAXWORDFREQ') == '1133'
assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
table_factory('test', 'txt TEXT')
@@ -127,16 +179,8 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
assert test_content == set((('1133', ), ))
def test_make_standard_word(analyzer):
with analyzer(abbr=(('STREET', 'ST'), ('tiny', 't'))) as anl:
assert anl.make_standard_word('tiny street') == 'TINY ST'
with analyzer(abbr=(('STRASSE', 'STR'), ('STR', 'ST'))) as anl:
assert anl.make_standard_word('Hauptstrasse') == 'HAUPTST'
def test_make_standard_hnr(analyzer):
with analyzer(abbr=(('IV', '4'),)) as anl:
with analyzer(abbr=('IV => 4',)) as anl:
assert anl._make_standard_hnr('345') == '345'
assert anl._make_standard_hnr('iv') == 'IV'
@@ -176,7 +220,7 @@ def test_update_special_phrase_empty_table(analyzer, word_table):
assert word_table.get_special() \
== {(' KÖNIG BEI', 'könig bei', 'amenity', 'royal', 'near'),
(' KÖNIGE', 'könige', 'amenity', 'royal', None),
(' ST', 'street', 'highway', 'primary', 'in')}
(' STREET', 'street', 'highway', 'primary', 'in')}
def test_update_special_phrase_delete_all(analyzer, word_table):
@@ -222,26 +266,42 @@ def test_update_special_phrase_modify(analyzer, word_table):
(' GARDEN', 'garden', 'leisure', 'garden', 'near')}
def test_process_place_names(analyzer, getorcreate_term_id):
with analyzer() as anl:
info = anl.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
class TestPlaceNames:
assert info['names'] == '{1,2,3,4,5}'
@pytest.fixture(autouse=True)
def setup(self, analyzer, getorcreate_full_word):
with analyzer() as anl:
self.analyzer = anl
yield anl
@pytest.mark.parametrize('sep', [',' , ';'])
def test_full_names_with_separator(analyzer, getorcreate_term_id, sep):
with analyzer() as anl:
names = anl._compute_full_names({'name' : sep.join(('New York', 'Big Apple'))})
def expect_name_terms(self, info, *expected_terms):
tokens = self.analyzer.get_word_token_info(expected_terms)
for token in tokens:
assert token[2] is not None, "No token for {0}".format(token)
assert names == set(('NEW YORK', 'BIG APPLE'))
assert eval(info['names']) == set((t[2] for t in tokens))
def test_full_names_with_bracket(analyzer, getorcreate_term_id):
with analyzer() as anl:
names = anl._compute_full_names({'name' : 'Houseboat (left)'})
def test_simple_names(self):
info = self.analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
assert names == set(('HOUSEBOAT (LEFT)', 'HOUSEBOAT'))
self.expect_name_terms(info, '#Soft bAr', '#34','Soft', 'bAr', '34')
@pytest.mark.parametrize('sep', [',' , ';'])
def test_names_with_separator(self, sep):
info = self.analyzer.process_place({'name' : {'name' : sep.join(('New York', 'Big Apple'))}})
self.expect_name_terms(info, '#New York', '#Big Apple',
'new', 'york', 'big', 'apple')
def test_full_names_with_bracket(self):
info = self.analyzer.process_place({'name' : {'name' : 'Houseboat (left)'}})
self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
'houseboat', 'left')
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])