adapt tests for ICU tokenizer

This commit is contained in:
Sarah Hoffmann
2021-06-06 11:00:44 +02:00
parent 8413075249
commit 2e3c5d4c5b
8 changed files with 143 additions and 67 deletions

View File

@@ -76,7 +76,7 @@ class ICUNameProcessor:
""" Normalize the given name, i.e. remove all elements not relevant """ Normalize the given name, i.e. remove all elements not relevant
for search. for search.
""" """
return self.normalizer.transliterate(name) return self.normalizer.transliterate(name).strip()
def get_variants_ascii(self, norm_name): def get_variants_ascii(self, norm_name):
""" Compute the spelling variants for the given normalized name """ Compute the spelling variants for the given normalized name
@@ -108,4 +108,4 @@ class ICUNameProcessor:
""" Return the normalized version of the name (including transliteration) """ Return the normalized version of the name (including transliteration)
to be applied at search time. to be applied at search time.
""" """
return self.search.transliterate(name) return self.search.transliterate(' ' + name + ' ').strip()

View File

@@ -28,7 +28,7 @@ class ICURuleLoader:
def get_search_rules(self): def get_search_rules(self):
""" Returns the ICU rules to be used during search. """ Return the ICU rules to be used during search.
The rules combine normalization, compound decomposition (including The rules combine normalization, compound decomposition (including
abbreviated compounds) and transliteration. abbreviated compounds) and transliteration.
""" """
@@ -60,7 +60,7 @@ class ICURuleLoader:
return self.transliteration_rules return self.transliteration_rules
def get_replacement_pairs(self): def get_replacement_pairs(self):
""" Returns the list of possible compound decompositions with """ Return the list of possible compound decompositions with
application of abbreviations included. application of abbreviations included.
The result is a list of pairs: the first item is the sequence to The result is a list of pairs: the first item is the sequence to
replace, the second is a list of replacements. replace, the second is a list of replacements.

View File

@@ -219,7 +219,7 @@ class LegacyICUNameAnalyzer:
self.conn = None self.conn = None
def get_word_token_info(self, conn, words): def get_word_token_info(self, words):
""" Return token information for the given list of words. """ Return token information for the given list of words.
If a word starts with # it is assumed to be a full name If a word starts with # it is assumed to be a full name
otherwise is a partial name. otherwise is a partial name.
@@ -233,11 +233,11 @@ class LegacyICUNameAnalyzer:
tokens = {} tokens = {}
for word in words: for word in words:
if word.startswith('#'): if word.startswith('#'):
tokens[word] = ' ' + self.name_processor.get_normalized(word[1:]) tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
else: else:
tokens[word] = self.name_processor.get_normalized(word) tokens[word] = self.name_processor.get_search_normalized(word)
with conn.cursor() as cur: with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id cur.execute("""SELECT word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = t.term WHERE word_token = t.term
@@ -245,7 +245,7 @@ class LegacyICUNameAnalyzer:
(list(tokens.values()), )) (list(tokens.values()), ))
ids = {r[0]: r[1] for r in cur} ids = {r[0]: r[1] for r in cur}
return [(k, v, ids[v]) for k, v in tokens.items()] return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
@staticmethod @staticmethod
@@ -308,7 +308,7 @@ class LegacyICUNameAnalyzer:
def update_special_phrases(self, phrases, should_replace): def update_special_phrases(self, phrases, should_replace):
""" Replace the search index for special phrases with the new phrases. """ Replace the search index for special phrases with the new phrases.
""" """
norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3]) norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
for p in phrases)) for p in phrases))
with self.conn.cursor() as cur: with self.conn.cursor() as cur:

View File

@@ -271,8 +271,7 @@ class LegacyNameAnalyzer:
self.conn = None self.conn = None
@staticmethod def get_word_token_info(self, words):
def get_word_token_info(conn, words):
""" Return token information for the given list of words. """ Return token information for the given list of words.
If a word starts with # it is assumed to be a full name If a word starts with # it is assumed to be a full name
otherwise is a partial name. otherwise is a partial name.
@@ -283,7 +282,7 @@ class LegacyNameAnalyzer:
The function is used for testing and debugging only The function is used for testing and debugging only
and not necessarily efficient. and not necessarily efficient.
""" """
with conn.cursor() as cur: with self.conn.cursor() as cur:
cur.execute("""SELECT t.term, word_token, word_id cur.execute("""SELECT t.term, word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = (CASE WHERE word_token = (CASE

View File

@@ -214,7 +214,7 @@ def check_search_name_contents(context, exclude):
for name, value in zip(row.headings, row.cells): for name, value in zip(row.headings, row.cells):
if name in ('name_vector', 'nameaddress_vector'): if name in ('name_vector', 'nameaddress_vector'):
items = [x.strip() for x in value.split(',')] items = [x.strip() for x in value.split(',')]
tokens = analyzer.get_word_token_info(context.db, items) tokens = analyzer.get_word_token_info(items)
if not exclude: if not exclude:
assert len(tokens) >= len(items), \ assert len(tokens) >= len(items), \

View File

@@ -34,6 +34,9 @@ def cfgfile(tmp_path, suffix='.yaml'):
return _create_config return _create_config
def get_normalized_variants(proc, name):
return proc.get_variants_ascii(proc.get_normalized(name))
def test_simple_variants(cfgfile): def test_simple_variants(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'], fpath = cfgfile(['strasse', 'straße', 'weg'],
['strasse,straße => str', ['strasse,straße => str',
@@ -42,11 +45,11 @@ def test_simple_variants(cfgfile):
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules) proc = ICUNameProcessor(rules)
assert set(proc.get_normalized_variants("Bauwegstraße")) \ assert set(get_normalized_variants(proc, "Bauwegstraße")) \
== {'bauweg straße', 'bauweg str'} == {'bauweg straße', 'bauweg str'}
assert proc.get_normalized_variants("Bauwegstr") == ['bauweg str'] assert get_normalized_variants(proc, "Bauwegstr") == ['bauweg str']
assert proc.get_normalized_variants("holzweg") == ['holz weg'] assert get_normalized_variants(proc, "holzweg") == ['holz weg']
assert proc.get_normalized_variants("hallo") == ['hallo'] assert get_normalized_variants(proc, "hallo") == ['hallo']
def test_multiple_replacements(cfgfile): def test_multiple_replacements(cfgfile):
@@ -55,6 +58,17 @@ def test_multiple_replacements(cfgfile):
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules) proc = ICUNameProcessor(rules)
assert set(proc.get_normalized_variants("Saint Johns Street")) == \ assert set(get_normalized_variants(proc, "Saint Johns Street")) == \
{'saint johns street', 's johns street', 'st johns street', {'saint johns street', 's johns street', 'st johns street',
'saint johns st', 's johns st', 'st johns st'} 'saint johns st', 's johns st', 'st johns st'}
def test_search_normalized(cfgfile):
fpath = cfgfile(['street'], ['street => s,st', 'master => mstr'])
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
assert proc.get_search_normalized('Master Street') == 'master street'
assert proc.get_search_normalized('Earnes St') == 'earne s st'
assert proc.get_search_normalized('Nostreet') == 'no street'

View File

@@ -69,7 +69,10 @@ def test_get_synonym_pairs(cfgfile):
repl = loader.get_replacement_pairs() repl = loader.get_replacement_pairs()
assert repl == [(' strasse ', {' strasse ', ' str ', ' st '}), assert sorted(((a, sorted(b)) for a, b in repl)) == \
('strasse ', {' strasse ', ' str ', ' st '}), sorted([(' strasse ', [' st ', ' str ', ' strasse ']),
('weg ', {' weg '})] ('strasse ', [' st ', ' str ', ' strasse ']),
('st ' , [' st ']),
('str ' , [' str ']),
('weg ', [' weg '])])

View File

@@ -2,10 +2,13 @@
Tests for Legacy ICU tokenizer. Tests for Legacy ICU tokenizer.
""" """
import shutil import shutil
import yaml
import pytest import pytest
from nominatim.tokenizer import legacy_icu_tokenizer from nominatim.tokenizer import legacy_icu_tokenizer
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties from nominatim.db import properties
@@ -40,16 +43,10 @@ def tokenizer_factory(dsn, tmp_path, property_table,
@pytest.fixture @pytest.fixture
def db_prop(temp_db_conn): def db_prop(temp_db_conn):
def _get_db_property(name): def _get_db_property(name):
return properties.get_property(temp_db_conn, return properties.get_property(temp_db_conn, name)
getattr(legacy_icu_tokenizer, name))
return _get_db_property return _get_db_property
@pytest.fixture
def tokenizer_setup(tokenizer_factory, test_config):
tok = tokenizer_factory()
tok.init_new_db(test_config)
@pytest.fixture @pytest.fixture
def analyzer(tokenizer_factory, test_config, monkeypatch, def analyzer(tokenizer_factory, test_config, monkeypatch,
@@ -62,9 +59,16 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
tok.init_new_db(test_config) tok.init_new_db(test_config)
monkeypatch.undo() monkeypatch.undo()
def _mk_analyser(trans=':: upper();', abbr=(('STREET', 'ST'), )): def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
tok.transliteration = trans suffixes=('gasse', ), abbr=('street => st', )):
tok.abbreviations = abbr cfgfile = tmp_path / 'analyser_test_config.yaml'
with cfgfile.open('w') as stream:
cfgstr = {'normalization' : list(norm),
'transliteration' : list(trans),
'compound_suffixes' : list(suffixes),
'abbreviations' : list(abbr)}
yaml.dump(cfgstr, stream)
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgfile))
return tok.name_analyzer() return tok.name_analyzer()
@@ -72,10 +76,54 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
@pytest.fixture @pytest.fixture
def getorcreate_term_id(temp_db_cursor): def getorcreate_full_word(temp_db_cursor):
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT) temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_full_word(
RETURNS INTEGER AS $$ norm_term TEXT, lookup_terms TEXT[],
SELECT nextval('seq_word')::INTEGER; $$ LANGUAGE SQL""") OUT full_token INT,
OUT partial_tokens INT[])
AS $$
DECLARE
partial_terms TEXT[] = '{}'::TEXT[];
term TEXT;
term_id INTEGER;
term_count INTEGER;
BEGIN
SELECT min(word_id) INTO full_token
FROM word WHERE word = norm_term and class is null and country_code is null;
IF full_token IS NULL THEN
full_token := nextval('seq_word');
INSERT INTO word (word_id, word_token, word, search_name_count)
SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term;
END IF;
FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
term := trim(term);
IF NOT (ARRAY[term] <@ partial_terms) THEN
partial_terms := partial_terms || term;
END IF;
END LOOP;
partial_tokens := '{}'::INT[];
FOR term IN SELECT unnest(partial_terms) LOOP
SELECT min(word_id), max(search_name_count) INTO term_id, term_count
FROM word WHERE word_token = term and class is null and country_code is null;
IF term_id IS NULL THEN
term_id := nextval('seq_word');
term_count := 0;
INSERT INTO word (word_id, word_token, search_name_count)
VALUES (term_id, term, 0);
END IF;
IF NOT (ARRAY[term_id] <@ partial_tokens) THEN
partial_tokens := partial_tokens || term_id;
END IF;
END LOOP;
END;
$$
LANGUAGE plpgsql;
""")
@pytest.fixture @pytest.fixture
@@ -91,19 +139,23 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
tok = tokenizer_factory() tok = tokenizer_factory()
tok.init_new_db(test_config) tok.init_new_db(test_config)
assert db_prop('DBCFG_NORMALIZATION') == ':: lower();' assert db_prop(legacy_icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
assert db_prop('DBCFG_TRANSLITERATION') is not None assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
assert db_prop('DBCFG_ABBREVIATIONS') is not None
def test_init_from_project(tokenizer_setup, tokenizer_factory): def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '90300')
tok = tokenizer_factory() tok = tokenizer_factory()
tok.init_new_db(test_config)
monkeypatch.undo()
tok = tokenizer_factory()
tok.init_from_project() tok.init_from_project()
assert tok.normalization is not None assert tok.naming_rules is not None
assert tok.transliteration is not None assert tok.term_normalization == ':: lower();'
assert tok.abbreviations is not None assert tok.max_word_frequency == '90300'
def test_update_sql_functions(db_prop, temp_db_cursor, def test_update_sql_functions(db_prop, temp_db_cursor,
@@ -114,7 +166,7 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
tok.init_new_db(test_config) tok.init_new_db(test_config)
monkeypatch.undo() monkeypatch.undo()
assert db_prop('DBCFG_MAXWORDFREQ') == '1133' assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
table_factory('test', 'txt TEXT') table_factory('test', 'txt TEXT')
@@ -127,16 +179,8 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
assert test_content == set((('1133', ), )) assert test_content == set((('1133', ), ))
def test_make_standard_word(analyzer):
with analyzer(abbr=(('STREET', 'ST'), ('tiny', 't'))) as anl:
assert anl.make_standard_word('tiny street') == 'TINY ST'
with analyzer(abbr=(('STRASSE', 'STR'), ('STR', 'ST'))) as anl:
assert anl.make_standard_word('Hauptstrasse') == 'HAUPTST'
def test_make_standard_hnr(analyzer): def test_make_standard_hnr(analyzer):
with analyzer(abbr=(('IV', '4'),)) as anl: with analyzer(abbr=('IV => 4',)) as anl:
assert anl._make_standard_hnr('345') == '345' assert anl._make_standard_hnr('345') == '345'
assert anl._make_standard_hnr('iv') == 'IV' assert anl._make_standard_hnr('iv') == 'IV'
@@ -176,7 +220,7 @@ def test_update_special_phrase_empty_table(analyzer, word_table):
assert word_table.get_special() \ assert word_table.get_special() \
== {(' KÖNIG BEI', 'könig bei', 'amenity', 'royal', 'near'), == {(' KÖNIG BEI', 'könig bei', 'amenity', 'royal', 'near'),
(' KÖNIGE', 'könige', 'amenity', 'royal', None), (' KÖNIGE', 'könige', 'amenity', 'royal', None),
(' ST', 'street', 'highway', 'primary', 'in')} (' STREET', 'street', 'highway', 'primary', 'in')}
def test_update_special_phrase_delete_all(analyzer, word_table): def test_update_special_phrase_delete_all(analyzer, word_table):
@@ -222,26 +266,42 @@ def test_update_special_phrase_modify(analyzer, word_table):
(' GARDEN', 'garden', 'leisure', 'garden', 'near')} (' GARDEN', 'garden', 'leisure', 'garden', 'near')}
def test_process_place_names(analyzer, getorcreate_term_id): class TestPlaceNames:
with analyzer() as anl:
info = anl.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
assert info['names'] == '{1,2,3,4,5}' @pytest.fixture(autouse=True)
def setup(self, analyzer, getorcreate_full_word):
with analyzer() as anl:
self.analyzer = anl
yield anl
@pytest.mark.parametrize('sep', [',' , ';']) def expect_name_terms(self, info, *expected_terms):
def test_full_names_with_separator(analyzer, getorcreate_term_id, sep): tokens = self.analyzer.get_word_token_info(expected_terms)
with analyzer() as anl: for token in tokens:
names = anl._compute_full_names({'name' : sep.join(('New York', 'Big Apple'))}) assert token[2] is not None, "No token for {0}".format(token)
assert names == set(('NEW YORK', 'BIG APPLE')) assert eval(info['names']) == set((t[2] for t in tokens))
def test_full_names_with_bracket(analyzer, getorcreate_term_id): def test_simple_names(self):
with analyzer() as anl: info = self.analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
names = anl._compute_full_names({'name' : 'Houseboat (left)'})
assert names == set(('HOUSEBOAT (LEFT)', 'HOUSEBOAT')) self.expect_name_terms(info, '#Soft bAr', '#34','Soft', 'bAr', '34')
@pytest.mark.parametrize('sep', [',' , ';'])
def test_names_with_separator(self, sep):
info = self.analyzer.process_place({'name' : {'name' : sep.join(('New York', 'Big Apple'))}})
self.expect_name_terms(info, '#New York', '#Big Apple',
'new', 'york', 'big', 'apple')
def test_full_names_with_bracket(self):
info = self.analyzer.process_place({'name' : {'name' : 'Houseboat (left)'}})
self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
'houseboat', 'left')
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345']) @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])