enable BDD tests for different tokenizers

The tokenizer to be used can be choosen with -DTOKENIZER.

Adapt all tests, so that they work with legacy_icu tokenizer.
Move lookup in word table to a function in the tokenizer.
Special phrases are temporarily imported from the wiki until
we have an implementation that can import from file. TIGER
tests do not work yet.
This commit is contained in:
Sarah Hoffmann
2021-05-05 10:00:34 +02:00
parent 18c99a5c5f
commit a263e54b94
7 changed files with 106 additions and 48 deletions

View File

@@ -271,6 +271,33 @@ class LegacyNameAnalyzer:
self.conn = None
@staticmethod
def get_word_token_info(conn, words):
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
The function returns a list of tuples with
(original word, word token, word id).
The function is used for testing and debugging only
and not necessarily efficient.
"""
with conn.cursor() as cur:
cur.execute("""SELECT t.term, word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = (CASE
WHEN left(t.term, 1) = '#' THEN
' ' || make_standard_name(substring(t.term from 2))
ELSE
make_standard_name(t.term)
END)
and class is null and country_code is null""",
(words, ))
return [(r[0], r[1], r[2]) for r in cur]
def normalize(self, phrase):
""" Normalize the given phrase, i.e. remove all properties that
are irrelevant for search.