adapt tests for ICU tokenizer

This commit is contained in:
Sarah Hoffmann
2021-06-06 11:00:44 +02:00
parent 8413075249
commit 2e3c5d4c5b
8 changed files with 143 additions and 67 deletions

View File

@@ -76,7 +76,7 @@ class ICUNameProcessor:
""" Normalize the given name, i.e. remove all elements not relevant
for search.
"""
return self.normalizer.transliterate(name)
return self.normalizer.transliterate(name).strip()
def get_variants_ascii(self, norm_name):
""" Compute the spelling variants for the given normalized name
@@ -108,4 +108,4 @@ class ICUNameProcessor:
""" Return the normalized version of the name (including transliteration)
to be applied at search time.
"""
return self.search.transliterate(name)
return self.search.transliterate(' ' + name + ' ').strip()

View File

@@ -28,7 +28,7 @@ class ICURuleLoader:
def get_search_rules(self):
""" Returns the ICU rules to be used during search.
""" Return the ICU rules to be used during search.
The rules combine normalization, compound decomposition (including
abbreviated compounds) and transliteration.
"""
@@ -60,7 +60,7 @@ class ICURuleLoader:
return self.transliteration_rules
def get_replacement_pairs(self):
""" Returns the list of possible compound decompositions with
""" Return the list of possible compound decompositions with
application of abbreviations included.
The result is a list of pairs: the first item is the sequence to
replace, the second is a list of replacements.

View File

@@ -219,7 +219,7 @@ class LegacyICUNameAnalyzer:
self.conn = None
def get_word_token_info(self, conn, words):
def get_word_token_info(self, words):
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
@@ -233,11 +233,11 @@ class LegacyICUNameAnalyzer:
tokens = {}
for word in words:
if word.startswith('#'):
tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
else:
tokens[word] = self.name_processor.get_normalized(word)
tokens[word] = self.name_processor.get_search_normalized(word)
with conn.cursor() as cur:
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = t.term
@@ -245,7 +245,7 @@ class LegacyICUNameAnalyzer:
(list(tokens.values()), ))
ids = {r[0]: r[1] for r in cur}
return [(k, v, ids[v]) for k, v in tokens.items()]
return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
@staticmethod
@@ -308,7 +308,7 @@ class LegacyICUNameAnalyzer:
def update_special_phrases(self, phrases, should_replace):
""" Replace the search index for special phrases with the new phrases.
"""
norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:

View File

@@ -271,8 +271,7 @@ class LegacyNameAnalyzer:
self.conn = None
@staticmethod
def get_word_token_info(conn, words):
def get_word_token_info(self, words):
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
@@ -283,7 +282,7 @@ class LegacyNameAnalyzer:
The function is used for testing and debugging only
and not necessarily efficient.
"""
with conn.cursor() as cur:
with self.conn.cursor() as cur:
cur.execute("""SELECT t.term, word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = (CASE