forked from hans/Nominatim
adapt tests for ICU tokenizer
This commit is contained in:
@@ -76,7 +76,7 @@ class ICUNameProcessor:
|
||||
""" Normalize the given name, i.e. remove all elements not relevant
|
||||
for search.
|
||||
"""
|
||||
return self.normalizer.transliterate(name)
|
||||
return self.normalizer.transliterate(name).strip()
|
||||
|
||||
def get_variants_ascii(self, norm_name):
|
||||
""" Compute the spelling variants for the given normalized name
|
||||
@@ -108,4 +108,4 @@ class ICUNameProcessor:
|
||||
""" Return the normalized version of the name (including transliteration)
|
||||
to be applied at search time.
|
||||
"""
|
||||
return self.search.transliterate(name)
|
||||
return self.search.transliterate(' ' + name + ' ').strip()
|
||||
|
||||
@@ -28,7 +28,7 @@ class ICURuleLoader:
|
||||
|
||||
|
||||
def get_search_rules(self):
|
||||
""" Returns the ICU rules to be used during search.
|
||||
""" Return the ICU rules to be used during search.
|
||||
The rules combine normalization, compound decomposition (including
|
||||
abbreviated compounds) and transliteration.
|
||||
"""
|
||||
@@ -60,7 +60,7 @@ class ICURuleLoader:
|
||||
return self.transliteration_rules
|
||||
|
||||
def get_replacement_pairs(self):
|
||||
""" Returns the list of possible compound decompositions with
|
||||
""" Return the list of possible compound decompositions with
|
||||
application of abbreviations included.
|
||||
The result is a list of pairs: the first item is the sequence to
|
||||
replace, the second is a list of replacements.
|
||||
|
||||
@@ -219,7 +219,7 @@ class LegacyICUNameAnalyzer:
|
||||
self.conn = None
|
||||
|
||||
|
||||
def get_word_token_info(self, conn, words):
|
||||
def get_word_token_info(self, words):
|
||||
""" Return token information for the given list of words.
|
||||
If a word starts with # it is assumed to be a full name
|
||||
otherwise is a partial name.
|
||||
@@ -233,11 +233,11 @@ class LegacyICUNameAnalyzer:
|
||||
tokens = {}
|
||||
for word in words:
|
||||
if word.startswith('#'):
|
||||
tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
|
||||
tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
|
||||
else:
|
||||
tokens[word] = self.name_processor.get_normalized(word)
|
||||
tokens[word] = self.name_processor.get_search_normalized(word)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT word_token, word_id
|
||||
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
|
||||
WHERE word_token = t.term
|
||||
@@ -245,7 +245,7 @@ class LegacyICUNameAnalyzer:
|
||||
(list(tokens.values()), ))
|
||||
ids = {r[0]: r[1] for r in cur}
|
||||
|
||||
return [(k, v, ids[v]) for k, v in tokens.items()]
|
||||
return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
|
||||
|
||||
|
||||
@staticmethod
|
||||
@@ -308,7 +308,7 @@ class LegacyICUNameAnalyzer:
|
||||
def update_special_phrases(self, phrases, should_replace):
|
||||
""" Replace the search index for special phrases with the new phrases.
|
||||
"""
|
||||
norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
|
||||
norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
|
||||
for p in phrases))
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
|
||||
@@ -271,8 +271,7 @@ class LegacyNameAnalyzer:
|
||||
self.conn = None
|
||||
|
||||
|
||||
@staticmethod
|
||||
def get_word_token_info(conn, words):
|
||||
def get_word_token_info(self, words):
|
||||
""" Return token information for the given list of words.
|
||||
If a word starts with # it is assumed to be a full name
|
||||
otherwise is a partial name.
|
||||
@@ -283,7 +282,7 @@ class LegacyNameAnalyzer:
|
||||
The function is used for testing and debugging only
|
||||
and not necessarily efficient.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT t.term, word_token, word_id
|
||||
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
|
||||
WHERE word_token = (CASE
|
||||
|
||||
Reference in New Issue
Block a user