complete tests for icu tokenizer

This commit is contained in:
Sarah Hoffmann
2021-06-10 17:18:23 +02:00
parent 32ca631b74
commit 9ff4f66f55
6 changed files with 205 additions and 48 deletions

View File

@@ -101,10 +101,19 @@ class ICUNameProcessor:
else:
pos += 1
if startpos == 0:
return [self.to_ascii.transliterate(norm_name)]
results = []
return [self.to_ascii.transliterate(v + baseform[startpos:pos]).strip() for v in variants]
if startpos == 0:
trans_name = self.to_ascii.transliterate(norm_name).strip()
if trans_name:
results.append(trans_name)
else:
for variant in variants:
trans_name = self.to_ascii.transliterate(variant + baseform[startpos:pos]).strip()
if trans_name:
results.append(trans_name)
return results
def get_search_normalized(self, name):

View File

@@ -123,7 +123,7 @@ class LegacyICUTokenizer:
"""
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
# pylint: disable=missing-format-attribute
def _install_php(self, phpdir):
""" Install the php script for the tokenizer.
"""
@@ -134,7 +134,7 @@ class LegacyICUTokenizer:
@define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
@define('CONST_Transliteration', "{0.naming_rules.search_rules}");
require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
""".format(self, phpdir))) # pylint: disable=missing-format-attribute
""".format(self, phpdir)))
def _save_config(self, config):
@@ -166,9 +166,11 @@ class LegacyICUTokenizer:
cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
for name, cnt in cur:
terms = set()
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
for term in word.split():
words[term] += cnt
terms.update(word.split())
for term in terms:
words[term] += cnt
# copy them back into the word table
with CopyBuffer() as copystr:
@@ -446,6 +448,9 @@ class LegacyICUNameAnalyzer:
full, part = self._cache.names.get(norm_name, (None, None))
if full is None:
variants = self.name_processor.get_variants_ascii(norm_name)
if not variants:
continue
with self.conn.cursor() as cur:
cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
(norm_name, variants))
@@ -465,12 +470,13 @@ class LegacyICUNameAnalyzer:
given dictionary of names.
"""
full_names = set()
for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
full_names.add(name.strip())
for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
if name:
full_names.add(name)
brace_idx = name.find('(')
if brace_idx >= 0:
full_names.add(name[:brace_idx].strip())
brace_idx = name.find('(')
if brace_idx >= 0:
full_names.add(name[:brace_idx].strip())
return full_names