complete tests for icu tokenizer

This commit is contained in:
Sarah Hoffmann
2021-06-10 17:18:23 +02:00
parent 32ca631b74
commit 9ff4f66f55
6 changed files with 205 additions and 48 deletions

View File

@@ -16,12 +16,14 @@ def cfgfile(tmp_path, suffix='.yaml'):
content = dedent("""\
normalization:
- ":: NFD ()"
- "'🜳' > ' '"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
transliteration:
- ":: Latin ()"
- "'🜵' > ' '"
""")
content += "compound_suffixes:\n"
content += '\n'.join((" - " + s for s in suffixes)) + '\n'
@@ -52,6 +54,17 @@ def test_simple_variants(cfgfile):
assert get_normalized_variants(proc, "hallo") == ['hallo']
def test_variants_empty(cfgfile):
fpath = cfgfile([], ['saint => 🜵', 'street => st'])
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
assert get_normalized_variants(proc, '🜵') == []
assert get_normalized_variants(proc, '🜳') == []
assert get_normalized_variants(proc, 'saint') == ['saint']
def test_multiple_replacements(cfgfile):
fpath = cfgfile([], ['saint => s,st', 'street => st'])