extend ICU config to accomodate multiple analysers

Adds parsing of multiple variant lists from the configuration.
Every entry except one must have a unique 'id' paramter to
distinguish the entries. The entry without id is considered
the default. Currently only the list without an id is used
for analysis.
This commit is contained in:
Sarah Hoffmann
2021-10-04 16:40:28 +02:00
parent 5a36559834
commit 52847b61a3
5 changed files with 92 additions and 64 deletions

View File

@@ -69,10 +69,10 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
variants=('~gasse -> gasse', 'street => st', ),
sanitizers=[]):
cfgstr = {'normalization' : list(norm),
'sanitizers' : sanitizers,
'transliteration' : list(trans),
'variants' : [ {'words': list(variants)}]}
cfgstr = {'normalization': list(norm),
'sanitizers': sanitizers,
'transliteration': list(trans),
'token-analysis': [{'variants': [{'words': list(variants)}]}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config)

View File

@@ -28,10 +28,10 @@ def cfgfile(def_config, tmp_path):
- ":: Latin ()"
- "'🜵' > ' '"
""")
content += "variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n'
content += "token-analysis:\n - variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
content += " {}: {}\n".format(k, v)
(project_dir / 'icu_tokenizer.yaml').write_text(content)
return def_config

View File

@@ -34,8 +34,8 @@ def cfgrules(test_config):
- ":: Latin ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
""")
content += "variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n'
content += "token-analysis:\n - variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
@@ -49,20 +49,20 @@ def test_empty_rule_set(test_config):
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
normalization:
transliteration:
variants:
token-analysis:
- variants:
"""))
rules = ICURuleLoader(test_config)
assert rules.get_search_rules() == ''
assert rules.get_normalization_rules() == ''
assert rules.get_transliteration_rules() == ''
assert list(rules.get_replacement_pairs()) == []
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
CONFIG_SECTIONS = ('normalization', 'transliteration', 'token-analysis')
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
def test_missing_section(section, test_config):
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
with pytest.raises(UsageError):
@@ -107,7 +107,8 @@ def test_transliteration_rules_from_file(test_config):
transliteration:
- "'ax' > 'b'"
- !include transliteration.yaml
variants:
token-analysis:
- variants:
"""))
transpath = test_config.project_dir / ('transliteration.yaml')
transpath.write_text('- "x > y"')
@@ -127,7 +128,7 @@ class TestGetReplacements:
def get_replacements(self, *variants):
loader = ICURuleLoader(self.cfgrules(*variants))
rules = loader.get_replacement_pairs()
rules = loader.analysis[None].variants
return set((v.source, v.replacement) for v in rules)