extend ICU config to accomodate multiple analysers

Adds parsing of multiple variant lists from the configuration.
Every entry except one must have a unique 'id' paramter to
distinguish the entries. The entry without id is considered
the default. Currently only the list without an id is used
for analysis.
This commit is contained in:
Sarah Hoffmann
2021-10-04 16:40:28 +02:00
parent 5a36559834
commit 52847b61a3
5 changed files with 92 additions and 64 deletions

View File

@@ -69,10 +69,10 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
variants=('~gasse -> gasse', 'street => st', ),
sanitizers=[]):
cfgstr = {'normalization' : list(norm),
'sanitizers' : sanitizers,
'transliteration' : list(trans),
'variants' : [ {'words': list(variants)}]}
cfgstr = {'normalization': list(norm),
'sanitizers': sanitizers,
'transliteration': list(trans),
'token-analysis': [{'variants': [{'words': list(variants)}]}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config)