switch to a more flexible variant description format

The new format combines compound splitting and abbreviation.
It also allows to restrict rules to additional conditions
(like language or region). This latter ability is not used
yet.
This commit is contained in:
Sarah Hoffmann
2021-06-24 20:02:07 +02:00
parent a6aa6360e0
commit 62828fc5c1
8 changed files with 1207 additions and 1053 deletions

View File

@@ -60,13 +60,12 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
monkeypatch.undo()
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
suffixes=('gasse', ), abbr=('street => st', )):
variants=('~gasse -> gasse', 'street => st', )):
cfgfile = tmp_path / 'analyser_test_config.yaml'
with cfgfile.open('w') as stream:
cfgstr = {'normalization' : list(norm),
'transliteration' : list(trans),
'compound_suffixes' : list(suffixes),
'abbreviations' : list(abbr)}
'variants' : [ {'words': list(variants)}]}
yaml.dump(cfgstr, stream)
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgfile))