make token analyzers configurable modules

Adds a mandatory section 'analyzer' to the token-analysis entries
which define, which analyser to use. Currently there is exactly
one, generic, which implements the former ICUNameProcessor.
This commit is contained in:
Sarah Hoffmann
2021-10-04 17:34:30 +02:00
parent 52847b61a3
commit 7cfcbacfc7
7 changed files with 49 additions and 29 deletions

View File

@@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
cfgstr = {'normalization': list(norm),
'sanitizers': sanitizers,
'transliteration': list(trans),
'token-analysis': [{'variants': [{'words': list(variants)}]}]}
'token-analysis': [{'analyzer': 'generic',
'variants': [{'words': list(variants)}]}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config)