make token analyzers configurable modules

Adds a mandatory section 'analyzer' to the token-analysis entries which define, which analyser to use. Currently there is exactly one, generic, which implements the former ICUNameProcessor.
2021-10-04 17:34:30 +02:00
parent 52847b61a3
commit 7cfcbacfc7
7 changed files with 49 additions and 29 deletions
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
        cfgstr = {'normalization': list(norm),
                  'sanitizers': sanitizers,
                  'transliteration': list(trans),
-                  'token-analysis': [{'variants': [{'words': list(variants)}]}]}
+                  'token-analysis': [{'analyzer': 'generic',
+                                      'variants': [{'words': list(variants)}]}]}
        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
        tok.loader = ICURuleLoader(test_config)