extend ICU config to accomodate multiple analysers

Adds parsing of multiple variant lists from the configuration. Every entry except one must have a unique 'id' paramter to distinguish the entries. The entry without id is considered the default. Currently only the list without an id is used for analysis.
2021-10-04 16:40:28 +02:00
parent 5a36559834
commit 52847b61a3
5 changed files with 92 additions and 64 deletions
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -69,10 +69,10 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
    def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
                     variants=('~gasse -> gasse', 'street => st', ),
                     sanitizers=[]):
-        cfgstr = {'normalization' : list(norm),
-                  'sanitizers' : sanitizers,
-                  'transliteration' : list(trans),
-                  'variants' : [ {'words': list(variants)}]}
+        cfgstr = {'normalization': list(norm),
+                  'sanitizers': sanitizers,
+                  'transliteration': list(trans),
+                  'token-analysis': [{'variants': [{'words': list(variants)}]}]}
        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
        tok.loader = ICURuleLoader(test_config)

--- a/test/python/test_tokenizer_icu_name_processor.py
+++ b/test/python/test_tokenizer_icu_name_processor.py
@@ -28,10 +28,10 @@ def cfgfile(def_config, tmp_path):
            - "::  Latin ()"
            - "'🜵' > ' '"
        """)
-        content += "variants:\n  - words:\n"
-        content += '\n'.join(("      - " + s for s in variants)) + '\n'
+        content += "token-analysis:\n  - variants:\n      - words:\n"
+        content += '\n'.join(("          - " + s for s in variants)) + '\n'
        for k, v in kwargs:
-            content += "    {}: {}\n".format(k, v)
+            content += "        {}: {}\n".format(k, v)
        (project_dir / 'icu_tokenizer.yaml').write_text(content)

        return def_config
--- a/test/python/test_tokenizer_icu_rule_loader.py
+++ b/test/python/test_tokenizer_icu_rule_loader.py
@@ -34,8 +34,8 @@ def cfgrules(test_config):
            - "::  Latin ()"
            - "[[:Punctuation:][:Space:]]+ > ' '"
        """)
-        content += "variants:\n  - words:\n"
-        content += '\n'.join(("      - " + s for s in variants)) + '\n'
+        content += "token-analysis:\n  - variants:\n     - words:\n"
+        content += '\n'.join(("         - " + s for s in variants)) + '\n'
        for k, v in kwargs:
            content += "    {}: {}\n".format(k, v)
        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
@@ -49,20 +49,20 @@ def test_empty_rule_set(test_config):
    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
        normalization:
        transliteration:
-        variants:
+        token-analysis:
+          - variants:
        """))

    rules = ICURuleLoader(test_config)
    assert rules.get_search_rules() == ''
    assert rules.get_normalization_rules() == ''
    assert rules.get_transliteration_rules() == ''
-    assert list(rules.get_replacement_pairs()) == []

-CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
+CONFIG_SECTIONS = ('normalization', 'transliteration', 'token-analysis')

@pytest.mark.parametrize("section", CONFIG_SECTIONS)
 def test_missing_section(section, test_config):
-    rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
+    rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section}
    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))

    with pytest.raises(UsageError):
@@ -107,7 +107,8 @@ def test_transliteration_rules_from_file(test_config):
        transliteration:
            - "'ax' > 'b'"
            - !include transliteration.yaml
-        variants:
+        token-analysis:
+            - variants:
        """))
    transpath = test_config.project_dir / ('transliteration.yaml')
    transpath.write_text('- "x > y"')
@@ -127,7 +128,7 @@ class TestGetReplacements:

    def get_replacements(self, *variants):
        loader = ICURuleLoader(self.cfgrules(*variants))
-        rules = loader.get_replacement_pairs()
+        rules = loader.analysis[None].variants

        return set((v.source, v.replacement) for v in rules)