extend ICU config to accomodate multiple analysers

Adds parsing of multiple variant lists from the configuration.
Every entry except one must have a unique 'id' paramter to
distinguish the entries. The entry without id is considered
the default. Currently only the list without an id is used
for analysis.
This commit is contained in:
Sarah Hoffmann
2021-10-04 16:40:28 +02:00
parent 5a36559834
commit 52847b61a3
5 changed files with 92 additions and 64 deletions

View File

@@ -43,12 +43,10 @@ class ICURuleLoader:
rules = config.load_sub_configuration('icu_tokenizer.yaml', rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG') config='TOKENIZER_CONFIG')
self.variants = set()
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self.analysis_rules = self._get_section(rules, 'variants') self.analysis_rules = self._get_section(rules, 'token-analysis')
self._parse_variant_list() self._setup_analysis()
# Load optional sanitizer rule set. # Load optional sanitizer rule set.
self.sanitizer_rules = rules.get('sanitizers', []) self.sanitizer_rules = rules.get('sanitizers', [])
@@ -61,7 +59,7 @@ class ICURuleLoader:
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)) self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
self._parse_variant_list() self._setup_analysis()
def save_config_to_db(self, conn): def save_config_to_db(self, conn):
@@ -82,9 +80,8 @@ class ICURuleLoader:
def make_token_analysis(self): def make_token_analysis(self):
""" Create a token analyser from the reviouly loaded rules. """ Create a token analyser from the reviouly loaded rules.
""" """
return ICUNameProcessor(self.normalization_rules, return self.analysis[None].create(self.normalization_rules,
self.transliteration_rules, self.transliteration_rules)
self.variants)
def get_search_rules(self): def get_search_rules(self):
@@ -99,23 +96,37 @@ class ICURuleLoader:
rules.write(self.transliteration_rules) rules.write(self.transliteration_rules)
return rules.getvalue() return rules.getvalue()
def get_normalization_rules(self): def get_normalization_rules(self):
""" Return rules for normalisation of a term. """ Return rules for normalisation of a term.
""" """
return self.normalization_rules return self.normalization_rules
def get_transliteration_rules(self): def get_transliteration_rules(self):
""" Return the rules for converting a string into its asciii representation. """ Return the rules for converting a string into its asciii representation.
""" """
return self.transliteration_rules return self.transliteration_rules
def get_replacement_pairs(self):
""" Return the list of possible compound decompositions with def _setup_analysis(self):
application of abbreviations included. """ Process the rules used for creating the various token analyzers.
The result is a list of pairs: the first item is the sequence to
replace, the second is a list of replacements.
""" """
return self.variants self.analysis = {}
if not isinstance(self.analysis_rules, list):
raise UsageError("Configuration section 'token-analysis' must be a list.")
for section in self.analysis_rules:
name = section.get('id', None)
if name in self.analysis:
if name is None:
LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
else:
LOG.fatal("ICU tokenizer configuration has two token "
"analyzers with id '%s'.", name)
UsageError("Syntax error in ICU tokenizer config.")
self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
@staticmethod @staticmethod
@@ -145,17 +156,32 @@ class ICURuleLoader:
return ';'.join(flatten_config_list(content, section)) + ';' return ';'.join(flatten_config_list(content, section)) + ';'
def _parse_variant_list(self): class TokenAnalyzerRule:
rules = self.analysis_rules """ Factory for a single analysis module. The class saves the configuration
and creates a new token analyzer on request.
"""
self.variants.clear() def __init__(self, rules, normalization_rules):
self._parse_variant_list(rules.get('variants'), normalization_rules)
def create(self, normalization_rules, transliteration_rules):
""" Create an analyzer from the given rules.
"""
return ICUNameProcessor(normalization_rules,
transliteration_rules,
self.variants)
def _parse_variant_list(self, rules, normalization_rules):
self.variants = set()
if not rules: if not rules:
return return
rules = flatten_config_list(rules, 'variants') rules = flatten_config_list(rules, 'variants')
vmaker = _VariantMaker(self.normalization_rules) vmaker = _VariantMaker(normalization_rules)
properties = [] properties = []
for section in rules: for section in rules:

View File

@@ -27,34 +27,35 @@ transliteration:
sanitizers: sanitizers:
- step: split-name-list - step: split-name-list
- step: strip-brace-terms - step: strip-brace-terms
variants: token-analysis:
- !include icu-rules/variants-bg.yaml - variants:
- !include icu-rules/variants-ca.yaml - !include icu-rules/variants-bg.yaml
- !include icu-rules/variants-cs.yaml - !include icu-rules/variants-ca.yaml
- !include icu-rules/variants-da.yaml - !include icu-rules/variants-cs.yaml
- !include icu-rules/variants-de.yaml - !include icu-rules/variants-da.yaml
- !include icu-rules/variants-el.yaml - !include icu-rules/variants-de.yaml
- !include icu-rules/variants-en.yaml - !include icu-rules/variants-el.yaml
- !include icu-rules/variants-es.yaml - !include icu-rules/variants-en.yaml
- !include icu-rules/variants-et.yaml - !include icu-rules/variants-es.yaml
- !include icu-rules/variants-eu.yaml - !include icu-rules/variants-et.yaml
- !include icu-rules/variants-fi.yaml - !include icu-rules/variants-eu.yaml
- !include icu-rules/variants-fr.yaml - !include icu-rules/variants-fi.yaml
- !include icu-rules/variants-gl.yaml - !include icu-rules/variants-fr.yaml
- !include icu-rules/variants-hu.yaml - !include icu-rules/variants-gl.yaml
- !include icu-rules/variants-it.yaml - !include icu-rules/variants-hu.yaml
- !include icu-rules/variants-ja.yaml - !include icu-rules/variants-it.yaml
- !include icu-rules/variants-mg.yaml - !include icu-rules/variants-ja.yaml
- !include icu-rules/variants-ms.yaml - !include icu-rules/variants-mg.yaml
- !include icu-rules/variants-nl.yaml - !include icu-rules/variants-ms.yaml
- !include icu-rules/variants-no.yaml - !include icu-rules/variants-nl.yaml
- !include icu-rules/variants-pl.yaml - !include icu-rules/variants-no.yaml
- !include icu-rules/variants-pt.yaml - !include icu-rules/variants-pl.yaml
- !include icu-rules/variants-ro.yaml - !include icu-rules/variants-pt.yaml
- !include icu-rules/variants-ru.yaml - !include icu-rules/variants-ro.yaml
- !include icu-rules/variants-sk.yaml - !include icu-rules/variants-ru.yaml
- !include icu-rules/variants-sl.yaml - !include icu-rules/variants-sk.yaml
- !include icu-rules/variants-sv.yaml - !include icu-rules/variants-sl.yaml
- !include icu-rules/variants-tr.yaml - !include icu-rules/variants-sv.yaml
- !include icu-rules/variants-uk.yaml - !include icu-rules/variants-tr.yaml
- !include icu-rules/variants-vi.yaml - !include icu-rules/variants-uk.yaml
- !include icu-rules/variants-vi.yaml

View File

@@ -69,10 +69,10 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',), def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
variants=('~gasse -> gasse', 'street => st', ), variants=('~gasse -> gasse', 'street => st', ),
sanitizers=[]): sanitizers=[]):
cfgstr = {'normalization' : list(norm), cfgstr = {'normalization': list(norm),
'sanitizers' : sanitizers, 'sanitizers': sanitizers,
'transliteration' : list(trans), 'transliteration': list(trans),
'variants' : [ {'words': list(variants)}]} 'token-analysis': [{'variants': [{'words': list(variants)}]}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr)) (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config) tok.loader = ICURuleLoader(test_config)

View File

@@ -28,10 +28,10 @@ def cfgfile(def_config, tmp_path):
- ":: Latin ()" - ":: Latin ()"
- "'🜵' > ' '" - "'🜵' > ' '"
""") """)
content += "variants:\n - words:\n" content += "token-analysis:\n - variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n' content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs: for k, v in kwargs:
content += " {}: {}\n".format(k, v) content += " {}: {}\n".format(k, v)
(project_dir / 'icu_tokenizer.yaml').write_text(content) (project_dir / 'icu_tokenizer.yaml').write_text(content)
return def_config return def_config

View File

@@ -34,8 +34,8 @@ def cfgrules(test_config):
- ":: Latin ()" - ":: Latin ()"
- "[[:Punctuation:][:Space:]]+ > ' '" - "[[:Punctuation:][:Space:]]+ > ' '"
""") """)
content += "variants:\n - words:\n" content += "token-analysis:\n - variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n' content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs: for k, v in kwargs:
content += " {}: {}\n".format(k, v) content += " {}: {}\n".format(k, v)
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(content) (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
@@ -49,20 +49,20 @@ def test_empty_rule_set(test_config):
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\ (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
normalization: normalization:
transliteration: transliteration:
variants: token-analysis:
- variants:
""")) """))
rules = ICURuleLoader(test_config) rules = ICURuleLoader(test_config)
assert rules.get_search_rules() == '' assert rules.get_search_rules() == ''
assert rules.get_normalization_rules() == '' assert rules.get_normalization_rules() == ''
assert rules.get_transliteration_rules() == '' assert rules.get_transliteration_rules() == ''
assert list(rules.get_replacement_pairs()) == []
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants') CONFIG_SECTIONS = ('normalization', 'transliteration', 'token-analysis')
@pytest.mark.parametrize("section", CONFIG_SECTIONS) @pytest.mark.parametrize("section", CONFIG_SECTIONS)
def test_missing_section(section, test_config): def test_missing_section(section, test_config):
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section} rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg)) (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
with pytest.raises(UsageError): with pytest.raises(UsageError):
@@ -107,7 +107,8 @@ def test_transliteration_rules_from_file(test_config):
transliteration: transliteration:
- "'ax' > 'b'" - "'ax' > 'b'"
- !include transliteration.yaml - !include transliteration.yaml
variants: token-analysis:
- variants:
""")) """))
transpath = test_config.project_dir / ('transliteration.yaml') transpath = test_config.project_dir / ('transliteration.yaml')
transpath.write_text('- "x > y"') transpath.write_text('- "x > y"')
@@ -127,7 +128,7 @@ class TestGetReplacements:
def get_replacements(self, *variants): def get_replacements(self, *variants):
loader = ICURuleLoader(self.cfgrules(*variants)) loader = ICURuleLoader(self.cfgrules(*variants))
rules = loader.get_replacement_pairs() rules = loader.analysis[None].variants
return set((v.source, v.replacement) for v in rules) return set((v.source, v.replacement) for v in rules)