move generation of normalized token form to analyzer

This gives the analyzer more flexibility in choosing the normalized
form. In particular, an analyzer creating different variants can choose
the variant that will be used as the canonical form.
This commit is contained in:
Sarah Hoffmann
2022-02-15 12:15:18 +01:00
parent 691ec08586
commit 837d44391c
5 changed files with 21 additions and 9 deletions

View File

@@ -25,5 +25,5 @@ class ICUTokenAnalysis:
self.search = Transliterator.createFromRules("icu_search", self.search = Transliterator.createFromRules("icu_search",
norm_rules + trans_rules) norm_rules + trans_rules)
self.analysis = {name: arules.create(self.to_ascii, arules.config) self.analysis = {name: arules.create(self.normalizer, self.to_ascii, arules.config)
for name, arules in analysis_rules.items()} for name, arules in analysis_rules.items()}

View File

@@ -561,7 +561,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
for name in names: for name in names:
analyzer_id = name.get_attr('analyzer') analyzer_id = name.get_attr('analyzer')
norm_name = self._normalized(name.name) analyzer = self.token_analysis.analysis[analyzer_id]
norm_name = analyzer.normalize(name.name)
if analyzer_id is None: if analyzer_id is None:
token_id = norm_name token_id = norm_name
else: else:
@@ -569,7 +570,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
full, part = self._cache.names.get(token_id, (None, None)) full, part = self._cache.names.get(token_id, (None, None))
if full is None: if full is None:
variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name) variants = analyzer.get_variants_ascii(norm_name)
if not variants: if not variants:
continue continue

View File

@@ -47,10 +47,10 @@ def configure(rules, normalization_rules):
### Analysis section ### Analysis section
def create(transliterator, config): def create(normalizer, transliterator, config):
""" Create a new token analysis instance for this module. """ Create a new token analysis instance for this module.
""" """
return GenericTokenAnalysis(transliterator, config) return GenericTokenAnalysis(normalizer, transliterator, config)
class GenericTokenAnalysis: class GenericTokenAnalysis:
@@ -58,7 +58,8 @@ class GenericTokenAnalysis:
and provides the functions to apply the transformations. and provides the functions to apply the transformations.
""" """
def __init__(self, to_ascii, config): def __init__(self, norm, to_ascii, config):
self.norm = norm
self.to_ascii = to_ascii self.to_ascii = to_ascii
self.variant_only = config['variant_only'] self.variant_only = config['variant_only']
@@ -74,6 +75,13 @@ class GenericTokenAnalysis:
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']] self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
def normalize(self, name):
""" Return the normalized form of the name. This is the standard form
from which possible variants for the name can be derived.
"""
return self.norm.transliterate(name).strip()
def get_variants_ascii(self, norm_name): def get_variants_ascii(self, norm_name):
""" Compute the spelling variants for the given normalized name """ Compute the spelling variants for the given normalized name
and transliterate the result. and transliterate the result.

View File

@@ -32,8 +32,9 @@ def make_analyser(*variants, variant_only=False):
rules['mode'] = 'variant-only' rules['mode'] = 'variant-only'
config = module.configure(rules, DEFAULT_NORMALIZATION) config = module.configure(rules, DEFAULT_NORMALIZATION)
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
return module.create(trans, config) return module.create(norm, trans, config)
def get_normalized_variants(proc, name): def get_normalized_variants(proc, name):
@@ -45,8 +46,9 @@ def test_no_variants():
rules = { 'analyzer': 'generic' } rules = { 'analyzer': 'generic' }
config = module.configure(rules, DEFAULT_NORMALIZATION) config = module.configure(rules, DEFAULT_NORMALIZATION)
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
proc = module.create(trans, config) proc = module.create(norm, trans, config)
assert get_normalized_variants(proc, '大德!') == ['dà dé'] assert get_normalized_variants(proc, '大德!') == ['dà dé']

View File

@@ -33,8 +33,9 @@ class TestMutationNoVariants:
} }
config = module.configure(rules, DEFAULT_NORMALIZATION) config = module.configure(rules, DEFAULT_NORMALIZATION)
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
self.analysis = module.create(trans, config) self.analysis = module.create(norm, trans, config)
def variants(self, name): def variants(self, name):