icu tokenizer: move transliteration rules in separate file

The tokenizer configuration has become difficult to handle
due to the additional manual transliteration rules. Allow
to have a separate rule file that is given to the ICU library
as is.
This commit is contained in:
Sarah Hoffmann
2021-05-26 20:50:34 +02:00
parent de4fac33dc
commit 6ba00e6aee
4 changed files with 4958 additions and 4951 deletions

View File

@@ -58,7 +58,7 @@ class LegacyICUTokenizer:
cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
rules = json.loads(cfgfile.read_text())
self.transliteration = ';'.join(rules['normalization']) + ';'
self._load_transliteration(rules['normalization'], cfgfile.parent)
self.abbreviations = rules["abbreviations"]
self.normalization = config.TERM_NORMALIZATION
@@ -70,6 +70,12 @@ class LegacyICUTokenizer:
self._init_db_tables(config)
def _load_transliteration(self, rules, cfg_path):
if isinstance(rules, str):
self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
else:
self.transliteration = ';'.join(rules) + ';'
def init_from_project(self):
""" Initialise the tokenizer from the project directory.
"""