forked from hans/Nominatim
icu tokenizer: move transliteration rules in separate file
The tokenizer configuration has become difficult to handle due to the additional manual transliteration rules. Allow to have a separate rule file that is given to the ICU library as is.
This commit is contained in:
@@ -58,7 +58,7 @@ class LegacyICUTokenizer:
|
||||
cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
|
||||
|
||||
rules = json.loads(cfgfile.read_text())
|
||||
self.transliteration = ';'.join(rules['normalization']) + ';'
|
||||
self._load_transliteration(rules['normalization'], cfgfile.parent)
|
||||
self.abbreviations = rules["abbreviations"]
|
||||
self.normalization = config.TERM_NORMALIZATION
|
||||
|
||||
@@ -70,6 +70,12 @@ class LegacyICUTokenizer:
|
||||
self._init_db_tables(config)
|
||||
|
||||
|
||||
def _load_transliteration(self, rules, cfg_path):
|
||||
if isinstance(rules, str):
|
||||
self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
|
||||
else:
|
||||
self.transliteration = ';'.join(rules) + ';'
|
||||
|
||||
def init_from_project(self):
|
||||
""" Initialise the tokenizer from the project directory.
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user