icu tokenizer: move transliteration rules in separate file

The tokenizer configuration has become difficult to handle
due to the additional manual transliteration rules. Allow
to have a separate rule file that is given to the ICU library
as is.
This commit is contained in:
Sarah Hoffmann
2021-05-26 20:50:34 +02:00
parent de4fac33dc
commit 6ba00e6aee
4 changed files with 4958 additions and 4951 deletions

View File

@@ -259,4 +259,5 @@ install(FILES settings/env.defaults
settings/import-full.style settings/import-full.style
settings/import-extratags.style settings/import-extratags.style
settings/legacy_icu_tokenizer.json settings/legacy_icu_tokenizer.json
settings/icu_transliteration.rules
DESTINATION ${NOMINATIM_CONFIGDIR}) DESTINATION ${NOMINATIM_CONFIGDIR})

View File

@@ -58,7 +58,7 @@ class LegacyICUTokenizer:
cfgfile = config.config_dir / 'legacy_icu_tokenizer.json' cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
rules = json.loads(cfgfile.read_text()) rules = json.loads(cfgfile.read_text())
self.transliteration = ';'.join(rules['normalization']) + ';' self._load_transliteration(rules['normalization'], cfgfile.parent)
self.abbreviations = rules["abbreviations"] self.abbreviations = rules["abbreviations"]
self.normalization = config.TERM_NORMALIZATION self.normalization = config.TERM_NORMALIZATION
@@ -70,6 +70,12 @@ class LegacyICUTokenizer:
self._init_db_tables(config) self._init_db_tables(config)
def _load_transliteration(self, rules, cfg_path):
if isinstance(rules, str):
self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
else:
self.transliteration = ';'.join(rules) + ';'
def init_from_project(self): def init_from_project(self):
""" Initialise the tokenizer from the project directory. """ Initialise the tokenizer from the project directory.
""" """

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff