mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-09 03:24:06 +00:00
icu tokenizer: move transliteration rules in separate file
The tokenizer configuration has become difficult to handle due to the additional manual transliteration rules. Allow to have a separate rule file that is given to the ICU library as is.
This commit is contained in:
@@ -259,4 +259,5 @@ install(FILES settings/env.defaults
|
|||||||
settings/import-full.style
|
settings/import-full.style
|
||||||
settings/import-extratags.style
|
settings/import-extratags.style
|
||||||
settings/legacy_icu_tokenizer.json
|
settings/legacy_icu_tokenizer.json
|
||||||
|
settings/icu_transliteration.rules
|
||||||
DESTINATION ${NOMINATIM_CONFIGDIR})
|
DESTINATION ${NOMINATIM_CONFIGDIR})
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ class LegacyICUTokenizer:
|
|||||||
cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
|
cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
|
||||||
|
|
||||||
rules = json.loads(cfgfile.read_text())
|
rules = json.loads(cfgfile.read_text())
|
||||||
self.transliteration = ';'.join(rules['normalization']) + ';'
|
self._load_transliteration(rules['normalization'], cfgfile.parent)
|
||||||
self.abbreviations = rules["abbreviations"]
|
self.abbreviations = rules["abbreviations"]
|
||||||
self.normalization = config.TERM_NORMALIZATION
|
self.normalization = config.TERM_NORMALIZATION
|
||||||
|
|
||||||
@@ -70,6 +70,12 @@ class LegacyICUTokenizer:
|
|||||||
self._init_db_tables(config)
|
self._init_db_tables(config)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_transliteration(self, rules, cfg_path):
|
||||||
|
if isinstance(rules, str):
|
||||||
|
self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
|
||||||
|
else:
|
||||||
|
self.transliteration = ';'.join(rules) + ';'
|
||||||
|
|
||||||
def init_from_project(self):
|
def init_from_project(self):
|
||||||
""" Initialise the tokenizer from the project directory.
|
""" Initialise the tokenizer from the project directory.
|
||||||
"""
|
"""
|
||||||
|
|||||||
4949
settings/icu_transliteration.rules
Normal file
4949
settings/icu_transliteration.rules
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user