mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-10 21:04:07 +00:00
use yaml tag syntax to mark include files
This commit is contained in:
@@ -5,6 +5,7 @@ import io
|
|||||||
import logging
|
import logging
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import itertools
|
import itertools
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
from icu import Transliterator
|
from icu import Transliterator
|
||||||
@@ -13,6 +14,22 @@ from nominatim.errors import UsageError
|
|||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
def _flatten_yaml_list(content):
|
||||||
|
if not content:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not isinstance(content, list):
|
||||||
|
raise UsageError("List expected in ICU yaml configuration.")
|
||||||
|
|
||||||
|
output = []
|
||||||
|
for ele in content:
|
||||||
|
if isinstance(ele, list):
|
||||||
|
output.extend(_flatten_yaml_list(ele))
|
||||||
|
else:
|
||||||
|
output.append(ele)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
class ICURuleLoader:
|
class ICURuleLoader:
|
||||||
""" Compiler for ICU rules from a tokenizer configuration file.
|
""" Compiler for ICU rules from a tokenizer configuration file.
|
||||||
@@ -87,8 +104,20 @@ class ICURuleLoader:
|
|||||||
|
|
||||||
return [(k, list(synonyms[k])) for k in sorted_keys]
|
return [(k, list(synonyms[k])) for k in sorted_keys]
|
||||||
|
|
||||||
|
def _yaml_include_representer(self, loader, node):
|
||||||
|
value = loader.construct_scalar(node)
|
||||||
|
|
||||||
|
if Path(value).is_absolute():
|
||||||
|
content = Path(value).read_text()
|
||||||
|
else:
|
||||||
|
content = (self.configfile.parent / value).read_text()
|
||||||
|
|
||||||
|
return yaml.safe_load(content)
|
||||||
|
|
||||||
|
|
||||||
def _load_from_yaml(self):
|
def _load_from_yaml(self):
|
||||||
|
yaml.add_constructor('!include', self._yaml_include_representer,
|
||||||
|
Loader=yaml.SafeLoader)
|
||||||
rules = yaml.safe_load(self.configfile.read_text())
|
rules = yaml.safe_load(self.configfile.read_text())
|
||||||
|
|
||||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||||
@@ -121,10 +150,8 @@ class ICURuleLoader:
|
|||||||
if content is None:
|
if content is None:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
if isinstance(content, str):
|
return ';'.join(_flatten_yaml_list(content)) + ';'
|
||||||
return (self.configfile.parent / content).read_text().replace('\n', ' ')
|
|
||||||
|
|
||||||
return ';'.join(content) + ';'
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_compound_suffix_list(self, rules):
|
def _parse_compound_suffix_list(self, rules):
|
||||||
|
|||||||
4941
settings/icu-rules/extended-unicode-to-asccii.yaml
Normal file
4941
settings/icu-rules/extended-unicode-to-asccii.yaml
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -5,7 +5,16 @@ normalization:
|
|||||||
- "ß > 'ss'" # German szet is unimbigiously equal to double ss
|
- "ß > 'ss'" # German szet is unimbigiously equal to double ss
|
||||||
- "[[:Punctuation:][:Space:]]+ > ' '"
|
- "[[:Punctuation:][:Space:]]+ > ' '"
|
||||||
- ":: NFC ()"
|
- ":: NFC ()"
|
||||||
transliteration: icu_transliteration.rules
|
transliteration:
|
||||||
|
- !include icu-rules/extended-unicode-to-asccii.yaml
|
||||||
|
- ":: Ascii ()"
|
||||||
|
- ":: NFD ()"
|
||||||
|
- "'' >"
|
||||||
|
- "[[:Nonspacing Mark:] [:Cf:]] >"
|
||||||
|
- "[^[:Ascii:]] >"
|
||||||
|
- ":: lower ()"
|
||||||
|
- "[[:Punctuation:][:Space:]]+ > ' '"
|
||||||
|
- ":: NFC ()"
|
||||||
compound_suffixes:
|
compound_suffixes:
|
||||||
# Danish
|
# Danish
|
||||||
- hal
|
- hal
|
||||||
|
|||||||
@@ -121,6 +121,26 @@ def test_get_transliteration_rules(cfgfile):
|
|||||||
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
|
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
|
||||||
|
|
||||||
|
|
||||||
|
def test_transliteration_rules_from_file(tmp_path):
|
||||||
|
cfgpath = tmp_path / ('test_config.yaml')
|
||||||
|
cfgpath.write_text(dedent("""\
|
||||||
|
normalization:
|
||||||
|
transliteration:
|
||||||
|
- "'ax' > 'b'"
|
||||||
|
- !include transliteration.yaml
|
||||||
|
compound_suffixes:
|
||||||
|
abbreviations:
|
||||||
|
"""))
|
||||||
|
transpath = tmp_path / ('transliteration.yaml')
|
||||||
|
transpath.write_text('- "x > y"')
|
||||||
|
|
||||||
|
loader = ICURuleLoader(cfgpath)
|
||||||
|
rules = loader.get_transliteration_rules()
|
||||||
|
trans = Transliterator.createFromRules("test", rules)
|
||||||
|
|
||||||
|
assert trans.transliterate(" axxt ") == " byt "
|
||||||
|
|
||||||
|
|
||||||
def test_get_replacement_pairs_multi_to(cfgfile):
|
def test_get_replacement_pairs_multi_to(cfgfile):
|
||||||
fpath = cfgfile(['Pfad', 'Strasse'],
|
fpath = cfgfile(['Pfad', 'Strasse'],
|
||||||
['Strasse => str,st'])
|
['Strasse => str,st'])
|
||||||
|
|||||||
Reference in New Issue
Block a user