use yaml tag syntax to mark include files

This commit is contained in:
Sarah Hoffmann
2021-06-20 23:45:33 +02:00
parent c4f6c06f44
commit a6aa6360e0
5 changed files with 5001 additions and 4953 deletions

View File

@@ -5,6 +5,7 @@ import io
import logging import logging
from collections import defaultdict from collections import defaultdict
import itertools import itertools
from pathlib import Path
import yaml import yaml
from icu import Transliterator from icu import Transliterator
@@ -13,6 +14,22 @@ from nominatim.errors import UsageError
LOG = logging.getLogger() LOG = logging.getLogger()
def _flatten_yaml_list(content):
if not content:
return []
if not isinstance(content, list):
raise UsageError("List expected in ICU yaml configuration.")
output = []
for ele in content:
if isinstance(ele, list):
output.extend(_flatten_yaml_list(ele))
else:
output.append(ele)
return output
class ICURuleLoader: class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file. """ Compiler for ICU rules from a tokenizer configuration file.
@@ -87,8 +104,20 @@ class ICURuleLoader:
return [(k, list(synonyms[k])) for k in sorted_keys] return [(k, list(synonyms[k])) for k in sorted_keys]
def _yaml_include_representer(self, loader, node):
value = loader.construct_scalar(node)
if Path(value).is_absolute():
content = Path(value).read_text()
else:
content = (self.configfile.parent / value).read_text()
return yaml.safe_load(content)
def _load_from_yaml(self): def _load_from_yaml(self):
yaml.add_constructor('!include', self._yaml_include_representer,
Loader=yaml.SafeLoader)
rules = yaml.safe_load(self.configfile.read_text()) rules = yaml.safe_load(self.configfile.read_text())
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
@@ -121,10 +150,8 @@ class ICURuleLoader:
if content is None: if content is None:
return '' return ''
if isinstance(content, str): return ';'.join(_flatten_yaml_list(content)) + ';'
return (self.configfile.parent / content).read_text().replace('\n', ' ')
return ';'.join(content) + ';'
def _parse_compound_suffix_list(self, rules): def _parse_compound_suffix_list(self, rules):

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,16 @@ normalization:
- "ß > 'ss'" # German szet is unimbigiously equal to double ss - "ß > 'ss'" # German szet is unimbigiously equal to double ss
- "[[:Punctuation:][:Space:]]+ > ' '" - "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()" - ":: NFC ()"
transliteration: icu_transliteration.rules transliteration:
- !include icu-rules/extended-unicode-to-asccii.yaml
- ":: Ascii ()"
- ":: NFD ()"
- "'' >"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- "[^[:Ascii:]] >"
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
compound_suffixes: compound_suffixes:
# Danish # Danish
- hal - hal

View File

@@ -121,6 +121,26 @@ def test_get_transliteration_rules(cfgfile):
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt " assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
def test_transliteration_rules_from_file(tmp_path):
cfgpath = tmp_path / ('test_config.yaml')
cfgpath.write_text(dedent("""\
normalization:
transliteration:
- "'ax' > 'b'"
- !include transliteration.yaml
compound_suffixes:
abbreviations:
"""))
transpath = tmp_path / ('transliteration.yaml')
transpath.write_text('- "x > y"')
loader = ICURuleLoader(cfgpath)
rules = loader.get_transliteration_rules()
trans = Transliterator.createFromRules("test", rules)
assert trans.transliterate(" axxt ") == " byt "
def test_get_replacement_pairs_multi_to(cfgfile): def test_get_replacement_pairs_multi_to(cfgfile):
fpath = cfgfile(['Pfad', 'Strasse'], fpath = cfgfile(['Pfad', 'Strasse'],
['Strasse => str,st']) ['Strasse => str,st'])