forked from hans/Nominatim
introduce generic YAML config loader
Adds a function to the Configuration class to load a YAML file. This means that searching for the file is generalised and works the same now for all configuration files. Changes the search logic, so that it is always possible to have a custom version of the configuration file in the project directory. Move ICU tokenizer to use new load function.
This commit is contained in:
@@ -4,10 +4,8 @@ Helper class to create ICU rules from a configuration file.
|
||||
import io
|
||||
import logging
|
||||
import itertools
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
import yaml
|
||||
from icu import Transliterator
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
@@ -15,17 +13,17 @@ import nominatim.tokenizer.icu_variants as variants
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _flatten_yaml_list(content):
|
||||
def _flatten_config_list(content):
|
||||
if not content:
|
||||
return []
|
||||
|
||||
if not isinstance(content, list):
|
||||
raise UsageError("List expected in ICU yaml configuration.")
|
||||
raise UsageError("List expected in ICU configuration.")
|
||||
|
||||
output = []
|
||||
for ele in content:
|
||||
if isinstance(ele, list):
|
||||
output.extend(_flatten_yaml_list(ele))
|
||||
output.extend(_flatten_config_list(ele))
|
||||
else:
|
||||
output.append(ele)
|
||||
|
||||
@@ -48,14 +46,12 @@ class ICURuleLoader:
|
||||
""" Compiler for ICU rules from a tokenizer configuration file.
|
||||
"""
|
||||
|
||||
def __init__(self, configfile):
|
||||
self.configfile = configfile
|
||||
def __init__(self, rules):
|
||||
self.variants = set()
|
||||
|
||||
if configfile.suffix == '.yaml':
|
||||
self._load_from_yaml()
|
||||
else:
|
||||
raise UsageError("Unknown format of tokenizer configuration.")
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||
self._parse_variant_list(self._get_section(rules, 'variants'))
|
||||
|
||||
|
||||
def get_search_rules(self):
|
||||
@@ -88,34 +84,14 @@ class ICURuleLoader:
|
||||
"""
|
||||
return self.variants
|
||||
|
||||
def _yaml_include_representer(self, loader, node):
|
||||
value = loader.construct_scalar(node)
|
||||
|
||||
if Path(value).is_absolute():
|
||||
content = Path(value)
|
||||
else:
|
||||
content = (self.configfile.parent / value)
|
||||
|
||||
return yaml.safe_load(content.read_text(encoding='utf-8'))
|
||||
|
||||
|
||||
def _load_from_yaml(self):
|
||||
yaml.add_constructor('!include', self._yaml_include_representer,
|
||||
Loader=yaml.SafeLoader)
|
||||
rules = yaml.safe_load(self.configfile.read_text(encoding='utf-8'))
|
||||
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||
self._parse_variant_list(self._get_section(rules, 'variants'))
|
||||
|
||||
|
||||
def _get_section(self, rules, section):
|
||||
@staticmethod
|
||||
def _get_section(rules, section):
|
||||
""" Get the section named 'section' from the rules. If the section does
|
||||
not exist, raise a usage error with a meaningful message.
|
||||
"""
|
||||
if section not in rules:
|
||||
LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
|
||||
section, str(self.configfile))
|
||||
LOG.fatal("Section '%s' not found in tokenizer config.", section)
|
||||
raise UsageError("Syntax error in tokenizer configuration file.")
|
||||
|
||||
return rules[section]
|
||||
@@ -133,7 +109,7 @@ class ICURuleLoader:
|
||||
if content is None:
|
||||
return ''
|
||||
|
||||
return ';'.join(_flatten_yaml_list(content)) + ';'
|
||||
return ';'.join(_flatten_config_list(content)) + ';'
|
||||
|
||||
|
||||
def _parse_variant_list(self, rules):
|
||||
@@ -142,7 +118,7 @@ class ICURuleLoader:
|
||||
if not rules:
|
||||
return
|
||||
|
||||
rules = _flatten_yaml_list(rules)
|
||||
rules = _flatten_config_list(rules)
|
||||
|
||||
vmaker = _VariantMaker(self.normalization_rules)
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ import json
|
||||
import logging
|
||||
import re
|
||||
from textwrap import dedent
|
||||
from pathlib import Path
|
||||
|
||||
from nominatim.db.connection import connect
|
||||
from nominatim.db.properties import set_property, get_property
|
||||
@@ -49,12 +48,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
This copies all necessary data in the project directory to make
|
||||
sure the tokenizer remains stable even over updates.
|
||||
"""
|
||||
if config.TOKENIZER_CONFIG:
|
||||
cfgfile = Path(config.TOKENIZER_CONFIG)
|
||||
else:
|
||||
cfgfile = config.config_dir / 'icu_tokenizer.yaml'
|
||||
|
||||
loader = ICURuleLoader(cfgfile)
|
||||
loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
|
||||
config='TOKENIZER_CONFIG'))
|
||||
self.naming_rules = ICUNameProcessorRules(loader=loader)
|
||||
self.term_normalization = config.TERM_NORMALIZATION
|
||||
self.max_word_frequency = config.MAX_WORD_FREQUENCY
|
||||
|
||||
Reference in New Issue
Block a user