unify ICUNameProcessorRules and ICURuleLoader

There is no need for the additional layer of indirection that
the ICUNameProcessorRules class adds. The ICURuleLoader can
fill the database properties directly.
This commit is contained in:
Sarah Hoffmann
2021-09-29 17:37:04 +02:00
parent 5e5addcdbf
commit 16daa57e47
14 changed files with 123 additions and 137 deletions

View File

@@ -2,6 +2,7 @@
Tokenizer for testing.
"""
from nominatim.indexer.place_info import PlaceInfo
from nominatim.config import Configuration
def create(dsn, data_dir):
""" Create a new instance of the tokenizer provided by this module.
@@ -22,7 +23,8 @@ class DummyTokenizer:
self.init_state = "new"
def init_from_project(self):
def init_from_project(self, config):
assert isinstance(config, Configuration)
assert self.init_state is None
self.init_state = "loaded"

View File

@@ -7,7 +7,6 @@ import yaml
import pytest
from nominatim.tokenizer import icu_tokenizer
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties
from nominatim.db.sql_preprocessor import SQLPreprocessor
@@ -72,7 +71,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
cfgstr = {'normalization' : list(norm),
'transliteration' : list(trans),
'variants' : [ {'words': list(variants)}]}
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config)
return tok.name_analyzer()
@@ -178,9 +178,9 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
monkeypatch.undo()
tok = tokenizer_factory()
tok.init_from_project()
tok.init_from_project(test_config)
assert tok.naming_rules is not None
assert tok.loader is not None
assert tok.term_normalization == ':: lower();'

View File

@@ -4,15 +4,17 @@ Tests for import name normalisation and variant generation.
from textwrap import dedent
import pytest
import yaml
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.errors import UsageError
@pytest.fixture
def cfgfile():
def cfgfile(def_config, tmp_path):
project_dir = tmp_path / 'project_dir'
project_dir.mkdir()
def_config.project_dir = project_dir
def _create_config(*variants, **kwargs):
content = dedent("""\
normalization:
@@ -30,7 +32,9 @@ def cfgfile():
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
return yaml.safe_load(content)
(project_dir / 'icu_tokenizer.yaml').write_text(content)
return def_config
return _create_config
@@ -40,10 +44,9 @@ def get_normalized_variants(proc, name):
def test_variants_empty(cfgfile):
fpath = cfgfile('saint -> 🜵', 'street -> st')
config = cfgfile('saint -> 🜵', 'street -> st')
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
proc = ICURuleLoader(config).make_token_analysis()
assert get_normalized_variants(proc, '🜵') == []
assert get_normalized_variants(proc, '🜳') == []
@@ -83,8 +86,8 @@ VARIANT_TESTS = [
@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
def test_variants(cfgfile, rules, name, variants):
fpath = cfgfile(*rules)
proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath)))
config = cfgfile(*rules)
proc = ICURuleLoader(config).make_token_analysis()
result = get_normalized_variants(proc, name)
@@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants):
def test_search_normalized(cfgfile):
fpath = cfgfile('~street => s,st', 'master => mstr')
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
config = cfgfile('~street => s,st', 'master => mstr')
proc = ICURuleLoader(config).make_token_analysis()
assert proc.get_search_normalized('Master Street') == 'master street'
assert proc.get_search_normalized('Earnes St') == 'earnes st'

View File

@@ -12,7 +12,16 @@ from nominatim.errors import UsageError
from icu import Transliterator
@pytest.fixture
def cfgrules():
def test_config(def_config, tmp_path):
project_dir = tmp_path / 'project_dir'
project_dir.mkdir()
def_config.project_dir = project_dir
return def_config
@pytest.fixture
def cfgrules(test_config):
def _create_config(*variants, **kwargs):
content = dedent("""\
normalization:
@@ -29,19 +38,21 @@ def cfgrules():
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
return yaml.safe_load(content)
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
return test_config
return _create_config
def test_empty_rule_set():
rule_cfg = yaml.safe_load(dedent("""\
def test_empty_rule_set(test_config):
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
normalization:
transliteration:
variants:
"""))
rules = ICURuleLoader(rule_cfg)
rules = ICURuleLoader(test_config)
assert rules.get_search_rules() == ''
assert rules.get_normalization_rules() == ''
assert rules.get_transliteration_rules() == ''
@@ -50,11 +61,12 @@ def test_empty_rule_set():
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
def test_missing_section(section):
def test_missing_section(section, test_config):
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
with pytest.raises(UsageError):
ICURuleLoader(rule_cfg)
ICURuleLoader(test_config)
def test_get_search_rules(cfgrules):
@@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules):
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
def test_transliteration_rules_from_file(def_config, tmp_path):
def_config.project_dir = tmp_path
cfgpath = tmp_path / ('test_config.yaml')
def test_transliteration_rules_from_file(test_config):
cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
cfgpath.write_text(dedent("""\
normalization:
transliteration:
@@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path):
- !include transliteration.yaml
variants:
"""))
transpath = tmp_path / ('transliteration.yaml')
transpath = test_config.project_dir / ('transliteration.yaml')
transpath.write_text('- "x > y"')
loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
loader = ICURuleLoader(test_config)
rules = loader.get_transliteration_rules()
trans = Transliterator.createFromRules("test", rules)

View File

@@ -132,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config,
assert not (test_config.project_dir / 'module').exists()
def test_init_from_project(tokenizer_setup, tokenizer_factory):
def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
tok = tokenizer_factory()
tok.init_from_project()
tok.init_from_project(test_config)
assert tok.normalization is not None

View File

@@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch,
check_result, state):
class _TestTokenizer:
@staticmethod
def check_database():
def check_database(_):
return check_result
monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',