mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-14 18:37:58 +00:00
unify ICUNameProcessorRules and ICURuleLoader
There is no need for the additional layer of indirection that the ICUNameProcessorRules class adds. The ICURuleLoader can fill the database properties directly.
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
Tokenizer for testing.
|
||||
"""
|
||||
from nominatim.indexer.place_info import PlaceInfo
|
||||
from nominatim.config import Configuration
|
||||
|
||||
def create(dsn, data_dir):
|
||||
""" Create a new instance of the tokenizer provided by this module.
|
||||
@@ -22,7 +23,8 @@ class DummyTokenizer:
|
||||
self.init_state = "new"
|
||||
|
||||
|
||||
def init_from_project(self):
|
||||
def init_from_project(self, config):
|
||||
assert isinstance(config, Configuration)
|
||||
assert self.init_state is None
|
||||
self.init_state = "loaded"
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ import yaml
|
||||
import pytest
|
||||
|
||||
from nominatim.tokenizer import icu_tokenizer
|
||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
|
||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||
from nominatim.db import properties
|
||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||
@@ -72,7 +71,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
||||
cfgstr = {'normalization' : list(norm),
|
||||
'transliteration' : list(trans),
|
||||
'variants' : [ {'words': list(variants)}]}
|
||||
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
||||
tok.loader = ICURuleLoader(test_config)
|
||||
|
||||
return tok.name_analyzer()
|
||||
|
||||
@@ -178,9 +178,9 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
|
||||
monkeypatch.undo()
|
||||
|
||||
tok = tokenizer_factory()
|
||||
tok.init_from_project()
|
||||
tok.init_from_project(test_config)
|
||||
|
||||
assert tok.naming_rules is not None
|
||||
assert tok.loader is not None
|
||||
assert tok.term_normalization == ':: lower();'
|
||||
|
||||
|
||||
|
||||
@@ -4,15 +4,17 @@ Tests for import name normalisation and variant generation.
|
||||
from textwrap import dedent
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
@pytest.fixture
|
||||
def cfgfile():
|
||||
def cfgfile(def_config, tmp_path):
|
||||
project_dir = tmp_path / 'project_dir'
|
||||
project_dir.mkdir()
|
||||
def_config.project_dir = project_dir
|
||||
|
||||
def _create_config(*variants, **kwargs):
|
||||
content = dedent("""\
|
||||
normalization:
|
||||
@@ -30,7 +32,9 @@ def cfgfile():
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
for k, v in kwargs:
|
||||
content += " {}: {}\n".format(k, v)
|
||||
return yaml.safe_load(content)
|
||||
(project_dir / 'icu_tokenizer.yaml').write_text(content)
|
||||
|
||||
return def_config
|
||||
|
||||
return _create_config
|
||||
|
||||
@@ -40,10 +44,9 @@ def get_normalized_variants(proc, name):
|
||||
|
||||
|
||||
def test_variants_empty(cfgfile):
|
||||
fpath = cfgfile('saint -> 🜵', 'street -> st')
|
||||
config = cfgfile('saint -> 🜵', 'street -> st')
|
||||
|
||||
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
|
||||
proc = ICUNameProcessor(rules)
|
||||
proc = ICURuleLoader(config).make_token_analysis()
|
||||
|
||||
assert get_normalized_variants(proc, '🜵') == []
|
||||
assert get_normalized_variants(proc, '🜳') == []
|
||||
@@ -83,8 +86,8 @@ VARIANT_TESTS = [
|
||||
|
||||
@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
|
||||
def test_variants(cfgfile, rules, name, variants):
|
||||
fpath = cfgfile(*rules)
|
||||
proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath)))
|
||||
config = cfgfile(*rules)
|
||||
proc = ICURuleLoader(config).make_token_analysis()
|
||||
|
||||
result = get_normalized_variants(proc, name)
|
||||
|
||||
@@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants):
|
||||
|
||||
|
||||
def test_search_normalized(cfgfile):
|
||||
fpath = cfgfile('~street => s,st', 'master => mstr')
|
||||
|
||||
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
|
||||
proc = ICUNameProcessor(rules)
|
||||
config = cfgfile('~street => s,st', 'master => mstr')
|
||||
proc = ICURuleLoader(config).make_token_analysis()
|
||||
|
||||
assert proc.get_search_normalized('Master Street') == 'master street'
|
||||
assert proc.get_search_normalized('Earnes St') == 'earnes st'
|
||||
|
||||
@@ -12,7 +12,16 @@ from nominatim.errors import UsageError
|
||||
from icu import Transliterator
|
||||
|
||||
@pytest.fixture
|
||||
def cfgrules():
|
||||
def test_config(def_config, tmp_path):
|
||||
project_dir = tmp_path / 'project_dir'
|
||||
project_dir.mkdir()
|
||||
def_config.project_dir = project_dir
|
||||
|
||||
return def_config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cfgrules(test_config):
|
||||
def _create_config(*variants, **kwargs):
|
||||
content = dedent("""\
|
||||
normalization:
|
||||
@@ -29,19 +38,21 @@ def cfgrules():
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
for k, v in kwargs:
|
||||
content += " {}: {}\n".format(k, v)
|
||||
return yaml.safe_load(content)
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
|
||||
|
||||
return test_config
|
||||
|
||||
return _create_config
|
||||
|
||||
|
||||
def test_empty_rule_set():
|
||||
rule_cfg = yaml.safe_load(dedent("""\
|
||||
def test_empty_rule_set(test_config):
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
|
||||
normalization:
|
||||
transliteration:
|
||||
variants:
|
||||
"""))
|
||||
|
||||
rules = ICURuleLoader(rule_cfg)
|
||||
rules = ICURuleLoader(test_config)
|
||||
assert rules.get_search_rules() == ''
|
||||
assert rules.get_normalization_rules() == ''
|
||||
assert rules.get_transliteration_rules() == ''
|
||||
@@ -50,11 +61,12 @@ def test_empty_rule_set():
|
||||
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
|
||||
|
||||
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
|
||||
def test_missing_section(section):
|
||||
def test_missing_section(section, test_config):
|
||||
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
ICURuleLoader(rule_cfg)
|
||||
ICURuleLoader(test_config)
|
||||
|
||||
|
||||
def test_get_search_rules(cfgrules):
|
||||
@@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules):
|
||||
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
|
||||
|
||||
|
||||
def test_transliteration_rules_from_file(def_config, tmp_path):
|
||||
def_config.project_dir = tmp_path
|
||||
cfgpath = tmp_path / ('test_config.yaml')
|
||||
def test_transliteration_rules_from_file(test_config):
|
||||
cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
|
||||
cfgpath.write_text(dedent("""\
|
||||
normalization:
|
||||
transliteration:
|
||||
@@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path):
|
||||
- !include transliteration.yaml
|
||||
variants:
|
||||
"""))
|
||||
transpath = tmp_path / ('transliteration.yaml')
|
||||
transpath = test_config.project_dir / ('transliteration.yaml')
|
||||
transpath.write_text('- "x > y"')
|
||||
|
||||
loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
|
||||
loader = ICURuleLoader(test_config)
|
||||
rules = loader.get_transliteration_rules()
|
||||
trans = Transliterator.createFromRules("test", rules)
|
||||
|
||||
|
||||
@@ -132,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config,
|
||||
assert not (test_config.project_dir / 'module').exists()
|
||||
|
||||
|
||||
def test_init_from_project(tokenizer_setup, tokenizer_factory):
|
||||
def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
|
||||
tok = tokenizer_factory()
|
||||
|
||||
tok.init_from_project()
|
||||
tok.init_from_project(test_config)
|
||||
|
||||
assert tok.normalization is not None
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch,
|
||||
check_result, state):
|
||||
class _TestTokenizer:
|
||||
@staticmethod
|
||||
def check_database():
|
||||
def check_database(_):
|
||||
return check_result
|
||||
|
||||
monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
|
||||
|
||||
Reference in New Issue
Block a user