unify ICUNameProcessorRules and ICURuleLoader

There is no need for the additional layer of indirection that
the ICUNameProcessorRules class adds. The ICURuleLoader can
fill the database properties directly.
This commit is contained in:
Sarah Hoffmann
2021-09-29 17:37:04 +02:00
parent 5e5addcdbf
commit 16daa57e47
14 changed files with 123 additions and 137 deletions

View File

@@ -14,7 +14,6 @@ from nominatim.db.properties import set_property, get_property
from nominatim.db.utils import CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -36,7 +35,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
def __init__(self, dsn, data_dir):
self.dsn = dsn
self.data_dir = data_dir
self.naming_rules = None
self.loader = None
self.term_normalization = None
@@ -46,9 +45,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG'))
self.naming_rules = ICUNameProcessorRules(loader=loader)
self.loader = ICURuleLoader(config)
self.term_normalization = config.TERM_NORMALIZATION
self._install_php(config.lib_dir.php)
@@ -59,11 +57,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
self._init_db_tables(config)
def init_from_project(self):
def init_from_project(self, config):
""" Initialise the tokenizer from the project directory.
"""
self.loader = ICURuleLoader(config)
with connect(self.dsn) as conn:
self.naming_rules = ICUNameProcessorRules(conn=conn)
self.loader.load_config_from_db(conn)
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
@@ -81,12 +81,12 @@ class LegacyICUTokenizer(AbstractTokenizer):
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
def check_database(self):
def check_database(self, config):
""" Check that the tokenizer is set up correctly.
"""
self.init_from_project()
self.init_from_project(config)
if self.naming_rules is None:
if self.term_normalization is None:
return "Configuration for tokenizer 'icu' are missing."
return None
@@ -107,7 +107,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
def _install_php(self, phpdir):
@@ -118,7 +118,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
<?php
@define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
@@ -127,8 +127,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
database as database properties.
"""
with connect(self.dsn) as conn:
self.naming_rules.save_rules(conn)
self.loader.save_config_to_db(conn)
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
@@ -163,7 +162,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
""" Count the partial terms from the names in the place table.
"""
words = Counter()
name_proc = ICUNameProcessor(self.naming_rules)
name_proc = self.loader.make_token_analysis()
with conn.cursor(name="words") as cur:
cur.execute(""" SELECT v, count(*) FROM