move word table and normalisation SQL into tokenizer

Creating and populating the word table is now the responsibility
of the tokenizer.

The get_maxwordfreq() function has been replaced with a
simple template parameter to the SQL during function installation.
The number is taken from the parameter list in the database to
ensure that it is not changed after installation.
This commit is contained in:
Sarah Hoffmann
2021-04-22 22:47:34 +02:00
parent b5540dc35c
commit fbbdd31399
15 changed files with 117 additions and 53 deletions

View File

@@ -46,6 +46,7 @@ class UpdateRefresh:
@staticmethod
def run(args):
from ..tools import refresh
from ..tokenizer import factory as tokenizer_factory
if args.postcodes:
LOG.warning("Update postcodes centroid")
@@ -66,6 +67,8 @@ class UpdateRefresh:
with connect(args.config.get_libpq_dsn()) as conn:
refresh.create_functions(conn, args.config,
args.diffs, args.enable_debug_statements)
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
tokenizer.update_sql_functions(args.config)
if args.wiki_data:
data_path = Path(args.config.WIKIPEDIA_DATA_PATH

View File

@@ -100,15 +100,19 @@ class SetupAll:
if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Initialise tables')
with connect(args.config.get_libpq_dsn()) as conn:
database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
database_import.truncate_data_tables(conn)
LOG.warning('Load data into placex table')
database_import.load_data(args.config.get_libpq_dsn(),
args.data_dir,
args.threads or psutil.cpu_count() or 1)
LOG.warning("Setting up tokenizer")
tokenizer = tokenizer_factory.create_tokenizer(args.config)
if args.continue_at is None or args.continue_at == 'load-data':
# (re)initialise the tokenizer data
tokenizer = tokenizer_factory.create_tokenizer(args.config)
else:
# just load the tokenizer
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Calculate postcodes')

View File

@@ -89,8 +89,6 @@ class SQLPreprocessor:
self.env.globals['db'] = db_info
self.env.globals['sql'] = _setup_postgres_sql(conn)
self.env.globals['postgres'] = _setup_postgresql_features(conn)
self.env.globals['modulepath'] = config.DATABASE_MODULE_PATH or \
str((config.project_dir / 'module').resolve())
def run_sql_file(self, conn, name, **kwargs):

View File

@@ -8,9 +8,12 @@ import psycopg2
from nominatim.db.connection import connect
from nominatim.db import properties
from nominatim.db import utils as db_utils
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.errors import UsageError
DBCFG_NORMALIZATION = "tokenizer_normalization"
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
LOG = logging.getLogger()
@@ -53,6 +56,9 @@ def _install_module(config_module_path, src_dir, module_dir):
def _check_module(module_dir, conn):
""" Try to use the PostgreSQL module to confirm that it is correctly
installed and accessible from PostgreSQL.
"""
with conn.cursor() as cur:
try:
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
@@ -91,7 +97,11 @@ class LegacyTokenizer:
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn)
self._save_config(conn, config)
conn.commit()
self.update_sql_functions(config)
self._init_db_tables(config)
def init_from_project(self):
@@ -101,6 +111,19 @@ class LegacyTokenizer:
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
def update_sql_functions(self, config):
""" Reimport the SQL functions for this tokenizer.
"""
with connect(self.dsn) as conn:
max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
modulepath = config.DATABASE_MODULE_PATH or \
str((config.project_dir / 'module').resolve())
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
max_word_freq=max_word_freq,
modulepath=modulepath)
def migrate_database(self, config):
""" Initialise the project directory of an existing database for
use with this tokenizer.
@@ -114,11 +137,25 @@ class LegacyTokenizer:
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn)
self._save_config(conn, config)
def _save_config(self, conn):
def _init_db_tables(self, config):
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
conn.commit()
LOG.warning("Precomputing word tokens")
db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
def _save_config(self, conn, config):
""" Save the configuration that needs to remain stable for the given
database as database properties.
"""
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)

View File

@@ -160,11 +160,10 @@ def create_partition_tables(conn, config):
sql.run_sql_file(conn, 'partition-tables.src.sql')
def truncate_data_tables(conn, max_word_frequency=None):
def truncate_data_tables(conn):
""" Truncate all data tables to prepare for a fresh load.
"""
with conn.cursor() as cur:
cur.execute('TRUNCATE word')
cur.execute('TRUNCATE placex')
cur.execute('TRUNCATE place_addressline')
cur.execute('TRUNCATE location_area')
@@ -183,23 +182,13 @@ def truncate_data_tables(conn, max_word_frequency=None):
for table in [r[0] for r in list(cur)]:
cur.execute('TRUNCATE ' + table)
if max_word_frequency is not None:
# Used by getorcreate_word_id to ignore frequent partial words.
cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
RETURNS integer AS $$
SELECT {} as maxwordfreq;
$$ LANGUAGE SQL IMMUTABLE
""".format(max_word_frequency))
conn.commit()
conn.commit()
_COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'
def load_data(dsn, data_dir, threads):
def load_data(dsn, threads):
""" Copy data into the word and placex table.
"""
# Pre-calculate the most important terms in the word list.
db_utils.execute_file(dsn, data_dir / 'words.sql')
sel = selectors.DefaultSelector()
# Then copy data from place to placex in <threads - 1> chunks.
place_threads = max(1, threads - 1)

View File

@@ -49,6 +49,8 @@ def migrate(config, paths):
if has_run_migration:
LOG.warning('Updating SQL functions.')
refresh.create_functions(conn, config)
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
tokenizer.update_sql_functions(config)
properties.set_property(conn, 'database_version',
'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))