move word table and normalisation SQL into tokenizer

Creating and populating the word table is now the responsibility
of the tokenizer.

The get_maxwordfreq() function has been replaced with a
simple template parameter to the SQL during function installation.
The number is taken from the parameter list in the database to
ensure that it is not changed after installation.
This commit is contained in:
Sarah Hoffmann
2021-04-22 22:47:34 +02:00
parent b5540dc35c
commit fbbdd31399
15 changed files with 117 additions and 53 deletions

View File

@@ -160,11 +160,10 @@ def create_partition_tables(conn, config):
sql.run_sql_file(conn, 'partition-tables.src.sql')
def truncate_data_tables(conn, max_word_frequency=None):
def truncate_data_tables(conn):
""" Truncate all data tables to prepare for a fresh load.
"""
with conn.cursor() as cur:
cur.execute('TRUNCATE word')
cur.execute('TRUNCATE placex')
cur.execute('TRUNCATE place_addressline')
cur.execute('TRUNCATE location_area')
@@ -183,23 +182,13 @@ def truncate_data_tables(conn, max_word_frequency=None):
for table in [r[0] for r in list(cur)]:
cur.execute('TRUNCATE ' + table)
if max_word_frequency is not None:
# Used by getorcreate_word_id to ignore frequent partial words.
cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
RETURNS integer AS $$
SELECT {} as maxwordfreq;
$$ LANGUAGE SQL IMMUTABLE
""".format(max_word_frequency))
conn.commit()
conn.commit()
_COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'
def load_data(dsn, data_dir, threads):
def load_data(dsn, threads):
""" Copy data into the word and placex table.
"""
# Pre-calculate the most important terms in the word list.
db_utils.execute_file(dsn, data_dir / 'words.sql')
sel = selectors.DefaultSelector()
# Then copy data from place to placex in <threads - 1> chunks.
place_threads = max(1, threads - 1)