move word table and normalisation SQL into tokenizer

Creating and populating the word table is now the responsibility
of the tokenizer.

The get_maxwordfreq() function has been replaced with a
simple template parameter to the SQL during function installation.
The number is taken from the parameter list in the database to
ensure that it is not changed after installation.
This commit is contained in:
Sarah Hoffmann
2021-04-22 22:47:34 +02:00
parent b5540dc35c
commit fbbdd31399
15 changed files with 117 additions and 53 deletions

View File

@@ -100,15 +100,19 @@ class SetupAll:
if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Initialise tables')
with connect(args.config.get_libpq_dsn()) as conn:
database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
database_import.truncate_data_tables(conn)
LOG.warning('Load data into placex table')
database_import.load_data(args.config.get_libpq_dsn(),
args.data_dir,
args.threads or psutil.cpu_count() or 1)
LOG.warning("Setting up tokenizer")
tokenizer = tokenizer_factory.create_tokenizer(args.config)
if args.continue_at is None or args.continue_at == 'load-data':
# (re)initialise the tokenizer data
tokenizer = tokenizer_factory.create_tokenizer(args.config)
else:
# just load the tokenizer
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Calculate postcodes')