move word table and normalisation SQL into tokenizer

Creating and populating the word table is now the responsibility of the tokenizer. The get_maxwordfreq() function has been replaced with a simple template parameter to the SQL during function installation. The number is taken from the parameter list in the database to ensure that it is not changed after installation.
2026-03-08 02:54:08 +00:00 · 2021-04-22 22:47:34 +02:00
parent b5540dc35c
commit fbbdd31399
15 changed files with 117 additions and 53 deletions
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -100,15 +100,19 @@ class SetupAll:
        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Initialise tables')
            with connect(args.config.get_libpq_dsn()) as conn:
-                database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
+                database_import.truncate_data_tables(conn)

            LOG.warning('Load data into placex table')
            database_import.load_data(args.config.get_libpq_dsn(),
-                                      args.data_dir,
                                      args.threads or psutil.cpu_count() or 1)

        LOG.warning("Setting up tokenizer")
-        tokenizer = tokenizer_factory.create_tokenizer(args.config)
+        if args.continue_at is None or args.continue_at == 'load-data':
+            # (re)initialise the tokenizer data
+            tokenizer = tokenizer_factory.create_tokenizer(args.config)
+        else:
+            # just load the tokenizer
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)

        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Calculate postcodes')