new word table layout for icu tokenizer

The table now directly reflects the different token types. Extra information is saved in a json structure that may be dynamically extended in the future without affecting the table layout.
2026-02-26 11:08:13 +00:00 · 2021-07-20 10:27:06 +02:00
parent 34dcf02dee
commit 8377528952
2 changed files with 16 additions and 1 deletions
--- a/lib-sql/tokenizer/icu_tokenizer_tables.sql
+++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql
@@ -0,0 +1,15 @@
 DROP TABLE IF EXISTS word;
 CREATE TABLE word_icu (
  word_id INTEGER,
  word_token text NOT NULL,
  type text NOT NULL,
  info jsonb
 ) {{db.tablespace.search_data}};
 CREATE INDEX idx_word_word_token ON word
    USING BTREE (word_token) {{db.tablespace.search_index}};
 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 DROP SEQUENCE IF EXISTS seq_word;
 CREATE SEQUENCE seq_word start 1;
 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -152,7 +152,7 @@ class LegacyICUTokenizer:
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
            conn.commit()
            LOG.warning("Precomputing word tokens")