new word table layout for icu tokenizer

The table now directly reflects the different token types.
Extra information is saved in a json structure that may be
dynamically extended in the future without affecting the
table layout.
This commit is contained in:
Sarah Hoffmann
2021-07-20 10:27:06 +02:00
parent 34dcf02dee
commit 8377528952
2 changed files with 16 additions and 1 deletions

View File

@@ -152,7 +152,7 @@ class LegacyICUTokenizer:
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
conn.commit()
LOG.warning("Precomputing word tokens")