Merge pull request #3328 from lonvia/word-count-into-new-table

Recreate word table when refreshing counts
This commit is contained in:
Sarah Hoffmann
2024-02-05 11:58:11 +01:00
committed by GitHub
10 changed files with 130 additions and 82 deletions

View File

@@ -1,40 +0,0 @@
-- SPDX-License-Identifier: GPL-2.0-only
--
-- This file is part of Nominatim. (https://nominatim.org)
--
-- Copyright (C) 2022 by the Nominatim developer community.
-- For a full list of authors see the git log.
DROP TABLE IF EXISTS word;
CREATE TABLE word (
word_id INTEGER,
word_token text NOT NULL,
type text NOT NULL,
word text,
info jsonb
) {{db.tablespace.search_data}};
CREATE INDEX idx_word_word_token ON word
USING BTREE (word_token) {{db.tablespace.search_index}};
-- Used when updating country names from the boundary relation.
CREATE INDEX idx_word_country_names ON word
USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'C';
-- Used when inserting new postcodes on updates.
CREATE INDEX idx_word_postcodes ON word
USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'P';
-- Used when inserting full words.
CREATE INDEX idx_word_full_word ON word
USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'W';
-- Used when inserting analyzed housenumbers (exclude old-style entries).
CREATE INDEX idx_word_housenumbers ON word
USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'H' and word is not null;
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS seq_word;
CREATE SEQUENCE seq_word start 1;
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";

View File

@@ -110,7 +110,7 @@ class UpdateRefresh:
if args.word_counts: if args.word_counts:
LOG.warning('Recompute word statistics') LOG.warning('Recompute word statistics')
self._get_tokenizer(args.config).update_statistics() self._get_tokenizer(args.config).update_statistics(args.config)
if args.address_levels: if args.address_levels:
LOG.warning('Updating address levels') LOG.warning('Updating address levels')

View File

@@ -169,7 +169,7 @@ class SetupAll:
tokenizer.finalize_import(args.config) tokenizer.finalize_import(args.config)
LOG.warning('Recompute word counts') LOG.warning('Recompute word counts')
tokenizer.update_statistics() tokenizer.update_statistics(args.config)
webdir = args.project_dir / 'website' webdir = args.project_dir / 'website'
LOG.warning('Setup website at %s', webdir) LOG.warning('Setup website at %s', webdir)

View File

@@ -90,6 +90,18 @@ class SQLPreprocessor:
self.env.globals['postgres'] = _setup_postgresql_features(conn) self.env.globals['postgres'] = _setup_postgresql_features(conn)
def run_string(self, conn: Connection, template: str, **kwargs: Any) -> None:
""" Execute the given SQL template string on the connection.
The keyword arguments may supply additional parameters
for preprocessing.
"""
sql = self.env.from_string(template).render(**kwargs)
with conn.cursor() as cur:
cur.execute(sql)
conn.commit()
def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None: def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None:
""" Execute the given SQL file on the connection. The keyword arguments """ Execute the given SQL file on the connection. The keyword arguments
may supply additional parameters for preprocessing. may supply additional parameters for preprocessing.

View File

@@ -201,7 +201,7 @@ class AbstractTokenizer(ABC):
@abstractmethod @abstractmethod
def update_statistics(self) -> None: def update_statistics(self, config: Configuration) -> None:
""" Recompute any tokenizer statistics necessary for efficient lookup. """ Recompute any tokenizer statistics necessary for efficient lookup.
This function is meant to be called from time to time by the user This function is meant to be called from time to time by the user
to improve performance. However, the tokenizer must not depend on to improve performance. However, the tokenizer must not depend on

View File

@@ -31,6 +31,11 @@ DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
LOG = logging.getLogger() LOG = logging.getLogger()
WORD_TYPES =(('country_names', 'C'),
('postcodes', 'P'),
('full_word', 'W'),
('housenumbers', 'H'))
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer': def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
""" Create a new instance of the tokenizer provided by this module. """ Create a new instance of the tokenizer provided by this module.
""" """
@@ -62,7 +67,8 @@ class ICUTokenizer(AbstractTokenizer):
if init_db: if init_db:
self.update_sql_functions(config) self.update_sql_functions(config)
self._init_db_tables(config) self._setup_db_tables(config, 'word')
self._create_base_indices(config, 'word')
def init_from_project(self, config: Configuration) -> None: def init_from_project(self, config: Configuration) -> None:
@@ -80,9 +86,7 @@ class ICUTokenizer(AbstractTokenizer):
""" Do any required postprocessing to make the tokenizer data ready """ Do any required postprocessing to make the tokenizer data ready
for use. for use.
""" """
with connect(self.dsn) as conn: self._create_lookup_indices(config, 'word')
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
def update_sql_functions(self, config: Configuration) -> None: def update_sql_functions(self, config: Configuration) -> None:
@@ -100,24 +104,35 @@ class ICUTokenizer(AbstractTokenizer):
self.init_from_project(config) self.init_from_project(config)
def update_statistics(self) -> None: def update_statistics(self, config: Configuration) -> None:
""" Recompute frequencies for all name words. """ Recompute frequencies for all name words.
""" """
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
if conn.table_exists('search_name'): if not conn.table_exists('search_name'):
with conn.cursor() as cur: return
cur.drop_table("word_frequencies")
LOG.info("Computing word frequencies") with conn.cursor() as cur:
cur.execute("""CREATE TEMP TABLE word_frequencies AS LOG.info('Computing word frequencies')
SELECT unnest(name_vector) as id, count(*) cur.drop_table('word_frequencies')
FROM search_name GROUP BY id""") cur.execute("""CREATE TEMP TABLE word_frequencies AS
cur.execute("CREATE INDEX ON word_frequencies(id)") SELECT unnest(name_vector) as id, count(*)
LOG.info("Update word table with recomputed frequencies") FROM search_name GROUP BY id""")
cur.execute("""UPDATE word cur.execute('CREATE INDEX ON word_frequencies(id)')
SET info = info || jsonb_build_object('count', count) LOG.info('Update word table with recomputed frequencies')
FROM word_frequencies WHERE word_id = id""") cur.drop_table('tmp_word')
cur.drop_table("word_frequencies") cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
(CASE WHEN wf.count is null THEN info
ELSE info || jsonb_build_object('count', wf.count)
END) as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id""")
cur.drop_table('word_frequencies')
conn.commit() conn.commit()
self._create_base_indices(config, 'tmp_word')
self._create_lookup_indices(config, 'tmp_word')
self._move_temporary_word_table('tmp_word')
def _cleanup_housenumbers(self) -> None: def _cleanup_housenumbers(self) -> None:
@@ -219,16 +234,81 @@ class ICUTokenizer(AbstractTokenizer):
self.loader.save_config_to_db(conn) self.loader.save_config_to_db(conn)
def _init_db_tables(self, config: Configuration) -> None: def _setup_db_tables(self, config: Configuration, table_name: str) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table(table_name)
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn, """
CREATE TABLE {{table_name}} (
word_id INTEGER,
word_token text NOT NULL,
type text NOT NULL,
word text,
info jsonb
) {{db.tablespace.search_data}};
GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS seq_{{table_name}};
CREATE SEQUENCE seq_{{table_name}} start 1;
GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
""", table_name=table_name)
def _create_base_indices(self, config: Configuration, table_name: str) -> None:
""" Set up the word table and fill it with pre-computed word """ Set up the word table and fill it with pre-computed word
frequencies. frequencies.
""" """
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config) sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql') sqlp.run_string(conn,
"""CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
USING BTREE (word_token) {{db.tablespace.search_index}}""",
table_name=table_name)
for name, ctype in WORD_TYPES:
sqlp.run_string(conn,
"""CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
USING BTREE (word) {{db.tablespace.address_index}}
WHERE type = '{{column_type}}'
""",
table_name=table_name, idx_name=name,
column_type=ctype)
def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
""" Create addtional indexes used when running the API.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
# Index required for details lookup.
sqlp.run_string(conn, """
CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
""",
table_name=table_name)
def _move_temporary_word_table(self, old: str) -> None:
""" Rename all tables and indexes used by the tokenizer.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('word')
cur.execute(f"ALTER TABLE {old} RENAME TO word")
for idx in ('word_token', 'word_id'):
cur.execute(f"""ALTER INDEX idx_{old}_{idx}
RENAME TO idx_word_{idx}""")
for name, _ in WORD_TYPES:
cur.execute(f"""ALTER INDEX idx_{old}_{name}
RENAME TO idx_word_{name}""")
conn.commit() conn.commit()
class ICUNameAnalyzer(AbstractAnalyzer): class ICUNameAnalyzer(AbstractAnalyzer):
""" The ICU analyzer uses the ICU library for splitting names. """ The ICU analyzer uses the ICU library for splitting names.

View File

@@ -210,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer):
self._save_config(conn, config) self._save_config(conn, config)
def update_statistics(self) -> None: def update_statistics(self, _: Configuration) -> None:
""" Recompute the frequency of full words. """ Recompute the frequency of full words.
""" """
with connect(self.dsn) as conn: with connect(self.dsn) as conn:

View File

@@ -38,10 +38,10 @@ class DummyTokenizer:
def finalize_import(self, *args): def finalize_import(self, *args):
self.finalize_import_called = True self.finalize_import_called = True
def update_statistics(self): def update_statistics(self, *args):
self.update_statistics_called = True self.update_statistics_called = True
def update_word_tokens(self): def update_word_tokens(self, *args):
self.update_word_tokens_called = True self.update_word_tokens_called = True

View File

@@ -7,7 +7,6 @@
""" """
Tests for ICU tokenizer. Tests for ICU tokenizer.
""" """
import shutil
import yaml import yaml
import itertools import itertools
@@ -32,8 +31,6 @@ def test_config(project_env, tmp_path):
sqldir.mkdir() sqldir.mkdir()
(sqldir / 'tokenizer').mkdir() (sqldir / 'tokenizer').mkdir()
(sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'") (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
shutil.copy(str(project_env.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
project_env.lib_dir.sql = sqldir project_env.lib_dir.sql = sqldir
@@ -204,16 +201,14 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
def test_finalize_import(tokenizer_factory, temp_db_conn, def test_finalize_import(tokenizer_factory, temp_db_conn,
temp_db_cursor, test_config, sql_preprocessor_cfg): temp_db_cursor, test_config, sql_preprocessor_cfg):
func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_indices.sql'
func_file.write_text("""CREATE FUNCTION test() RETURNS TEXT
AS $$ SELECT 'b'::text $$ LANGUAGE SQL""")
tok = tokenizer_factory() tok = tokenizer_factory()
tok.init_new_db(test_config) tok.init_new_db(test_config)
assert not temp_db_conn.index_exists('idx_word_word_id')
tok.finalize_import(test_config) tok.finalize_import(test_config)
temp_db_cursor.scalar('SELECT test()') == 'b' assert temp_db_conn.index_exists('idx_word_word_id')
def test_check_database(test_config, tokenizer_factory, def test_check_database(test_config, tokenizer_factory,
@@ -224,19 +219,20 @@ def test_check_database(test_config, tokenizer_factory,
assert tok.check_database(test_config) is None assert tok.check_database(test_config) is None
def test_update_statistics_reverse_only(word_table, tokenizer_factory): def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
tok = tokenizer_factory() tok = tokenizer_factory()
tok.update_statistics() tok.update_statistics(test_config)
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory): def test_update_statistics(word_table, table_factory, temp_db_cursor,
tokenizer_factory, test_config):
word_table.add_full_word(1000, 'hello') word_table.add_full_word(1000, 'hello')
table_factory('search_name', table_factory('search_name',
'place_id BIGINT, name_vector INT[]', 'place_id BIGINT, name_vector INT[]',
[(12, [1000])]) [(12, [1000])])
tok = tokenizer_factory() tok = tokenizer_factory()
tok.update_statistics() tok.update_statistics(test_config)
assert temp_db_cursor.scalar("""SELECT count(*) FROM word assert temp_db_cursor.scalar("""SELECT count(*) FROM word
WHERE type = 'W' and WHERE type = 'W' and

View File

@@ -238,19 +238,19 @@ def test_check_database_bad_setup(test_config, tokenizer_factory, monkeypatch,
assert tok.check_database(False) is not None assert tok.check_database(False) is not None
def test_update_statistics_reverse_only(word_table, tokenizer_factory): def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
tok = tokenizer_factory() tok = tokenizer_factory()
tok.update_statistics() tok.update_statistics(test_config)
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory): def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory, test_config):
word_table.add_full_word(1000, 'hello') word_table.add_full_word(1000, 'hello')
table_factory('search_name', table_factory('search_name',
'place_id BIGINT, name_vector INT[]', 'place_id BIGINT, name_vector INT[]',
[(12, [1000])]) [(12, [1000])])
tok = tokenizer_factory() tok = tokenizer_factory()
tok.update_statistics() tok.update_statistics(test_config)
assert temp_db_cursor.scalar("""SELECT count(*) FROM word assert temp_db_cursor.scalar("""SELECT count(*) FROM word
WHERE word_token like ' %' and WHERE word_token like ' %' and