mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-09 03:24:06 +00:00
Merge pull request #3328 from lonvia/word-count-into-new-table
Recreate word table when refreshing counts
This commit is contained in:
@@ -1,40 +0,0 @@
|
|||||||
-- SPDX-License-Identifier: GPL-2.0-only
|
|
||||||
--
|
|
||||||
-- This file is part of Nominatim. (https://nominatim.org)
|
|
||||||
--
|
|
||||||
-- Copyright (C) 2022 by the Nominatim developer community.
|
|
||||||
-- For a full list of authors see the git log.
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS word;
|
|
||||||
CREATE TABLE word (
|
|
||||||
word_id INTEGER,
|
|
||||||
word_token text NOT NULL,
|
|
||||||
type text NOT NULL,
|
|
||||||
word text,
|
|
||||||
info jsonb
|
|
||||||
) {{db.tablespace.search_data}};
|
|
||||||
|
|
||||||
CREATE INDEX idx_word_word_token ON word
|
|
||||||
USING BTREE (word_token) {{db.tablespace.search_index}};
|
|
||||||
-- Used when updating country names from the boundary relation.
|
|
||||||
CREATE INDEX idx_word_country_names ON word
|
|
||||||
USING btree(word) {{db.tablespace.address_index}}
|
|
||||||
WHERE type = 'C';
|
|
||||||
-- Used when inserting new postcodes on updates.
|
|
||||||
CREATE INDEX idx_word_postcodes ON word
|
|
||||||
USING btree(word) {{db.tablespace.address_index}}
|
|
||||||
WHERE type = 'P';
|
|
||||||
-- Used when inserting full words.
|
|
||||||
CREATE INDEX idx_word_full_word ON word
|
|
||||||
USING btree(word) {{db.tablespace.address_index}}
|
|
||||||
WHERE type = 'W';
|
|
||||||
-- Used when inserting analyzed housenumbers (exclude old-style entries).
|
|
||||||
CREATE INDEX idx_word_housenumbers ON word
|
|
||||||
USING btree(word) {{db.tablespace.address_index}}
|
|
||||||
WHERE type = 'H' and word is not null;
|
|
||||||
|
|
||||||
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
|
|
||||||
|
|
||||||
DROP SEQUENCE IF EXISTS seq_word;
|
|
||||||
CREATE SEQUENCE seq_word start 1;
|
|
||||||
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
|
|
||||||
@@ -110,7 +110,7 @@ class UpdateRefresh:
|
|||||||
|
|
||||||
if args.word_counts:
|
if args.word_counts:
|
||||||
LOG.warning('Recompute word statistics')
|
LOG.warning('Recompute word statistics')
|
||||||
self._get_tokenizer(args.config).update_statistics()
|
self._get_tokenizer(args.config).update_statistics(args.config)
|
||||||
|
|
||||||
if args.address_levels:
|
if args.address_levels:
|
||||||
LOG.warning('Updating address levels')
|
LOG.warning('Updating address levels')
|
||||||
|
|||||||
@@ -169,7 +169,7 @@ class SetupAll:
|
|||||||
tokenizer.finalize_import(args.config)
|
tokenizer.finalize_import(args.config)
|
||||||
|
|
||||||
LOG.warning('Recompute word counts')
|
LOG.warning('Recompute word counts')
|
||||||
tokenizer.update_statistics()
|
tokenizer.update_statistics(args.config)
|
||||||
|
|
||||||
webdir = args.project_dir / 'website'
|
webdir = args.project_dir / 'website'
|
||||||
LOG.warning('Setup website at %s', webdir)
|
LOG.warning('Setup website at %s', webdir)
|
||||||
|
|||||||
@@ -90,6 +90,18 @@ class SQLPreprocessor:
|
|||||||
self.env.globals['postgres'] = _setup_postgresql_features(conn)
|
self.env.globals['postgres'] = _setup_postgresql_features(conn)
|
||||||
|
|
||||||
|
|
||||||
|
def run_string(self, conn: Connection, template: str, **kwargs: Any) -> None:
|
||||||
|
""" Execute the given SQL template string on the connection.
|
||||||
|
The keyword arguments may supply additional parameters
|
||||||
|
for preprocessing.
|
||||||
|
"""
|
||||||
|
sql = self.env.from_string(template).render(**kwargs)
|
||||||
|
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(sql)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None:
|
def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None:
|
||||||
""" Execute the given SQL file on the connection. The keyword arguments
|
""" Execute the given SQL file on the connection. The keyword arguments
|
||||||
may supply additional parameters for preprocessing.
|
may supply additional parameters for preprocessing.
|
||||||
|
|||||||
@@ -201,7 +201,7 @@ class AbstractTokenizer(ABC):
|
|||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def update_statistics(self) -> None:
|
def update_statistics(self, config: Configuration) -> None:
|
||||||
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
||||||
This function is meant to be called from time to time by the user
|
This function is meant to be called from time to time by the user
|
||||||
to improve performance. However, the tokenizer must not depend on
|
to improve performance. However, the tokenizer must not depend on
|
||||||
|
|||||||
@@ -31,6 +31,11 @@ DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
|||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
WORD_TYPES =(('country_names', 'C'),
|
||||||
|
('postcodes', 'P'),
|
||||||
|
('full_word', 'W'),
|
||||||
|
('housenumbers', 'H'))
|
||||||
|
|
||||||
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
|
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
|
||||||
""" Create a new instance of the tokenizer provided by this module.
|
""" Create a new instance of the tokenizer provided by this module.
|
||||||
"""
|
"""
|
||||||
@@ -62,7 +67,8 @@ class ICUTokenizer(AbstractTokenizer):
|
|||||||
|
|
||||||
if init_db:
|
if init_db:
|
||||||
self.update_sql_functions(config)
|
self.update_sql_functions(config)
|
||||||
self._init_db_tables(config)
|
self._setup_db_tables(config, 'word')
|
||||||
|
self._create_base_indices(config, 'word')
|
||||||
|
|
||||||
|
|
||||||
def init_from_project(self, config: Configuration) -> None:
|
def init_from_project(self, config: Configuration) -> None:
|
||||||
@@ -80,9 +86,7 @@ class ICUTokenizer(AbstractTokenizer):
|
|||||||
""" Do any required postprocessing to make the tokenizer data ready
|
""" Do any required postprocessing to make the tokenizer data ready
|
||||||
for use.
|
for use.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
self._create_lookup_indices(config, 'word')
|
||||||
sqlp = SQLPreprocessor(conn, config)
|
|
||||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
|
|
||||||
|
|
||||||
|
|
||||||
def update_sql_functions(self, config: Configuration) -> None:
|
def update_sql_functions(self, config: Configuration) -> None:
|
||||||
@@ -100,24 +104,35 @@ class ICUTokenizer(AbstractTokenizer):
|
|||||||
self.init_from_project(config)
|
self.init_from_project(config)
|
||||||
|
|
||||||
|
|
||||||
def update_statistics(self) -> None:
|
def update_statistics(self, config: Configuration) -> None:
|
||||||
""" Recompute frequencies for all name words.
|
""" Recompute frequencies for all name words.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
if conn.table_exists('search_name'):
|
if not conn.table_exists('search_name'):
|
||||||
with conn.cursor() as cur:
|
return
|
||||||
cur.drop_table("word_frequencies")
|
|
||||||
LOG.info("Computing word frequencies")
|
with conn.cursor() as cur:
|
||||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
LOG.info('Computing word frequencies')
|
||||||
SELECT unnest(name_vector) as id, count(*)
|
cur.drop_table('word_frequencies')
|
||||||
FROM search_name GROUP BY id""")
|
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||||
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
SELECT unnest(name_vector) as id, count(*)
|
||||||
LOG.info("Update word table with recomputed frequencies")
|
FROM search_name GROUP BY id""")
|
||||||
cur.execute("""UPDATE word
|
cur.execute('CREATE INDEX ON word_frequencies(id)')
|
||||||
SET info = info || jsonb_build_object('count', count)
|
LOG.info('Update word table with recomputed frequencies')
|
||||||
FROM word_frequencies WHERE word_id = id""")
|
cur.drop_table('tmp_word')
|
||||||
cur.drop_table("word_frequencies")
|
cur.execute("""CREATE TABLE tmp_word AS
|
||||||
|
SELECT word_id, word_token, type, word,
|
||||||
|
(CASE WHEN wf.count is null THEN info
|
||||||
|
ELSE info || jsonb_build_object('count', wf.count)
|
||||||
|
END) as info
|
||||||
|
FROM word LEFT JOIN word_frequencies wf
|
||||||
|
ON word.word_id = wf.id""")
|
||||||
|
cur.drop_table('word_frequencies')
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
self._create_base_indices(config, 'tmp_word')
|
||||||
|
self._create_lookup_indices(config, 'tmp_word')
|
||||||
|
self._move_temporary_word_table('tmp_word')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _cleanup_housenumbers(self) -> None:
|
def _cleanup_housenumbers(self) -> None:
|
||||||
@@ -219,16 +234,81 @@ class ICUTokenizer(AbstractTokenizer):
|
|||||||
self.loader.save_config_to_db(conn)
|
self.loader.save_config_to_db(conn)
|
||||||
|
|
||||||
|
|
||||||
def _init_db_tables(self, config: Configuration) -> None:
|
def _setup_db_tables(self, config: Configuration, table_name: str) -> None:
|
||||||
|
""" Set up the word table and fill it with pre-computed word
|
||||||
|
frequencies.
|
||||||
|
"""
|
||||||
|
with connect(self.dsn) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.drop_table(table_name)
|
||||||
|
sqlp = SQLPreprocessor(conn, config)
|
||||||
|
sqlp.run_string(conn, """
|
||||||
|
CREATE TABLE {{table_name}} (
|
||||||
|
word_id INTEGER,
|
||||||
|
word_token text NOT NULL,
|
||||||
|
type text NOT NULL,
|
||||||
|
word text,
|
||||||
|
info jsonb
|
||||||
|
) {{db.tablespace.search_data}};
|
||||||
|
GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
|
||||||
|
|
||||||
|
DROP SEQUENCE IF EXISTS seq_{{table_name}};
|
||||||
|
CREATE SEQUENCE seq_{{table_name}} start 1;
|
||||||
|
GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
|
||||||
|
""", table_name=table_name)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_base_indices(self, config: Configuration, table_name: str) -> None:
|
||||||
""" Set up the word table and fill it with pre-computed word
|
""" Set up the word table and fill it with pre-computed word
|
||||||
frequencies.
|
frequencies.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
sqlp = SQLPreprocessor(conn, config)
|
sqlp = SQLPreprocessor(conn, config)
|
||||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
|
sqlp.run_string(conn,
|
||||||
|
"""CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
|
||||||
|
USING BTREE (word_token) {{db.tablespace.search_index}}""",
|
||||||
|
table_name=table_name)
|
||||||
|
for name, ctype in WORD_TYPES:
|
||||||
|
sqlp.run_string(conn,
|
||||||
|
"""CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
|
||||||
|
USING BTREE (word) {{db.tablespace.address_index}}
|
||||||
|
WHERE type = '{{column_type}}'
|
||||||
|
""",
|
||||||
|
table_name=table_name, idx_name=name,
|
||||||
|
column_type=ctype)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
|
||||||
|
""" Create addtional indexes used when running the API.
|
||||||
|
"""
|
||||||
|
with connect(self.dsn) as conn:
|
||||||
|
sqlp = SQLPreprocessor(conn, config)
|
||||||
|
# Index required for details lookup.
|
||||||
|
sqlp.run_string(conn, """
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
|
||||||
|
ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
|
||||||
|
""",
|
||||||
|
table_name=table_name)
|
||||||
|
|
||||||
|
|
||||||
|
def _move_temporary_word_table(self, old: str) -> None:
|
||||||
|
""" Rename all tables and indexes used by the tokenizer.
|
||||||
|
"""
|
||||||
|
with connect(self.dsn) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.drop_table('word')
|
||||||
|
cur.execute(f"ALTER TABLE {old} RENAME TO word")
|
||||||
|
for idx in ('word_token', 'word_id'):
|
||||||
|
cur.execute(f"""ALTER INDEX idx_{old}_{idx}
|
||||||
|
RENAME TO idx_word_{idx}""")
|
||||||
|
for name, _ in WORD_TYPES:
|
||||||
|
cur.execute(f"""ALTER INDEX idx_{old}_{name}
|
||||||
|
RENAME TO idx_word_{name}""")
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ICUNameAnalyzer(AbstractAnalyzer):
|
class ICUNameAnalyzer(AbstractAnalyzer):
|
||||||
""" The ICU analyzer uses the ICU library for splitting names.
|
""" The ICU analyzer uses the ICU library for splitting names.
|
||||||
|
|
||||||
|
|||||||
@@ -210,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
self._save_config(conn, config)
|
self._save_config(conn, config)
|
||||||
|
|
||||||
|
|
||||||
def update_statistics(self) -> None:
|
def update_statistics(self, _: Configuration) -> None:
|
||||||
""" Recompute the frequency of full words.
|
""" Recompute the frequency of full words.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
|
|||||||
@@ -38,10 +38,10 @@ class DummyTokenizer:
|
|||||||
def finalize_import(self, *args):
|
def finalize_import(self, *args):
|
||||||
self.finalize_import_called = True
|
self.finalize_import_called = True
|
||||||
|
|
||||||
def update_statistics(self):
|
def update_statistics(self, *args):
|
||||||
self.update_statistics_called = True
|
self.update_statistics_called = True
|
||||||
|
|
||||||
def update_word_tokens(self):
|
def update_word_tokens(self, *args):
|
||||||
self.update_word_tokens_called = True
|
self.update_word_tokens_called = True
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,6 @@
|
|||||||
"""
|
"""
|
||||||
Tests for ICU tokenizer.
|
Tests for ICU tokenizer.
|
||||||
"""
|
"""
|
||||||
import shutil
|
|
||||||
import yaml
|
import yaml
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
@@ -32,8 +31,6 @@ def test_config(project_env, tmp_path):
|
|||||||
sqldir.mkdir()
|
sqldir.mkdir()
|
||||||
(sqldir / 'tokenizer').mkdir()
|
(sqldir / 'tokenizer').mkdir()
|
||||||
(sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
|
(sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
|
||||||
shutil.copy(str(project_env.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
|
|
||||||
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
|
|
||||||
|
|
||||||
project_env.lib_dir.sql = sqldir
|
project_env.lib_dir.sql = sqldir
|
||||||
|
|
||||||
@@ -204,16 +201,14 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
|
|||||||
|
|
||||||
def test_finalize_import(tokenizer_factory, temp_db_conn,
|
def test_finalize_import(tokenizer_factory, temp_db_conn,
|
||||||
temp_db_cursor, test_config, sql_preprocessor_cfg):
|
temp_db_cursor, test_config, sql_preprocessor_cfg):
|
||||||
func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_indices.sql'
|
|
||||||
func_file.write_text("""CREATE FUNCTION test() RETURNS TEXT
|
|
||||||
AS $$ SELECT 'b'::text $$ LANGUAGE SQL""")
|
|
||||||
|
|
||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
tok.init_new_db(test_config)
|
tok.init_new_db(test_config)
|
||||||
|
|
||||||
|
assert not temp_db_conn.index_exists('idx_word_word_id')
|
||||||
|
|
||||||
tok.finalize_import(test_config)
|
tok.finalize_import(test_config)
|
||||||
|
|
||||||
temp_db_cursor.scalar('SELECT test()') == 'b'
|
assert temp_db_conn.index_exists('idx_word_word_id')
|
||||||
|
|
||||||
|
|
||||||
def test_check_database(test_config, tokenizer_factory,
|
def test_check_database(test_config, tokenizer_factory,
|
||||||
@@ -224,19 +219,20 @@ def test_check_database(test_config, tokenizer_factory,
|
|||||||
assert tok.check_database(test_config) is None
|
assert tok.check_database(test_config) is None
|
||||||
|
|
||||||
|
|
||||||
def test_update_statistics_reverse_only(word_table, tokenizer_factory):
|
def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
|
||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
tok.update_statistics()
|
tok.update_statistics(test_config)
|
||||||
|
|
||||||
|
|
||||||
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory):
|
def test_update_statistics(word_table, table_factory, temp_db_cursor,
|
||||||
|
tokenizer_factory, test_config):
|
||||||
word_table.add_full_word(1000, 'hello')
|
word_table.add_full_word(1000, 'hello')
|
||||||
table_factory('search_name',
|
table_factory('search_name',
|
||||||
'place_id BIGINT, name_vector INT[]',
|
'place_id BIGINT, name_vector INT[]',
|
||||||
[(12, [1000])])
|
[(12, [1000])])
|
||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
|
|
||||||
tok.update_statistics()
|
tok.update_statistics(test_config)
|
||||||
|
|
||||||
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
|
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
|
||||||
WHERE type = 'W' and
|
WHERE type = 'W' and
|
||||||
|
|||||||
@@ -238,19 +238,19 @@ def test_check_database_bad_setup(test_config, tokenizer_factory, monkeypatch,
|
|||||||
assert tok.check_database(False) is not None
|
assert tok.check_database(False) is not None
|
||||||
|
|
||||||
|
|
||||||
def test_update_statistics_reverse_only(word_table, tokenizer_factory):
|
def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
|
||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
tok.update_statistics()
|
tok.update_statistics(test_config)
|
||||||
|
|
||||||
|
|
||||||
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory):
|
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory, test_config):
|
||||||
word_table.add_full_word(1000, 'hello')
|
word_table.add_full_word(1000, 'hello')
|
||||||
table_factory('search_name',
|
table_factory('search_name',
|
||||||
'place_id BIGINT, name_vector INT[]',
|
'place_id BIGINT, name_vector INT[]',
|
||||||
[(12, [1000])])
|
[(12, [1000])])
|
||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
|
|
||||||
tok.update_statistics()
|
tok.update_statistics(test_config)
|
||||||
|
|
||||||
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
|
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
|
||||||
WHERE word_token like ' %' and
|
WHERE word_token like ' %' and
|
||||||
|
|||||||
Reference in New Issue
Block a user