extend word statistics to address index

Word frequency in names is not sufficient to interpolate word
frequency in the address because names of towns, states etc. are
much more frequently used than, say street names.
This commit is contained in:
Sarah Hoffmann
2024-03-13 15:10:25 +01:00
parent 9c48726691
commit bb5de9b955
4 changed files with 88 additions and 19 deletions

View File

@@ -102,6 +102,7 @@ class Token(ABC):
lookup_word: str lookup_word: str
is_indexed: bool is_indexed: bool
addr_count: int = 1
@abstractmethod @abstractmethod
def get_category(self) -> Tuple[str, str]: def get_category(self) -> Tuple[str, str]:

View File

@@ -110,7 +110,8 @@ class UpdateRefresh:
if args.word_counts: if args.word_counts:
LOG.warning('Recompute word statistics') LOG.warning('Recompute word statistics')
self._get_tokenizer(args.config).update_statistics(args.config) self._get_tokenizer(args.config).update_statistics(args.config,
threads=args.threads or 1)
if args.address_levels: if args.address_levels:
LOG.warning('Updating address levels') LOG.warning('Updating address levels')

View File

@@ -168,7 +168,7 @@ class SetupAll:
tokenizer.finalize_import(args.config) tokenizer.finalize_import(args.config)
LOG.warning('Recompute word counts') LOG.warning('Recompute word counts')
tokenizer.update_statistics(args.config) tokenizer.update_statistics(args.config, threads=num_threads)
webdir = args.project_dir / 'website' webdir = args.project_dir / 'website'
LOG.warning('Setup website at %s', webdir) LOG.warning('Setup website at %s', webdir)

View File

@@ -104,7 +104,7 @@ class ICUTokenizer(AbstractTokenizer):
self.init_from_project(config) self.init_from_project(config)
def update_statistics(self, config: Configuration) -> None: def update_statistics(self, config: Configuration, threads: int = 2) -> None:
""" Recompute frequencies for all name words. """ Recompute frequencies for all name words.
""" """
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
@@ -112,22 +112,89 @@ class ICUTokenizer(AbstractTokenizer):
return return
with conn.cursor() as cur: with conn.cursor() as cur:
LOG.info('Computing word frequencies') cur.execute('ANALYSE search_name')
cur.drop_table('word_frequencies') if threads > 1:
cur.execute("""CREATE TEMP TABLE word_frequencies AS cur.execute('SET max_parallel_workers_per_gather TO %s',
SELECT unnest(name_vector) as id, count(*) (min(threads, 6),))
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON word_frequencies(id)') if conn.server_version_tuple() < (12, 0):
LOG.info('Update word table with recomputed frequencies') LOG.info('Computing word frequencies')
cur.drop_table('tmp_word') cur.drop_table('word_frequencies')
cur.execute("""CREATE TABLE tmp_word AS cur.drop_table('addressword_frequencies')
SELECT word_id, word_token, type, word, cur.execute("""CREATE TEMP TABLE word_frequencies AS
(CASE WHEN wf.count is null THEN info SELECT unnest(name_vector) as id, count(*)
ELSE info || jsonb_build_object('count', wf.count) FROM search_name GROUP BY id""")
END) as info cur.execute('CREATE INDEX ON word_frequencies(id)')
FROM word LEFT JOIN word_frequencies wf cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
ON word.word_id = wf.id""") SELECT unnest(nameaddress_vector) as id, count(*)
cur.drop_table('word_frequencies') FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON addressword_frequencies(id)')
cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
INOUT info JSONB)
AS $$
DECLARE rec RECORD;
BEGIN
IF info is null THEN
info = '{}'::jsonb;
END IF;
FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('count', rec.count);
END LOOP;
FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('addr_count', rec.count);
END LOOP;
IF info = '{}'::jsonb THEN
info = null;
END IF;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
""")
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
word_freq_update(word_id, info) as info
FROM word
""")
cur.drop_table('word_frequencies')
cur.drop_table('addressword_frequencies')
else:
LOG.info('Computing word frequencies')
cur.drop_table('word_frequencies')
cur.execute("""
CREATE TEMP TABLE word_frequencies AS
WITH word_freq AS MATERIALIZED (
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id),
addr_freq AS MATERIALIZED (
SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id)
SELECT coalesce(a.id, w.id) as id,
(CASE WHEN w.count is null THEN '{}'::JSONB
ELSE jsonb_build_object('count', w.count) END
||
CASE WHEN a.count is null THEN '{}'::JSONB
ELSE jsonb_build_object('addr_count', a.count) END) as info
FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
""")
cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
cur.execute('ANALYSE word_frequencies')
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
(CASE WHEN wf.info is null THEN word.info
ELSE coalesce(word.info, '{}'::jsonb) || wf.info
END) as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id
""")
cur.drop_table('word_frequencies')
with conn.cursor() as cur:
cur.execute('SET max_parallel_workers_per_gather TO 0')
sqlp = SQLPreprocessor(conn, config) sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn, sqlp.run_string(conn,