do not save word counts of 1

This is the default setting, which will be assumed when the count is
missing.
This commit is contained in:
Sarah Hoffmann
2025-03-20 20:01:46 +01:00
parent 46579f08e4
commit 1705bb5f57

View File

@@ -121,10 +121,10 @@ class ICUTokenizer(AbstractTokenizer):
SELECT unnest(nameaddress_vector) as id, count(*) SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id) FROM search_name GROUP BY id)
SELECT coalesce(a.id, w.id) as id, SELECT coalesce(a.id, w.id) as id,
(CASE WHEN w.count is null THEN '{}'::JSONB (CASE WHEN w.count is null or w.count <= 1 THEN '{}'::JSONB
ELSE jsonb_build_object('count', w.count) END ELSE jsonb_build_object('count', w.count) END
|| ||
CASE WHEN a.count is null THEN '{}'::JSONB CASE WHEN a.count is null or a.count <= 1 THEN '{}'::JSONB
ELSE jsonb_build_object('addr_count', a.count) END) as info ELSE jsonb_build_object('addr_count', a.count) END) as info
FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
""") """)
@@ -134,9 +134,10 @@ class ICUTokenizer(AbstractTokenizer):
drop_tables(conn, 'tmp_word') drop_tables(conn, 'tmp_word')
cur.execute("""CREATE TABLE tmp_word AS cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word, SELECT word_id, word_token, type, word,
(CASE WHEN wf.info is null THEN word.info coalesce(word.info, '{}'::jsonb)
ELSE coalesce(word.info, '{}'::jsonb) || wf.info - 'count' - 'addr_count' ||
END) as info coalesce(wf.info, '{}'::jsonb)
as info
FROM word LEFT JOIN word_frequencies wf FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id ON word.word_id = wf.id
""") """)