move abbreviation computation into import phase

This adds precomputation of abbreviated terms for names and removes
abbreviation of terms in the query. Basic import works but still
needs some thorough testing as well as speed improvements during
import.

New dependency for python library datrie.
This commit is contained in:
Sarah Hoffmann
2021-05-28 22:06:13 +02:00
parent 6ba00e6aee
commit 8413075249
10 changed files with 665 additions and 206 deletions

View File

@@ -87,25 +87,48 @@ $$ LANGUAGE SQL IMMUTABLE STRICT;
--------------- private functions ----------------------------------------------
CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT)
RETURNS INTEGER
CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, lookup_terms TEXT[],
OUT full_token INT,
OUT partial_tokens INT[])
AS $$
DECLARE
return_id INTEGER;
partial_terms TEXT[] = '{}'::TEXT[];
term TEXT;
term_id INTEGER;
term_count INTEGER;
BEGIN
SELECT min(word_id), max(search_name_count) INTO return_id, term_count
FROM word WHERE word_token = lookup_term and class is null and type is null;
SELECT min(word_id) INTO full_token
FROM word WHERE word = norm_term and class is null and country_code is null;
IF return_id IS NULL THEN
return_id := nextval('seq_word');
INSERT INTO word (word_id, word_token, search_name_count)
VALUES (return_id, lookup_term, 0);
ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN
return_id := 0;
IF full_token IS NULL THEN
full_token := nextval('seq_word');
INSERT INTO word (word_id, word_token, word, search_name_count)
SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term;
END IF;
RETURN return_id;
FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
term := trim(term);
IF NOT (ARRAY[term] <@ partial_terms) THEN
partial_terms := partial_terms || term;
END IF;
END LOOP;
partial_tokens := '{}'::INT[];
FOR term IN SELECT unnest(partial_terms) LOOP
SELECT min(word_id), max(search_name_count) INTO term_id, term_count
FROM word WHERE word_token = term and class is null and country_code is null;
IF term_id IS NULL THEN
term_id := nextval('seq_word');
term_count := 0;
INSERT INTO word (word_id, word_token, search_name_count)
VALUES (term_id, term, 0);
END IF;
IF term_count < {{ max_word_freq }} THEN
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
END IF;
END LOOP;
END;
$$
LANGUAGE plpgsql;