move abbreviation computation into import phase

This adds precomputation of abbreviated terms for names and removes abbreviation of terms in the query. Basic import works but still needs some thorough testing as well as speed improvements during import. New dependency for python library datrie.
2026-02-14 18:37:58 +00:00 · 2021-05-28 22:06:13 +02:00
parent 6ba00e6aee
commit 8413075249
10 changed files with 665 additions and 206 deletions
--- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql
@@ -87,25 +87,48 @@ $$ LANGUAGE SQL IMMUTABLE STRICT;

 --------------- private functions ----------------------------------------------

-CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT)
-  RETURNS INTEGER
+CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, lookup_terms TEXT[],
+                                                 OUT full_token INT,
+                                                 OUT partial_tokens INT[])
  AS $$
 DECLARE
-  return_id INTEGER;
+  partial_terms TEXT[] = '{}'::TEXT[];
+  term TEXT;
+  term_id INTEGER;
  term_count INTEGER;
 BEGIN
-  SELECT min(word_id), max(search_name_count) INTO return_id, term_count
-    FROM word WHERE word_token = lookup_term and class is null and type is null;
+  SELECT min(word_id) INTO full_token
+    FROM word WHERE word = norm_term and class is null and country_code is null;

-  IF return_id IS NULL THEN
-    return_id := nextval('seq_word');
-    INSERT INTO word (word_id, word_token, search_name_count)
-      VALUES (return_id, lookup_term, 0);
-  ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN
-    return_id := 0;
+  IF full_token IS NULL THEN
+    full_token := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, word, search_name_count)
+      SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term;
  END IF;

-  RETURN return_id;
+  FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
+    term := trim(term);
+    IF NOT (ARRAY[term] <@ partial_terms) THEN
+      partial_terms := partial_terms || term;
+    END IF;
+  END LOOP;
+
+  partial_tokens := '{}'::INT[];
+  FOR term IN SELECT unnest(partial_terms) LOOP
+    SELECT min(word_id), max(search_name_count) INTO term_id, term_count
+      FROM word WHERE word_token = term and class is null and country_code is null;
+
+    IF term_id IS NULL THEN
+      term_id := nextval('seq_word');
+      term_count := 0;
+      INSERT INTO word (word_id, word_token, search_name_count)
+        VALUES (term_id, term, 0);
+    END IF;
+
+    IF term_count < {{ max_word_freq }} THEN
+      partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
+    END IF;
+  END LOOP;
 END;
 $$
 LANGUAGE plpgsql;