Files
Nominatim/lib-sql/tokenizer/legacy_icu_tokenizer.sql
Sarah Hoffmann 1db098c05d reinstate word column in icu word table
Postgresql is very bad at creating statistics for jsonb
columns. The result is that the query planer tends to
use JIT for queries with a where over 'info' even when
there is an index.
2021-07-28 11:31:47 +02:00

158 lines
4.4 KiB
PL/PgSQL

-- Get tokens used for searching the given place.
--
-- These are the tokens that will be saved in the search_name table.
CREATE OR REPLACE FUNCTION token_get_name_search_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'names')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
-- Get tokens for matching the place name against others.
--
-- This should usually be restricted to full name tokens.
CREATE OR REPLACE FUNCTION token_get_name_match_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'names')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
-- Return the housenumber tokens applicable for the place.
CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'hnr_tokens')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
-- Return the housenumber in the form that it can be matched during search.
CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
RETURNS TEXT
AS $$
SELECT info->>'hnr';
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'street')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'place_match')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'place_search')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
DROP TYPE IF EXISTS token_addresstoken CASCADE;
CREATE TYPE token_addresstoken AS (
key TEXT,
match_tokens INT[],
search_tokens INT[]
);
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
RETURNS SETOF token_addresstoken
AS $$
SELECT key, (value->>1)::int[] as match_tokens,
(value->>0)::int[] as search_tokens
FROM jsonb_each(info->'addr');
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
RETURNS TEXT
AS $$
SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
$$ LANGUAGE SQL IMMUTABLE STRICT;
-- Return token info that should be saved permanently in the database.
CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
RETURNS JSONB
AS $$
SELECT NULL::JSONB;
$$ LANGUAGE SQL IMMUTABLE STRICT;
--------------- private functions ----------------------------------------------
CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, lookup_terms TEXT[],
OUT full_token INT,
OUT partial_tokens INT[])
AS $$
DECLARE
partial_terms TEXT[] = '{}'::TEXT[];
term TEXT;
term_id INTEGER;
term_count INTEGER;
BEGIN
SELECT min(word_id) INTO full_token
FROM word WHERE word = norm_term and type = 'W';
IF full_token IS NULL THEN
full_token := nextval('seq_word');
INSERT INTO word (word_id, word_token, type, word, info)
SELECT full_token, lookup_term, 'W', norm_term,
json_build_object('count', 0)
FROM unnest(lookup_terms) as lookup_term;
END IF;
FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
term := trim(term);
IF NOT (ARRAY[term] <@ partial_terms) THEN
partial_terms := partial_terms || term;
END IF;
END LOOP;
partial_tokens := '{}'::INT[];
FOR term IN SELECT unnest(partial_terms) LOOP
SELECT min(word_id), max(info->>'count') INTO term_id, term_count
FROM word WHERE word_token = term and type = 'w';
IF term_id IS NULL THEN
term_id := nextval('seq_word');
term_count := 0;
INSERT INTO word (word_id, word_token, type, info)
VALUES (term_id, term, 'w', json_build_object('count', term_count));
END IF;
IF term_count < {{ max_word_freq }} THEN
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
END IF;
END LOOP;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
RETURNS INTEGER
AS $$
DECLARE
return_id INTEGER;
BEGIN
SELECT min(word_id) INTO return_id FROM word
WHERE word_token = lookup_term and type = 'H';
IF return_id IS NULL THEN
return_id := nextval('seq_word');
INSERT INTO word (word_id, word_token, type)
VALUES (return_id, lookup_term, 'H');
END IF;
RETURN return_id;
END;
$$
LANGUAGE plpgsql;