mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-15 19:07:58 +00:00
Requires a second wrapper class for the word table with the new layout. This class is interface-compatible, so that later when the ICU tokenizer becomes the default, all tests that depend on behaviour of the default tokenizer can be switched to the other wrapper.
158 lines
4.4 KiB
PL/PgSQL
158 lines
4.4 KiB
PL/PgSQL
-- Get tokens used for searching the given place.
|
|
--
|
|
-- These are the tokens that will be saved in the search_name table.
|
|
CREATE OR REPLACE FUNCTION token_get_name_search_tokens(info JSONB)
|
|
RETURNS INTEGER[]
|
|
AS $$
|
|
SELECT (info->>'names')::INTEGER[]
|
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
|
|
|
|
|
-- Get tokens for matching the place name against others.
|
|
--
|
|
-- This should usually be restricted to full name tokens.
|
|
CREATE OR REPLACE FUNCTION token_get_name_match_tokens(info JSONB)
|
|
RETURNS INTEGER[]
|
|
AS $$
|
|
SELECT (info->>'names')::INTEGER[]
|
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
|
|
|
|
|
-- Return the housenumber tokens applicable for the place.
|
|
CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
|
|
RETURNS INTEGER[]
|
|
AS $$
|
|
SELECT (info->>'hnr_tokens')::INTEGER[]
|
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
|
|
|
|
|
-- Return the housenumber in the form that it can be matched during search.
|
|
CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
|
|
RETURNS TEXT
|
|
AS $$
|
|
SELECT info->>'hnr';
|
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
|
|
RETURNS INTEGER[]
|
|
AS $$
|
|
SELECT (info->>'street')::INTEGER[]
|
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
|
|
RETURNS INTEGER[]
|
|
AS $$
|
|
SELECT (info->>'place_match')::INTEGER[]
|
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
|
|
RETURNS INTEGER[]
|
|
AS $$
|
|
SELECT (info->>'place_search')::INTEGER[]
|
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
|
|
|
|
|
DROP TYPE IF EXISTS token_addresstoken CASCADE;
|
|
CREATE TYPE token_addresstoken AS (
|
|
key TEXT,
|
|
match_tokens INT[],
|
|
search_tokens INT[]
|
|
);
|
|
|
|
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
|
|
RETURNS SETOF token_addresstoken
|
|
AS $$
|
|
SELECT key, (value->>1)::int[] as match_tokens,
|
|
(value->>0)::int[] as search_tokens
|
|
FROM jsonb_each(info->'addr');
|
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
|
|
RETURNS TEXT
|
|
AS $$
|
|
SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
|
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
|
|
|
|
|
-- Return token info that should be saved permanently in the database.
|
|
CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
|
|
RETURNS JSONB
|
|
AS $$
|
|
SELECT NULL::JSONB;
|
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
|
|
|
--------------- private functions ----------------------------------------------
|
|
|
|
CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, lookup_terms TEXT[],
|
|
OUT full_token INT,
|
|
OUT partial_tokens INT[])
|
|
AS $$
|
|
DECLARE
|
|
partial_terms TEXT[] = '{}'::TEXT[];
|
|
term TEXT;
|
|
term_id INTEGER;
|
|
term_count INTEGER;
|
|
BEGIN
|
|
SELECT min(word_id) INTO full_token
|
|
FROM word WHERE info->>'word' = norm_term and type = 'W';
|
|
|
|
IF full_token IS NULL THEN
|
|
full_token := nextval('seq_word');
|
|
INSERT INTO word (word_id, word_token, type, info)
|
|
SELECT full_token, lookup_term, 'W',
|
|
json_build_object('word', norm_term, 'count', 0)
|
|
FROM unnest(lookup_terms) as lookup_term;
|
|
END IF;
|
|
|
|
FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
|
|
term := trim(term);
|
|
IF NOT (ARRAY[term] <@ partial_terms) THEN
|
|
partial_terms := partial_terms || term;
|
|
END IF;
|
|
END LOOP;
|
|
|
|
partial_tokens := '{}'::INT[];
|
|
FOR term IN SELECT unnest(partial_terms) LOOP
|
|
SELECT min(word_id), max(info->>'count') INTO term_id, term_count
|
|
FROM word WHERE word_token = term and type = 'w';
|
|
|
|
IF term_id IS NULL THEN
|
|
term_id := nextval('seq_word');
|
|
term_count := 0;
|
|
INSERT INTO word (word_id, word_token, type, info)
|
|
VALUES (term_id, term, 'w', json_build_object('count', term_count));
|
|
END IF;
|
|
|
|
IF term_count < {{ max_word_freq }} THEN
|
|
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
|
|
END IF;
|
|
END LOOP;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
|
|
RETURNS INTEGER
|
|
AS $$
|
|
DECLARE
|
|
return_id INTEGER;
|
|
BEGIN
|
|
SELECT min(word_id) INTO return_id FROM word
|
|
WHERE word_token = lookup_term and type = 'H';
|
|
|
|
IF return_id IS NULL THEN
|
|
return_id := nextval('seq_word');
|
|
INSERT INTO word (word_id, word_token, type)
|
|
VALUES (return_id, lookup_term, 'H');
|
|
END IF;
|
|
|
|
RETURN return_id;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|