Merge pull request #2454 from lonvia/sort-out-token-assignment-in-sql

ICU tokenizer: switch match method to using partial terms
This commit is contained in:
Sarah Hoffmann
2021-09-28 09:45:15 +02:00
committed by GitHub
11 changed files with 236 additions and 189 deletions

View File

@@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE;
-- find the parent road of the cut road parts
CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB,
partition SMALLINT,
centroid GEOMETRY, geom GEOMETRY)
RETURNS BIGINT
@@ -52,7 +52,7 @@ DECLARE
parent_place_id BIGINT;
location RECORD;
BEGIN
parent_place_id := find_parent_for_address(street, place, partition, centroid);
parent_place_id := find_parent_for_address(token_info, partition, centroid);
IF parent_place_id is null THEN
FOR location IN SELECT place_id FROM placex
@@ -155,9 +155,8 @@ BEGIN
NEW.interpolationtype = NEW.address->'interpolation';
place_centroid := ST_PointOnSurface(NEW.linegeo);
NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
token_addr_place_match_tokens(NEW.token_info),
NEW.partition, place_centroid, NEW.linegeo);
NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition,
place_centroid, NEW.linegeo);
interpol_postcode := token_normalized_postcode(NEW.address->'postcode');

View File

@@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE;
CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
from_rank SMALLINT, to_rank SMALLINT,
extent FLOAT, tokens INT[])
extent FLOAT, token_info JSONB, key TEXT)
RETURNS nearfeaturecentr
AS $$
DECLARE
@@ -80,7 +80,7 @@ BEGIN
FROM location_area_large_{{ partition }}
WHERE geometry && ST_Expand(feature, extent)
AND rank_address between from_rank and to_rank
AND tokens && keywords
AND token_matches_address(token_info, key, keywords)
GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
RETURN r;
@@ -148,18 +148,21 @@ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
point GEOMETRY,
isin_token INTEGER[])
token_info JSONB)
RETURNS BIGINT
AS $$
DECLARE
parent BIGINT;
BEGIN
IF not token_has_addr_street(token_info) THEN
RETURN NULL;
END IF;
{% for partition in db.partitions %}
IF in_partition = {{ partition }} THEN
SELECT place_id FROM search_name_{{ partition }}
INTO parent
WHERE name_vector && isin_token
WHERE token_matches_street(token_info, name_vector)
AND centroid && ST_Expand(point, 0.015)
AND address_rank between 26 and 27
ORDER BY ST_Distance(centroid, point) ASC limit 1;
@@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE;
CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
point GEOMETRY,
isin_token INTEGER[])
token_info JSONB)
RETURNS BIGINT
AS $$
DECLARE
parent BIGINT;
BEGIN
IF not token_has_addr_place(token_info) THEN
RETURN NULL;
END IF;
{% for partition in db.partitions %}
IF in_partition = {{ partition }} THEN
SELECT place_id
INTO parent
FROM search_name_{{ partition }}
WHERE name_vector && isin_token
WHERE token_matches_place(token_info, name_vector)
AND centroid && ST_Expand(point, 0.04)
AND address_rank between 16 and 25
ORDER BY ST_Distance(centroid, point) ASC limit 1;

View File

@@ -104,8 +104,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
poi_osm_id BIGINT,
poi_partition SMALLINT,
bbox GEOMETRY,
addr_street INTEGER[],
addr_place INTEGER[],
token_info JSONB,
is_place_addr BOOLEAN)
RETURNS BIGINT
AS $$
@@ -119,8 +118,7 @@ BEGIN
parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
IF parent_place_id is null THEN
parent_place_id := find_parent_for_address(addr_street, addr_place,
poi_partition, bbox);
parent_place_id := find_parent_for_address(token_info, poi_partition, bbox);
END IF;
IF parent_place_id is null and poi_osm_type = 'N' THEN
@@ -333,13 +331,14 @@ BEGIN
WHERE s.place_id = parent_place_id;
FOR addr_item IN
SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
FROM token_get_address_tokens(token_info)
WHERE not search_tokens <@ parent_address_vector
SELECT (get_addr_tag_rank(key, country)).*, key,
token_get_address_search_tokens(token_info, key) as search_tokens
FROM token_get_address_keys(token_info) as key
WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector
LOOP
addr_place := get_address_place(in_partition, geometry,
addr_item.from_rank, addr_item.to_rank,
addr_item.extent, addr_item.match_tokens);
addr_item.extent, token_info, addr_item.key);
IF addr_place is null THEN
-- No place found in OSM that matches. Make it at least searchable.
@@ -447,14 +446,16 @@ BEGIN
FOR location IN
SELECT (get_address_place(partition, geometry, from_rank, to_rank,
extent, match_tokens)).*, search_tokens
FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
FROM token_get_address_tokens(token_info)) x
extent, token_info, key)).*, key
FROM (SELECT (get_addr_tag_rank(key, country)).*, key
FROM token_get_address_keys(token_info) as key) x
ORDER BY rank_address, distance, isguess desc
LOOP
IF location.place_id is null THEN
{% if not db.reverse_only %}
nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
nameaddress_vector := array_merge(nameaddress_vector,
token_get_address_search_tokens(token_info,
location.key));
{% endif %}
ELSE
{% if not db.reverse_only %}
@@ -689,9 +690,6 @@ DECLARE
parent_address_level SMALLINT;
place_address_level SMALLINT;
addr_street INTEGER[];
addr_place INTEGER[];
max_rank SMALLINT;
name_vector INTEGER[];
@@ -860,8 +858,6 @@ BEGIN
END IF;
NEW.housenumber := token_normalized_housenumber(NEW.token_info);
addr_street := token_addr_street_match_tokens(NEW.token_info);
addr_place := token_addr_place_match_tokens(NEW.token_info);
NEW.postcode := null;
@@ -907,7 +903,7 @@ BEGIN
NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
NEW.partition,
ST_Envelope(NEW.geometry),
addr_street, addr_place,
NEW.token_info,
is_place_address);
-- If we found the road take a shortcut here.

View File

@@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE;
-- Find the parent of an address with addr:street/addr:place tag.
--
-- \param street Value of addr:street or NULL if tag is missing.
-- \param place Value of addr:place or NULL if tag is missing.
-- \param token_info Naming info with the address information.
-- \param partition Partition where to search the parent.
-- \param centroid Location of the address.
--
-- \return Place ID of the parent if one was found, NULL otherwise.
CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB,
partition SMALLINT,
centroid GEOMETRY)
RETURNS BIGINT
@@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG
DECLARE
parent_place_id BIGINT;
BEGIN
IF street is not null THEN
-- Check for addr:street attributes
-- Note that addr:street links can only be indexed, once the street itself is indexed
parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
IF parent_place_id is not null THEN
{% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
RETURN parent_place_id;
END IF;
-- Check for addr:street attributes
parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info);
IF parent_place_id is not null THEN
{% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %}
RETURN parent_place_id;
END IF;
-- Check for addr:place attributes.
IF place is not null THEN
parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
IF parent_place_id is not null THEN
{% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
RETURN parent_place_id;
END IF;
END IF;
RETURN NULL;
parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info);
{% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %}
RETURN parent_place_id;
END;
$$
LANGUAGE plpgsql STABLE;
CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
RETURNS BOOLEAN
AS $$

View File

@@ -14,7 +14,6 @@ DECLARE
out_partition INTEGER;
out_parent_place_id BIGINT;
location RECORD;
address_street_word_ids INTEGER[];
BEGIN
@@ -54,13 +53,9 @@ BEGIN
place_centroid := ST_Centroid(linegeo);
out_partition := get_partition('us');
out_parent_place_id := null;
address_street_word_ids := token_addr_street_match_tokens(token_info);
IF address_street_word_ids IS NOT NULL THEN
out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
address_street_word_ids);
END IF;
out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
token_info);
IF out_parent_place_id IS NULL THEN
SELECT getNearestParallelRoadFeature(out_partition, linegeo)

View File

@@ -34,40 +34,59 @@ AS $$
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
RETURNS INTEGER[]
CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
RETURNS BOOLEAN
AS $$
SELECT (info->>'street')::INTEGER[]
SELECT info->>'street' is not null;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
RETURNS BOOLEAN
AS $$
SELECT info->>'place' is not null;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
RETURNS BOOLEAN
AS $$
SELECT (info->>'street')::INTEGER[] <@ street_tokens
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
RETURNS INTEGER[]
CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
RETURNS BOOLEAN
AS $$
SELECT (info->>'place_match')::INTEGER[]
SELECT (info->>'place')::INTEGER[] <@ place_tokens
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'place_search')::INTEGER[]
SELECT (info->>'place')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
DROP TYPE IF EXISTS token_addresstoken CASCADE;
CREATE TYPE token_addresstoken AS (
key TEXT,
match_tokens INT[],
search_tokens INT[]
);
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
RETURNS SETOF token_addresstoken
CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
RETURNS SETOF TEXT
AS $$
SELECT key, (value->>1)::int[] as match_tokens,
(value->>0)::int[] as search_tokens
FROM jsonb_each(info->'addr');
SELECT * FROM jsonb_object_keys(info->'addr');
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
RETURNS INTEGER[]
AS $$
SELECT (info->'addr'->>key)::INTEGER[];
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
RETURNS BOOLEAN
AS $$
SELECT (info->'addr'->>key)::INTEGER[] <@ tokens;
$$ LANGUAGE SQL IMMUTABLE STRICT;
@@ -127,15 +146,34 @@ BEGIN
VALUES (term_id, term, 'w', json_build_object('count', term_count));
END IF;
IF term_count < {{ max_word_freq }} THEN
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
END IF;
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
END LOOP;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT)
RETURNS INTEGER
AS $$
DECLARE
token INTEGER;
BEGIN
SELECT min(word_id) INTO token
FROM word WHERE word_token = partial and type = 'w';
IF token IS NULL THEN
token := nextval('seq_word');
INSERT INTO word (word_id, word_token, type, info)
VALUES (token, partial, 'w', json_build_object('count', 0));
END IF;
RETURN token;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
RETURNS INTEGER
AS $$

View File

@@ -34,17 +34,31 @@ AS $$
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
RETURNS INTEGER[]
CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
RETURNS BOOLEAN
AS $$
SELECT (info->>'street')::INTEGER[]
SELECT info->>'street' is not null;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
RETURNS BOOLEAN
AS $$
SELECT info->>'place_match' is not null;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
RETURNS BOOLEAN
AS $$
SELECT (info->>'street')::INTEGER[] && street_tokens
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
RETURNS INTEGER[]
CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
RETURNS BOOLEAN
AS $$
SELECT (info->>'place_match')::INTEGER[]
SELECT (info->>'place_match')::INTEGER[] && place_tokens
$$ LANGUAGE SQL IMMUTABLE STRICT;
@@ -55,19 +69,24 @@ AS $$
$$ LANGUAGE SQL IMMUTABLE STRICT;
DROP TYPE IF EXISTS token_addresstoken CASCADE;
CREATE TYPE token_addresstoken AS (
key TEXT,
match_tokens INT[],
search_tokens INT[]
);
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
RETURNS SETOF token_addresstoken
CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
RETURNS SETOF TEXT
AS $$
SELECT key, (value->>1)::int[] as match_tokens,
(value->>0)::int[] as search_tokens
FROM jsonb_each(info->'addr');
SELECT * FROM jsonb_object_keys(info->'addr');
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
RETURNS INTEGER[]
AS $$
SELECT (info->'addr'->key->>0)::INTEGER[];
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
RETURNS BOOLEAN
AS $$
SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
$$ LANGUAGE SQL IMMUTABLE STRICT;