From 59fe74ddf6749d93c93e88b1aeff0eb59a8e03ec Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 22 Sep 2021 22:20:02 +0200
Subject: [PATCH 1/5] move name matching into tokenizer module

Instead of requesting the match tokens from the tokenizer
when looking for parent streets/places and address parts,
hand in the saved tokens and ask if they match. This gives
the tokenizer more freedom to decide how name matching
should be done.
---
 lib-sql/functions/interpolation.sql       |  9 ++--
 lib-sql/functions/partition-functions.sql | 18 +++++---
 lib-sql/functions/placex_triggers.sql     | 32 ++++++-------
 lib-sql/functions/utils.sql               | 31 +++++--------
 lib-sql/tiger_import_start.sql            |  9 +---
 lib-sql/tokenizer/icu_tokenizer.sql       | 55 +++++++++++++++--------
 lib-sql/tokenizer/legacy_tokenizer.sql    | 55 +++++++++++++++--------
 7 files changed, 117 insertions(+), 92 deletions(-)

diff --git a/lib-sql/functions/interpolation.sql b/lib-sql/functions/interpolation.sql
index 55e44dfd..4ef36f4f 100644
--- a/lib-sql/functions/interpolation.sql
+++ b/lib-sql/functions/interpolation.sql
@@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE;
 
 
 -- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
+CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB,
                                                     partition SMALLINT,
                                                     centroid GEOMETRY, geom GEOMETRY)
   RETURNS BIGINT
@@ -52,7 +52,7 @@ DECLARE
   parent_place_id BIGINT;
   location RECORD;
 BEGIN
-  parent_place_id := find_parent_for_address(street, place, partition, centroid);
+  parent_place_id := find_parent_for_address(token_info, partition, centroid);
 
   IF parent_place_id is null THEN
     FOR location IN SELECT place_id FROM placex
@@ -155,9 +155,8 @@ BEGIN
   NEW.interpolationtype = NEW.address->'interpolation';
 
   place_centroid := ST_PointOnSurface(NEW.linegeo);
-  NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
-                                                 token_addr_place_match_tokens(NEW.token_info),
-                                                 NEW.partition, place_centroid, NEW.linegeo);
+  NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition,
+                                                 place_centroid, NEW.linegeo);
 
   interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
 
diff --git a/lib-sql/functions/partition-functions.sql b/lib-sql/functions/partition-functions.sql
index 53aba22c..97afec15 100644
--- a/lib-sql/functions/partition-functions.sql
+++ b/lib-sql/functions/partition-functions.sql
@@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE;
 
 CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
                                              from_rank SMALLINT, to_rank SMALLINT,
-                                             extent FLOAT, tokens INT[])
+                                             extent FLOAT, token_info JSONB, key TEXT)
   RETURNS nearfeaturecentr
   AS $$
 DECLARE
@@ -80,7 +80,7 @@ BEGIN
         FROM location_area_large_{{ partition }}
         WHERE geometry && ST_Expand(feature, extent)
               AND rank_address between from_rank and to_rank
-              AND tokens && keywords
+              AND token_matches_address(token_info, key, keywords)
         GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
         ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
       RETURN r;
@@ -148,18 +148,21 @@ LANGUAGE plpgsql;
 
 CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
                                                       point GEOMETRY,
-                                                      isin_token INTEGER[])
+                                                      token_info JSONB)
   RETURNS BIGINT
   AS $$
 DECLARE
   parent BIGINT;
 BEGIN
+  IF not token_has_addr_street(token_info) THEN
+    RETURN NULL;
+  END IF;
 
 {% for partition in db.partitions %}
   IF in_partition = {{ partition }} THEN
     SELECT place_id FROM search_name_{{ partition }}
       INTO parent
-      WHERE name_vector && isin_token
+      WHERE token_matches_street(token_info, name_vector)
             AND centroid && ST_Expand(point, 0.015)
             AND address_rank between 26 and 27
       ORDER BY ST_Distance(centroid, point) ASC limit 1;
@@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE;
 
 CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
                                                        point GEOMETRY,
-                                                       isin_token INTEGER[])
+                                                       token_info JSONB)
   RETURNS BIGINT
   AS $$
 DECLARE
   parent BIGINT;
 BEGIN
+  IF not token_has_addr_place(token_info) THEN
+    RETURN NULL;
+  END IF;
 
 {% for partition in db.partitions %}
   IF in_partition = {{ partition }} THEN
     SELECT place_id
       INTO parent
       FROM search_name_{{ partition }}
-      WHERE name_vector && isin_token
+      WHERE token_matches_place(token_info, name_vector)
             AND centroid && ST_Expand(point, 0.04)
             AND address_rank between 16 and 25
       ORDER BY ST_Distance(centroid, point) ASC limit 1;
diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql
index fa7156ec..9c2a67a1 100644
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -104,8 +104,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
                                                poi_osm_id BIGINT,
                                                poi_partition SMALLINT,
                                                bbox GEOMETRY,
-                                               addr_street INTEGER[],
-                                               addr_place INTEGER[],
+                                               token_info JSONB,
                                                is_place_addr BOOLEAN)
   RETURNS BIGINT
   AS $$
@@ -119,8 +118,7 @@ BEGIN
   parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
 
   IF parent_place_id is null THEN
-    parent_place_id := find_parent_for_address(addr_street, addr_place,
-                                               poi_partition, bbox);
+    parent_place_id := find_parent_for_address(token_info, poi_partition, bbox);
   END IF;
 
   IF parent_place_id is null and poi_osm_type = 'N' THEN
@@ -333,13 +331,14 @@ BEGIN
     WHERE s.place_id = parent_place_id;
 
   FOR addr_item IN
-    SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
-      FROM token_get_address_tokens(token_info)
-      WHERE not search_tokens <@ parent_address_vector
+    SELECT (get_addr_tag_rank(key, country)).*, key,
+           token_get_address_search_tokens(token_info, key) as search_tokens
+      FROM token_get_address_keys(token_info) as key
+      WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector
   LOOP
     addr_place := get_address_place(in_partition, geometry,
                                     addr_item.from_rank, addr_item.to_rank,
-                                    addr_item.extent, addr_item.match_tokens);
+                                    addr_item.extent, token_info, addr_item.key);
 
     IF addr_place is null THEN
       -- No place found in OSM that matches. Make it at least searchable.
@@ -447,14 +446,16 @@ BEGIN
 
   FOR location IN
     SELECT (get_address_place(partition, geometry, from_rank, to_rank,
-                              extent, match_tokens)).*, search_tokens
-      FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
-              FROM token_get_address_tokens(token_info)) x
+                              extent, token_info, key)).*, key
+      FROM (SELECT (get_addr_tag_rank(key, country)).*, key
+              FROM token_get_address_keys(token_info) as key) x
       ORDER BY rank_address, distance, isguess desc
   LOOP
     IF location.place_id is null THEN
       {% if not db.reverse_only %}
-      nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
+      nameaddress_vector := array_merge(nameaddress_vector,
+                                        token_get_address_search_tokens(token_info,
+                                                                        location.key));
       {% endif %}
     ELSE
       {% if not db.reverse_only %}
@@ -689,9 +690,6 @@ DECLARE
   parent_address_level SMALLINT;
   place_address_level SMALLINT;
 
-  addr_street INTEGER[];
-  addr_place INTEGER[];
-
   max_rank SMALLINT;
 
   name_vector INTEGER[];
@@ -860,8 +858,6 @@ BEGIN
   END IF;
 
   NEW.housenumber := token_normalized_housenumber(NEW.token_info);
-  addr_street := token_addr_street_match_tokens(NEW.token_info);
-  addr_place := token_addr_place_match_tokens(NEW.token_info);
 
   NEW.postcode := null;
 
@@ -907,7 +903,7 @@ BEGIN
     NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
                                                NEW.partition,
                                                ST_Envelope(NEW.geometry),
-                                               addr_street, addr_place,
+                                               NEW.token_info,
                                                is_place_address);
 
     -- If we found the road take a shortcut here.
diff --git a/lib-sql/functions/utils.sql b/lib-sql/functions/utils.sql
index c308d025..f7d2093c 100644
--- a/lib-sql/functions/utils.sql
+++ b/lib-sql/functions/utils.sql
@@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE;
 
 -- Find the parent of an address with addr:street/addr:place tag.
 --
--- \param street     Value of addr:street or NULL if tag is missing.
--- \param place      Value of addr:place or NULL if tag is missing.
+-- \param token_info Naming info with the address information.
 -- \param partition  Partition where to search the parent.
 -- \param centroid   Location of the address.
 --
 -- \return Place ID of the parent if one was found, NULL otherwise.
-CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
+CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB,
                                                    partition SMALLINT,
                                                    centroid GEOMETRY)
   RETURNS BIGINT
@@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG
 DECLARE
   parent_place_id BIGINT;
 BEGIN
-  IF street is not null THEN
-    -- Check for addr:street attributes
-    -- Note that addr:street links can only be indexed, once the street itself is indexed
-    parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
-    IF parent_place_id is not null THEN
-      {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
-      RETURN parent_place_id;
-    END IF;
+  -- Check for addr:street attributes
+  parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info);
+  IF parent_place_id is not null THEN
+    {% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %}
+    RETURN parent_place_id;
   END IF;
 
   -- Check for addr:place attributes.
-  IF place is not null THEN
-    parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
-    IF parent_place_id is not null THEN
-      {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
-      RETURN parent_place_id;
-    END IF;
-  END IF;
-
-  RETURN NULL;
+  parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info);
+  {% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %}
+  RETURN parent_place_id;
 END;
 $$
 LANGUAGE plpgsql STABLE;
 
+
 CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
   RETURNS BOOLEAN
   AS $$
diff --git a/lib-sql/tiger_import_start.sql b/lib-sql/tiger_import_start.sql
index faa4efbb..f344e174 100644
--- a/lib-sql/tiger_import_start.sql
+++ b/lib-sql/tiger_import_start.sql
@@ -14,7 +14,6 @@ DECLARE
   out_partition INTEGER;
   out_parent_place_id BIGINT;
   location RECORD;
-  address_street_word_ids INTEGER[];
 
 BEGIN
 
@@ -54,13 +53,9 @@ BEGIN
 
   place_centroid := ST_Centroid(linegeo);
   out_partition := get_partition('us');
-  out_parent_place_id := null;
 
-  address_street_word_ids := token_addr_street_match_tokens(token_info);
-  IF address_street_word_ids IS NOT NULL THEN
-    out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
-                                                      address_street_word_ids);
-  END IF;
+  out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
+                                                    token_info);
 
   IF out_parent_place_id IS NULL THEN
     SELECT getNearestParallelRoadFeature(out_partition, linegeo)
diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql
index ffe6648c..230cb2ea 100644
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@@ -34,17 +34,31 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'street')::INTEGER[]
+  SELECT info->>'street' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
+  RETURNS BOOLEAN
+AS $$
+  SELECT info->>'place_match' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->>'street')::INTEGER[] && street_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'place_match')::INTEGER[]
+  SELECT (info->>'place_match')::INTEGER[] && place_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
@@ -55,19 +69,24 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-DROP TYPE IF EXISTS token_addresstoken CASCADE;
-CREATE TYPE token_addresstoken AS (
-  key TEXT,
-  match_tokens INT[],
-  search_tokens INT[]
-);
-
-CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
-  RETURNS SETOF token_addresstoken
+CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
+  RETURNS SETOF TEXT
 AS $$
-  SELECT key, (value->>1)::int[] as match_tokens,
-         (value->>0)::int[] as search_tokens
-  FROM jsonb_each(info->'addr');
+  SELECT * FROM jsonb_object_keys(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->'addr'->key->>0)::INTEGER[];
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql
index a2c6b520..2b734e6f 100644
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@@ -34,17 +34,31 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'street')::INTEGER[]
+  SELECT info->>'street' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
+  RETURNS BOOLEAN
+AS $$
+  SELECT info->>'place_match' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->>'street')::INTEGER[] && street_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'place_match')::INTEGER[]
+  SELECT (info->>'place_match')::INTEGER[] && place_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
@@ -55,19 +69,24 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-DROP TYPE IF EXISTS token_addresstoken CASCADE;
-CREATE TYPE token_addresstoken AS (
-  key TEXT,
-  match_tokens INT[],
-  search_tokens INT[]
-);
-
-CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
-  RETURNS SETOF token_addresstoken
+CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
+  RETURNS SETOF TEXT
 AS $$
-  SELECT key, (value->>1)::int[] as match_tokens,
-         (value->>0)::int[] as search_tokens
-  FROM jsonb_each(info->'addr');
+  SELECT * FROM jsonb_object_keys(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->'addr'->key->>0)::INTEGER[];
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 

From c6fdcf9b0d0fca42d0d5f69f0fc469259e17ca24 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 22 Sep 2021 22:54:14 +0200
Subject: [PATCH 2/5] adapt documentation for SQL tokenizer interface

---
 docs/develop/Tokenizers.md | 61 +++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md
index 529315e4..5282db1a 100644
--- a/docs/develop/Tokenizers.md
+++ b/docs/develop/Tokenizers.md
@@ -190,22 +190,21 @@ be listed with a semicolon as delimiter. Must be NULL when the place has no
 house numbers.
 
 ```sql
-FUNCTION token_addr_street_match_tokens(info JSONB) RETURNS INTEGER[]
+FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN
 ```
 
-Return the match token IDs by which to search a matching street from the
-`addr:street` tag. These IDs will be matched against the IDs supplied by
-`token_get_name_match_tokens`. Must be NULL when the place has no `addr:street`
-tag.
+Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
+match against the `addr:street` tag name. Must return either NULL or FALSE
+when the place has no `addr:street` tag.
 
 ```sql
-FUNCTION token_addr_place_match_tokens(info JSONB) RETURNS INTEGER[]
+FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN
 ```
 
-Return the match token IDs by which to search a matching place from the
-`addr:place` tag. These IDs will be matched against the IDs supplied by
-`token_get_name_match_tokens`. Must be NULL when the place has no `addr:place`
-tag.
+Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
+match against the `addr:place` tag name. Must return either NULL or FALSE
+when the place has no `addr:place` tag.
+
 
 ```sql
 FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[]
@@ -216,26 +215,34 @@ are used for searches by address when no matching place can be found in the
 database. Must be NULL when the place has no `addr:place` tag.
 
 ```sql
-CREATE TYPE token_addresstoken AS (
-  key TEXT,
-  match_tokens INT[],
-  search_tokens INT[]
-);
-
-FUNCTION token_get_address_tokens(info JSONB) RETURNS SETOF token_addresstoken
+FUNCTION token_get_address_keys(info JSONB) RETURNS SETOF TEXT
 ```
 
-Return the match and search token IDs for explicit `addr:*` tags for the place
-other than `addr:street` and `addr:place`. For each address item there are
-three pieces of information returned:
+Return the set of keys for which address information is provided. This
+should correspond to the list of (relevant) `addr:*` tags with the `addr:`
+prefix removed or the keys used in the `address` dictionary of the place info.
 
- * _key_ contains the type of address item (city, county, etc.). This is the
-   key handed in with the `address` dictionary.
- * *match_tokens* is the list of token IDs used to find the corresponding
-   place object for the address part. The list is matched against the IDs
-   from `token_get_name_match_tokens`.
- * *search_tokens* is the list of token IDs under which to search the address
-   item. It is used when no corresponding place object was found.
+```sql
+FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[]
+```
+
+Return the array of search tokens for the given address part. `key` can be
+expected to be one of those returned with `token_get_address_keys()`. The
+search tokens are added to the address search vector of the place, when no
+corresponding OSM object could be found for the given address part from which
+to copy the name information.
+
+```sql
+FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+```
+
+Check if the given tokens match against the address part `key`.
+
+__Warning:__ the tokens that are handed in are the lists previously saved
+from `token_get_name_search_tokens()`, _not_ from the match token list. This
+is an historical oddity which will be fixed at some point in the future.
+Currently, tokenizers are encouraged to make sure that matching works against
+both the search token list and the match token list.
 
 ```sql
 FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT

From bd7c7ddad04627d2bf402ebf20d6a8413a331320 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 23 Sep 2021 16:57:24 +0200
Subject: [PATCH 3/5] icu tokenizer: switch to matching against partial names

When matching address parts from addr:* tags against place names,
the address names where so far converted to full names and compared
those to the place names. This can become problematic with the new
ICU tokenizer once we introduce creation of different variants
depending on the place name context. It wouldn't be clear which
variant to produce to get a match, so we would have to create all of
them. To work around this issue, switch to using the partial terms
for matching. This introduces a larger fuzziness between matches but
that shouldn't be a problem because matching is always geographically
restricted.

The search terms created for address parts have a different problem:
they are already created before we even know if they are going to be
used. This can lead to spurious entries in the word table, which slows
down searching. This problem can also be circumvented by using only
partial terms for the search terms. In terms of searching that means
that the address terms would not get the full-word boost, but given
that the case where an address part does not exist as an OSM object
should be the exception, this is likely acceptable.
---
 lib-sql/tokenizer/icu_tokenizer.sql    | 37 +++++++++++----
 nominatim/tokenizer/icu_tokenizer.py   | 65 ++++++++++++++++----------
 test/bdd/db/import/search_name.feature | 15 ------
 3 files changed, 68 insertions(+), 49 deletions(-)

diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql
index 230cb2ea..6092319a 100644
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@@ -44,28 +44,28 @@ $$ LANGUAGE SQL IMMUTABLE;
 CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
   RETURNS BOOLEAN
 AS $$
-  SELECT info->>'place_match' is not null;
+  SELECT info->>'place' is not null;
 $$ LANGUAGE SQL IMMUTABLE;
 
 
 CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
   RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'street')::INTEGER[] && street_tokens
+  SELECT (info->>'street')::INTEGER[] <@ street_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
 CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
   RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'place_match')::INTEGER[] && place_tokens
+  SELECT (info->>'place')::INTEGER[] <@ place_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
 CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
   RETURNS INTEGER[]
 AS $$
-  SELECT (info->>'place_search')::INTEGER[]
+  SELECT (info->>'place')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
@@ -79,14 +79,14 @@ $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
   RETURNS INTEGER[]
 AS $$
-  SELECT (info->'addr'->key->>0)::INTEGER[];
+  SELECT (info->'addr'->>key)::INTEGER[];
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
 CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
   RETURNS BOOLEAN
 AS $$
-  SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
+  SELECT (info->'addr'->>key)::INTEGER[] <@ tokens;
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
@@ -146,15 +146,34 @@ BEGIN
         VALUES (term_id, term, 'w', json_build_object('count', term_count));
     END IF;
 
-    IF term_count < {{ max_word_freq }} THEN
-      partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
-    END IF;
+    partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
   END LOOP;
 END;
 $$
 LANGUAGE plpgsql;
 
 
+CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  token INTEGER;
+BEGIN
+  SELECT min(word_id) INTO token
+    FROM word WHERE word_token = partial and type = 'w';
+
+  IF token IS NULL THEN
+    token := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, type, info)
+        VALUES (token, partial, 'w', json_build_object('count', 0));
+  END IF;
+
+  RETURN token;
+END;
+$$
+LANGUAGE plpgsql;
+
+
 CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
   RETURNS INTEGER
   AS $$
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 61263678..22f5e78f 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -17,7 +17,6 @@ from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
-DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
 
 LOG = logging.getLogger()
@@ -39,7 +38,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
         self.data_dir = data_dir
         self.naming_rules = None
         self.term_normalization = None
-        self.max_word_frequency = None
 
 
     def init_new_db(self, config, init_db=True):
@@ -52,7 +50,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
                                                              config='TOKENIZER_CONFIG'))
         self.naming_rules = ICUNameProcessorRules(loader=loader)
         self.term_normalization = config.TERM_NORMALIZATION
-        self.max_word_frequency = config.MAX_WORD_FREQUENCY
 
         self._install_php(config.lib_dir.php)
         self._save_config(config)
@@ -68,7 +65,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
         with connect(self.dsn) as conn:
             self.naming_rules = ICUNameProcessorRules(conn=conn)
             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
-            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
 
 
     def finalize_import(self, _):
@@ -81,10 +77,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """ Reimport the SQL functions for this tokenizer.
         """
         with connect(self.dsn) as conn:
-            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
             sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
-                              max_word_freq=max_word_freq)
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
 
 
     def check_database(self):
@@ -122,7 +116,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         php_file = self.data_dir / "tokenizer.php"
         php_file.write_text(dedent(f"""\
             <?php
-            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+            @define('CONST_Max_Word_Frequency', 10000000);
             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
             @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
@@ -135,7 +129,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
         with connect(self.dsn) as conn:
             self.naming_rules.save_rules(conn)
 
-            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 
 
@@ -424,12 +417,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                 hnrs.append(value)
             elif key == 'street':
-                token_info.add_street(*self._compute_name_tokens({'name': value}))
+                token_info.add_street(self._compute_partial_tokens(value))
             elif key == 'place':
-                token_info.add_place(*self._compute_name_tokens({'name': value}))
+                token_info.add_place(self._compute_partial_tokens(value))
             elif not key.startswith('_') and \
                  key not in ('country', 'full'):
-                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+                addr_terms.append((key, self._compute_partial_tokens(value)))
 
         if hnrs:
             hnrs = self._split_housenumbers(hnrs)
@@ -438,6 +431,32 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         if addr_terms:
             token_info.add_address_terms(addr_terms)
 
+    def _compute_partial_tokens(self, name):
+        """ Normalize the given term, split it into partial words and return
+            then token list for them.
+        """
+        norm_name = self.name_processor.get_search_normalized(name)
+
+        tokens = []
+        need_lookup = []
+        for partial in norm_name.split():
+            token = self._cache.partials.get(partial)
+            if token:
+                tokens.append(token)
+            else:
+                need_lookup.append(partial)
+
+        if need_lookup:
+            with self.conn.cursor() as cur:
+                cur.execute("""SELECT word, getorcreate_partial_word(word)
+                               FROM unnest(%s) word""",
+                            (need_lookup, ))
+
+                for partial, token in cur:
+                    tokens.append(token)
+                    self._cache.partials[partial] = token
+
+        return tokens
 
     def _compute_name_tokens(self, names):
         """ Computes the full name and partial name tokens for the given
@@ -551,30 +570,25 @@ class _TokenInfo:
         self.data['hnr'] = ';'.join(hnrs)
 
 
-    def add_street(self, fulls, _):
+    def add_street(self, tokens):
         """ Add addr:street match terms.
         """
-        if fulls:
-            self.data['street'] = self._mk_array(fulls)
+        if tokens:
+            self.data['street'] = self._mk_array(tokens)
 
 
-    def add_place(self, fulls, partials):
+    def add_place(self, tokens):
         """ Add addr:place search and match terms.
         """
-        if fulls:
-            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
-            self.data['place_match'] = self._mk_array(fulls)
+        if tokens:
+            self.data['place'] = self._mk_array(tokens)
 
 
     def add_address_terms(self, terms):
         """ Add additional address terms.
         """
-        tokens = {}
-
-        for key, fulls, partials in terms:
-            if fulls:
-                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
-                               self._mk_array(fulls)]
+        tokens = {key: self._mk_array(partials)
+                  for key, partials in terms if partials}
 
         if tokens:
             self.data['addr'] = tokens
@@ -588,6 +602,7 @@ class _TokenCache:
     """
     def __init__(self):
         self.names = {}
+        self.partials = {}
         self.postcodes = set()
         self.housenumbers = {}
 
diff --git a/test/bdd/db/import/search_name.feature b/test/bdd/db/import/search_name.feature
index d393b8f3..a5b07e5c 100644
--- a/test/bdd/db/import/search_name.feature
+++ b/test/bdd/db/import/search_name.feature
@@ -125,9 +125,6 @@ Feature: Creation of search terms
         Then placex contains
          | object | parent_place_id |
          | N1     | N2              |
-        Then search_name contains
-         | object | name_vector | nameaddress_vector |
-         | N1     | #Walltown   | Strange, Town |
         When sending search query "23 Rose Street"
         Then exactly 1 results are returned
         And results contain
@@ -156,9 +153,6 @@ Feature: Creation of search terms
          | W1  | highway | residential | Rose Street  | :w-north |
          | N2  | place   | city        | Strange Town | :p-N1 |
         When importing
-        Then search_name contains
-         | object | name_vector      | nameaddress_vector      |
-         | N1     | #Walltown, #Blue house | Walltown, Strange, Town |
         When sending search query "23 Walltown, Strange Town"
         Then results contain
          | osm | display_name |
@@ -190,9 +184,6 @@ Feature: Creation of search terms
          | W1  | highway | residential | Rose Street  | :w-north |
          | N2  | place   | city        | Strange Town | :p-N1 |
         When importing
-        Then search_name contains
-         | object | name_vector      | nameaddress_vector      |
-         | N1     | #Moon sun, #Blue house | Moon, Sun, Strange, Town |
         When sending search query "23 Moon Sun, Strange Town"
         Then results contain
          | osm | display_name |
@@ -212,9 +203,6 @@ Feature: Creation of search terms
          | W1  | highway | residential | Rose Street  | Walltown  | :w-north |
          | N2  | place   | suburb      | Strange Town | Walltown  | :p-N1 |
         When importing
-        Then search_name contains
-         | object | name_vector | nameaddress_vector |
-         | N1     | #Walltown   | Strange, Town      |
         When sending search query "23 Rose Street, Walltown"
         Then exactly 1 result is returned
         And results contain
@@ -303,9 +291,6 @@ Feature: Creation of search terms
          | W1  | highway | residential | Rose Street  | :w-north |
          | N2  | place   | suburb      | Strange Town | :p-N1 |
         When importing
-        Then search_name contains
-         | object | name_vector | nameaddress_vector |
-         | N1     | #Green Moss | Walltown |
         When sending search query "Green Moss, Rose Street, Walltown"
         Then exactly 0 result is returned
         When sending search query "Green Moss, Walltown"

From bb18479d5bfd59fe418f21e79136646133e93567 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 27 Sep 2021 14:58:43 +0200
Subject: [PATCH 4/5] remove unused parameter

---
 nominatim/tokenizer/icu_tokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 22f5e78f..5768fd35 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -52,7 +52,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         self.term_normalization = config.TERM_NORMALIZATION
 
         self._install_php(config.lib_dir.php)
-        self._save_config(config)
+        self._save_config()
 
         if init_db:
             self.update_sql_functions(config)
@@ -122,7 +122,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 
 
-    def _save_config(self, config):
+    def _save_config(self):
         """ Save the configuration that needs to remain stable for the given
             database as database properties.
         """

From 09c9fad6c31733065492306c9ee3c052e50ac115 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 27 Sep 2021 17:36:23 +0200
Subject: [PATCH 5/5] adapt tests to new ICU address token handling

---
 test/python/test_tokenizer_icu.py | 44 ++++++++++++++-----------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py
index b7101c3f..ed079269 100644
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -10,6 +10,7 @@ from nominatim.tokenizer import icu_tokenizer
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db import properties
+from nominatim.db.sql_preprocessor import SQLPreprocessor
 
 from mock_icu_word_table import MockIcuWordTable
 
@@ -76,6 +77,15 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
 
     return _mk_analyser
 
+@pytest.fixture
+def sql_functions(temp_db_conn, def_config, src_dir):
+    orig_sql = def_config.lib_dir.sql
+    def_config.lib_dir.sql = src_dir / 'lib-sql'
+    sqlproc = SQLPreprocessor(temp_db_conn, def_config)
+    sqlproc.run_sql_file(temp_db_conn, 'functions/utils.sql')
+    sqlproc.run_sql_file(temp_db_conn, 'tokenizer/icu_tokenizer.sql')
+    def_config.lib_dir.sql = orig_sql
+
 
 @pytest.fixture
 def getorcreate_full_word(temp_db_cursor):
@@ -144,7 +154,6 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
     tok.init_new_db(test_config)
 
     assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
-    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
 
 
 def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
@@ -163,7 +172,6 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
 
 def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
     monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
-    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '90300')
     tok = tokenizer_factory()
     tok.init_new_db(test_config)
     monkeypatch.undo()
@@ -173,23 +181,18 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
 
     assert tok.naming_rules is not None
     assert tok.term_normalization == ':: lower();'
-    assert tok.max_word_frequency == '90300'
 
 
 def test_update_sql_functions(db_prop, temp_db_cursor,
                               tokenizer_factory, test_config, table_factory,
                               monkeypatch):
-    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
     tok = tokenizer_factory()
     tok.init_new_db(test_config)
-    monkeypatch.undo()
-
-    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
 
     table_factory('test', 'txt TEXT')
 
     func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
-    func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
+    func_file.write_text("""INSERT INTO test VALUES (1133)""")
 
     tok.update_sql_functions(test_config)
 
@@ -304,7 +307,7 @@ def test_add_country_names_extend(analyzer, word_table):
 class TestPlaceNames:
 
     @pytest.fixture(autouse=True)
-    def setup(self, analyzer, getorcreate_full_word):
+    def setup(self, analyzer, sql_functions):
         with analyzer() as anl:
             self.analyzer = anl
             yield anl
@@ -351,7 +354,7 @@ class TestPlaceNames:
 class TestPlaceAddress:
 
     @pytest.fixture(autouse=True)
-    def setup(self, analyzer, getorcreate_full_word):
+    def setup(self, analyzer, sql_functions):
         with analyzer(trans=(":: upper()", "'🜵' > ' '")) as anl:
             self.analyzer = anl
             yield anl
@@ -424,7 +427,7 @@ class TestPlaceAddress:
     def test_process_place_street(self):
         info = self.process_address(street='Grand Road')
 
-        assert eval(info['street']) == self.name_token_set('#GRAND ROAD')
+        assert eval(info['street']) == self.name_token_set('GRAND', 'ROAD')
 
 
     def test_process_place_street_empty(self):
@@ -436,16 +439,13 @@ class TestPlaceAddress:
     def test_process_place_place(self):
         info = self.process_address(place='Honu Lulu')
 
-        assert eval(info['place_search']) == self.name_token_set('#HONU LULU',
-                                                                 'HONU', 'LULU')
-        assert eval(info['place_match']) == self.name_token_set('#HONU LULU')
+        assert eval(info['place']) == self.name_token_set('HONU', 'LULU')
 
 
     def test_process_place_place_empty(self):
         info = self.process_address(place='🜵')
 
-        assert 'place_search' not in info
-        assert 'place_match' not in info
+        assert 'place' not in info
 
 
     def test_process_place_address_terms(self):
@@ -453,16 +453,12 @@ class TestPlaceAddress:
                                     suburb='Zwickau', street='Hauptstr',
                                     full='right behind the church')
 
-        city_full = self.name_token_set('#ZWICKAU')
-        city_all = self.name_token_set('#ZWICKAU', 'ZWICKAU')
-        state_full = self.name_token_set('#SACHSEN')
-        state_all = self.name_token_set('#SACHSEN', 'SACHSEN')
+        city = self.name_token_set('ZWICKAU')
+        state = self.name_token_set('SACHSEN')
 
-        result = {k: [eval(v[0]), eval(v[1])] for k,v in info['addr'].items()}
+        result = {k: eval(v) for k,v in info['addr'].items()}
 
-        assert result == {'city': [city_all, city_full],
-                          'suburb': [city_all, city_full],
-                          'state': [state_all, state_full]}
+        assert result == {'city': city, 'suburb': city, 'state': state}
 
 
     def test_process_place_address_terms_empty(self):