port legacy tokenizer to new postcode handling

Also documents the changes to the SQL functions of the tokenizer.
This commit is contained in:
Sarah Hoffmann
2022-06-08 08:19:55 +02:00
parent e86db3001f
commit 37b2c6a830
5 changed files with 16 additions and 17 deletions

View File

@@ -245,11 +245,11 @@ Currently, tokenizers are encouraged to make sure that matching works against
both the search token list and the match token list. both the search token list and the match token list.
```sql ```sql
FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT FUNCTION token_get_postcode(info JSONB) RETURNS TEXT
``` ```
Return the normalized version of the given postcode. This function must return Return the postcode for the object, if any exists. The postcode must be in
the same value as the Python function `AbstractAnalyzer->normalize_postcode()`. the form that should also be presented to the end-user.
```sql ```sql
FUNCTION token_strip_info(info JSONB) RETURNS JSONB FUNCTION token_strip_info(info JSONB) RETURNS JSONB

View File

@@ -97,13 +97,6 @@ AS $$
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
RETURNS TEXT
AS $$
SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB) CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB)
RETURNS TEXT RETURNS TEXT
AS $$ AS $$

View File

@@ -97,10 +97,10 @@ AS $$
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT) CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB)
RETURNS TEXT RETURNS TEXT
AS $$ AS $$
SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END; SELECT info->>'postcode';
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;

View File

@@ -467,8 +467,9 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
if key == 'postcode': if key == 'postcode':
# Make sure the normalized postcode is present in the word table. # Make sure the normalized postcode is present in the word table.
if re.search(r'[:,;]', value) is None: if re.search(r'[:,;]', value) is None:
self._cache.add_postcode(self.conn, norm_pc = self.normalize_postcode(value)
self.normalize_postcode(value)) token_info.set_postcode(norm_pc)
self._cache.add_postcode(self.conn, norm_pc)
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value) hnrs.append(value)
elif key == 'street': elif key == 'street':
@@ -527,6 +528,11 @@ class _TokenInfo:
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone() self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
def set_postcode(self, postcode):
""" Set or replace the postcode token with the given value.
"""
self.data['postcode'] = postcode
def add_street(self, conn, street): def add_street(self, conn, street):
""" Add addr:street match terms. """ Add addr:street match terms.
""" """

View File

@@ -186,17 +186,17 @@ def update_postcodes(dsn, project_dir, tokenizer):
# Recompute the list of valid postcodes from placex. # Recompute the list of valid postcodes from placex.
with conn.cursor(name="placex_postcodes") as cur: with conn.cursor(name="placex_postcodes") as cur:
cur.execute(""" cur.execute("""
SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid) SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
FROM (SELECT FROM (SELECT
COALESCE(plx.country_code, COALESCE(plx.country_code,
get_country_code(ST_Centroid(pl.geometry))) as cc, get_country_code(ST_Centroid(pl.geometry))) as cc,
token_normalized_postcode(pl.address->'postcode') as pc, pl.address->'postcode' as pc,
COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
FROM place AS pl LEFT OUTER JOIN placex AS plx FROM place AS pl LEFT OUTER JOIN placex AS plx
ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
WHERE pc IS NOT null AND cc IS NOT null WHERE pc IS NOT null AND cc IS NOT null
ORDER BY country_code, pc""") ORDER BY cc, pc""")
collector = None collector = None