switch postcode tokens to new word table layout

This commit is contained in:
Sarah Hoffmann
2021-07-20 12:11:12 +02:00
parent 5ab0a63fd6
commit 5394b1fa1b
3 changed files with 26 additions and 17 deletions

View File

@@ -147,7 +147,7 @@ class Tokenizer
{ {
// Check which tokens we have, get the ID numbers // Check which tokens we have, get the ID numbers
$sSQL = 'SELECT word_id, word_token, type'; $sSQL = 'SELECT word_id, word_token, type';
$sSQL .= " info->>'cc' as country"; $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode";
$sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= ' FROM word WHERE word_token in (';
$sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
@@ -171,6 +171,16 @@ class Tokenizer
'H': // house number tokens 'H': // house number tokens
$oToken = new Token\HouseNumber($iId, $aWord['word_token']); $oToken = new Token\HouseNumber($iId, $aWord['word_token']);
break; break;
'P': // postcode tokens
// Postcodes are not normalized, so they may have content
// that makes SQL injection possible. Reject postcodes
// that would need special escaping.
if ($aWord['postcode'] === null
|| pg_escape_string($aWord['postcode']) == $aWord['postcode']
) {
continue;
}
$oToken = new Token\Postcode($iId, $aWord['postcode'], null);
default: default:
continue; continue;
} }

View File

@@ -10,7 +10,12 @@ CREATE INDEX idx_word_word_token ON word
USING BTREE (word_token) {{db.tablespace.search_index}}; USING BTREE (word_token) {{db.tablespace.search_index}};
-- Used when updating country names from the boundary relation. -- Used when updating country names from the boundary relation.
CREATE INDEX idx_word_country_names ON word CREATE INDEX idx_word_country_names ON word
USING btree((info->>'cc')) WHERE type = 'C'; USING btree((info->>'cc')) {{db.tablespace.address_index}}
WHERE type = 'C';
-- Used when inserting new postcodes on updates.
CREATE INDEX idx_word_postcodes ON word
USING btree((info->>'postcode')) {{db.tablespace.address_index}}
WHERE type = 'P'
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS seq_word; DROP SEQUENCE IF EXISTS seq_word;

View File

@@ -276,8 +276,7 @@ class LegacyICUNameAnalyzer:
(SELECT pc, word FROM (SELECT pc, word FROM
(SELECT distinct(postcode) as pc FROM location_postcode) p (SELECT distinct(postcode) as pc FROM location_postcode) p
FULL JOIN FULL JOIN
(SELECT word FROM word (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
WHERE class ='place' and type = 'postcode') w
ON pc = word) x ON pc = word) x
WHERE pc is null or word is null""") WHERE pc is null or word is null""")
@@ -286,20 +285,16 @@ class LegacyICUNameAnalyzer:
if postcode is None: if postcode is None:
to_delete.append(word) to_delete.append(word)
else: else:
copystr.add( copystr.add(self.name_processor.get_search_normalized(postcode),
postcode, 'P', {'postcode': postcode})
' ' + self.name_processor.get_search_normalized(postcode),
'place', 'postcode', 0)
if to_delete: if to_delete:
cur.execute("""DELETE FROM WORD cur.execute("""DELETE FROM WORD
WHERE class ='place' and type = 'postcode' WHERE class ='P' and info->>'postcode' = any(%s)
and word = any(%s)
""", (to_delete, )) """, (to_delete, ))
copystr.copy_out(cur, 'word', copystr.copy_out(cur, 'word',
columns=['word', 'word_token', 'class', 'type', columns=['word_token', 'type', 'info'])
'search_name_count'])
def update_special_phrases(self, phrases, should_replace): def update_special_phrases(self, phrases, should_replace):
@@ -503,14 +498,13 @@ class LegacyICUNameAnalyzer:
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
# no word_id needed for postcodes # no word_id needed for postcodes
cur.execute("""INSERT INTO word (word, word_token, class, type, cur.execute("""INSERT INTO word (word_token, type, info)
search_name_count) (SELECT %s, 'P', json_build_object('postcode', pc)
(SELECT pc, %s, 'place', 'postcode', 0
FROM (VALUES (%s)) as v(pc) FROM (VALUES (%s)) as v(pc)
WHERE NOT EXISTS WHERE NOT EXISTS
(SELECT * FROM word (SELECT * FROM word
WHERE word = pc and class='place' and type='postcode')) WHERE type = 'P' and info->>postcode = pc))
""", (' ' + term, postcode)) """, (term, postcode))
self._cache.postcodes.add(postcode) self._cache.postcodes.add(postcode)