diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 9bd9828c..7a900e5a 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -145,8 +145,7 @@ class Tokenizer private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) { // Check which tokens we have, get the ID numbers - $sSQL = 'SELECT word_id, word_token, type,'; - $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,"; + $sSQL = 'SELECT word_id, word_token, type, word,'; $sSQL .= " info->>'op' as operator,"; $sSQL .= " info->>'class' as class, info->>'type' as ctype,"; $sSQL .= " info->>'count' as count"; @@ -163,11 +162,14 @@ class Tokenizer switch ($aWord['type']) { case 'C': // country name tokens - if ($aWord['country'] !== null + if ($aWord['word'] !== null && (!$this->aCountryRestriction - || in_array($aWord['country'], $this->aCountryRestriction)) + || in_array($aWord['word'], $this->aCountryRestriction)) ) { - $oValidTokens->addToken($sTok, new Token\Country($iId, $aWord['country'])); + $oValidTokens->addToken( + $sTok, + new Token\Country($iId, $aWord['word']) + ); } break; case 'H': // house number tokens @@ -177,12 +179,15 @@ class Tokenizer // Postcodes are not normalized, so they may have content // that makes SQL injection possible. Reject postcodes // that would need special escaping. - if ($aWord['postcode'] !== null - && pg_escape_string($aWord['postcode']) == $aWord['postcode'] + if ($aWord['word'] !== null + && pg_escape_string($aWord['word']) == $aWord['word'] ) { - $sNormPostcode = $this->normalizeString($aWord['postcode']); + $sNormPostcode = $this->normalizeString($aWord['word']); if (strpos($sNormQuery, $sNormPostcode) !== false) { - $oValidTokens->addToken($sTok, new Token\Postcode($iId, $aWord['postcode'], null)); + $oValidTokens->addToken( + $sTok, + new Token\Postcode($iId, $aWord['word'], null) + ); } } break; @@ -192,7 +197,7 @@ class Tokenizer $iId, $aWord['class'], $aWord['ctype'], - (isset($aWord['op'])) ? Operator::NEAR : Operator::NONE + (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE )); } break; diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql index 912ba768..7ec3c6f8 100644 --- a/lib-sql/tokenizer/icu_tokenizer_tables.sql +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -3,6 +3,7 @@ CREATE TABLE word ( word_id INTEGER, word_token text NOT NULL, type text NOT NULL, + word text, info jsonb ) {{db.tablespace.search_data}}; @@ -10,15 +11,15 @@ CREATE INDEX idx_word_word_token ON word USING BTREE (word_token) {{db.tablespace.search_index}}; -- Used when updating country names from the boundary relation. CREATE INDEX idx_word_country_names ON word - USING btree((info->>'cc')) {{db.tablespace.address_index}} + USING btree(word) {{db.tablespace.address_index}} WHERE type = 'C'; -- Used when inserting new postcodes on updates. CREATE INDEX idx_word_postcodes ON word - USING btree((info->>'postcode')) {{db.tablespace.address_index}} + USING btree(word) {{db.tablespace.address_index}} WHERE type = 'P'; -- Used when inserting full words. CREATE INDEX idx_word_full_word ON word - USING btree((info->>'word')) {{db.tablespace.address_index}} + USING btree(word) {{db.tablespace.address_index}} WHERE type = 'W'; GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql index e021bf8b..ffe6648c 100644 --- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -98,13 +98,13 @@ DECLARE term_count INTEGER; BEGIN SELECT min(word_id) INTO full_token - FROM word WHERE info->>'word' = norm_term and type = 'W'; + FROM word WHERE word = norm_term and type = 'W'; IF full_token IS NULL THEN full_token := nextval('seq_word'); - INSERT INTO word (word_id, word_token, type, info) - SELECT full_token, lookup_term, 'W', - json_build_object('word', norm_term, 'count', 0) + INSERT INTO word (word_id, word_token, type, word, info) + SELECT full_token, lookup_term, 'W', norm_term, + json_build_object('count', 0) FROM unnest(lookup_terms) as lookup_term; END IF; diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 3d7d752e..1f8096ff 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -278,7 +278,7 @@ class LegacyICUNameAnalyzer: (SELECT pc, word FROM (SELECT distinct(postcode) as pc FROM location_postcode) p FULL JOIN - (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w + (SELECT word FROM word WHERE type = 'P') w ON pc = word) x WHERE pc is null or word is null""") @@ -288,15 +288,15 @@ class LegacyICUNameAnalyzer: to_delete.append(word) else: copystr.add(self.name_processor.get_search_normalized(postcode), - 'P', json.dumps({'postcode': postcode})) + 'P', postcode) if to_delete: cur.execute("""DELETE FROM WORD - WHERE type ='P' and info->>'postcode' = any(%s) + WHERE type ='P' and word = any(%s) """, (to_delete, )) copystr.copy_out(cur, 'word', - columns=['word_token', 'type', 'info']) + columns=['word_token', 'type', 'word']) def update_special_phrases(self, phrases, should_replace): @@ -311,9 +311,9 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # Get the old phrases. existing_phrases = set() - cur.execute("SELECT info FROM word WHERE type = 'S'") - for (info, ) in cur: - existing_phrases.add((info['word'], info['class'], info['type'], + cur.execute("SELECT word, info FROM word WHERE type = 'S'") + for word, info in cur: + existing_phrases.add((word, info['class'], info['type'], info.get('op') or '-')) added = self._add_special_phrases(cur, norm_phrases, existing_phrases) @@ -337,13 +337,13 @@ class LegacyICUNameAnalyzer: for word, cls, typ, oper in to_add: term = self.name_processor.get_search_normalized(word) if term: - copystr.add(term, 'S', - json.dumps({'word': word, 'class': cls, 'type': typ, + copystr.add(term, 'S', word, + json.dumps({'class': cls, 'type': typ, 'op': oper if oper in ('in', 'near') else None})) added += 1 copystr.copy_out(cursor, 'word', - columns=['word_token', 'type', 'info']) + columns=['word_token', 'type', 'word', 'info']) return added @@ -358,7 +358,7 @@ class LegacyICUNameAnalyzer: if to_delete: cursor.execute_values( """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) - WHERE info->>'word' = name + WHERE type = 'S' and word = name and info->>'class' = in_class and info->>'type' = in_type and ((op = '-' and info->>'op' is null) or op = info->>'op') """, to_delete) @@ -378,14 +378,14 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # Get existing names cur.execute("""SELECT word_token FROM word - WHERE type = 'C' and info->>'cc'= %s""", + WHERE type = 'C' and word = %s""", (country_code, )) word_tokens.difference_update((t[0] for t in cur)) # Only add those names that are not yet in the list. if word_tokens: - cur.execute("""INSERT INTO word (word_token, type, info) - (SELECT token, 'C', json_build_object('cc', %s) + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT token, 'C', %s FROM unnest(%s) as token) """, (country_code, list(word_tokens))) @@ -503,12 +503,11 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word_token, type, info) - (SELECT %s, 'P', json_build_object('postcode', pc) - FROM (VALUES (%s)) as v(pc) + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE type = 'P' and info->>'postcode' = pc)) + WHERE type = 'P' and word = pc)) """, (term, postcode)) self._cache.postcodes.add(postcode) diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index be2789f3..ac61fc67 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -266,22 +266,6 @@ def check_location_postcode(context): db_row.assert_row(row, ('country', 'postcode')) -@then("word contains(?P not)?") -def check_word_table(context, exclude): - """ Check the contents of the word table. Each row represents a table row - and all data must match. Data not present in the expected table, may - be arbitry. The rows are identified via all given columns. - """ - with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - for row in context.table: - wheres = ' AND '.join(["{} = %s".format(h) for h in row.headings]) - cur.execute("SELECT * from word WHERE " + wheres, list(row.cells)) - if exclude: - assert cur.rowcount == 0, "Row still in word table: %s" % '/'.join(values) - else: - assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values) - - @then("there are(?P no)? word tokens for postcodes (?P.*)") def check_word_table_for_postcodes(context, exclude, postcodes): """ Check that the tokenizer produces postcode tokens for the given @@ -297,8 +281,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes): with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: if nctx.tokenizer == 'legacy_icu': - cur.execute("""SELECT info->>'postcode' FROM word - WHERE type = 'P' and info->>'postcode' = any(%s)""", + cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)", (plist,)) else: cur.execute("""SELECT word FROM word WHERE word = any(%s) diff --git a/test/python/mock_icu_word_table.py b/test/python/mock_icu_word_table.py index 3d457d0b..cde5e770 100644 --- a/test/python/mock_icu_word_table.py +++ b/test/python/mock_icu_word_table.py @@ -12,16 +12,16 @@ class MockIcuWordTable: cur.execute("""CREATE TABLE word (word_id INTEGER, word_token text NOT NULL, type text NOT NULL, + word text, info jsonb)""") conn.commit() def add_special(self, word_token, word, cls, typ, oper): with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, type, info) - VALUES (%s, 'S', - json_build_object('word', %s, - 'class', %s, + cur.execute("""INSERT INTO word (word_token, type, word, info) + VALUES (%s, 'S', %s, + json_build_object('class', %s, 'type', %s, 'op', %s)) """, (word_token, word, cls, typ, oper)) @@ -30,16 +30,16 @@ class MockIcuWordTable: def add_country(self, country_code, word_token): with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, type, info) - VALUES(%s, 'C', json_build_object('cc', %s))""", + cur.execute("""INSERT INTO word (word_token, type, word) + VALUES(%s, 'C', %s)""", (word_token, country_code)) self.conn.commit() def add_postcode(self, word_token, postcode): with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, type, info) - VALUES (%s, 'P', json_build_object('postcode', %s)) + cur.execute("""INSERT INTO word (word_token, type, word) + VALUES (%s, 'P', %s) """, (word_token, postcode)) self.conn.commit() @@ -56,8 +56,8 @@ class MockIcuWordTable: def get_special(self): with self.conn.cursor() as cur: - cur.execute("SELECT word_token, info FROM word WHERE type = 'S'") - result = set(((row[0], row[1]['word'], row[1]['class'], + cur.execute("SELECT word_token, info, word FROM word WHERE type = 'S'") + result = set(((row[0], row[2], row[1]['class'], row[1]['type'], row[1]['op']) for row in cur)) assert len(result) == cur.rowcount, "Word table has duplicates." return result @@ -65,7 +65,7 @@ class MockIcuWordTable: def get_country(self): with self.conn.cursor() as cur: - cur.execute("SELECT info->>'cc', word_token FROM word WHERE type = 'C'") + cur.execute("SELECT word, word_token FROM word WHERE type = 'C'") result = set((tuple(row) for row in cur)) assert len(result) == cur.rowcount, "Word table has duplicates." return result @@ -73,7 +73,7 @@ class MockIcuWordTable: def get_postcodes(self): with self.conn.cursor() as cur: - cur.execute("SELECT info->>'postcode' FROM word WHERE type = 'P'") + cur.execute("SELECT word FROM word WHERE type = 'P'") return set((row[0] for row in cur))