From 83775289523eda29fe8d82ff2e92c6faa5c76898 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 20 Jul 2021 10:27:06 +0200 Subject: [PATCH 01/14] new word table layout for icu tokenizer The table now directly reflects the different token types. Extra information is saved in a json structure that may be dynamically extended in the future without affecting the table layout. --- lib-sql/tokenizer/icu_tokenizer_tables.sql | 15 +++++++++++++++ nominatim/tokenizer/legacy_icu_tokenizer.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 lib-sql/tokenizer/icu_tokenizer_tables.sql diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql new file mode 100644 index 00000000..13e1bdb0 --- /dev/null +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS word; +CREATE TABLE word_icu ( + word_id INTEGER, + word_token text NOT NULL, + type text NOT NULL, + info jsonb +) {{db.tablespace.search_data}}; + +CREATE INDEX idx_word_word_token ON word + USING BTREE (word_token) {{db.tablespace.search_index}}; +GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; + +DROP SEQUENCE IF EXISTS seq_word; +CREATE SEQUENCE seq_word start 1; +GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}"; diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 6d3d11c1..59ad09aa 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -152,7 +152,7 @@ class LegacyICUTokenizer: """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql') + sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql') conn.commit() LOG.warning("Precomputing word tokens") From 1618aba5f282a27fc45af28c4eeebb6dcd28c332 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 20 Jul 2021 11:21:13 +0200 Subject: [PATCH 02/14] switch country name tokens to new word table layout --- lib-php/tokenizer/legacy_icu_tokenizer.php | 33 +++++++++++---------- lib-sql/tokenizer/icu_tokenizer_tables.sql | 3 ++ nominatim/tokenizer/legacy_icu_tokenizer.py | 18 +++++++---- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index eac964e4..ea445f23 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -146,8 +146,8 @@ class Tokenizer private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) { // Check which tokens we have, get the ID numbers - $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,'; - $sSQL .= ' operator, coalesce(search_name_count, 0) as count'; + $sSQL = 'SELECT word_id, word_token, type'; + $sSQL .= " info->>'cc' as country"; $sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; @@ -156,8 +156,20 @@ class Tokenizer $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.'); foreach ($aDBWords as $aWord) { - $oToken = null; - $iId = (int) $aWord['word_id']; + switch ($aWord['type']) { + 'C': // country name tokens + if ($aWord['country'] === null + || ($this->aCountryRestriction + && !in_array($aWord['country'], $this->aCountryRestriction)) + ) { + continue; + } + $oToken = new Token\Country($iId, $aWord['country']) + break; + default: + continue; + } +/* $iId = (int) $aWord['word_id']; if ($aWord['class']) { // Special terms need to appear in their normalized form. @@ -207,16 +219,9 @@ class Tokenizer $aWord['word_token'], (int) $aWord['count'] ); - } + }*/ - if ($oToken) { - // remove any leading spaces - if ($aWord['word_token'][0] == ' ') { - $oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken); - } else { - $oValidTokens->addToken($aWord['word_token'], $oToken); - } - } + $oValidTokens->addToken($aWord['word_token'], $oToken); } } @@ -234,12 +239,10 @@ class Tokenizer for ($i = 0; $i < $iNumWords; $i++) { $sPhrase = $aWords[$i]; - $aTokens[' '.$sPhrase] = ' '.$sPhrase; $aTokens[$sPhrase] = $sPhrase; for ($j = $i + 1; $j < $iNumWords; $j++) { $sPhrase .= ' '.$aWords[$j]; - $aTokens[' '.$sPhrase] = ' '.$sPhrase; $aTokens[$sPhrase] = $sPhrase; } } diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql index 13e1bdb0..1d70a9c3 100644 --- a/lib-sql/tokenizer/icu_tokenizer_tables.sql +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -8,6 +8,9 @@ CREATE TABLE word_icu ( CREATE INDEX idx_word_word_token ON word USING BTREE (word_token) {{db.tablespace.search_index}}; +-- Used when updating country names from the boundary relation. +CREATE INDEX idx_word_country_names ON word + USING btree((info->>'cc')) WHERE type = 'C'; GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; DROP SEQUENCE IF EXISTS seq_word; diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 59ad09aa..32dd6535 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -371,22 +371,28 @@ class LegacyICUNameAnalyzer: """ word_tokens = set() for name in self._compute_full_names(names): - if name: - word_tokens.add(' ' + self.name_processor.get_search_normalized(name)) + norm_name = self.name_processor.get_search_normalized(name) + if norm_name: + word_tokens.add(norm_name) with self.conn.cursor() as cur: # Get existing names - cur.execute("SELECT word_token FROM word WHERE country_code = %s", + cur.execute("""SELECT word_token FROM word + WHERE type = 'C' and info->>'cc'= %s""", (country_code, )) word_tokens.difference_update((t[0] for t in cur)) + # Only add those names that are not yet in the list. if word_tokens: - cur.execute("""INSERT INTO word (word_id, word_token, country_code, - search_name_count) - (SELECT nextval('seq_word'), token, %s, 0 + cur.execute("""INSERT INTO word (word_token, type, info) + (SELECT token, 'C', json_build_object('cc', %s) FROM unnest(%s) as token) """, (country_code, list(word_tokens))) + # No names are deleted at the moment. + # If deletion is made possible, then the static names from the + # initial 'country_name' table should be kept. + def process_place(self, place): """ Determine tokenizer information about the given place. From 5ab0a63fd6881f1b7273363e7f20562bc5e8dc39 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 20 Jul 2021 11:36:20 +0200 Subject: [PATCH 03/14] switch housenumber tokens to new word table layout --- lib-php/tokenizer/legacy_icu_tokenizer.php | 9 ++++++--- lib-sql/tokenizer/legacy_icu_tokenizer.sql | 10 ++++------ nominatim/tokenizer/legacy_icu_tokenizer.py | 3 ++- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index ea445f23..b2fc27c7 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -156,6 +156,8 @@ class Tokenizer $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.'); foreach ($aDBWords as $aWord) { + $iId = (int) $aWord['word_id']; + switch ($aWord['type']) { 'C': // country name tokens if ($aWord['country'] === null @@ -166,12 +168,13 @@ class Tokenizer } $oToken = new Token\Country($iId, $aWord['country']) break; + 'H': // house number tokens + $oToken = new Token\HouseNumber($iId, $aWord['word_token']); + break; default: continue; } -/* $iId = (int) $aWord['word_id']; - - if ($aWord['class']) { +/* if ($aWord['class']) { // Special terms need to appear in their normalized form. // (postcodes are not normalized in the word table) $sNormWord = $this->normalizeString($aWord['word']); diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql index 686137de..e9dcf4bc 100644 --- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -140,15 +140,13 @@ CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) DECLARE return_id INTEGER; BEGIN - SELECT min(word_id) INTO return_id - FROM word - WHERE word_token = ' ' || lookup_term - and class = 'place' and type = 'house'; + SELECT min(word_id) INTO return_id FROM word + WHERE word_token = lookup_term and type = 'H'; IF return_id IS NULL THEN return_id := nextval('seq_word'); - INSERT INTO word (word_id, word_token, class, type, search_name_count) - VALUES (return_id, ' ' || lookup_term, 'place', 'house', 0); + INSERT INTO word (word_id, word_token, type) + VALUES (return_id, lookup_term, 'H'); END IF; RETURN return_id; diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 32dd6535..9fbb9bb0 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -601,7 +601,8 @@ class _TokenCache: def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the - database if necessary. + database if necessary. `terms` is an iterable of normalized + housenumbers. """ tokens = [] askdb = [] From 5394b1fa1bd258745a9906c7df605310bc6cc021 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 20 Jul 2021 12:11:12 +0200 Subject: [PATCH 04/14] switch postcode tokens to new word table layout --- lib-php/tokenizer/legacy_icu_tokenizer.php | 12 ++++++++++- lib-sql/tokenizer/icu_tokenizer_tables.sql | 7 +++++- nominatim/tokenizer/legacy_icu_tokenizer.py | 24 ++++++++------------- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index b2fc27c7..2461a1fd 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -147,7 +147,7 @@ class Tokenizer { // Check which tokens we have, get the ID numbers $sSQL = 'SELECT word_id, word_token, type'; - $sSQL .= " info->>'cc' as country"; + $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode"; $sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; @@ -171,6 +171,16 @@ class Tokenizer 'H': // house number tokens $oToken = new Token\HouseNumber($iId, $aWord['word_token']); break; + 'P': // postcode tokens + // Postcodes are not normalized, so they may have content + // that makes SQL injection possible. Reject postcodes + // that would need special escaping. + if ($aWord['postcode'] === null + || pg_escape_string($aWord['postcode']) == $aWord['postcode'] + ) { + continue; + } + $oToken = new Token\Postcode($iId, $aWord['postcode'], null); default: continue; } diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql index 1d70a9c3..dd3ac477 100644 --- a/lib-sql/tokenizer/icu_tokenizer_tables.sql +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -10,7 +10,12 @@ CREATE INDEX idx_word_word_token ON word USING BTREE (word_token) {{db.tablespace.search_index}}; -- Used when updating country names from the boundary relation. CREATE INDEX idx_word_country_names ON word - USING btree((info->>'cc')) WHERE type = 'C'; + USING btree((info->>'cc')) {{db.tablespace.address_index}} + WHERE type = 'C'; +-- Used when inserting new postcodes on updates. +CREATE INDEX idx_word_postcodes ON word + USING btree((info->>'postcode')) {{db.tablespace.address_index}} + WHERE type = 'P' GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; DROP SEQUENCE IF EXISTS seq_word; diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 9fbb9bb0..e0fd3a02 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -276,8 +276,7 @@ class LegacyICUNameAnalyzer: (SELECT pc, word FROM (SELECT distinct(postcode) as pc FROM location_postcode) p FULL JOIN - (SELECT word FROM word - WHERE class ='place' and type = 'postcode') w + (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w ON pc = word) x WHERE pc is null or word is null""") @@ -286,20 +285,16 @@ class LegacyICUNameAnalyzer: if postcode is None: to_delete.append(word) else: - copystr.add( - postcode, - ' ' + self.name_processor.get_search_normalized(postcode), - 'place', 'postcode', 0) + copystr.add(self.name_processor.get_search_normalized(postcode), + 'P', {'postcode': postcode}) if to_delete: cur.execute("""DELETE FROM WORD - WHERE class ='place' and type = 'postcode' - and word = any(%s) + WHERE class ='P' and info->>'postcode' = any(%s) """, (to_delete, )) copystr.copy_out(cur, 'word', - columns=['word', 'word_token', 'class', 'type', - 'search_name_count']) + columns=['word_token', 'type', 'info']) def update_special_phrases(self, phrases, should_replace): @@ -503,14 +498,13 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 + cur.execute("""INSERT INTO word (word_token, type, info) + (SELECT %s, 'P', json_build_object('postcode', pc) FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) + WHERE type = 'P' and info->>postcode = pc)) + """, (term, postcode)) self._cache.postcodes.add(postcode) From 4342b28882249a6d460bb7f7e6f6708751497058 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 20 Jul 2021 21:11:01 +0200 Subject: [PATCH 05/14] switch special phrases to new word table format --- lib-php/tokenizer/legacy_icu_tokenizer.php | 23 +++++++++++++++++- nominatim/tokenizer/legacy_icu_tokenizer.py | 27 ++++++++++++--------- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 2461a1fd..70358976 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -147,7 +147,9 @@ class Tokenizer { // Check which tokens we have, get the ID numbers $sSQL = 'SELECT word_id, word_token, type'; - $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode"; + $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,"; + $sSQL .= " info->>'word' as word, info->>'op' as operator,"; + $sSQL .= " info->>'class' as class, info->>'type' as type"; $sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; @@ -180,7 +182,26 @@ class Tokenizer ) { continue; } + $sNormPostcode = $this->normalizeString($aWord['postcode']); + if (strpos($sNormQuery, $sNormPostcode) === false) { + continue; + } $oToken = new Token\Postcode($iId, $aWord['postcode'], null); + break; + 'S': // tokens for classification terms (special phrases) + if ($aWord['class'] === null || $aWord['type'] === null + || $aWord['word'] === null + || strpos($sNormQuery, $aWord['word']) === false + ) { + continue; + } + $oToken = new Token\SpecialTerm( + $iId, + $aWord['class'], + $aWord['type'], + $aWord['op'] ? Operator::NEAR : Operator::NONE + ); + break; default: continue; } diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index e0fd3a02..a645b598 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -299,6 +299,9 @@ class LegacyICUNameAnalyzer: def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. + If `should_replace` is True, then the previous set of will be + completely replaced. Otherwise the phrases are added to the + already existing ones. """ norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3]) for p in phrases)) @@ -306,11 +309,10 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # Get the old phrases. existing_phrases = set() - cur.execute("""SELECT word, class, type, operator FROM word - WHERE class != 'place' - OR (type != 'house' AND type != 'postcode')""") - for label, cls, typ, oper in cur: - existing_phrases.add((label, cls, typ, oper or '-')) + cur.execute("SELECT info FROM word WHERE type = 'S'") + for (info, ) in cur: + existing_phrases.add((info['word'], info['class'], info['type'], + info.get('op') or '-')) added = self._add_special_phrases(cur, norm_phrases, existing_phrases) if should_replace: @@ -333,13 +335,13 @@ class LegacyICUNameAnalyzer: for word, cls, typ, oper in to_add: term = self.name_processor.get_search_normalized(word) if term: - copystr.add(word, ' ' + term, cls, typ, - oper if oper in ('in', 'near') else None, 0) + copystr.add(term, 'S', + {'word': word, 'class': cls, 'type': typ, + 'op': oper if oper in ('in', 'near') else None}) added += 1 copystr.copy_out(cursor, 'word', - columns=['word', 'word_token', 'class', 'type', - 'operator', 'search_name_count']) + columns=['word_token', 'type', 'info']) return added @@ -354,9 +356,10 @@ class LegacyICUNameAnalyzer: if to_delete: cursor.execute_values( """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) - WHERE word = name and class = in_class and type = in_type - and ((op = '-' and operator is null) or op = operator)""", - to_delete) + WHERE info->>'word' = name + and info->>'class' = in_class and info->>'type' = in_type + and ((op = '-' and info->>'op' is null) or op = info->>'op') + """, to_delete) return len(to_delete) From 70f154be8b69d3b57eebd25eff225ee29ccc97ba Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 21 Jul 2021 10:41:38 +0200 Subject: [PATCH 06/14] switch word tokens to new word table layout --- lib-php/tokenizer/legacy_icu_tokenizer.php | 70 +++++---------------- lib-sql/tokenizer/icu_tokenizer_tables.sql | 7 ++- lib-sql/tokenizer/legacy_icu_tokenizer.sql | 16 ++--- nominatim/tokenizer/legacy_icu_tokenizer.py | 51 +++++++-------- 4 files changed, 58 insertions(+), 86 deletions(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 70358976..314bd27e 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -148,8 +148,9 @@ class Tokenizer // Check which tokens we have, get the ID numbers $sSQL = 'SELECT word_id, word_token, type'; $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,"; - $sSQL .= " info->>'word' as word, info->>'op' as operator,"; - $sSQL .= " info->>'class' as class, info->>'type' as type"; + $sSQL .= " info->>'op' as operator,"; + $sSQL .= " info->>'class' as class, info->>'type' as type,"; + $sSQL .= " info->>'count' as count"; $sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; @@ -190,8 +191,6 @@ class Tokenizer break; 'S': // tokens for classification terms (special phrases) if ($aWord['class'] === null || $aWord['type'] === null - || $aWord['word'] === null - || strpos($sNormQuery, $aWord['word']) === false ) { continue; } @@ -202,58 +201,23 @@ class Tokenizer $aWord['op'] ? Operator::NEAR : Operator::NONE ); break; + 'W': // full-word tokens + $oToken = new Token\Word( + $iId, + (int) $aWord['count'], + substr_count($aWord['word_token'], ' ') + ); + break; + 'w': // partial word terms + $oToken = new Token\Partial( + $iId, + $aWord['word_token'], + (int) $aWord['count'] + ); + break; default: continue; } -/* if ($aWord['class']) { - // Special terms need to appear in their normalized form. - // (postcodes are not normalized in the word table) - $sNormWord = $this->normalizeString($aWord['word']); - if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) { - continue; - } - - if ($aWord['class'] == 'place' && $aWord['type'] == 'house') { - $oToken = new Token\HouseNumber($iId, trim($aWord['word_token'])); - } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') { - if ($aWord['word'] - && pg_escape_string($aWord['word']) == $aWord['word'] - ) { - $oToken = new Token\Postcode( - $iId, - $aWord['word'], - $aWord['country_code'] - ); - } - } else { - // near and in operator the same at the moment - $oToken = new Token\SpecialTerm( - $iId, - $aWord['class'], - $aWord['type'], - $aWord['operator'] ? Operator::NEAR : Operator::NONE - ); - } - } elseif ($aWord['country_code']) { - // Filter country tokens that do not match restricted countries. - if (!$this->aCountryRestriction - || in_array($aWord['country_code'], $this->aCountryRestriction) - ) { - $oToken = new Token\Country($iId, $aWord['country_code']); - } - } elseif ($aWord['word_token'][0] == ' ') { - $oToken = new Token\Word( - $iId, - (int) $aWord['count'], - substr_count($aWord['word_token'], ' ') - ); - } else { - $oToken = new Token\Partial( - $iId, - $aWord['word_token'], - (int) $aWord['count'] - ); - }*/ $oValidTokens->addToken($aWord['word_token'], $oToken); } diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql index dd3ac477..13b12797 100644 --- a/lib-sql/tokenizer/icu_tokenizer_tables.sql +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -15,7 +15,12 @@ CREATE INDEX idx_word_country_names ON word -- Used when inserting new postcodes on updates. CREATE INDEX idx_word_postcodes ON word USING btree((info->>'postcode')) {{db.tablespace.address_index}} - WHERE type = 'P' + WHERE type = 'P'; +-- Used when inserting full words. +CREATE INDEX idx_word_full_word ON word + USING btree((info->>'word')) {{db.tablespace.address_index}} + WHERE type = 'W'; + GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; DROP SEQUENCE IF EXISTS seq_word; diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql index e9dcf4bc..f4258f82 100644 --- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -98,12 +98,14 @@ DECLARE term_count INTEGER; BEGIN SELECT min(word_id) INTO full_token - FROM word WHERE word = norm_term and class is null and country_code is null; + FROM word WHERE info->>'word' = norm_term and type = 'W'; IF full_token IS NULL THEN full_token := nextval('seq_word'); - INSERT INTO word (word_id, word_token, word, search_name_count) - SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term; + INSERT INTO word (word_id, word_token, info) + SELECT full_token, lookup_term, + json_build_object('word', norm_term, 'count', 0) + FROM unnest(lookup_terms) as lookup_term; END IF; FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP @@ -115,14 +117,14 @@ BEGIN partial_tokens := '{}'::INT[]; FOR term IN SELECT unnest(partial_terms) LOOP - SELECT min(word_id), max(search_name_count) INTO term_id, term_count - FROM word WHERE word_token = term and class is null and country_code is null; + SELECT min(word_id), max(info->>'count') INTO term_id, term_count + FROM word WHERE word_token = term and type = 'w'; IF term_id IS NULL THEN term_id := nextval('seq_word'); term_count := 0; - INSERT INTO word (word_id, word_token, search_name_count) - VALUES (term_id, term, 0); + INSERT INTO word (word_id, word_token, info) + VALUES (term_id, term, json_build_object('count', term_count)); END IF; IF term_count < {{ max_word_freq }} THEN diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index a645b598..14fa5b60 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -74,13 +74,11 @@ class LegacyICUTokenizer: self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) - def finalize_import(self, config): + def finalize_import(self, _): """ Do any required postprocessing to make the tokenizer data ready for use. """ - with connect(self.dsn) as conn: - sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') + pass def update_sql_functions(self, config): @@ -121,18 +119,17 @@ class LegacyICUTokenizer: """ return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - # pylint: disable=missing-format-attribute + def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ php_file = self.data_dir / "tokenizer.php" - php_file.write_text(dedent("""\ + php_file.write_text(dedent(f"""\ >'postcode' = any(%s) + WHERE type ='P' and info->>'postcode' = any(%s) """, (to_delete, )) copystr.copy_out(cur, 'word', From 6ad35aca4aadbf436e9dce776134f9ce4d2e69c6 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 21 Jul 2021 10:52:34 +0200 Subject: [PATCH 07/14] adapt special terms lookup to new word table --- lib-php/tokenizer/legacy_icu_tokenizer.php | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 314bd27e..796635ee 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -19,7 +19,7 @@ class Tokenizer public function checkStatus() { - $sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')"; + $sSQL = "SELECT word_id FROM word WHERE word_token == 'a'"; $iWordID = $this->oDB->getOne($sSQL); if ($iWordID === false) { throw new Exception('Query failed', 703); @@ -55,9 +55,8 @@ class Tokenizer { $aResults = array(); - $sSQL = 'SELECT word_id, class, type FROM word '; - $sSQL .= ' WHERE word_token = \' \' || :term'; - $sSQL .= ' AND class is not null AND class not in (\'place\')'; + $sSQL = "SELECT word_id, info->>'class' as class, info->>'type' as type "; + $sSQL .= ' FROM word WHERE word_token = :term and type = \'S\''; Debug::printVar('Term', $sTerm); Debug::printSQL($sSQL); From eb6814d74e509fb49986989bb4c60539a2871d76 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 21 Jul 2021 11:37:14 +0200 Subject: [PATCH 08/14] convert word info column to json before copying --- nominatim/db/utils.py | 1 + nominatim/tokenizer/legacy_icu_tokenizer.py | 9 ++--- test/python/test_db_utils.py | 37 +++++++++++++++++++++ 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 9a4a41a5..bb7faa25 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -65,6 +65,7 @@ _SQL_TRANSLATION = {ord(u'\\'): u'\\\\', ord(u'\t'): u'\\t', ord(u'\n'): u'\\n'} + class CopyBuffer: """ Data collector for the copy_from command. """ diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 14fa5b60..e019ef67 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -4,6 +4,7 @@ libICU instead of the PostgreSQL module. """ from collections import Counter import itertools +import json import logging import re from textwrap import dedent @@ -173,7 +174,7 @@ class LegacyICUTokenizer: # copy them back into the word table with CopyBuffer() as copystr: for k, v in words.items(): - copystr.add('w', k, {'count': v}) + copystr.add('w', k, json.dumps({'count': v})) with conn.cursor() as cur: copystr.copy_out(cur, 'word', @@ -287,7 +288,7 @@ class LegacyICUNameAnalyzer: to_delete.append(word) else: copystr.add(self.name_processor.get_search_normalized(postcode), - 'P', {'postcode': postcode}) + 'P', json.dumps({'postcode': postcode})) if to_delete: cur.execute("""DELETE FROM WORD @@ -337,8 +338,8 @@ class LegacyICUNameAnalyzer: term = self.name_processor.get_search_normalized(word) if term: copystr.add(term, 'S', - {'word': word, 'class': cls, 'type': typ, - 'op': oper if oper in ('in', 'near') else None}) + json.dumps({'word': word, 'class': cls, 'type': typ, + 'op': oper if oper in ('in', 'near') else None})) added += 1 copystr.copy_out(cursor, 'word', diff --git a/test/python/test_db_utils.py b/test/python/test_db_utils.py index 545cc58f..9eea7ed1 100644 --- a/test/python/test_db_utils.py +++ b/test/python/test_db_utils.py @@ -1,6 +1,8 @@ """ Tests for DB utility functions in db.utils """ +import json + import pytest import nominatim.db.utils as db_utils @@ -115,3 +117,38 @@ class TestCopyBuffer: +class TestCopyBufferJson: + TABLE_NAME = 'copytable' + + @pytest.fixture(autouse=True) + def setup_test_table(self, table_factory): + table_factory(self.TABLE_NAME, 'colA INT, colB JSONB') + + + def table_rows(self, cursor): + cursor.execute('SELECT * FROM ' + self.TABLE_NAME) + results = {k: v for k,v in cursor} + + assert len(results) == cursor.rowcount + + return results + + + def test_json_object(self, temp_db_cursor): + with db_utils.CopyBuffer() as buf: + buf.add(1, json.dumps({'test': 'value', 'number': 1})) + + buf.copy_out(temp_db_cursor, self.TABLE_NAME) + + assert self.table_rows(temp_db_cursor) == \ + {1: {'test': 'value', 'number': 1}} + + + def test_json_object_special_chras(self, temp_db_cursor): + with db_utils.CopyBuffer() as buf: + buf.add(1, json.dumps({'te\tst': 'va\nlue', 'nu"mber': None})) + + buf.copy_out(temp_db_cursor, self.TABLE_NAME) + + assert self.table_rows(temp_db_cursor) == \ + {1: {'te\tst': 'va\nlue', 'nu"mber': None}} From e42878eeda111457018684a3f60417c0ed6c5294 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 22 Jul 2021 17:24:43 +0200 Subject: [PATCH 09/14] adapt unit test for new word table Requires a second wrapper class for the word table with the new layout. This class is interface-compatible, so that later when the ICU tokenizer becomes the default, all tests that depend on behaviour of the default tokenizer can be switched to the other wrapper. --- lib-sql/tokenizer/icu_tokenizer_tables.sql | 2 +- lib-sql/tokenizer/legacy_icu_tokenizer.sql | 8 +- nominatim/tokenizer/legacy_icu_tokenizer.py | 22 +++--- test/python/mock_icu_word_table.py | 84 ++++++++++++++++++++ test/python/mock_legacy_word_table.py | 86 +++++++++++++++++++++ test/python/mocks.py | 85 +------------------- test/python/test_tokenizer_legacy_icu.py | 63 ++++++++------- 7 files changed, 225 insertions(+), 125 deletions(-) create mode 100644 test/python/mock_icu_word_table.py create mode 100644 test/python/mock_legacy_word_table.py diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql index 13b12797..912ba768 100644 --- a/lib-sql/tokenizer/icu_tokenizer_tables.sql +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -1,5 +1,5 @@ DROP TABLE IF EXISTS word; -CREATE TABLE word_icu ( +CREATE TABLE word ( word_id INTEGER, word_token text NOT NULL, type text NOT NULL, diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql index f4258f82..e021bf8b 100644 --- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -102,8 +102,8 @@ BEGIN IF full_token IS NULL THEN full_token := nextval('seq_word'); - INSERT INTO word (word_id, word_token, info) - SELECT full_token, lookup_term, + INSERT INTO word (word_id, word_token, type, info) + SELECT full_token, lookup_term, 'W', json_build_object('word', norm_term, 'count', 0) FROM unnest(lookup_terms) as lookup_term; END IF; @@ -123,8 +123,8 @@ BEGIN IF term_id IS NULL THEN term_id := nextval('seq_word'); term_count := 0; - INSERT INTO word (word_id, word_token, info) - VALUES (term_id, term, json_build_object('count', term_count)); + INSERT INTO word (word_id, word_token, type, info) + VALUES (term_id, term, 'w', json_build_object('count', term_count)); END IF; IF term_count < {{ max_word_freq }} THEN diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index e019ef67..3d7d752e 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -236,17 +236,17 @@ class LegacyICUNameAnalyzer: partial_tokens[word] = self.name_processor.get_search_normalized(word) with self.conn.cursor() as cur: - cur.execute("""(SELECT word_token, word_id - FROM word WHERE word_token = ANY(%s) and type = 'W') - UNION - (SELECT word_token, word_id - FROM word WHERE word_token = ANY(%s) and type = 'w')""", - (list(full_tokens.values()), - list(partial_tokens.values()))) - ids = {r[0]: r[1] for r in cur} + cur.execute("""SELECT word_token, word_id + FROM word WHERE word_token = ANY(%s) and type = 'W' + """, (list(full_tokens.values()),)) + full_ids = {r[0]: r[1] for r in cur} + cur.execute("""SELECT word_token, word_id + FROM word WHERE word_token = ANY(%s) and type = 'w'""", + (list(partial_tokens.values()),)) + part_ids = {r[0]: r[1] for r in cur} - return [(k, v, ids.get(v, None)) for k, v in full_tokens.items()] \ - + [(k, v, ids.get(v, None)) for k, v in partial_tokens.items()] + return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \ + + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()] @staticmethod @@ -508,7 +508,7 @@ class LegacyICUNameAnalyzer: FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE type = 'P' and info->>postcode = pc)) + WHERE type = 'P' and info->>'postcode' = pc)) """, (term, postcode)) self._cache.postcodes.add(postcode) diff --git a/test/python/mock_icu_word_table.py b/test/python/mock_icu_word_table.py new file mode 100644 index 00000000..3d457d0b --- /dev/null +++ b/test/python/mock_icu_word_table.py @@ -0,0 +1,84 @@ +""" +Legacy word table for testing with functions to prefil and test contents +of the table. +""" + +class MockIcuWordTable: + """ A word table for testing using legacy word table structure. + """ + def __init__(self, conn): + self.conn = conn + with conn.cursor() as cur: + cur.execute("""CREATE TABLE word (word_id INTEGER, + word_token text NOT NULL, + type text NOT NULL, + info jsonb)""") + + conn.commit() + + def add_special(self, word_token, word, cls, typ, oper): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_token, type, info) + VALUES (%s, 'S', + json_build_object('word', %s, + 'class', %s, + 'type', %s, + 'op', %s)) + """, (word_token, word, cls, typ, oper)) + self.conn.commit() + + + def add_country(self, country_code, word_token): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_token, type, info) + VALUES(%s, 'C', json_build_object('cc', %s))""", + (word_token, country_code)) + self.conn.commit() + + + def add_postcode(self, word_token, postcode): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_token, type, info) + VALUES (%s, 'P', json_build_object('postcode', %s)) + """, (word_token, postcode)) + self.conn.commit() + + + def count(self): + with self.conn.cursor() as cur: + return cur.scalar("SELECT count(*) FROM word") + + + def count_special(self): + with self.conn.cursor() as cur: + return cur.scalar("SELECT count(*) FROM word WHERE type = 'S'") + + + def get_special(self): + with self.conn.cursor() as cur: + cur.execute("SELECT word_token, info FROM word WHERE type = 'S'") + result = set(((row[0], row[1]['word'], row[1]['class'], + row[1]['type'], row[1]['op']) for row in cur)) + assert len(result) == cur.rowcount, "Word table has duplicates." + return result + + + def get_country(self): + with self.conn.cursor() as cur: + cur.execute("SELECT info->>'cc', word_token FROM word WHERE type = 'C'") + result = set((tuple(row) for row in cur)) + assert len(result) == cur.rowcount, "Word table has duplicates." + return result + + + def get_postcodes(self): + with self.conn.cursor() as cur: + cur.execute("SELECT info->>'postcode' FROM word WHERE type = 'P'") + return set((row[0] for row in cur)) + + + def get_partial_words(self): + with self.conn.cursor() as cur: + cur.execute("SELECT word_token, info FROM word WHERE type ='w'") + return set(((row[0], row[1]['count']) for row in cur)) + diff --git a/test/python/mock_legacy_word_table.py b/test/python/mock_legacy_word_table.py new file mode 100644 index 00000000..8baf3adc --- /dev/null +++ b/test/python/mock_legacy_word_table.py @@ -0,0 +1,86 @@ +""" +Legacy word table for testing with functions to prefil and test contents +of the table. +""" + +class MockLegacyWordTable: + """ A word table for testing using legacy word table structure. + """ + def __init__(self, conn): + self.conn = conn + with conn.cursor() as cur: + cur.execute("""CREATE TABLE word (word_id INTEGER, + word_token text, + word text, + class text, + type text, + country_code varchar(2), + search_name_count INTEGER, + operator TEXT)""") + + conn.commit() + + def add_special(self, word_token, word, cls, typ, oper): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_token, word, class, type, operator) + VALUES (%s, %s, %s, %s, %s) + """, (word_token, word, cls, typ, oper)) + self.conn.commit() + + + def add_country(self, country_code, word_token): + with self.conn.cursor() as cur: + cur.execute("INSERT INTO word (word_token, country_code) VALUES(%s, %s)", + (word_token, country_code)) + self.conn.commit() + + + def add_postcode(self, word_token, postcode): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_token, word, class, type) + VALUES (%s, %s, 'place', 'postcode') + """, (word_token, postcode)) + self.conn.commit() + + + def count(self): + with self.conn.cursor() as cur: + return cur.scalar("SELECT count(*) FROM word") + + + def count_special(self): + with self.conn.cursor() as cur: + return cur.scalar("SELECT count(*) FROM word WHERE class != 'place'") + + + def get_special(self): + with self.conn.cursor() as cur: + cur.execute("""SELECT word_token, word, class, type, operator + FROM word WHERE class != 'place'""") + result = set((tuple(row) for row in cur)) + assert len(result) == cur.rowcount, "Word table has duplicates." + return result + + + def get_country(self): + with self.conn.cursor() as cur: + cur.execute("""SELECT country_code, word_token + FROM word WHERE country_code is not null""") + result = set((tuple(row) for row in cur)) + assert len(result) == cur.rowcount, "Word table has duplicates." + return result + + + def get_postcodes(self): + with self.conn.cursor() as cur: + cur.execute("""SELECT word FROM word + WHERE class = 'place' and type = 'postcode'""") + return set((row[0] for row in cur)) + + def get_partial_words(self): + with self.conn.cursor() as cur: + cur.execute("""SELECT word_token, search_name_count FROM word + WHERE class is null and country_code is null + and not word_token like ' %'""") + return set((tuple(row) for row in cur)) + diff --git a/test/python/mocks.py b/test/python/mocks.py index f9faaa93..7f7aaafc 100644 --- a/test/python/mocks.py +++ b/test/python/mocks.py @@ -7,6 +7,9 @@ import psycopg2.extras from nominatim.db import properties +# This must always point to the mock word table for the default tokenizer. +from mock_legacy_word_table import MockLegacyWordTable as MockWordTable + class MockParamCapture: """ Mock that records the parameters with which a function was called as well as the number of calls. @@ -24,88 +27,6 @@ class MockParamCapture: return self.return_value -class MockWordTable: - """ A word table for testing. - """ - def __init__(self, conn): - self.conn = conn - with conn.cursor() as cur: - cur.execute("""CREATE TABLE word (word_id INTEGER, - word_token text, - word text, - class text, - type text, - country_code varchar(2), - search_name_count INTEGER, - operator TEXT)""") - - conn.commit() - - def add_special(self, word_token, word, cls, typ, oper): - with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, word, class, type, operator) - VALUES (%s, %s, %s, %s, %s) - """, (word_token, word, cls, typ, oper)) - self.conn.commit() - - - def add_country(self, country_code, word_token): - with self.conn.cursor() as cur: - cur.execute("INSERT INTO word (word_token, country_code) VALUES(%s, %s)", - (word_token, country_code)) - self.conn.commit() - - - def add_postcode(self, word_token, postcode): - with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, word, class, type) - VALUES (%s, %s, 'place', 'postcode') - """, (word_token, postcode)) - self.conn.commit() - - - def count(self): - with self.conn.cursor() as cur: - return cur.scalar("SELECT count(*) FROM word") - - - def count_special(self): - with self.conn.cursor() as cur: - return cur.scalar("SELECT count(*) FROM word WHERE class != 'place'") - - - def get_special(self): - with self.conn.cursor() as cur: - cur.execute("""SELECT word_token, word, class, type, operator - FROM word WHERE class != 'place'""") - result = set((tuple(row) for row in cur)) - assert len(result) == cur.rowcount, "Word table has duplicates." - return result - - - def get_country(self): - with self.conn.cursor() as cur: - cur.execute("""SELECT country_code, word_token - FROM word WHERE country_code is not null""") - result = set((tuple(row) for row in cur)) - assert len(result) == cur.rowcount, "Word table has duplicates." - return result - - - def get_postcodes(self): - with self.conn.cursor() as cur: - cur.execute("""SELECT word FROM word - WHERE class = 'place' and type = 'postcode'""") - return set((row[0] for row in cur)) - - def get_partial_words(self): - with self.conn.cursor() as cur: - cur.execute("""SELECT word_token, search_name_count FROM word - WHERE class is null and country_code is null - and not word_token like ' %'""") - return set((tuple(row) for row in cur)) - - class MockPlacexTable: """ A placex table for testing. """ diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py index 39fc9fb4..ed489662 100644 --- a/test/python/test_tokenizer_legacy_icu.py +++ b/test/python/test_tokenizer_legacy_icu.py @@ -11,6 +11,12 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.db import properties +from mock_icu_word_table import MockIcuWordTable + +@pytest.fixture +def word_table(temp_db_conn): + return MockIcuWordTable(temp_db_conn) + @pytest.fixture def test_config(def_config, tmp_path): @@ -21,8 +27,8 @@ def test_config(def_config, tmp_path): sqldir.mkdir() (sqldir / 'tokenizer').mkdir() (sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'") - shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'), - str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql')) + shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'), + str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql')) def_config.lib_dir.sql = sqldir @@ -88,12 +94,14 @@ DECLARE term_count INTEGER; BEGIN SELECT min(word_id) INTO full_token - FROM word WHERE word = norm_term and class is null and country_code is null; + FROM word WHERE info->>'word' = norm_term and type = 'W'; IF full_token IS NULL THEN full_token := nextval('seq_word'); - INSERT INTO word (word_id, word_token, word, search_name_count) - SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term; + INSERT INTO word (word_id, word_token, type, info) + SELECT full_token, lookup_term, 'W', + json_build_object('word', norm_term, 'count', 0) + FROM unnest(lookup_terms) as lookup_term; END IF; FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP @@ -105,18 +113,18 @@ BEGIN partial_tokens := '{}'::INT[]; FOR term IN SELECT unnest(partial_terms) LOOP - SELECT min(word_id), max(search_name_count) INTO term_id, term_count - FROM word WHERE word_token = term and class is null and country_code is null; + SELECT min(word_id), max(info->>'count') INTO term_id, term_count + FROM word WHERE word_token = term and type = 'w'; IF term_id IS NULL THEN term_id := nextval('seq_word'); term_count := 0; - INSERT INTO word (word_id, word_token, search_name_count) - VALUES (term_id, term, 0); + INSERT INTO word (word_id, word_token, type, info) + VALUES (term_id, term, 'w', json_build_object('count', term_count)); END IF; IF NOT (ARRAY[term_id] <@ partial_tokens) THEN - partial_tokens := partial_tokens || term_id; + partial_tokens := partial_tokens || term_id; END IF; END LOOP; END; @@ -232,14 +240,14 @@ def test_update_special_phrase_empty_table(analyzer, word_table): ], True) assert word_table.get_special() \ - == {(' KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'), - (' KÖNIGE', 'Könige', 'amenity', 'royal', None), - (' STREET', 'street', 'highway', 'primary', 'in')} + == {('KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'), + ('KÖNIGE', 'Könige', 'amenity', 'royal', None), + ('STREET', 'street', 'highway', 'primary', 'in')} def test_update_special_phrase_delete_all(analyzer, word_table): - word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in') - word_table.add_special(' BAR', 'bar', 'highway', 'road', None) + word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in') + word_table.add_special('BAR', 'bar', 'highway', 'road', None) assert word_table.count_special() == 2 @@ -250,8 +258,8 @@ def test_update_special_phrase_delete_all(analyzer, word_table): def test_update_special_phrases_no_replace(analyzer, word_table): - word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in') - word_table.add_special(' BAR', 'bar', 'highway', 'road', None) + word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in') + word_table.add_special('BAR', 'bar', 'highway', 'road', None) assert word_table.count_special() == 2 @@ -262,8 +270,8 @@ def test_update_special_phrases_no_replace(analyzer, word_table): def test_update_special_phrase_modify(analyzer, word_table): - word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in') - word_table.add_special(' BAR', 'bar', 'highway', 'road', None) + word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in') + word_table.add_special('BAR', 'bar', 'highway', 'road', None) assert word_table.count_special() == 2 @@ -275,25 +283,25 @@ def test_update_special_phrase_modify(analyzer, word_table): ], True) assert word_table.get_special() \ - == {(' PRISON', 'prison', 'amenity', 'prison', 'in'), - (' BAR', 'bar', 'highway', 'road', None), - (' GARDEN', 'garden', 'leisure', 'garden', 'near')} + == {('PRISON', 'prison', 'amenity', 'prison', 'in'), + ('BAR', 'bar', 'highway', 'road', None), + ('GARDEN', 'garden', 'leisure', 'garden', 'near')} def test_add_country_names_new(analyzer, word_table): with analyzer() as anl: anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'}) - assert word_table.get_country() == {('es', ' ESPAGÑA'), ('es', ' SPAIN')} + assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')} def test_add_country_names_extend(analyzer, word_table): - word_table.add_country('ch', ' SCHWEIZ') + word_table.add_country('ch', 'SCHWEIZ') with analyzer() as anl: anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'}) - assert word_table.get_country() == {('ch', ' SCHWEIZ'), ('ch', ' SUISSE')} + assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')} class TestPlaceNames: @@ -307,6 +315,7 @@ class TestPlaceNames: def expect_name_terms(self, info, *expected_terms): tokens = self.analyzer.get_word_token_info(expected_terms) + print (tokens) for token in tokens: assert token[2] is not None, "No token for {0}".format(token) @@ -316,7 +325,7 @@ class TestPlaceNames: def test_simple_names(self): info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}}) - self.expect_name_terms(info, '#Soft bAr', '#34','Soft', 'bAr', '34') + self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34') @pytest.mark.parametrize('sep', [',' , ';']) @@ -339,7 +348,7 @@ class TestPlaceNames: 'country_feature': 'no'}) self.expect_name_terms(info, '#norge', 'norge') - assert word_table.get_country() == {('no', ' NORGE')} + assert word_table.get_country() == {('no', 'NORGE')} class TestPlaceAddress: From 324b1b5575ce1793d90cdb9837230f76acd8169e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 24 Jul 2021 12:12:31 +0200 Subject: [PATCH 10/14] bdd tests: do not query word table directly The BDD tests cannot make assumptions about the structure of the word table anymore because it depends on the tokenizer. Use more abstract descriptions instead that ask for specific kinds of tokens. --- lib-php/tokenizer/legacy_icu_tokenizer.php | 72 ++++++++++------------ test/bdd/db/import/postcodes.feature | 4 +- test/bdd/db/update/postcode.feature | 29 +++------ test/bdd/steps/steps_db_ops.py | 33 ++++++++++ 4 files changed, 74 insertions(+), 64 deletions(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 796635ee..9bd9828c 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -19,7 +19,7 @@ class Tokenizer public function checkStatus() { - $sSQL = "SELECT word_id FROM word WHERE word_token == 'a'"; + $sSQL = "SELECT word_id FROM word limit 1"; $iWordID = $this->oDB->getOne($sSQL); if ($iWordID === false) { throw new Exception('Query failed', 703); @@ -145,10 +145,10 @@ class Tokenizer private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) { // Check which tokens we have, get the ID numbers - $sSQL = 'SELECT word_id, word_token, type'; + $sSQL = 'SELECT word_id, word_token, type,'; $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,"; $sSQL .= " info->>'op' as operator,"; - $sSQL .= " info->>'class' as class, info->>'type' as type,"; + $sSQL .= " info->>'class' as class, info->>'type' as ctype,"; $sSQL .= " info->>'count' as count"; $sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; @@ -159,66 +159,60 @@ class Tokenizer foreach ($aDBWords as $aWord) { $iId = (int) $aWord['word_id']; + $sTok = $aWord['word_token']; switch ($aWord['type']) { - 'C': // country name tokens - if ($aWord['country'] === null - || ($this->aCountryRestriction - && !in_array($aWord['country'], $this->aCountryRestriction)) + case 'C': // country name tokens + if ($aWord['country'] !== null + && (!$this->aCountryRestriction + || in_array($aWord['country'], $this->aCountryRestriction)) ) { - continue; + $oValidTokens->addToken($sTok, new Token\Country($iId, $aWord['country'])); } - $oToken = new Token\Country($iId, $aWord['country']) break; - 'H': // house number tokens - $oToken = new Token\HouseNumber($iId, $aWord['word_token']); + case 'H': // house number tokens + $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token'])); break; - 'P': // postcode tokens + case 'P': // postcode tokens // Postcodes are not normalized, so they may have content // that makes SQL injection possible. Reject postcodes // that would need special escaping. - if ($aWord['postcode'] === null - || pg_escape_string($aWord['postcode']) == $aWord['postcode'] + if ($aWord['postcode'] !== null + && pg_escape_string($aWord['postcode']) == $aWord['postcode'] ) { - continue; + $sNormPostcode = $this->normalizeString($aWord['postcode']); + if (strpos($sNormQuery, $sNormPostcode) !== false) { + $oValidTokens->addToken($sTok, new Token\Postcode($iId, $aWord['postcode'], null)); + } } - $sNormPostcode = $this->normalizeString($aWord['postcode']); - if (strpos($sNormQuery, $sNormPostcode) === false) { - continue; - } - $oToken = new Token\Postcode($iId, $aWord['postcode'], null); break; - 'S': // tokens for classification terms (special phrases) - if ($aWord['class'] === null || $aWord['type'] === null - ) { - continue; + case 'S': // tokens for classification terms (special phrases) + if ($aWord['class'] !== null && $aWord['ctype'] !== null) { + $oValidTokens->addToken($sTok, new Token\SpecialTerm( + $iId, + $aWord['class'], + $aWord['ctype'], + (isset($aWord['op'])) ? Operator::NEAR : Operator::NONE + )); } - $oToken = new Token\SpecialTerm( - $iId, - $aWord['class'], - $aWord['type'], - $aWord['op'] ? Operator::NEAR : Operator::NONE - ); break; - 'W': // full-word tokens - $oToken = new Token\Word( + case 'W': // full-word tokens + $oValidTokens->addToken($sTok, new Token\Word( $iId, (int) $aWord['count'], substr_count($aWord['word_token'], ' ') - ); + )); break; - 'w': // partial word terms - $oToken = new Token\Partial( + case 'w': // partial word terms + $oValidTokens->addToken($sTok, new Token\Partial( $iId, $aWord['word_token'], (int) $aWord['count'] - ); + )); break; default: - continue; + break; } - - $oValidTokens->addToken($aWord['word_token'], $oToken); } } diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 6102e99b..4c839db0 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -134,9 +134,7 @@ Feature: Import of postcodes Then location_postcode contains exactly | country | postcode | geometry | | de | 01982 | country:de | - And word contains - | word | class | type | - | 01982 | place | postcode | + And there are word tokens for postcodes 01982 Scenario: Different postcodes with the same normalization can both be found Given the places diff --git a/test/bdd/db/update/postcode.feature b/test/bdd/db/update/postcode.feature index 94550ffd..c2fb30ce 100644 --- a/test/bdd/db/update/postcode.feature +++ b/test/bdd/db/update/postcode.feature @@ -18,10 +18,7 @@ Feature: Update of postcode | country | postcode | geometry | | de | 01982 | country:de | | ch | 4567 | country:ch | - And word contains - | word | class | type | - | 01982 | place | postcode | - | 4567 | place | postcode | + And there are word tokens for postcodes 01982,4567 Scenario: When the last postcode is deleted, it is deleted from postcode and word Given the places @@ -34,12 +31,8 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | ch | 4567 | country:ch | - And word contains not - | word | class | type | - | 01982 | place | postcode | - And word contains - | word | class | type | - | 4567 | place | postcode | + And there are word tokens for postcodes 4567 + And there are no word tokens for postcodes 01982 Scenario: A postcode is not deleted from postcode and word when it exist in another country Given the places @@ -52,9 +45,7 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | ch | 01982 | country:ch | - And word contains - | word | class | type | - | 01982 | place | postcode | + And there are word tokens for postcodes 01982 Scenario: Updating a postcode is reflected in postcode table Given the places @@ -68,9 +59,7 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 20453 | country:de | - And word contains - | word | class | type | - | 20453 | place | postcode | + And there are word tokens for postcodes 20453 Scenario: When changing from a postcode type, the entry appears in placex When importing @@ -91,9 +80,7 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 20453 | country:de | - And word contains - | word | class | type | - | 20453 | place | postcode | + And there are word tokens for postcodes 20453 Scenario: When changing to a postcode type, the entry disappears from placex When importing @@ -114,6 +101,4 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 01982 | country:de | - And word contains - | word | class | type | - | 01982 | place | postcode | + And there are word tokens for postcodes 01982 diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index b4f0d853..be2789f3 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -281,6 +281,39 @@ def check_word_table(context, exclude): else: assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values) + +@then("there are(?P no)? word tokens for postcodes (?P.*)") +def check_word_table_for_postcodes(context, exclude, postcodes): + """ Check that the tokenizer produces postcode tokens for the given + postcodes. The postcodes are a comma-separated list of postcodes. + Whitespace matters. + """ + nctx = context.nominatim + tokenizer = tokenizer_factory.get_tokenizer_for_db(nctx.get_test_config()) + with tokenizer.name_analyzer() as ana: + plist = [ana.normalize_postcode(p) for p in postcodes.split(',')] + + plist.sort() + + with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + if nctx.tokenizer == 'legacy_icu': + cur.execute("""SELECT info->>'postcode' FROM word + WHERE type = 'P' and info->>'postcode' = any(%s)""", + (plist,)) + else: + cur.execute("""SELECT word FROM word WHERE word = any(%s) + and class = 'place' and type = 'postcode'""", + (plist,)) + + found = [row[0] for row in cur] + assert len(found) == len(set(found)), f"Duplicate rows for postcodes: {found}" + + if exclude: + assert len(found) == 0, f"Unexpected postcodes: {found}" + else: + assert set(found) == set(plist), \ + f"Missing postcodes {set(plist) - set(found)}. Found: {found}" + @then("place_addressline contains") def check_place_addressline(context): """ Check the contents of the place_addressline table. Each row represents From 1db098c05d54e50e1682747d446ef92f5ed0f9f6 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 25 Jul 2021 15:08:11 +0200 Subject: [PATCH 11/14] reinstate word column in icu word table Postgresql is very bad at creating statistics for jsonb columns. The result is that the query planer tends to use JIT for queries with a where over 'info' even when there is an index. --- lib-php/tokenizer/legacy_icu_tokenizer.php | 25 +++++++++------ lib-sql/tokenizer/icu_tokenizer_tables.sql | 7 +++-- lib-sql/tokenizer/legacy_icu_tokenizer.sql | 8 ++--- nominatim/tokenizer/legacy_icu_tokenizer.py | 35 ++++++++++----------- test/bdd/steps/steps_db_ops.py | 19 +---------- test/python/mock_icu_word_table.py | 24 +++++++------- 6 files changed, 53 insertions(+), 65 deletions(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 9bd9828c..7a900e5a 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -145,8 +145,7 @@ class Tokenizer private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) { // Check which tokens we have, get the ID numbers - $sSQL = 'SELECT word_id, word_token, type,'; - $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,"; + $sSQL = 'SELECT word_id, word_token, type, word,'; $sSQL .= " info->>'op' as operator,"; $sSQL .= " info->>'class' as class, info->>'type' as ctype,"; $sSQL .= " info->>'count' as count"; @@ -163,11 +162,14 @@ class Tokenizer switch ($aWord['type']) { case 'C': // country name tokens - if ($aWord['country'] !== null + if ($aWord['word'] !== null && (!$this->aCountryRestriction - || in_array($aWord['country'], $this->aCountryRestriction)) + || in_array($aWord['word'], $this->aCountryRestriction)) ) { - $oValidTokens->addToken($sTok, new Token\Country($iId, $aWord['country'])); + $oValidTokens->addToken( + $sTok, + new Token\Country($iId, $aWord['word']) + ); } break; case 'H': // house number tokens @@ -177,12 +179,15 @@ class Tokenizer // Postcodes are not normalized, so they may have content // that makes SQL injection possible. Reject postcodes // that would need special escaping. - if ($aWord['postcode'] !== null - && pg_escape_string($aWord['postcode']) == $aWord['postcode'] + if ($aWord['word'] !== null + && pg_escape_string($aWord['word']) == $aWord['word'] ) { - $sNormPostcode = $this->normalizeString($aWord['postcode']); + $sNormPostcode = $this->normalizeString($aWord['word']); if (strpos($sNormQuery, $sNormPostcode) !== false) { - $oValidTokens->addToken($sTok, new Token\Postcode($iId, $aWord['postcode'], null)); + $oValidTokens->addToken( + $sTok, + new Token\Postcode($iId, $aWord['word'], null) + ); } } break; @@ -192,7 +197,7 @@ class Tokenizer $iId, $aWord['class'], $aWord['ctype'], - (isset($aWord['op'])) ? Operator::NEAR : Operator::NONE + (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE )); } break; diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql index 912ba768..7ec3c6f8 100644 --- a/lib-sql/tokenizer/icu_tokenizer_tables.sql +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -3,6 +3,7 @@ CREATE TABLE word ( word_id INTEGER, word_token text NOT NULL, type text NOT NULL, + word text, info jsonb ) {{db.tablespace.search_data}}; @@ -10,15 +11,15 @@ CREATE INDEX idx_word_word_token ON word USING BTREE (word_token) {{db.tablespace.search_index}}; -- Used when updating country names from the boundary relation. CREATE INDEX idx_word_country_names ON word - USING btree((info->>'cc')) {{db.tablespace.address_index}} + USING btree(word) {{db.tablespace.address_index}} WHERE type = 'C'; -- Used when inserting new postcodes on updates. CREATE INDEX idx_word_postcodes ON word - USING btree((info->>'postcode')) {{db.tablespace.address_index}} + USING btree(word) {{db.tablespace.address_index}} WHERE type = 'P'; -- Used when inserting full words. CREATE INDEX idx_word_full_word ON word - USING btree((info->>'word')) {{db.tablespace.address_index}} + USING btree(word) {{db.tablespace.address_index}} WHERE type = 'W'; GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql index e021bf8b..ffe6648c 100644 --- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -98,13 +98,13 @@ DECLARE term_count INTEGER; BEGIN SELECT min(word_id) INTO full_token - FROM word WHERE info->>'word' = norm_term and type = 'W'; + FROM word WHERE word = norm_term and type = 'W'; IF full_token IS NULL THEN full_token := nextval('seq_word'); - INSERT INTO word (word_id, word_token, type, info) - SELECT full_token, lookup_term, 'W', - json_build_object('word', norm_term, 'count', 0) + INSERT INTO word (word_id, word_token, type, word, info) + SELECT full_token, lookup_term, 'W', norm_term, + json_build_object('count', 0) FROM unnest(lookup_terms) as lookup_term; END IF; diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 3d7d752e..1f8096ff 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -278,7 +278,7 @@ class LegacyICUNameAnalyzer: (SELECT pc, word FROM (SELECT distinct(postcode) as pc FROM location_postcode) p FULL JOIN - (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w + (SELECT word FROM word WHERE type = 'P') w ON pc = word) x WHERE pc is null or word is null""") @@ -288,15 +288,15 @@ class LegacyICUNameAnalyzer: to_delete.append(word) else: copystr.add(self.name_processor.get_search_normalized(postcode), - 'P', json.dumps({'postcode': postcode})) + 'P', postcode) if to_delete: cur.execute("""DELETE FROM WORD - WHERE type ='P' and info->>'postcode' = any(%s) + WHERE type ='P' and word = any(%s) """, (to_delete, )) copystr.copy_out(cur, 'word', - columns=['word_token', 'type', 'info']) + columns=['word_token', 'type', 'word']) def update_special_phrases(self, phrases, should_replace): @@ -311,9 +311,9 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # Get the old phrases. existing_phrases = set() - cur.execute("SELECT info FROM word WHERE type = 'S'") - for (info, ) in cur: - existing_phrases.add((info['word'], info['class'], info['type'], + cur.execute("SELECT word, info FROM word WHERE type = 'S'") + for word, info in cur: + existing_phrases.add((word, info['class'], info['type'], info.get('op') or '-')) added = self._add_special_phrases(cur, norm_phrases, existing_phrases) @@ -337,13 +337,13 @@ class LegacyICUNameAnalyzer: for word, cls, typ, oper in to_add: term = self.name_processor.get_search_normalized(word) if term: - copystr.add(term, 'S', - json.dumps({'word': word, 'class': cls, 'type': typ, + copystr.add(term, 'S', word, + json.dumps({'class': cls, 'type': typ, 'op': oper if oper in ('in', 'near') else None})) added += 1 copystr.copy_out(cursor, 'word', - columns=['word_token', 'type', 'info']) + columns=['word_token', 'type', 'word', 'info']) return added @@ -358,7 +358,7 @@ class LegacyICUNameAnalyzer: if to_delete: cursor.execute_values( """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) - WHERE info->>'word' = name + WHERE type = 'S' and word = name and info->>'class' = in_class and info->>'type' = in_type and ((op = '-' and info->>'op' is null) or op = info->>'op') """, to_delete) @@ -378,14 +378,14 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # Get existing names cur.execute("""SELECT word_token FROM word - WHERE type = 'C' and info->>'cc'= %s""", + WHERE type = 'C' and word = %s""", (country_code, )) word_tokens.difference_update((t[0] for t in cur)) # Only add those names that are not yet in the list. if word_tokens: - cur.execute("""INSERT INTO word (word_token, type, info) - (SELECT token, 'C', json_build_object('cc', %s) + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT token, 'C', %s FROM unnest(%s) as token) """, (country_code, list(word_tokens))) @@ -503,12 +503,11 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word_token, type, info) - (SELECT %s, 'P', json_build_object('postcode', pc) - FROM (VALUES (%s)) as v(pc) + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE type = 'P' and info->>'postcode' = pc)) + WHERE type = 'P' and word = pc)) """, (term, postcode)) self._cache.postcodes.add(postcode) diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index be2789f3..ac61fc67 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -266,22 +266,6 @@ def check_location_postcode(context): db_row.assert_row(row, ('country', 'postcode')) -@then("word contains(?P not)?") -def check_word_table(context, exclude): - """ Check the contents of the word table. Each row represents a table row - and all data must match. Data not present in the expected table, may - be arbitry. The rows are identified via all given columns. - """ - with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - for row in context.table: - wheres = ' AND '.join(["{} = %s".format(h) for h in row.headings]) - cur.execute("SELECT * from word WHERE " + wheres, list(row.cells)) - if exclude: - assert cur.rowcount == 0, "Row still in word table: %s" % '/'.join(values) - else: - assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values) - - @then("there are(?P no)? word tokens for postcodes (?P.*)") def check_word_table_for_postcodes(context, exclude, postcodes): """ Check that the tokenizer produces postcode tokens for the given @@ -297,8 +281,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes): with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: if nctx.tokenizer == 'legacy_icu': - cur.execute("""SELECT info->>'postcode' FROM word - WHERE type = 'P' and info->>'postcode' = any(%s)""", + cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)", (plist,)) else: cur.execute("""SELECT word FROM word WHERE word = any(%s) diff --git a/test/python/mock_icu_word_table.py b/test/python/mock_icu_word_table.py index 3d457d0b..cde5e770 100644 --- a/test/python/mock_icu_word_table.py +++ b/test/python/mock_icu_word_table.py @@ -12,16 +12,16 @@ class MockIcuWordTable: cur.execute("""CREATE TABLE word (word_id INTEGER, word_token text NOT NULL, type text NOT NULL, + word text, info jsonb)""") conn.commit() def add_special(self, word_token, word, cls, typ, oper): with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, type, info) - VALUES (%s, 'S', - json_build_object('word', %s, - 'class', %s, + cur.execute("""INSERT INTO word (word_token, type, word, info) + VALUES (%s, 'S', %s, + json_build_object('class', %s, 'type', %s, 'op', %s)) """, (word_token, word, cls, typ, oper)) @@ -30,16 +30,16 @@ class MockIcuWordTable: def add_country(self, country_code, word_token): with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, type, info) - VALUES(%s, 'C', json_build_object('cc', %s))""", + cur.execute("""INSERT INTO word (word_token, type, word) + VALUES(%s, 'C', %s)""", (word_token, country_code)) self.conn.commit() def add_postcode(self, word_token, postcode): with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, type, info) - VALUES (%s, 'P', json_build_object('postcode', %s)) + cur.execute("""INSERT INTO word (word_token, type, word) + VALUES (%s, 'P', %s) """, (word_token, postcode)) self.conn.commit() @@ -56,8 +56,8 @@ class MockIcuWordTable: def get_special(self): with self.conn.cursor() as cur: - cur.execute("SELECT word_token, info FROM word WHERE type = 'S'") - result = set(((row[0], row[1]['word'], row[1]['class'], + cur.execute("SELECT word_token, info, word FROM word WHERE type = 'S'") + result = set(((row[0], row[2], row[1]['class'], row[1]['type'], row[1]['op']) for row in cur)) assert len(result) == cur.rowcount, "Word table has duplicates." return result @@ -65,7 +65,7 @@ class MockIcuWordTable: def get_country(self): with self.conn.cursor() as cur: - cur.execute("SELECT info->>'cc', word_token FROM word WHERE type = 'C'") + cur.execute("SELECT word, word_token FROM word WHERE type = 'C'") result = set((tuple(row) for row in cur)) assert len(result) == cur.rowcount, "Word table has duplicates." return result @@ -73,7 +73,7 @@ class MockIcuWordTable: def get_postcodes(self): with self.conn.cursor() as cur: - cur.execute("SELECT info->>'postcode' FROM word WHERE type = 'P'") + cur.execute("SELECT word FROM word WHERE type = 'P'") return set((row[0] for row in cur)) From 001b2aa9f9e3476d3b70a1a2a408ea4f0a7786fc Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 25 Jul 2021 15:13:49 +0200 Subject: [PATCH 12/14] fix linitin issues in PHP --- lib-php/tokenizer/legacy_icu_tokenizer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 7a900e5a..ef79769c 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -19,7 +19,7 @@ class Tokenizer public function checkStatus() { - $sSQL = "SELECT word_id FROM word limit 1"; + $sSQL = 'SELECT word_id FROM word limit 1'; $iWordID = $this->oDB->getOne($sSQL); if ($iWordID === false) { throw new Exception('Query failed', 703); From d48793c22cd2625d5390364dfb0ec04a2cc8d0f9 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 25 Jul 2021 15:30:47 +0200 Subject: [PATCH 13/14] fix Python linitin errors --- nominatim/tokenizer/legacy_icu_tokenizer.py | 41 ++++++++++++--------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 1f8096ff..a887ae28 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -79,7 +79,6 @@ class LegacyICUTokenizer: """ Do any required postprocessing to make the tokenizer data ready for use. """ - pass def update_sql_functions(self, config): @@ -156,25 +155,12 @@ class LegacyICUTokenizer: LOG.warning("Precomputing word tokens") # get partial words and their frequencies - words = Counter() - name_proc = ICUNameProcessor(self.naming_rules) - with conn.cursor(name="words") as cur: - cur.execute(""" SELECT v, count(*) FROM - (SELECT svals(name) as v FROM place)x - WHERE length(v) < 75 GROUP BY v""") - - for name, cnt in cur: - terms = set() - for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): - if ' ' in word: - terms.update(word.split()) - for term in terms: - words[term] += cnt + words = self._count_partial_terms(conn) # copy them back into the word table with CopyBuffer() as copystr: - for k, v in words.items(): - copystr.add('w', k, json.dumps({'count': v})) + for term, cnt in words.items(): + copystr.add('w', term, json.dumps({'count': cnt})) with conn.cursor() as cur: copystr.copy_out(cur, 'word', @@ -184,6 +170,27 @@ class LegacyICUTokenizer: conn.commit() + def _count_partial_terms(self, conn): + """ Count the partial terms from the names in the place table. + """ + words = Counter() + name_proc = ICUNameProcessor(self.naming_rules) + + with conn.cursor(name="words") as cur: + cur.execute(""" SELECT v, count(*) FROM + (SELECT svals(name) as v FROM place)x + WHERE length(v) < 75 GROUP BY v""") + + for name, cnt in cur: + terms = set() + for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): + if ' ' in word: + terms.update(word.split()) + for term in terms: + words[term] += cnt + + return words + class LegacyICUNameAnalyzer: """ The legacy analyzer uses the ICU library for splitting names. From fdff5791884da75cabe9e9393f1ac8a6021ab464 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 25 Jul 2021 16:29:04 +0200 Subject: [PATCH 14/14] php: force use of global Exception class --- lib-php/init-website.php | 6 +++--- lib-php/tokenizer/legacy_icu_tokenizer.php | 4 ++-- lib-php/tokenizer/legacy_tokenizer.php | 8 ++++---- lib-php/website/details.php | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/lib-php/init-website.php b/lib-php/init-website.php index d6cc8a24..6f3b5545 100644 --- a/lib-php/init-website.php +++ b/lib-php/init-website.php @@ -12,7 +12,7 @@ require_once(CONST_Debug ? 'DebugHtml.php' : 'DebugNone.php'); function userError($sMsg) { - throw new Exception($sMsg, 400); + throw new \Exception($sMsg, 400); } @@ -37,7 +37,7 @@ function shutdown_exception_handler_xml() { $error = error_get_last(); if ($error !== null && $error['type'] === E_ERROR) { - exception_handler_xml(new Exception($error['message'], 500)); + exception_handler_xml(new \Exception($error['message'], 500)); } } @@ -45,7 +45,7 @@ function shutdown_exception_handler_json() { $error = error_get_last(); if ($error !== null && $error['type'] === E_ERROR) { - exception_handler_json(new Exception($error['message'], 500)); + exception_handler_json(new \Exception($error['message'], 500)); } } diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index ef79769c..3751e821 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -22,10 +22,10 @@ class Tokenizer $sSQL = 'SELECT word_id FROM word limit 1'; $iWordID = $this->oDB->getOne($sSQL); if ($iWordID === false) { - throw new Exception('Query failed', 703); + throw new \Exception('Query failed', 703); } if (!$iWordID) { - throw new Exception('No value', 704); + throw new \Exception('No value', 704); } } diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php index 064b4166..570b8828 100644 --- a/lib-php/tokenizer/legacy_tokenizer.php +++ b/lib-php/tokenizer/legacy_tokenizer.php @@ -19,20 +19,20 @@ class Tokenizer { $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')"); if ($sStandardWord === false) { - throw new Exception('Module failed', 701); + throw new \Exception('Module failed', 701); } if ($sStandardWord != 'a') { - throw new Exception('Module call failed', 702); + throw new \Exception('Module call failed', 702); } $sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')"; $iWordID = $this->oDB->getOne($sSQL); if ($iWordID === false) { - throw new Exception('Query failed', 703); + throw new \Exception('Query failed', 703); } if (!$iWordID) { - throw new Exception('No value', 704); + throw new \Exception('No value', 704); } } diff --git a/lib-php/website/details.php b/lib-php/website/details.php index c16725e2..0d67ec83 100644 --- a/lib-php/website/details.php +++ b/lib-php/website/details.php @@ -83,7 +83,7 @@ if ($sOsmType && $iOsmId > 0) { } if ($sPlaceId === false) { - throw new Exception('No place with that OSM ID found.', 404); + throw new \Exception('No place with that OSM ID found.', 404); } } else { if ($sPlaceId === false) { @@ -146,7 +146,7 @@ $sSQL .= " WHERE place_id = $iPlaceID"; $aPointDetails = $oDB->getRow($sSQL, null, 'Could not get details of place object.'); if (!$aPointDetails) { - throw new Exception('No place with that place ID found.', 404); + throw new \Exception('No place with that place ID found.', 404); } $aPointDetails['localname'] = $aPointDetails['localname']?$aPointDetails['localname']:$aPointDetails['housenumber'];