diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 796635ee..9bd9828c 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -19,7 +19,7 @@ class Tokenizer public function checkStatus() { - $sSQL = "SELECT word_id FROM word WHERE word_token == 'a'"; + $sSQL = "SELECT word_id FROM word limit 1"; $iWordID = $this->oDB->getOne($sSQL); if ($iWordID === false) { throw new Exception('Query failed', 703); @@ -145,10 +145,10 @@ class Tokenizer private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) { // Check which tokens we have, get the ID numbers - $sSQL = 'SELECT word_id, word_token, type'; + $sSQL = 'SELECT word_id, word_token, type,'; $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,"; $sSQL .= " info->>'op' as operator,"; - $sSQL .= " info->>'class' as class, info->>'type' as type,"; + $sSQL .= " info->>'class' as class, info->>'type' as ctype,"; $sSQL .= " info->>'count' as count"; $sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; @@ -159,66 +159,60 @@ class Tokenizer foreach ($aDBWords as $aWord) { $iId = (int) $aWord['word_id']; + $sTok = $aWord['word_token']; switch ($aWord['type']) { - 'C': // country name tokens - if ($aWord['country'] === null - || ($this->aCountryRestriction - && !in_array($aWord['country'], $this->aCountryRestriction)) + case 'C': // country name tokens + if ($aWord['country'] !== null + && (!$this->aCountryRestriction + || in_array($aWord['country'], $this->aCountryRestriction)) ) { - continue; + $oValidTokens->addToken($sTok, new Token\Country($iId, $aWord['country'])); } - $oToken = new Token\Country($iId, $aWord['country']) break; - 'H': // house number tokens - $oToken = new Token\HouseNumber($iId, $aWord['word_token']); + case 'H': // house number tokens + $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token'])); break; - 'P': // postcode tokens + case 'P': // postcode tokens // Postcodes are not normalized, so they may have content // that makes SQL injection possible. Reject postcodes // that would need special escaping. - if ($aWord['postcode'] === null - || pg_escape_string($aWord['postcode']) == $aWord['postcode'] + if ($aWord['postcode'] !== null + && pg_escape_string($aWord['postcode']) == $aWord['postcode'] ) { - continue; + $sNormPostcode = $this->normalizeString($aWord['postcode']); + if (strpos($sNormQuery, $sNormPostcode) !== false) { + $oValidTokens->addToken($sTok, new Token\Postcode($iId, $aWord['postcode'], null)); + } } - $sNormPostcode = $this->normalizeString($aWord['postcode']); - if (strpos($sNormQuery, $sNormPostcode) === false) { - continue; - } - $oToken = new Token\Postcode($iId, $aWord['postcode'], null); break; - 'S': // tokens for classification terms (special phrases) - if ($aWord['class'] === null || $aWord['type'] === null - ) { - continue; + case 'S': // tokens for classification terms (special phrases) + if ($aWord['class'] !== null && $aWord['ctype'] !== null) { + $oValidTokens->addToken($sTok, new Token\SpecialTerm( + $iId, + $aWord['class'], + $aWord['ctype'], + (isset($aWord['op'])) ? Operator::NEAR : Operator::NONE + )); } - $oToken = new Token\SpecialTerm( - $iId, - $aWord['class'], - $aWord['type'], - $aWord['op'] ? Operator::NEAR : Operator::NONE - ); break; - 'W': // full-word tokens - $oToken = new Token\Word( + case 'W': // full-word tokens + $oValidTokens->addToken($sTok, new Token\Word( $iId, (int) $aWord['count'], substr_count($aWord['word_token'], ' ') - ); + )); break; - 'w': // partial word terms - $oToken = new Token\Partial( + case 'w': // partial word terms + $oValidTokens->addToken($sTok, new Token\Partial( $iId, $aWord['word_token'], (int) $aWord['count'] - ); + )); break; default: - continue; + break; } - - $oValidTokens->addToken($aWord['word_token'], $oToken); } } diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 6102e99b..4c839db0 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -134,9 +134,7 @@ Feature: Import of postcodes Then location_postcode contains exactly | country | postcode | geometry | | de | 01982 | country:de | - And word contains - | word | class | type | - | 01982 | place | postcode | + And there are word tokens for postcodes 01982 Scenario: Different postcodes with the same normalization can both be found Given the places diff --git a/test/bdd/db/update/postcode.feature b/test/bdd/db/update/postcode.feature index 94550ffd..c2fb30ce 100644 --- a/test/bdd/db/update/postcode.feature +++ b/test/bdd/db/update/postcode.feature @@ -18,10 +18,7 @@ Feature: Update of postcode | country | postcode | geometry | | de | 01982 | country:de | | ch | 4567 | country:ch | - And word contains - | word | class | type | - | 01982 | place | postcode | - | 4567 | place | postcode | + And there are word tokens for postcodes 01982,4567 Scenario: When the last postcode is deleted, it is deleted from postcode and word Given the places @@ -34,12 +31,8 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | ch | 4567 | country:ch | - And word contains not - | word | class | type | - | 01982 | place | postcode | - And word contains - | word | class | type | - | 4567 | place | postcode | + And there are word tokens for postcodes 4567 + And there are no word tokens for postcodes 01982 Scenario: A postcode is not deleted from postcode and word when it exist in another country Given the places @@ -52,9 +45,7 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | ch | 01982 | country:ch | - And word contains - | word | class | type | - | 01982 | place | postcode | + And there are word tokens for postcodes 01982 Scenario: Updating a postcode is reflected in postcode table Given the places @@ -68,9 +59,7 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 20453 | country:de | - And word contains - | word | class | type | - | 20453 | place | postcode | + And there are word tokens for postcodes 20453 Scenario: When changing from a postcode type, the entry appears in placex When importing @@ -91,9 +80,7 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 20453 | country:de | - And word contains - | word | class | type | - | 20453 | place | postcode | + And there are word tokens for postcodes 20453 Scenario: When changing to a postcode type, the entry disappears from placex When importing @@ -114,6 +101,4 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 01982 | country:de | - And word contains - | word | class | type | - | 01982 | place | postcode | + And there are word tokens for postcodes 01982 diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index b4f0d853..be2789f3 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -281,6 +281,39 @@ def check_word_table(context, exclude): else: assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values) + +@then("there are(?P no)? word tokens for postcodes (?P.*)") +def check_word_table_for_postcodes(context, exclude, postcodes): + """ Check that the tokenizer produces postcode tokens for the given + postcodes. The postcodes are a comma-separated list of postcodes. + Whitespace matters. + """ + nctx = context.nominatim + tokenizer = tokenizer_factory.get_tokenizer_for_db(nctx.get_test_config()) + with tokenizer.name_analyzer() as ana: + plist = [ana.normalize_postcode(p) for p in postcodes.split(',')] + + plist.sort() + + with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + if nctx.tokenizer == 'legacy_icu': + cur.execute("""SELECT info->>'postcode' FROM word + WHERE type = 'P' and info->>'postcode' = any(%s)""", + (plist,)) + else: + cur.execute("""SELECT word FROM word WHERE word = any(%s) + and class = 'place' and type = 'postcode'""", + (plist,)) + + found = [row[0] for row in cur] + assert len(found) == len(set(found)), f"Duplicate rows for postcodes: {found}" + + if exclude: + assert len(found) == 0, f"Unexpected postcodes: {found}" + else: + assert set(found) == set(plist), \ + f"Missing postcodes {set(plist) - set(found)}. Found: {found}" + @then("place_addressline contains") def check_place_addressline(context): """ Check the contents of the place_addressline table. Each row represents