bdd tests: do not query word table directly

The BDD tests cannot make assumptions about the structure of the
word table anymore because it depends on the tokenizer. Use more
abstract descriptions instead that ask for specific kinds of
tokens.
This commit is contained in:
Sarah Hoffmann
2021-07-24 12:12:31 +02:00
parent e42878eeda
commit 324b1b5575
4 changed files with 74 additions and 64 deletions

View File

@@ -19,7 +19,7 @@ class Tokenizer
public function checkStatus() public function checkStatus()
{ {
$sSQL = "SELECT word_id FROM word WHERE word_token == 'a'"; $sSQL = "SELECT word_id FROM word limit 1";
$iWordID = $this->oDB->getOne($sSQL); $iWordID = $this->oDB->getOne($sSQL);
if ($iWordID === false) { if ($iWordID === false) {
throw new Exception('Query failed', 703); throw new Exception('Query failed', 703);
@@ -145,10 +145,10 @@ class Tokenizer
private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
{ {
// Check which tokens we have, get the ID numbers // Check which tokens we have, get the ID numbers
$sSQL = 'SELECT word_id, word_token, type'; $sSQL = 'SELECT word_id, word_token, type,';
$sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,"; $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,";
$sSQL .= " info->>'op' as operator,"; $sSQL .= " info->>'op' as operator,";
$sSQL .= " info->>'class' as class, info->>'type' as type,"; $sSQL .= " info->>'class' as class, info->>'type' as ctype,";
$sSQL .= " info->>'count' as count"; $sSQL .= " info->>'count' as count";
$sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= ' FROM word WHERE word_token in (';
$sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
@@ -159,66 +159,60 @@ class Tokenizer
foreach ($aDBWords as $aWord) { foreach ($aDBWords as $aWord) {
$iId = (int) $aWord['word_id']; $iId = (int) $aWord['word_id'];
$sTok = $aWord['word_token'];
switch ($aWord['type']) { switch ($aWord['type']) {
'C': // country name tokens case 'C': // country name tokens
if ($aWord['country'] === null if ($aWord['country'] !== null
|| ($this->aCountryRestriction && (!$this->aCountryRestriction
&& !in_array($aWord['country'], $this->aCountryRestriction)) || in_array($aWord['country'], $this->aCountryRestriction))
) { ) {
continue; $oValidTokens->addToken($sTok, new Token\Country($iId, $aWord['country']));
} }
$oToken = new Token\Country($iId, $aWord['country'])
break; break;
'H': // house number tokens case 'H': // house number tokens
$oToken = new Token\HouseNumber($iId, $aWord['word_token']); $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token']));
break; break;
'P': // postcode tokens case 'P': // postcode tokens
// Postcodes are not normalized, so they may have content // Postcodes are not normalized, so they may have content
// that makes SQL injection possible. Reject postcodes // that makes SQL injection possible. Reject postcodes
// that would need special escaping. // that would need special escaping.
if ($aWord['postcode'] === null if ($aWord['postcode'] !== null
|| pg_escape_string($aWord['postcode']) == $aWord['postcode'] && pg_escape_string($aWord['postcode']) == $aWord['postcode']
) { ) {
continue; $sNormPostcode = $this->normalizeString($aWord['postcode']);
if (strpos($sNormQuery, $sNormPostcode) !== false) {
$oValidTokens->addToken($sTok, new Token\Postcode($iId, $aWord['postcode'], null));
}
} }
$sNormPostcode = $this->normalizeString($aWord['postcode']);
if (strpos($sNormQuery, $sNormPostcode) === false) {
continue;
}
$oToken = new Token\Postcode($iId, $aWord['postcode'], null);
break; break;
'S': // tokens for classification terms (special phrases) case 'S': // tokens for classification terms (special phrases)
if ($aWord['class'] === null || $aWord['type'] === null if ($aWord['class'] !== null && $aWord['ctype'] !== null) {
) { $oValidTokens->addToken($sTok, new Token\SpecialTerm(
continue; $iId,
$aWord['class'],
$aWord['ctype'],
(isset($aWord['op'])) ? Operator::NEAR : Operator::NONE
));
} }
$oToken = new Token\SpecialTerm(
$iId,
$aWord['class'],
$aWord['type'],
$aWord['op'] ? Operator::NEAR : Operator::NONE
);
break; break;
'W': // full-word tokens case 'W': // full-word tokens
$oToken = new Token\Word( $oValidTokens->addToken($sTok, new Token\Word(
$iId, $iId,
(int) $aWord['count'], (int) $aWord['count'],
substr_count($aWord['word_token'], ' ') substr_count($aWord['word_token'], ' ')
); ));
break; break;
'w': // partial word terms case 'w': // partial word terms
$oToken = new Token\Partial( $oValidTokens->addToken($sTok, new Token\Partial(
$iId, $iId,
$aWord['word_token'], $aWord['word_token'],
(int) $aWord['count'] (int) $aWord['count']
); ));
break; break;
default: default:
continue; break;
} }
$oValidTokens->addToken($aWord['word_token'], $oToken);
} }
} }

View File

@@ -134,9 +134,7 @@ Feature: Import of postcodes
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 01982 | country:de | | de | 01982 | country:de |
And word contains And there are word tokens for postcodes 01982
| word | class | type |
| 01982 | place | postcode |
Scenario: Different postcodes with the same normalization can both be found Scenario: Different postcodes with the same normalization can both be found
Given the places Given the places

View File

@@ -18,10 +18,7 @@ Feature: Update of postcode
| country | postcode | geometry | | country | postcode | geometry |
| de | 01982 | country:de | | de | 01982 | country:de |
| ch | 4567 | country:ch | | ch | 4567 | country:ch |
And word contains And there are word tokens for postcodes 01982,4567
| word | class | type |
| 01982 | place | postcode |
| 4567 | place | postcode |
Scenario: When the last postcode is deleted, it is deleted from postcode and word Scenario: When the last postcode is deleted, it is deleted from postcode and word
Given the places Given the places
@@ -34,12 +31,8 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| ch | 4567 | country:ch | | ch | 4567 | country:ch |
And word contains not And there are word tokens for postcodes 4567
| word | class | type | And there are no word tokens for postcodes 01982
| 01982 | place | postcode |
And word contains
| word | class | type |
| 4567 | place | postcode |
Scenario: A postcode is not deleted from postcode and word when it exist in another country Scenario: A postcode is not deleted from postcode and word when it exist in another country
Given the places Given the places
@@ -52,9 +45,7 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| ch | 01982 | country:ch | | ch | 01982 | country:ch |
And word contains And there are word tokens for postcodes 01982
| word | class | type |
| 01982 | place | postcode |
Scenario: Updating a postcode is reflected in postcode table Scenario: Updating a postcode is reflected in postcode table
Given the places Given the places
@@ -68,9 +59,7 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 20453 | country:de | | de | 20453 | country:de |
And word contains And there are word tokens for postcodes 20453
| word | class | type |
| 20453 | place | postcode |
Scenario: When changing from a postcode type, the entry appears in placex Scenario: When changing from a postcode type, the entry appears in placex
When importing When importing
@@ -91,9 +80,7 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 20453 | country:de | | de | 20453 | country:de |
And word contains And there are word tokens for postcodes 20453
| word | class | type |
| 20453 | place | postcode |
Scenario: When changing to a postcode type, the entry disappears from placex Scenario: When changing to a postcode type, the entry disappears from placex
When importing When importing
@@ -114,6 +101,4 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 01982 | country:de | | de | 01982 | country:de |
And word contains And there are word tokens for postcodes 01982
| word | class | type |
| 01982 | place | postcode |

View File

@@ -281,6 +281,39 @@ def check_word_table(context, exclude):
else: else:
assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values) assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values)
@then("there are(?P<exclude> no)? word tokens for postcodes (?P<postcodes>.*)")
def check_word_table_for_postcodes(context, exclude, postcodes):
""" Check that the tokenizer produces postcode tokens for the given
postcodes. The postcodes are a comma-separated list of postcodes.
Whitespace matters.
"""
nctx = context.nominatim
tokenizer = tokenizer_factory.get_tokenizer_for_db(nctx.get_test_config())
with tokenizer.name_analyzer() as ana:
plist = [ana.normalize_postcode(p) for p in postcodes.split(',')]
plist.sort()
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
if nctx.tokenizer == 'legacy_icu':
cur.execute("""SELECT info->>'postcode' FROM word
WHERE type = 'P' and info->>'postcode' = any(%s)""",
(plist,))
else:
cur.execute("""SELECT word FROM word WHERE word = any(%s)
and class = 'place' and type = 'postcode'""",
(plist,))
found = [row[0] for row in cur]
assert len(found) == len(set(found)), f"Duplicate rows for postcodes: {found}"
if exclude:
assert len(found) == 0, f"Unexpected postcodes: {found}"
else:
assert set(found) == set(plist), \
f"Missing postcodes {set(plist) - set(found)}. Found: {found}"
@then("place_addressline contains") @then("place_addressline contains")
def check_place_addressline(context): def check_place_addressline(context):
""" Check the contents of the place_addressline table. Each row represents """ Check the contents of the place_addressline table. Each row represents