move special hack for US states to legacy tokenizer

The hack for IL, AL and LA is only needed because these abbreviations
are removed by the legacy tokenizer as a stop word. There is no need
to keep the hack for future tokenizers. Move it therefore to the
token extraction function.
This commit is contained in:
Sarah Hoffmann
2021-08-17 14:28:55 +02:00
parent 5f2b9e317a
commit f00b8dd1c3
3 changed files with 19 additions and 8 deletions

View File

@@ -87,6 +87,23 @@ class Tokenizer
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
$sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
$aParams[':'.$iPhrase] = $oPhrase->getPhrase();
// Conflicts between US state abbreviations and various words
// for 'the' in different languages
switch (strtolower($oPhrase->getPhrase())) {
case 'il':
$aParams[':'.$iPhrase] = 'illinois';
break;
case 'al':
$aParams[':'.$iPhrase] = 'alabama';
break;
case 'la':
$aParams[':'.$iPhrase] = 'louisiana';
break;
default:
$aParams[':'.$iPhrase] = $oPhrase->getPhrase();
break;
}
}
$sSQL = substr($sSQL, 0, -1);