php: make word list a first-class object

This separates the logic of creating word sets from the Phrase
class. A tokenizer may now derived the word sets any way they
like. The SimpleWordList class provides a standard implementation
for splitting phrases on spaces.
This commit is contained in:
Sarah Hoffmann
2021-08-12 11:09:46 +02:00
parent 0fb8eade13
commit 1147b83b22
6 changed files with 265 additions and 273 deletions

View File

@@ -2,6 +2,8 @@
namespace Nominatim;
require_once(CONST_LibDir.'/SimpleWordList.php');
class Tokenizer
{
private $oDB;
@@ -99,13 +101,14 @@ class Tokenizer
$aWordLists = array();
$aTokens = array();
foreach ($aNormPhrases as $sPhrase) {
if (strlen($sPhrase) > 0) {
$aWords = explode(' ', $sPhrase);
Tokenizer::addTokens($aTokens, $aWords);
$aWordLists[] = $aWords;
} else {
$aWordLists[] = array();
$oWordList = new SimpleWordList($sPhrase);
foreach ($oWordList->getTokens() as $sToken) {
$aTokens[' '.$sToken] = ' '.$sToken;
$aTokens[$sToken] = $sToken;
}
$aWordLists[] = $oWordList;
}
Debug::printVar('Tokens', $aTokens);
@@ -114,7 +117,7 @@ class Tokenizer
$oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
foreach ($aPhrases as $iPhrase => $oPhrase) {
$oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
$oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
}
return $oValidTokens;
@@ -226,29 +229,4 @@ class Tokenizer
}
}
}
/**
* Add the tokens from this phrase to the given list of tokens.
*
* @param string[] $aTokens List of tokens to append.
*
* @return void
*/
private static function addTokens(&$aTokens, $aWords)
{
$iNumWords = count($aWords);
for ($i = 0; $i < $iNumWords; $i++) {
$sPhrase = $aWords[$i];
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
$aTokens[$sPhrase] = $sPhrase;
for ($j = $i + 1; $j < $iNumWords; $j++) {
$sPhrase .= ' '.$aWords[$j];
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
$aTokens[$sPhrase] = $sPhrase;
}
}
}
}