fix warming for ICU tokenizer

Running the warm-up search requests requires querying
the most frequent words. This must be done via the tokenizer
to honor the different formats of the word table.
This commit is contained in:
Sarah Hoffmann
2021-10-25 13:08:16 +02:00
parent 13e7398566
commit 16cc395f78
3 changed files with 23 additions and 1 deletions

View File

@@ -86,8 +86,13 @@ if (!$aResult['reverse-only']) {
if ($bVerbose) {
echo "\n";
}
$oTokenizer = new \Nominatim\Tokenizer($oDB);
$aWords = $oTokenizer->mostFrequentWords(1000);
$sSQL = 'SELECT word FROM word WHERE word is not null ORDER BY search_name_count DESC LIMIT 1000';
foreach ($oDB->getCol($sSQL) as $sWord) {
foreach ($aWords as $sWord) {
if ($bVerbose) {
echo "$sWord = ";
}

View File

@@ -40,6 +40,15 @@ class Tokenizer
return $this->oNormalizer->transliterate($sTerm);
}
public function mostFrequentWords($iNum)
{
$sSQL = "SELECT word FROM word WHERE type = 'W'";
$sSQL .= "ORDER BY info->'count' DESC LIMIT ".$iNum;
return $this->oDB->getCol($sSQL);
}
private function makeStandardWord($sTerm)
{
return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));

View File

@@ -48,6 +48,14 @@ class Tokenizer
}
public function mostFrequentWords($iNum)
{
$sSQL = 'SELECT word FROM word WHERE word is not null ';
$sSQL .= 'ORDER BY search_name_count DESC LIMIT '.$iNum;
return $this->oDB->getCol($sSQL);
}
public function tokensForSpecialTerm($sTerm)
{
$aResults = array();