move abbreviation computation into import phase

This adds precomputation of abbreviated terms for names and removes
abbreviation of terms in the query. Basic import works but still
needs some thorough testing as well as speed improvements during
import.

New dependency for python library datrie.
This commit is contained in:
Sarah Hoffmann
2021-05-28 22:06:13 +02:00
parent 6ba00e6aee
commit 8413075249
10 changed files with 665 additions and 206 deletions

View File

@@ -47,9 +47,7 @@ class Tokenizer
private function makeStandardWord($sTerm)
{
$sNorm = ' '.$this->oTransliterator->transliterate($sTerm).' ';
return trim(str_replace(CONST_Abbreviations[0], CONST_Abbreviations[1], $sNorm));
return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
}
@@ -90,6 +88,7 @@ class Tokenizer
foreach ($aPhrases as $iPhrase => $oPhrase) {
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
$sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
Debug::printVar('Phrase', $sPhrase);
if (strlen($sPhrase) > 0) {
$aWords = explode(' ', $sPhrase);
Tokenizer::addTokens($aTokens, $aWords);