Rework word set computation

Switch from an recursive algorithm for computing the word sets
to an iterative one that benefits from caching intermediate
results. This considerably reduces the amount of memory needed,
so that the depth restriction can be dropped. To ensure that
the number of word sets remains manageable, only sets up to
a certain length are accepted and only a certain number of
total word sets. If word sets need to be dropped, we drop
the ones with more words per word set first.

To further reduce the number of potential word sets, the valid
tokens are looked up first and then only word sets containing
valid tokens are computed.

Fixes #1403, #1404 and #654.
This commit is contained in:
Sarah Hoffmann
2019-06-29 18:22:31 +02:00
parent 09e7f0d013
commit 38a99856c0
4 changed files with 143 additions and 46 deletions

View File

@@ -348,10 +348,7 @@ class Geocode
$aNewPhraseSearches = array(); $aNewPhraseSearches = array();
$sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : ''; $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : '';
foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) { foreach ($oPhrase->getWordSets() as $aWordset) {
// Too many permutations - too expensive
if ($iWordSet > 120) break;
$aWordsetSearches = $aSearches; $aWordsetSearches = $aSearches;
// Add all words from this wordset // Add all words from this wordset
@@ -641,7 +638,6 @@ class Geocode
} }
} }
Debug::printDebugTable('Phrases', $aPhrases);
Debug::printVar('Tokens', $aTokens); Debug::printVar('Tokens', $aTokens);
$oValidTokens = new TokenList(); $oValidTokens = new TokenList();
@@ -686,6 +682,11 @@ class Geocode
Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo()); Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo());
foreach ($aPhrases as $oPhrase) {
$oPhrase->computeWordSets($oValidTokens);
}
Debug::printDebugTable('Phrases', $aPhrases);
Debug::newSection('Search candidates'); Debug::newSection('Search candidates');
$aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bStructuredPhrases); $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bStructuredPhrases);

View File

@@ -9,7 +9,8 @@ namespace Nominatim;
*/ */
class Phrase class Phrase
{ {
const MAX_DEPTH = 7; public const MAX_WORDSET_LEN = 20;
public const MAX_WORDSETS = 100;
// Complete phrase as a string. // Complete phrase as a string.
private $sPhrase; private $sPhrase;
@@ -20,13 +21,24 @@ class Phrase
// Possible segmentations of the phrase. // Possible segmentations of the phrase.
private $aWordSets; private $aWordSets;
public static function cmpByArraylen($aA, $aB)
{
$iALen = count($aA);
$iBLen = count($aB);
if ($iALen == $iBLen) {
return 0;
}
return ($iALen < $iBLen) ? -1 : 1;
}
public function __construct($sPhrase, $sPhraseType) public function __construct($sPhrase, $sPhraseType)
{ {
$this->sPhrase = trim($sPhrase); $this->sPhrase = trim($sPhrase);
$this->sPhraseType = $sPhraseType; $this->sPhraseType = $sPhraseType;
$this->aWords = explode(' ', $this->sPhrase); $this->aWords = explode(' ', $this->sPhrase);
$this->aWordSets = $this->createWordSets($this->aWords, 0);
} }
/** /**
@@ -60,10 +72,17 @@ class Phrase
*/ */
public function addTokens(&$aTokens) public function addTokens(&$aTokens)
{ {
foreach ($this->aWordSets as $aSet) { $iNumWords = count($this->aWords);
foreach ($aSet as $sWord) {
$aTokens[' '.$sWord] = ' '.$sWord; for ($i = 0; $i < $iNumWords; $i++) {
$aTokens[$sWord] = $sWord; $sPhrase = $this->aWords[$i];
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
$aTokens[$sPhrase] = $sPhrase;
for ($j = $i + 1; $j < $iNumWords; $j++) {
$sPhrase .= ' '.$this->aWords[$j];
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
$aTokens[$sPhrase] = $sPhrase;
} }
} }
} }
@@ -75,44 +94,59 @@ class Phrase
*/ */
public function invertWordSets() public function invertWordSets()
{ {
$this->aWordSets = $this->createInverseWordSets($this->aWords, 0); foreach ($this->aWordSets as $i => $aSet) {
$this->aWordSets[$i] = array_reverse($aSet);
}
} }
private function createWordSets($aWords, $iDepth) public function computeWordSets($oTokens)
{ {
$aResult = array(array(join(' ', $aWords))); $iNumWords = count($this->aWords);
$sFirstToken = ''; // Caches the word set for the partial phrase up to word i.
if ($iDepth < Phrase::MAX_DEPTH) { $aSetCache = array_fill(0, $iNumWords, array());
while (count($aWords) > 1) {
$sWord = array_shift($aWords); // Initialise first element of cache. There can only be the word.
$sFirstToken .= ($sFirstToken?' ':'').$sWord; if ($oTokens->containsAny($this->aWords[0])) {
$aRest = $this->createWordSets($aWords, $iDepth + 1); $aSetCache[0][] = array($this->aWords[0]);
foreach ($aRest as $aSet) { }
$aResult[] = array_merge(array($sFirstToken), $aSet);
// Now do the next elements using what we already have.
for ($i = 1; $i < $iNumWords; $i++) {
for ($j = $i; $j > 0; $j--) {
$sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
$aPartial = array($sPartial);
foreach ($aSetCache[$j - 1] as $aSet) {
if (count($aSet) < Phrase::MAX_WORDSET_LEN) {
$aSetCache[$i][] = array_merge($aSet, $aPartial);
}
}
if (count($aSetCache[$i]) > 2 * Phrase::MAX_WORDSETS) {
usort(
$aSetCache[$i],
array('\Nominatim\Phrase', 'cmpByArraylen')
);
$aSetCache[$i] = array_slice(
$aSetCache[$i],
0,
Phrase::MAX_WORDSETS
);
}
} }
} }
// finally the current full phrase
$sPartial = $this->aWords[0].' '.$sPartial;
if ($oTokens->containsAny($sPartial)) {
$aSetCache[$i][] = array($sPartial);
}
} }
return $aResult; $this->aWordSets = $aSetCache[$iNumWords - 1];
usort($this->aWordSets, array('\Nominatim\Phrase', 'cmpByArraylen'));
$this->aWordSets = array_slice($this->aWordSets, 0, Phrase::MAX_WORDSETS);
} }
private function createInverseWordSets($aWords, $iDepth)
{
$aResult = array(array(join(' ', $aWords)));
$sFirstToken = '';
if ($iDepth < Phrase::MAX_DEPTH) {
while (count($aWords) > 1) {
$sWord = array_pop($aWords);
$sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken;
$aRest = $this->createInverseWordSets($aWords, $iDepth + 1);
foreach ($aRest as $aSet) {
$aResult[] = array_merge(array($sFirstToken), $aSet);
}
}
}
return $aResult;
}
public function debugInfo() public function debugInfo()
{ {

View File

@@ -55,6 +55,18 @@ class TokenList
return isset($this->aTokens[$sWord]); return isset($this->aTokens[$sWord]);
} }
/**
* Check if there are partial or full tokens for the given word.
*
* @param string $sWord Token word to look for.
*
* @return bool True if there is one or more token for the token word.
*/
public function containsAny($sWord)
{
return isset($this->aTokens[$sWord]) || isset($this->aTokens[' '.$sWord]);
}
/** /**
* Get the list of tokens for the given token word. * Get the list of tokens for the given token word.
* *

View File

@@ -4,6 +4,29 @@ namespace Nominatim;
require_once(CONST_BasePath.'/lib/Phrase.php'); require_once(CONST_BasePath.'/lib/Phrase.php');
class TokensFullSet
{
public function containsAny($sTerm)
{
return true;
}
}
// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
class TokensPartialSet
{
public function __construct($aTokens)
{
$this->aTokens = array_flip($aTokens);
}
public function containsAny($sTerm)
{
return isset($this->aTokens[$sTerm]);
}
}
// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
class PhraseTest extends \PHPUnit\Framework\TestCase class PhraseTest extends \PHPUnit\Framework\TestCase
{ {
@@ -21,6 +44,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testEmptyPhrase() public function testEmptyPhrase()
{ {
$oPhrase = new Phrase('', ''); $oPhrase = new Phrase('', '');
$oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals( $this->assertEquals(
array(array('')), array(array('')),
@@ -32,6 +56,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testSingleWordPhrase() public function testSingleWordPhrase()
{ {
$oPhrase = new Phrase('a', ''); $oPhrase = new Phrase('a', '');
$oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals( $this->assertEquals(
'(a)', '(a)',
@@ -43,20 +68,23 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testMultiWordPhrase() public function testMultiWordPhrase()
{ {
$oPhrase = new Phrase('a b', ''); $oPhrase = new Phrase('a b', '');
$oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals( $this->assertEquals(
'(a b),(a|b)', '(a b),(a|b)',
$this->serializeSets($oPhrase->getWordSets()) $this->serializeSets($oPhrase->getWordSets())
); );
$oPhrase = new Phrase('a b c', ''); $oPhrase = new Phrase('a b c', '');
$oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals( $this->assertEquals(
'(a b c),(a|b c),(a|b|c),(a b|c)', '(a b c),(a|b c),(a b|c),(a|b|c)',
$this->serializeSets($oPhrase->getWordSets()) $this->serializeSets($oPhrase->getWordSets())
); );
$oPhrase = new Phrase('a b c d', ''); $oPhrase = new Phrase('a b c d', '');
$oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals( $this->assertEquals(
'(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)', '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
$this->serializeSets($oPhrase->getWordSets()) $this->serializeSets($oPhrase->getWordSets())
); );
} }
@@ -65,25 +93,47 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testInverseWordSets() public function testInverseWordSets()
{ {
$oPhrase = new Phrase('a b c', ''); $oPhrase = new Phrase('a b c', '');
$oPhrase->computeWordSets(new TokensFullSet());
$oPhrase->invertWordSets(); $oPhrase->invertWordSets();
$this->assertEquals( $this->assertEquals(
'(a b c),(c|a b),(c|b|a),(b c|a)', '(a b c),(b c|a),(c|a b),(c|b|a)',
$this->serializeSets($oPhrase->getWordSets()) $this->serializeSets($oPhrase->getWordSets())
); );
} }
public function testMaxDepth() public function testMaxWordSets()
{ {
$oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), ''); $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
$oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals(8, count($oPhrase->getWordSets())); $this->assertEquals(8, count($oPhrase->getWordSets()));
$oPhrase->invertWordSets(); $oPhrase->invertWordSets();
$this->assertEquals(8, count($oPhrase->getWordSets())); $this->assertEquals(8, count($oPhrase->getWordSets()));
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), ''); $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
$this->assertEquals(41226, count($oPhrase->getWordSets())); $oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals(Phrase::MAX_WORDSETS, count($oPhrase->getWordSets()));
$oPhrase->invertWordSets(); $oPhrase->invertWordSets();
$this->assertEquals(41226, count($oPhrase->getWordSets())); $this->assertEquals(Phrase::MAX_WORDSETS, count($oPhrase->getWordSets()));
}
public function testPartialTokensShortTerm()
{
$oPhrase = new Phrase('a b c d', '');
$oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
$this->assertEquals(
'(a|b c d),(a|b c|d)',
$this->serializeSets($oPhrase->getWordSets())
);
}
public function testPartialTokensLongTerm()
{
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
$oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a')));
$this->assertEquals(80, count($oPhrase->getWordSets()));
} }
} }