mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-15 19:07:58 +00:00
Rework word set computation
Switch from an recursive algorithm for computing the word sets to an iterative one that benefits from caching intermediate results. This considerably reduces the amount of memory needed, so that the depth restriction can be dropped. To ensure that the number of word sets remains manageable, only sets up to a certain length are accepted and only a certain number of total word sets. If word sets need to be dropped, we drop the ones with more words per word set first. To further reduce the number of potential word sets, the valid tokens are looked up first and then only word sets containing valid tokens are computed. Fixes #1403, #1404 and #654.
This commit is contained in:
@@ -4,6 +4,29 @@ namespace Nominatim;
|
||||
|
||||
require_once(CONST_BasePath.'/lib/Phrase.php');
|
||||
|
||||
class TokensFullSet
|
||||
{
|
||||
public function containsAny($sTerm)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
|
||||
class TokensPartialSet
|
||||
{
|
||||
public function __construct($aTokens)
|
||||
{
|
||||
$this->aTokens = array_flip($aTokens);
|
||||
}
|
||||
|
||||
public function containsAny($sTerm)
|
||||
{
|
||||
return isset($this->aTokens[$sTerm]);
|
||||
}
|
||||
}
|
||||
|
||||
// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
|
||||
class PhraseTest extends \PHPUnit\Framework\TestCase
|
||||
{
|
||||
|
||||
@@ -21,6 +44,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
|
||||
public function testEmptyPhrase()
|
||||
{
|
||||
$oPhrase = new Phrase('', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
|
||||
$this->assertEquals(
|
||||
array(array('')),
|
||||
@@ -32,6 +56,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
|
||||
public function testSingleWordPhrase()
|
||||
{
|
||||
$oPhrase = new Phrase('a', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
|
||||
$this->assertEquals(
|
||||
'(a)',
|
||||
@@ -43,20 +68,23 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
|
||||
public function testMultiWordPhrase()
|
||||
{
|
||||
$oPhrase = new Phrase('a b', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$this->assertEquals(
|
||||
'(a b),(a|b)',
|
||||
$this->serializeSets($oPhrase->getWordSets())
|
||||
);
|
||||
|
||||
$oPhrase = new Phrase('a b c', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$this->assertEquals(
|
||||
'(a b c),(a|b c),(a|b|c),(a b|c)',
|
||||
'(a b c),(a|b c),(a b|c),(a|b|c)',
|
||||
$this->serializeSets($oPhrase->getWordSets())
|
||||
);
|
||||
|
||||
$oPhrase = new Phrase('a b c d', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$this->assertEquals(
|
||||
'(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)',
|
||||
'(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
|
||||
$this->serializeSets($oPhrase->getWordSets())
|
||||
);
|
||||
}
|
||||
@@ -65,25 +93,47 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
|
||||
public function testInverseWordSets()
|
||||
{
|
||||
$oPhrase = new Phrase('a b c', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$oPhrase->invertWordSets();
|
||||
|
||||
$this->assertEquals(
|
||||
'(a b c),(c|a b),(c|b|a),(b c|a)',
|
||||
'(a b c),(b c|a),(c|a b),(c|b|a)',
|
||||
$this->serializeSets($oPhrase->getWordSets())
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public function testMaxDepth()
|
||||
public function testMaxWordSets()
|
||||
{
|
||||
$oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$this->assertEquals(8, count($oPhrase->getWordSets()));
|
||||
$oPhrase->invertWordSets();
|
||||
$this->assertEquals(8, count($oPhrase->getWordSets()));
|
||||
|
||||
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
|
||||
$this->assertEquals(41226, count($oPhrase->getWordSets()));
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$this->assertEquals(Phrase::MAX_WORDSETS, count($oPhrase->getWordSets()));
|
||||
$oPhrase->invertWordSets();
|
||||
$this->assertEquals(41226, count($oPhrase->getWordSets()));
|
||||
$this->assertEquals(Phrase::MAX_WORDSETS, count($oPhrase->getWordSets()));
|
||||
}
|
||||
|
||||
|
||||
public function testPartialTokensShortTerm()
|
||||
{
|
||||
$oPhrase = new Phrase('a b c d', '');
|
||||
$oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
|
||||
$this->assertEquals(
|
||||
'(a|b c d),(a|b c|d)',
|
||||
$this->serializeSets($oPhrase->getWordSets())
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public function testPartialTokensLongTerm()
|
||||
{
|
||||
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
|
||||
$oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a')));
|
||||
$this->assertEquals(80, count($oPhrase->getWordSets()));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user