move tokenization in query into tokenizer

This commit is contained in:
Sarah Hoffmann
2021-04-28 14:08:24 +02:00
parent 3eb4d88057
commit 044bb6afa5
7 changed files with 315 additions and 311 deletions

View File

@@ -44,19 +44,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testEmptyPhrase()
{
$oPhrase = new Phrase('', '');
$oPhrase->computeWordSets(new TokensFullSet());
$oPhrase->computeWordSets(array(), new TokensFullSet());
$this->assertEquals(
array(array('')),
$oPhrase->getWordSets()
);
$this->assertNull($oPhrase->getWordSets());
}
public function testSingleWordPhrase()
{
$oPhrase = new Phrase('a', '');
$oPhrase->computeWordSets(new TokensFullSet());
$oPhrase->computeWordSets(array('a'), new TokensFullSet());
$this->assertEquals(
'(a)',
@@ -68,21 +65,21 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testMultiWordPhrase()
{
$oPhrase = new Phrase('a b', '');
$oPhrase->computeWordSets(new TokensFullSet());
$oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet());
$this->assertEquals(
'(a b),(a|b)',
$this->serializeSets($oPhrase->getWordSets())
);
$oPhrase = new Phrase('a b c', '');
$oPhrase->computeWordSets(new TokensFullSet());
$oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
$this->assertEquals(
'(a b c),(a|b c),(a b|c),(a|b|c)',
$this->serializeSets($oPhrase->getWordSets())
);
$oPhrase = new Phrase('a b c d', '');
$oPhrase->computeWordSets(new TokensFullSet());
$oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet());
$this->assertEquals(
'(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
$this->serializeSets($oPhrase->getWordSets())
@@ -93,7 +90,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testInverseWordSets()
{
$oPhrase = new Phrase('a b c', '');
$oPhrase->computeWordSets(new TokensFullSet());
$oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
$oPhrase->invertWordSets();
$this->assertEquals(
@@ -105,14 +102,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testMaxWordSets()
{
$oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
$oPhrase->computeWordSets(new TokensFullSet());
$aWords = array_fill(0, 4, 'a');
$oPhrase = new Phrase(join(' ', $aWords), '');
$oPhrase->computeWordSets($aWords, new TokensFullSet());
$this->assertEquals(8, count($oPhrase->getWordSets()));
$oPhrase->invertWordSets();
$this->assertEquals(8, count($oPhrase->getWordSets()));
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
$oPhrase->computeWordSets(new TokensFullSet());
$aWords = array_fill(0, 18, 'a');
$oPhrase = new Phrase(join(' ', $aWords), '');
$oPhrase->computeWordSets($aWords, new TokensFullSet());
$this->assertEquals(100, count($oPhrase->getWordSets()));
$oPhrase->invertWordSets();
$this->assertEquals(100, count($oPhrase->getWordSets()));
@@ -122,7 +121,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testPartialTokensShortTerm()
{
$oPhrase = new Phrase('a b c d', '');
$oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
$oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
$this->assertEquals(
'(a|b c d),(a|b c|d)',
$this->serializeSets($oPhrase->getWordSets())
@@ -132,8 +131,9 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testPartialTokensLongTerm()
{
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
$oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a')));
$aWords = array_fill(0, 18, 'a');
$oPhrase = new Phrase(join(' ', $aWords), '');
$oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a')));
$this->assertEquals(80, count($oPhrase->getWordSets()));
}
}

View File

@@ -49,88 +49,4 @@ class TokenTest extends \PHPUnit\Framework\TestCase
$this->assertFalse($TL->contains('unknownword'));
$this->assertEquals(array(), $TL->get('unknownword'));
}
public function testAddress()
{
$this->expectOutputRegex('/<p><tt>/');
$oDbStub = $this->getMockBuilder(Nominatim\DB::class)
->setMethods(array('getAll', 'getDBQuotedList'))
->getMock();
$oDbStub->method('getDBQuotedList')
->will($this->returnCallback(function ($aVals) {
return array_map(function ($sVal) {
return "'".$sVal."'";
}, $aVals);
}));
$oDbStub->method('getAll')
->will($this->returnCallback(function ($sql) {
$aResults = array();
if (preg_match('/1051/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => '1051',
'class' => 'place',
'type' => 'house'
));
}
if (preg_match('/hauptstr/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => 'hauptstr',
'class' => 'place',
'type' => 'street',
'operator' => true
));
}
if (preg_match('/64286/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => '64286',
'word' => '64286',
'class' => 'place',
'type' => 'postcode'
));
}
if (preg_match('/darmstadt/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => 'darmstadt',
'count' => 533
));
}
if (preg_match('/alemagne/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => 'alemagne',
'country_code' => 'de',
));
}
if (preg_match('/mexico/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => 'mexico',
'country_code' => 'mx',
));
}
return $aResults;
}));
$aCountryCodes = array('de', 'fr');
$sNormQuery = '1051 hauptstr 64286 darmstadt alemagne mexico';
$aTokens = explode(' ', $sNormQuery);
$TL = new TokenList;
$TL->addTokensFromDB($oDbStub, $aTokens, $aCountryCodes, $sNormQuery, $this->oNormalizer);
$this->assertEquals(5, $TL->count());
$this->assertEquals(array(new Token\HouseNumber(999, '1051')), $TL->get('1051'));
$this->assertEquals(array(new Token\Country(999, 'de')), $TL->get('alemagne'));
$this->assertEquals(array(new Token\Postcode(999, '64286')), $TL->get('64286'));
$this->assertEquals(array(new Token\Word(999, true, 533, 0)), $TL->get('darmstadt'));
$this->assertEquals(array(new Token\SpecialTerm(999, 'place', 'street', true)), $TL->get('hauptstr'));
}
}

View File

@@ -0,0 +1,17 @@
<?php
namespace Nominatim;
class Tokenizer
{
private $oDB;
public function __construct(&$oDB)
{
$this->oDB =& $oDB;
}
public function checkStatus()
{
}
}