introduce a separate token type for partials

This means that the leading space can be removed as a partial
word indicator.
This commit is contained in:
Sarah Hoffmann
2021-07-13 16:54:51 +02:00
parent bc8b2d4ae0
commit 6070c3d1d5
5 changed files with 62 additions and 14 deletions

View File

@@ -7,6 +7,7 @@ require_once(CONST_LibDir.'/TokenHousenumber.php');
require_once(CONST_LibDir.'/TokenPostcode.php');
require_once(CONST_LibDir.'/TokenSpecialTerm.php');
require_once(CONST_LibDir.'/TokenWord.php');
require_once(CONST_LibDir.'/TokenPartial.php');
require_once(CONST_LibDir.'/SpecialSearchOperator.php');
/**

31
lib-php/TokenPartial.php Normal file
View File

@@ -0,0 +1,31 @@
<?php
namespace Nominatim\Token;
/**
* A standard word token.
*/
class Partial
{
/// Database word id, if applicable.
public $iId;
/// Number of appearances in the database.
public $iSearchNameCount;
public function __construct($iId, $iSearchNameCount)
{
$this->iId = $iId;
$this->iSearchNameCount = $iSearchNameCount;
}
public function debugInfo()
{
return array(
'ID' => $this->iId,
'Type' => 'partial',
'Info' => array(
'count' => $this->iSearchNameCount
)
);
}
}

View File

@@ -9,17 +9,14 @@ class Word
{
/// Database word id, if applicable.
public $iId;
/// If true, the word may represent only part of a place name.
public $bPartial;
/// Number of appearances in the database.
public $iSearchNameCount;
/// Number of terms in the word.
public $iTermCount;
public function __construct($iId, $bPartial, $iSearchNameCount, $iTermCount)
public function __construct($iId, $iSearchNameCount, $iTermCount)
{
$this->iId = $iId;
$this->bPartial = $bPartial;
$this->iSearchNameCount = $iSearchNameCount;
$this->iTermCount = $iTermCount;
}
@@ -30,8 +27,8 @@ class Word
'ID' => $this->iId,
'Type' => 'word',
'Info' => array(
'partial' => $this->bPartial,
'count' => $this->iSearchNameCount
'count' => $this->iSearchNameCount,
'terms' => $this->iTermCount
)
);
}

View File

@@ -195,17 +195,27 @@ class Tokenizer
) {
$oToken = new Token\Country($iId, $aWord['country_code']);
}
} elseif ($aWord['word_token'][0] == ' ') {
$oToken = new Token\Word(
$iId,
$aWord['word_token'][0] != ' ',
(int) $aWord['count'],
substr_count($aWord['word_token'], ' ')
);
} else {
$oToken = new Token\Word(
$oToken = new Token\Partial(
$iId,
$aWord['word_token'][0] != ' ',
(int) $aWord['count'],
substr_count($aWord['word_token'], ' ')
(int) $aWord['count']
);
}
if ($oToken) {
$oValidTokens->addToken($aWord['word_token'], $oToken);
// remove any leading spaces
if ($aWord['word_token'][0] == ' ') {
$oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken);
} else {
$oValidTokens->addToken($aWord['word_token'], $oToken);
}
}
}
}

View File

@@ -212,17 +212,26 @@ class Tokenizer
) {
$oToken = new Token\Country($iId, $aWord['country_code']);
}
} else {
} elseif ($aWord['word_token'][0] == ' ') {
$oToken = new Token\Word(
$iId,
$aWord['word_token'][0] != ' ',
(int) $aWord['count'],
substr_count($aWord['word_token'], ' ')
);
} else {
$oToken = new Token\Partial(
$iId,
(int) $aWord['count']
);
}
if ($oToken) {
$oValidTokens->addToken($aWord['word_token'], $oToken);
// remove any leading spaces
if ($aWord['word_token'][0] == ' ') {
$oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken);
} else {
$oValidTokens->addToken($aWord['word_token'], $oToken);
}
}
}
}