remove special status of partial tokens

Full-word tokens are no longer marked by a space at the
beginning of the token. Use the new Partial token category
instead. This removes a couple of special casing, we don't
really need.

The word table still has the space for compatibility reasons,
so the tokenizer code needs to get rid of it when loading the
tokens.
This commit is contained in:
Sarah Hoffmann
2021-07-14 22:17:17 +02:00
parent 6070c3d1d5
commit 143ff14466
5 changed files with 34 additions and 69 deletions

View File

@@ -355,15 +355,15 @@ class Geocode
$aNewWordsetSearches = array(); $aNewWordsetSearches = array();
foreach ($aWordsetSearches as $oCurrentSearch) { foreach ($aWordsetSearches as $oCurrentSearch) {
// Tokens with full name matches. foreach ($oValidTokens->get($sToken) as $oSearchTerm) {
foreach ($oValidTokens->get(' '.$sToken) as $oSearchTerm) { $aNewSearches = $oCurrentSearch->extendWithSearchTerm(
$aNewSearches = $oCurrentSearch->extendWithFullTerm( $sToken,
$oSearchTerm, $oSearchTerm,
$sPhraseType, $sPhraseType,
$iToken == 0 && $iPhrase == 0, $iToken == 0 && $iPhrase == 0,
$iPhrase == 0,
$iToken + 1 == count($aWordset) $iToken + 1 == count($aWordset)
&& $iPhrase + 1 == count($aPhrases) && $iPhrase + 1 == count($aPhrases),
$iPhrase
); );
foreach ($aNewSearches as $oSearch) { foreach ($aNewSearches as $oSearch) {
@@ -372,27 +372,6 @@ class Geocode
} }
} }
} }
// Look for partial matches.
// Note that there is no point in adding country terms here
// because country is omitted in the address.
if ($sPhraseType != 'country') {
// Allow searching for a word - but at extra cost
foreach ($oValidTokens->get($sToken) as $oSearchTerm) {
$aNewSearches = $oCurrentSearch->extendWithPartialTerm(
$sToken,
$oSearchTerm,
(bool) $sPhraseType,
$iPhrase,
$oValidTokens->get(' '.$sToken)
);
foreach ($aNewSearches as $oSearch) {
if ($oSearch->getRank() < $this->iMaxRank) {
$aNewWordsetSearches[] = $oSearch;
}
}
}
}
} }
// Sort and cut // Sort and cut
usort($aNewWordsetSearches, array('Nominatim\SearchDescription', 'bySearchRank')); usort($aNewWordsetSearches, array('Nominatim\SearchDescription', 'bySearchRank'));

View File

@@ -152,17 +152,17 @@ class SearchDescription
/** /**
* Derive new searches by adding a full term to the existing search. * Derive new searches by adding a full term to the existing search.
* *
* @param string $sToken Term for the token.
* @param object $oSearchTerm Description of the token. * @param object $oSearchTerm Description of the token.
* @param string $sPhraseType Type of phrase the token is contained in. * @param string $sPhraseType Type of phrase the token is contained in.
* @param bool $bFirstToken True if the token is at the beginning of the * @param bool $bFirstToken True if the token is at the beginning of the
* query. * query.
* @param bool $bFirstPhrase True if the token is in the first phrase of
* the query.
* @param bool $bLastToken True if the token is at the end of the query. * @param bool $bLastToken True if the token is at the end of the query.
* @param integer $iPhrase Number of the phrase the token is in.
* *
* @return SearchDescription[] List of derived search descriptions. * @return SearchDescription[] List of derived search descriptions.
*/ */
public function extendWithFullTerm($oSearchTerm, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken) public function extendWithSearchTerm($sToken, $oSearchTerm, $sPhraseType, $bFirstToken, $bLastToken, $iPhrase)
{ {
$aNewSearches = array(); $aNewSearches = array();
@@ -295,8 +295,8 @@ class SearchDescription
// of the phrase. In structured search the name must forcably in // of the phrase. In structured search the name must forcably in
// the first phrase. In unstructured search it may be in a later // the first phrase. In unstructured search it may be in a later
// phrase when the first phrase is a house number. // phrase when the first phrase is a house number.
if (!empty($this->aName) || !($bFirstPhrase || $sPhraseType == '')) { if (!empty($this->aName) || !($iPhrase == 0 || $sPhraseType == '')) {
if (($sPhraseType == '' || !$bFirstPhrase) && $oSearchTerm->iTermCount > 1) { if (($sPhraseType == '' || $iPhrase > 0) && $oSearchTerm->iTermCount > 1) {
$oSearch = clone $this; $oSearch = clone $this;
$oSearch->iNamePhrase = -1; $oSearch->iNamePhrase = -1;
$oSearch->iSearchRank += 1; $oSearch->iSearchRank += 1;
@@ -314,6 +314,16 @@ class SearchDescription
} }
$aNewSearches[] = $oSearch; $aNewSearches[] = $oSearch;
} }
} elseif ($sPhraseType != 'country'
&& is_a($oSearchTerm, '\Nominatim\Token\Partial')
&& strpos($sToken, ' ') === false
) {
$aNewSearches = $this->extendWithPartialTerm(
$sToken,
$oSearchTerm,
(bool) $sPhraseType,
$iPhrase
);
} }
return $aNewSearches; return $aNewSearches;
@@ -326,20 +336,11 @@ class SearchDescription
* @param object $oSearchTerm Description of the token. * @param object $oSearchTerm Description of the token.
* @param bool $bStructuredPhrases True if the search is structured. * @param bool $bStructuredPhrases True if the search is structured.
* @param integer $iPhrase Number of the phrase the token is in. * @param integer $iPhrase Number of the phrase the token is in.
* @param array[] $aFullTokens List of full term tokens with the
* same name.
* *
* @return SearchDescription[] List of derived search descriptions. * @return SearchDescription[] List of derived search descriptions.
*/ */
public function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens) private function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase)
{ {
// Only allow name terms.
if (!(is_a($oSearchTerm, '\Nominatim\Token\Word'))
|| strpos($sToken, ' ') !== false
) {
return array();
}
$aNewSearches = array(); $aNewSearches = array();
$iWordID = $oSearchTerm->iId; $iWordID = $oSearchTerm->iId;
@@ -355,9 +356,6 @@ class SearchDescription
$oSearch->aAddress[$iWordID] = $iWordID; $oSearch->aAddress[$iWordID] = $iWordID;
} else { } else {
$oSearch->aAddressNonSearch[$iWordID] = $iWordID; $oSearch->aAddressNonSearch[$iWordID] = $iWordID;
if (!empty($aFullTokens)) {
$oSearch->iSearchRank++;
}
} }
$aNewSearches[] = $oSearch; $aNewSearches[] = $oSearch;
} }
@@ -385,9 +383,6 @@ class SearchDescription
} }
$oSearch->aName[$iWordID] = $iWordID; $oSearch->aName[$iWordID] = $iWordID;
} else { } else {
if (!empty($aFullTokens)) {
$oSearch->iSearchRank++;
}
$oSearch->aNameNonSearch[$iWordID] = $iWordID; $oSearch->aNameNonSearch[$iWordID] = $iWordID;
} }
$oSearch->iNamePhrase = $iPhrase; $oSearch->iNamePhrase = $iPhrase;

View File

@@ -18,15 +18,6 @@ require_once(CONST_LibDir.'/SpecialSearchOperator.php');
* tokens do not have a common base class. All tokens need to have a field * tokens do not have a common base class. All tokens need to have a field
* with the word id that points to an entry in the `word` database table * with the word id that points to an entry in the `word` database table
* but otherwise the information saved about a token can be very different. * but otherwise the information saved about a token can be very different.
*
* There are two different kinds of token words: full words and partial terms.
*
* Full words start with a space. They represent a complete name of a place.
* All special tokens are normally full words.
*
* Partial terms have no space at the beginning. They may represent a part of
* a name of a place (e.g. in the name 'World Trade Center' a partial term
* would be 'Trade' or 'Trade Center'). They are only used in TokenWord.
*/ */
class TokenList class TokenList
{ {
@@ -65,7 +56,7 @@ class TokenList
*/ */
public function containsAny($sWord) public function containsAny($sWord)
{ {
return isset($this->aTokens[$sWord]) || isset($this->aTokens[' '.$sWord]); return isset($this->aTokens[$sWord]);
} }
/** /**
@@ -87,7 +78,7 @@ class TokenList
foreach ($this->aTokens as $aTokenList) { foreach ($this->aTokens as $aTokenList) {
foreach ($aTokenList as $oToken) { foreach ($aTokenList as $oToken) {
if (is_a($oToken, '\Nominatim\Token\Word') && !$oToken->bPartial) { if (is_a($oToken, '\Nominatim\Token\Word')) {
$ids[$oToken->iId] = $oToken->iId; $ids[$oToken->iId] = $oToken->iId;
} }
} }

View File

@@ -120,7 +120,7 @@ class Tokenizer
// Try more interpretations for Tokens that could not be matched. // Try more interpretations for Tokens that could not be matched.
foreach ($aTokens as $sToken) { foreach ($aTokens as $sToken) {
if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) { if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
// US ZIP+4 codes - merge in the 5-digit ZIP code // US ZIP+4 codes - merge in the 5-digit ZIP code
$oValidTokens->addToken( $oValidTokens->addToken(

View File

@@ -137,7 +137,7 @@ class Tokenizer
// Try more interpretations for Tokens that could not be matched. // Try more interpretations for Tokens that could not be matched.
foreach ($aTokens as $sToken) { foreach ($aTokens as $sToken) {
if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) { if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
// US ZIP+4 codes - merge in the 5-digit ZIP code // US ZIP+4 codes - merge in the 5-digit ZIP code
$oValidTokens->addToken( $oValidTokens->addToken(