remove special status of partial tokens

Full-word tokens are no longer marked by a space at the beginning of the token. Use the new Partial token category instead. This removes a couple of special casing, we don't really need. The word table still has the space for compatibility reasons, so the tokenizer code needs to get rid of it when loading the tokens.
2026-03-12 22:04:07 +00:00 · 2021-07-14 22:17:17 +02:00
parent 6070c3d1d5
commit 143ff14466
5 changed files with 34 additions and 69 deletions
--- a/lib-php/Geocode.php
+++ b/lib-php/Geocode.php
@@ -355,15 +355,15 @@ class Geocode
                    $aNewWordsetSearches = array();
                    foreach ($aWordsetSearches as $oCurrentSearch) {
-                        // Tokens with full name matches.
+                        foreach ($oValidTokens->get($sToken) as $oSearchTerm) {
-                        foreach ($oValidTokens->get(' '.$sToken) as $oSearchTerm) {
+                            $aNewSearches = $oCurrentSearch->extendWithSearchTerm(
-                            $aNewSearches = $oCurrentSearch->extendWithFullTerm(
+                                $sToken,
                                $oSearchTerm,
                                $sPhraseType,
                                $iToken == 0 && $iPhrase == 0,
                                $iPhrase == 0,
                                $iToken + 1 == count($aWordset)
-                                  && $iPhrase + 1 == count($aPhrases)
+                                  && $iPhrase + 1 == count($aPhrases),
                                $iPhrase
                            );
                            foreach ($aNewSearches as $oSearch) {
@@ -372,27 +372,6 @@ class Geocode
                                }
                            }
                        }
                        // Look for partial matches.
                        // Note that there is no point in adding country terms here
                        // because country is omitted in the address.
                        if ($sPhraseType != 'country') {
                            // Allow searching for a word - but at extra cost
                            foreach ($oValidTokens->get($sToken) as $oSearchTerm) {
                                $aNewSearches = $oCurrentSearch->extendWithPartialTerm(
                                    $sToken,
                                    $oSearchTerm,
                                    (bool) $sPhraseType,
                                    $iPhrase,
                                    $oValidTokens->get(' '.$sToken)
                                );
                                foreach ($aNewSearches as $oSearch) {
                                    if ($oSearch->getRank() < $this->iMaxRank) {
                                        $aNewWordsetSearches[] = $oSearch;
                                    }
                                }
                            }
                        }
                    }
                    // Sort and cut
                    usort($aNewWordsetSearches, array('Nominatim\SearchDescription', 'bySearchRank'));
--- a/lib-php/SearchDescription.php
+++ b/lib-php/SearchDescription.php
@@ -152,17 +152,17 @@ class SearchDescription
    /**
     * Derive new searches by adding a full term to the existing search.
     *
     * @param string  $sToken       Term for the token.
     * @param object  $oSearchTerm  Description of the token.
     * @param string  $sPhraseType  Type of phrase the token is contained in.
     * @param bool    $bFirstToken  True if the token is at the beginning of the
     *                              query.
     * @param bool   $bFirstPhrase True if the token is in the first phrase of
     *                             the query.
     * @param bool    $bLastToken   True if the token is at the end of the query.
     * @param integer $iPhrase      Number of the phrase the token is in.
     *
     * @return SearchDescription[] List of derived search descriptions.
     */
-    public function extendWithFullTerm($oSearchTerm, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken)
+    public function extendWithSearchTerm($sToken, $oSearchTerm, $sPhraseType, $bFirstToken, $bLastToken, $iPhrase)
    {
        $aNewSearches = array();
@@ -295,8 +295,8 @@ class SearchDescription
            // of the phrase. In structured search the name must forcably in
            // the first phrase. In unstructured search it may be in a later
            // phrase when the first phrase is a house number.
-            if (!empty($this->aName) || !($bFirstPhrase || $sPhraseType == '')) {
+            if (!empty($this->aName) || !($iPhrase == 0 || $sPhraseType == '')) {
-                if (($sPhraseType == '' || !$bFirstPhrase) && $oSearchTerm->iTermCount > 1) {
+                if (($sPhraseType == '' || $iPhrase > 0) && $oSearchTerm->iTermCount > 1) {
                    $oSearch = clone $this;
                    $oSearch->iNamePhrase = -1;
                    $oSearch->iSearchRank += 1;
@@ -314,6 +314,16 @@ class SearchDescription
                }
                $aNewSearches[] = $oSearch;
            }
        } elseif ($sPhraseType != 'country'
                  && is_a($oSearchTerm, '\Nominatim\Token\Partial')
                  && strpos($sToken, ' ') === false
        ) {
            $aNewSearches = $this->extendWithPartialTerm(
                $sToken,
                $oSearchTerm,
                (bool) $sPhraseType,
                $iPhrase
            );
        }
        return $aNewSearches;
@@ -326,20 +336,11 @@ class SearchDescription
     * @param object  $oSearchTerm        Description of the token.
     * @param bool    $bStructuredPhrases True if the search is structured.
     * @param integer $iPhrase            Number of the phrase the token is in.
     * @param array[] $aFullTokens        List of full term tokens with the
     *                                    same name.
     *
     * @return SearchDescription[] List of derived search descriptions.
     */
-    public function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens)
+    private function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase)
    {
        // Only allow name terms.
        if (!(is_a($oSearchTerm, '\Nominatim\Token\Word'))
            || strpos($sToken, ' ') !== false
        ) {
            return array();
        }
        $aNewSearches = array();
        $iWordID = $oSearchTerm->iId;
@@ -355,9 +356,6 @@ class SearchDescription
                $oSearch->aAddress[$iWordID] = $iWordID;
            } else {
                $oSearch->aAddressNonSearch[$iWordID] = $iWordID;
                if (!empty($aFullTokens)) {
                    $oSearch->iSearchRank++;
                }
            }
            $aNewSearches[] = $oSearch;
        }
@@ -385,9 +383,6 @@ class SearchDescription
                }
                $oSearch->aName[$iWordID] = $iWordID;
            } else {
                if (!empty($aFullTokens)) {
                    $oSearch->iSearchRank++;
                }
                $oSearch->aNameNonSearch[$iWordID] = $iWordID;
            }
            $oSearch->iNamePhrase = $iPhrase;
--- a/lib-php/TokenList.php
+++ b/lib-php/TokenList.php
@@ -18,15 +18,6 @@ require_once(CONST_LibDir.'/SpecialSearchOperator.php');
 * tokens do not have a common base class. All tokens need to have a field
 * with the word id that points to an entry in the `word` database table
 * but otherwise the information saved about a token can be very different.
 *
 * There are two different kinds of token words: full words and partial terms.
 *
 * Full words start with a space. They represent a complete name of a place.
 * All special tokens are normally full words.
 *
 * Partial terms have no space at the beginning. They may represent a part of
 * a name of a place (e.g. in the name 'World Trade Center' a partial term
 * would be 'Trade' or 'Trade Center'). They are only used in TokenWord.
 */
 class TokenList
 {
@@ -65,7 +56,7 @@ class TokenList
     */
    public function containsAny($sWord)
    {
-        return isset($this->aTokens[$sWord]) || isset($this->aTokens[' '.$sWord]);
+        return isset($this->aTokens[$sWord]);
    }
    /**
@@ -87,7 +78,7 @@ class TokenList
        foreach ($this->aTokens as $aTokenList) {
            foreach ($aTokenList as $oToken) {
-                if (is_a($oToken, '\Nominatim\Token\Word') && !$oToken->bPartial) {
+                if (is_a($oToken, '\Nominatim\Token\Word')) {
                    $ids[$oToken->iId] = $oToken->iId;
                }
            }
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -120,7 +120,7 @@ class Tokenizer
            // Try more interpretations for Tokens that could not be matched.
            foreach ($aTokens as $sToken) {
-                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
+                if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
                    if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
                        // US ZIP+4 codes - merge in the 5-digit ZIP code
                        $oValidTokens->addToken(
--- a/lib-php/tokenizer/legacy_tokenizer.php
+++ b/lib-php/tokenizer/legacy_tokenizer.php
@@ -137,7 +137,7 @@ class Tokenizer
            // Try more interpretations for Tokens that could not be matched.
            foreach ($aTokens as $sToken) {
-                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
+                if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
                    if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
                        // US ZIP+4 codes - merge in the 5-digit ZIP code
                        $oValidTokens->addToken(