improve handling of multi-word partials in SearchDescription

Multi-word partial terms had an undue advantage over separate partial terms because they only need to pay the penalty once. This changes the behaviour by setting the penalty according to the number of words in the token. This should get rid of search interpretations with low chance of matching. This also fixes handling of exact term matching. We now match against all exact terms of the query, not just a couple of them collected while building the interpretations. Also adds a penalty to very short postcodes.
2020-11-25 11:44:25 +01:00
parent f21853ea9d
commit 0f87da017f
6 changed files with 50 additions and 15 deletions
--- a/lib/TokenList.php
+++ b/lib/TokenList.php
@@ -80,6 +80,21 @@ class TokenList
        return isset($this->aTokens[$sWord]) ? $this->aTokens[$sWord] : array();
    }

+    public function getFullWordIDs()
+    {
+        $ids = array();
+
+        foreach($this->aTokens as $aTokenList) {
+            foreach($aTokenList as $oToken) {
+                if (is_a($oToken, '\Nominatim\Token\Word') && !$oToken->bPartial) {
+                    $ids[$oToken->iId] = $oToken->iId;
+                }
+            }
+        }
+
+        return $ids;
+    }
+
    /**
     * Add token information from the word table in the database.
     *
@@ -151,7 +166,8 @@ class TokenList
                $oToken = new Token\Word(
                    $iId,
                    $aWord['word_token'][0] != ' ',
-                    (int) $aWord['count']
+                    (int) $aWord['count'],
+                    substr_count($aWord['word_token'], ' ')
                );
            }