From 023f94b066c11628ecc87ca9876dbe5ec4136776 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 12 Oct 2017 22:37:44 +0200 Subject: [PATCH 1/8] convert phrase array to class --- lib/Geocode.php | 50 +++++++++++---------------- lib/Phrase.php | 92 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/lib.php | 48 -------------------------- 3 files changed, 112 insertions(+), 78 deletions(-) create mode 100644 lib/Phrase.php diff --git a/lib/Geocode.php b/lib/Geocode.php index 16919bb8..399139f0 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -3,6 +3,7 @@ namespace Nominatim; require_once(CONST_BasePath.'/lib/PlaceLookup.php'); +require_once(CONST_BasePath.'/lib/Phrase.php'); require_once(CONST_BasePath.'/lib/ReverseGeocode.php'); require_once(CONST_BasePath.'/lib/SearchDescription.php'); require_once(CONST_BasePath.'/lib/SearchContext.php'); @@ -668,7 +669,7 @@ class Geocode return $aSearchResults; } - public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery) + public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bIsStructured, $sNormQuery) { /* Calculate all searches using aValidTokens i.e. @@ -683,15 +684,11 @@ class Geocode */ $iGlobalRank = 0; - foreach ($aPhrases as $iPhrase => $aPhrase) { + foreach ($aPhrases as $iPhrase => $oPhrase) { $aNewPhraseSearches = array(); - if ($bStructuredPhrases) { - $sPhraseType = $aPhraseTypes[$iPhrase]; - } else { - $sPhraseType = ''; - } + $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : ''; - foreach ($aPhrase['wordsets'] as $iWordSet => $aWordset) { + foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) { // Too many permutations - too expensive if ($iWordSet > 120) break; @@ -746,7 +743,7 @@ class Geocode foreach ($aValidTokens[$sToken] as $aSearchTerm) { $aNewSearches = $oCurrentSearch->extendWithPartialTerm( $aSearchTerm, - $bStructuredPhrases, + $bIsStructured, $iPhrase, $aWordFrequencyScores, isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array() @@ -955,10 +952,10 @@ class Geocode // Split query into phrases // Commas are used to reduce the search space by indicating where phrases split if ($this->aStructuredQuery) { - $aPhrases = $this->aStructuredQuery; + $aInPhrases = $this->aStructuredQuery; $bStructuredPhrases = true; } else { - $aPhrases = explode(',', $sQuery); + $aInPhrases = explode(',', $sQuery); $bStructuredPhrases = false; } @@ -967,25 +964,19 @@ class Geocode // Get all 'sets' of words // Generate a complete list of all $aTokens = array(); - foreach ($aPhrases as $iPhrase => $sPhrase) { - $aPhrase = chksql( - $this->oDB->getRow("SELECT make_standard_name('".pg_escape_string($sPhrase)."') as string"), + $aPhrases = array(); + foreach ($aInPhrases as $iPhrase => $sPhrase) { + $sPhrase = chksql( + $this->oDB->getOne('SELECT make_standard_name('.getDBQuoted($sPhrase).')'), "Cannot normalize query string (is it a UTF-8 string?)" ); - if (trim($aPhrase['string'])) { - $aPhrases[$iPhrase] = $aPhrase; - $aPhrases[$iPhrase]['words'] = explode(' ', $aPhrases[$iPhrase]['string']); - $aPhrases[$iPhrase]['wordsets'] = getWordSets($aPhrases[$iPhrase]['words'], 0); - $aTokens = array_merge($aTokens, getTokensFromSets($aPhrases[$iPhrase]['wordsets'])); - } else { - unset($aPhrases[$iPhrase]); + if (trim($sPhrase)) { + $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : ''); + $oPhrase->addTokens($aTokens); + $aPhrases[] = $oPhrase; } } - // Reindex phrases - we make assumptions later on that they are numerically keyed in order - $aPhraseTypes = array_keys($aPhrases); - $aPhrases = array_values($aPhrases); - if (sizeof($aTokens)) { // Check which tokens we have, get the ID numbers $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, operator, search_name_count'; @@ -1046,19 +1037,18 @@ class Geocode // Any words that have failed completely? // TODO: suggestions - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery); if ($this->bReverseInPlan) { // Reverse phrase array and also reverse the order of the wordsets in // the first and final phrase. Don't bother about phrases in the middle // because order in the address doesn't matter. $aPhrases = array_reverse($aPhrases); - $aPhrases[0]['wordsets'] = getInverseWordSets($aPhrases[0]['words'], 0); + $aPhrases[0]->invertWordSets(); if (sizeof($aPhrases) > 1) { - $aFinalPhrase = end($aPhrases); - $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0); + $aPhrases[sizeof($aPhrases)-1]->invertWordSets(); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { diff --git a/lib/Phrase.php b/lib/Phrase.php new file mode 100644 index 00000000..23b9e3ca --- /dev/null +++ b/lib/Phrase.php @@ -0,0 +1,92 @@ +sPhrase = trim($sPhrase); + $this->sPhraseType = $sPhraseType; + $this->aWords = explode(' ', $this->sPhrase); + $this->aWordSets = $this->createWordSets($this->aWords, 0); + } + + public function getPhraseType() + { + return $this->sPhraseType; + } + + public function getWordSets() + { + return $this->aWordSets; + } + + public function addTokens(&$aTokens) + { + foreach ($this->aWordSets as $aSet) { + foreach ($aSet as $sWord) { + $aTokens[' '.$sWord] = ' '.$sWord; + $aTokens[$sWord] = $sWord; + } + } + } + + public function invertWordSets() + { + $this->aWordSets = $this->createInverseWordSets($this->aWords, 0); + } + + private function createWordSets($aWords, $iDepth) + { + $aResult = array(array(join(' ', $aWords))); + $sFirstToken = ''; + if ($iDepth < Phrase::MAX_DEPTH) { + while (sizeof($aWords) > 1) { + $sWord = array_shift($aWords); + $sFirstToken .= ($sFirstToken?' ':'').$sWord; + $aRest = $this->createWordSets($aWords, $iDepth + 1); + foreach ($aRest as $aSet) { + $aResult[] = array_merge(array($sFirstToken), $aSet); + } + } + } + + return $aResult; + } + + public function createInverseWordSets($aWords, $iDepth) + { + $aResult = array(array(join(' ', $aWords))); + $sFirstToken = ''; + if ($iDepth < Phrase::MAX_DEPTH) { + while (sizeof($aWords) > 1) { + $sWord = array_pop($aWords); + $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken; + $aRest = $this->createInverseWordSets($aWords, $iDepth + 1); + foreach ($aRest as $aSet) { + $aResult[] = array_merge(array($sFirstToken), $aSet); + } + } + } + + return $aResult; + } +}; diff --git a/lib/lib.php b/lib/lib.php index b5fbee3e..76775d6c 100644 --- a/lib/lib.php +++ b/lib/lib.php @@ -60,54 +60,6 @@ function byImportance($a, $b) } -function getWordSets($aWords, $iDepth) -{ - $aResult = array(array(join(' ', $aWords))); - $sFirstToken = ''; - if ($iDepth < 7) { - while (sizeof($aWords) > 1) { - $sWord = array_shift($aWords); - $sFirstToken .= ($sFirstToken?' ':'').$sWord; - $aRest = getWordSets($aWords, $iDepth+1); - foreach ($aRest as $aSet) { - $aResult[] = array_merge(array($sFirstToken), $aSet); - } - } - } - return $aResult; -} - -function getInverseWordSets($aWords, $iDepth) -{ - $aResult = array(array(join(' ', $aWords))); - $sFirstToken = ''; - if ($iDepth < 8) { - while (sizeof($aWords) > 1) { - $sWord = array_pop($aWords); - $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken; - $aRest = getInverseWordSets($aWords, $iDepth+1); - foreach ($aRest as $aSet) { - $aResult[] = array_merge(array($sFirstToken), $aSet); - } - } - } - return $aResult; -} - - -function getTokensFromSets($aSets) -{ - $aTokens = array(); - foreach ($aSets as $aSet) { - foreach ($aSet as $sWord) { - $aTokens[' '.$sWord] = ' '.$sWord; - $aTokens[$sWord] = $sWord; - } - } - return $aTokens; -} - - function getClassTypes() { return array( From 77abe882ab160669d5206f94426353883750857f Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 12 Oct 2017 22:59:07 +0200 Subject: [PATCH 2/8] take frequency scores from token description No need to hand them in separately. --- lib/Geocode.php | 7 +++---- lib/SearchDescription.php | 8 +++----- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/lib/Geocode.php b/lib/Geocode.php index 399139f0..bf0782f1 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -669,7 +669,7 @@ class Geocode return $aSearchResults; } - public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bIsStructured, $sNormQuery) + public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bIsStructured, $sNormQuery) { /* Calculate all searches using aValidTokens i.e. @@ -745,7 +745,6 @@ class Geocode $aSearchTerm, $bIsStructured, $iPhrase, - $aWordFrequencyScores, isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array() ); @@ -1037,7 +1036,7 @@ class Geocode // Any words that have failed completely? // TODO: suggestions - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bStructuredPhrases, $sNormQuery); if ($this->bReverseInPlan) { // Reverse phrase array and also reverse the order of the wordsets in @@ -1048,7 +1047,7 @@ class Geocode if (sizeof($aPhrases) > 1) { $aPhrases[sizeof($aPhrases)-1]->invertWordSets(); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, false, $sNormQuery); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { diff --git a/lib/SearchDescription.php b/lib/SearchDescription.php index 1f3765ab..143d2864 100644 --- a/lib/SearchDescription.php +++ b/lib/SearchDescription.php @@ -333,14 +333,12 @@ class SearchDescription * @param mixed[] $aSearchTerm Description of the token. * @param bool $bStructuredPhrases True if the search is structured. * @param integer $iPhrase Number of the phrase the token is in. - * @param mixed[] $aWordFrequencyScores Number of times tokens appears - * overall in a planet database. * @param array[] $aFullTokens List of full term tokens with the * same name. * * @return SearchDescription[] List of derived search descriptions. */ - public function extendWithPartialTerm($aSearchTerm, $bStructuredPhrases, $iPhrase, &$aWordFrequencyScores, $aFullTokens) + public function extendWithPartialTerm($aSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens) { // Only allow name terms. if (!(isset($aSearchTerm['word_id']) && $aSearchTerm['word_id'])) { @@ -354,7 +352,7 @@ class SearchDescription && sizeof($this->aName) && strpos($aSearchTerm['word_token'], ' ') === false ) { - if ($aWordFrequencyScores[$iWordID] < CONST_Max_Word_Frequency) { + if ($aSearchTerm['search_name_count'] + 1 < CONST_Max_Word_Frequency) { $oSearch = clone $this; $oSearch->iSearchRank++; $oSearch->aAddress[$iWordID] = $iWordID; @@ -397,7 +395,7 @@ class SearchDescription if (preg_match('#^[0-9]+$#', $aSearchTerm['word_token'])) { $oSearch->iSearchRank += 2; } - if ($aWordFrequencyScores[$iWordID] < CONST_Max_Word_Frequency) { + if ($aSearchTerm['search_name_count'] + 1 < CONST_Max_Word_Frequency) { $oSearch->aName[$iWordID] = $iWordID; } else { $oSearch->aNameNonSearch[$iWordID] = $iWordID; From c700421aa7a5509d096d6064379b2793ed191d69 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 13 Oct 2017 21:23:45 +0200 Subject: [PATCH 3/8] add documentation for Phrase --- lib/Phrase.php | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/lib/Phrase.php b/lib/Phrase.php index 23b9e3ca..0fe1d313 100644 --- a/lib/Phrase.php +++ b/lib/Phrase.php @@ -29,16 +29,35 @@ class Phrase $this->aWordSets = $this->createWordSets($this->aWords, 0); } + /** + * Return the element type of the phrase. + * + * @return string Pharse type if the phrase comes from a structured query + * or empty string otherwise. + */ public function getPhraseType() { return $this->sPhraseType; } + /** + * Return the array of possible segmentations of the phrase. + * + * @return string[][] Array of segmentations, each consisting of an + * array of terms. + */ public function getWordSets() { return $this->aWordSets; } + /** + * Add the tokens from this phrase to the given list of tokens. + * + * @param string[] $aTokens List of tokens to append. + * + * @return void + */ public function addTokens(&$aTokens) { foreach ($this->aWordSets as $aSet) { @@ -49,6 +68,11 @@ class Phrase } } + /** + * Invert the set of possible segmentations. + * + * @return void + */ public function invertWordSets() { $this->aWordSets = $this->createInverseWordSets($this->aWords, 0); @@ -72,7 +96,7 @@ class Phrase return $aResult; } - public function createInverseWordSets($aWords, $iDepth) + private function createInverseWordSets($aWords, $iDepth) { $aResult = array(array(join(' ', $aWords))); $sFirstToken = ''; From 9ef2370a2a6e6383b2e166896bfa379e5faff485 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 13 Oct 2017 21:34:13 +0200 Subject: [PATCH 4/8] remove unused $aPossibleMainWordIDs array --- lib/Geocode.php | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/Geocode.php b/lib/Geocode.php index bf0782f1..9115be0b 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -989,7 +989,6 @@ class Geocode $this->oDB->getAll($sSQL), "Could not get word tokens." ); - $aPossibleMainWordIDs = array(); $aWordFrequencyScores = array(); foreach ($aDatabaseWords as $aToken) { // Very special case - require 2 letter country param to match the country code found @@ -1004,7 +1003,6 @@ class Geocode } else { $aValidTokens[$aToken['word_token']] = array($aToken); } - if (!$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1; $aWordFrequencyScores[$aToken['word_id']] = $aToken['search_name_count'] + 1; } if (CONST_Debug) var_Dump($aPhrases, $aValidTokens); From 77b76ae51bc4751c02e58678e9003fa9838abcf1 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 13 Oct 2017 22:23:39 +0200 Subject: [PATCH 5/8] simplify cross-check of country tokens Drop country tokens that do not match the country code list early. Remove in turn the special country code check for structured phrases. It is sufficient to do this during word list building. --- lib/Geocode.php | 9 +++++---- lib/SearchDescription.php | 18 ++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/lib/Geocode.php b/lib/Geocode.php index 9115be0b..faf9e770 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -802,7 +802,7 @@ class Geocode // Revisit searches, drop bad searches and give penalty to unlikely combinations. $aGroupedSearches = array(); foreach ($aSearches as $oSearch) { - if (!$oSearch->isValidSearch($this->aCountryCodes)) { + if (!$oSearch->isValidSearch()) { continue; } @@ -991,9 +991,10 @@ class Geocode ); $aWordFrequencyScores = array(); foreach ($aDatabaseWords as $aToken) { - // Very special case - require 2 letter country param to match the country code found - if ($bStructuredPhrases && $aToken['country_code'] && !empty($this->aStructuredQuery['country']) - && strlen($this->aStructuredQuery['country']) == 2 && strtolower($this->aStructuredQuery['country']) != $aToken['country_code'] + // Filter country tokens that do not match restricted countries. + if ($this->aCountryCodes + && $aToken['country_code'] + && !in_array($aToken['country_code'], $this->aCountryCodes) ) { continue; } diff --git a/lib/SearchDescription.php b/lib/SearchDescription.php index 143d2864..ea7c96ef 100644 --- a/lib/SearchDescription.php +++ b/lib/SearchDescription.php @@ -155,22 +155,17 @@ class SearchDescription /** * Check if the combination of parameters is sensible. * - * @param string[] $aCountryCodes List of country codes. - * * @return bool True, if the search looks valid. */ - public function isValidSearch(&$aCountryCodes) + public function isValidSearch() { if (!sizeof($this->aName)) { if ($this->sHouseNumber) { return false; } - } - if ($aCountryCodes - && $this->sCountryCode - && !in_array($this->sCountryCode, $aCountryCodes) - ) { - return false; + if (!$this->sClass && !$this->sCountryCode) { + return false; + } } return true; @@ -302,7 +297,10 @@ class SearchDescription $oSearch->setPoiSearch($iOp, $aSearchTerm['class'], $aSearchTerm['type']); $aNewSearches[] = $oSearch; } - } elseif (isset($aSearchTerm['word_id']) && $aSearchTerm['word_id']) { + } elseif (isset($aSearchTerm['word_id']) + && $aSearchTerm['word_id'] + && $sPhraseType != 'country' + ) { $iWordID = $aSearchTerm['word_id']; if (sizeof($this->aName)) { if (($sPhraseType == '' || !$bFirstPhrase) From 00265af528652d6c1bb32cf0694d71e8c5603f39 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 13 Oct 2017 23:04:12 +0200 Subject: [PATCH 6/8] move word recheck into token collection Drop tokens for special and postcode searches already when collecting them for ValidTokens when they cannot be found in the normalized query. --- lib/Geocode.php | 23 +++++++++++------------ lib/SearchDescription.php | 19 +++++-------------- 2 files changed, 16 insertions(+), 26 deletions(-) diff --git a/lib/Geocode.php b/lib/Geocode.php index faf9e770..f7f97593 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -669,7 +669,7 @@ class Geocode return $aSearchResults; } - public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bIsStructured, $sNormQuery) + public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bIsStructured) { /* Calculate all searches using aValidTokens i.e. @@ -707,17 +707,8 @@ class Geocode // If the token is valid if (isset($aValidTokens[' '.$sToken])) { foreach ($aValidTokens[' '.$sToken] as $aSearchTerm) { - // Recheck if the original word shows up in the query. - $bWordInQuery = false; - if (isset($aSearchTerm['word']) && $aSearchTerm['word']) { - $bWordInQuery = strpos( - $sNormQuery, - $this->normTerm($aSearchTerm['word']) - ) !== false; - } $aNewSearches = $oCurrentSearch->extendWithFullTerm( $aSearchTerm, - $bWordInQuery, isset($aValidTokens[$sToken]) && strpos($sToken, ' ') === false, $sPhraseType, @@ -999,6 +990,14 @@ class Geocode continue; } + // Special terms need to appear in their normalized form. + if ($aToken['word'] && $aToken['class']) { + $sNormWord = $this->normTerm($aToken['word']); + if (strpos($sNormQuery, $sNormWord) === false) { + continue; + } + } + if (isset($aValidTokens[$aToken['word_token']])) { $aValidTokens[$aToken['word_token']][] = $aToken; } else { @@ -1035,7 +1034,7 @@ class Geocode // Any words that have failed completely? // TODO: suggestions - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bStructuredPhrases, $sNormQuery); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bStructuredPhrases); if ($this->bReverseInPlan) { // Reverse phrase array and also reverse the order of the wordsets in @@ -1046,7 +1045,7 @@ class Geocode if (sizeof($aPhrases) > 1) { $aPhrases[sizeof($aPhrases)-1]->invertWordSets(); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, false, $sNormQuery); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, false); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { diff --git a/lib/SearchDescription.php b/lib/SearchDescription.php index ea7c96ef..1325a463 100644 --- a/lib/SearchDescription.php +++ b/lib/SearchDescription.php @@ -178,8 +178,6 @@ class SearchDescription * Derive new searches by adding a full term to the existing search. * * @param mixed[] $aSearchTerm Description of the token. - * @param bool $bWordInQuery True, if the normalised version of the word - * is contained in the query. * @param bool $bHasPartial True if there are also tokens of partial terms * with the same name. * @param string $sPhraseType Type of phrase the token is contained in. @@ -193,7 +191,7 @@ class SearchDescription * * @return SearchDescription[] List of derived search descriptions. */ - public function extendWithFullTerm($aSearchTerm, $bWordInQuery, $bHasPartial, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken, &$iGlobalRank) + public function extendWithFullTerm($aSearchTerm, $bHasPartial, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken, &$iGlobalRank) { $aNewSearches = array(); @@ -224,7 +222,8 @@ class SearchDescription // We need to try the case where the postal code is the primary element // (i.e. no way to tell if it is (postalcode, city) OR (city, postalcode) // so try both. - if (!$this->sPostcode && $bWordInQuery + if (!$this->sPostcode + && $aSearchTerm['word'] && pg_escape_string($aSearchTerm['word']) == $aSearchTerm['word'] ) { // If we have structured search or this is the first term, @@ -273,16 +272,8 @@ class SearchDescription } $aNewSearches[] = $oSearch; } - } elseif ($sPhraseType == '' - && $aSearchTerm['class'] !== '' && $aSearchTerm['class'] !== null - ) { - // require a normalized exact match of the term - // if we have the normalizer version of the query - // available - if ($this->iOperator == Operator::NONE - && (isset($aSearchTerm['word']) && $aSearchTerm['word']) - && $bWordInQuery - ) { + } elseif ($sPhraseType == '' && $aSearchTerm['class']) { + if ($this->iOperator == Operator::NONE) { $oSearch = clone $this; $oSearch->iSearchRank++; From cdf8c678988cdad7d10cc52b5ece22196c2491f9 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 13 Oct 2017 23:11:09 +0200 Subject: [PATCH 7/8] fix CodeSniffer offences --- lib/Geocode.php | 3 +-- lib/Phrase.php | 4 ++-- lib/SearchDescription.php | 10 +++++----- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/lib/Geocode.php b/lib/Geocode.php index f7f97593..be543012 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -1275,8 +1275,7 @@ class Geocode $aResult['name'] = $aResult['langaddress']; - if ($oCtx->hasNearPoint()) - { + if ($oCtx->hasNearPoint()) { $aResult['importance'] = 0.001; $aResult['foundorder'] = $aResult['addressimportance']; } else { diff --git a/lib/Phrase.php b/lib/Phrase.php index 0fe1d313..b39079d9 100644 --- a/lib/Phrase.php +++ b/lib/Phrase.php @@ -9,7 +9,7 @@ namespace Nominatim; */ class Phrase { - CONST MAX_DEPTH = 7; + const MAX_DEPTH = 7; // Complete phrase as a string. private $sPhrase; @@ -113,4 +113,4 @@ class Phrase return $aResult; } -}; +} diff --git a/lib/SearchDescription.php b/lib/SearchDescription.php index 1325a463..eba5f6a9 100644 --- a/lib/SearchDescription.php +++ b/lib/SearchDescription.php @@ -319,11 +319,11 @@ class SearchDescription /** * Derive new searches by adding a partial term to the existing search. * - * @param mixed[] $aSearchTerm Description of the token. - * @param bool $bStructuredPhrases True if the search is structured. - * @param integer $iPhrase Number of the phrase the token is in. - * @param array[] $aFullTokens List of full term tokens with the - * same name. + * @param mixed[] $aSearchTerm Description of the token. + * @param bool $bStructuredPhrases True if the search is structured. + * @param integer $iPhrase Number of the phrase the token is in. + * @param array[] $aFullTokens List of full term tokens with the + * same name. * * @return SearchDescription[] List of derived search descriptions. */ From 5c18d6865d18ab4ff9e13acda1c9ca5cf04252c4 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 14 Oct 2017 20:28:52 +0200 Subject: [PATCH 8/8] adapt unit tests to new Phrase class --- test/php/Nominatim/NominatimTest.php | 70 ---------------------- test/php/Nominatim/PhraseTest.php | 87 ++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 70 deletions(-) create mode 100644 test/php/Nominatim/PhraseTest.php diff --git a/test/php/Nominatim/NominatimTest.php b/test/php/Nominatim/NominatimTest.php index 33bb6d32..cae3ebb8 100644 --- a/test/php/Nominatim/NominatimTest.php +++ b/test/php/Nominatim/NominatimTest.php @@ -66,76 +66,6 @@ class NominatimTest extends \PHPUnit_Framework_TestCase } - public function testGetWordSets() - { - // given an array of arrays like - // array( array('a','b'), array('c','d') ) - // returns a summary as string: '(a|b),(c|d)' - - - function serializeSets($aSets) - { - $aParts = array(); - foreach ($aSets as $aSet) { - $aParts[] = '(' . join('|', $aSet) . ')'; - } - return join(',', $aParts); - } - - $this->assertEquals( - array(array('')), - getWordSets(array(), 0) - ); - - $this->assertEquals( - '(a)', - serializeSets(getWordSets(array("a"), 0)) - ); - - $this->assertEquals( - '(a b),(a|b)', - serializeSets(getWordSets(array('a', 'b'), 0)) - ); - - $this->assertEquals( - '(a b c),(a|b c),(a|b|c),(a b|c)', - serializeSets(getWordSets(array('a', 'b', 'c'), 0)) - ); - - $this->assertEquals( - '(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)', - serializeSets(getWordSets(array('a', 'b', 'c', 'd'), 0)) - ); - - - // Inverse - $this->assertEquals( - '(a b c),(c|a b),(c|b|a),(b c|a)', - serializeSets(getInverseWordSets(array('a', 'b', 'c'), 0)) - ); - - - // make sure we don't create too many sets - // 4 words => 8 sets - // 10 words => 511 sets - // 15 words => 12911 sets - // 18 words => 65536 sets - // 20 words => 169766 sets - // 22 words => 401930 sets - // 28 words => 3505699 sets (needs more than 4GB via 'phpunit -d memory_limit=' to run) - $this->assertEquals( - 8, - count(getWordSets(array_fill(0, 4, 'a'), 0)) - ); - - - $this->assertEquals( - 41226, - count(getWordSets(array_fill(0, 18, 'a'), 0)) - ); - } - - public function testCreatePointsAroundCenter() { // you might say we're creating a circle diff --git a/test/php/Nominatim/PhraseTest.php b/test/php/Nominatim/PhraseTest.php new file mode 100644 index 00000000..db8d8b50 --- /dev/null +++ b/test/php/Nominatim/PhraseTest.php @@ -0,0 +1,87 @@ +assertEquals( + array(array('')), + $oPhrase->getWordSets() + ); + } + + + public function testSingleWordPhrase() + { + $oPhrase = new Phrase('a', ''); + + $this->assertEquals( + '(a)', + $this->serializeSets($oPhrase->getWordSets()) + ); + } + + + public function testMultiWordPhrase() + { + $oPhrase = new Phrase('a b', ''); + $this->assertEquals( + '(a b),(a|b)', + $this->serializeSets($oPhrase->getWordSets()) + ); + + $oPhrase = new Phrase('a b c', ''); + $this->assertEquals( + '(a b c),(a|b c),(a|b|c),(a b|c)', + $this->serializeSets($oPhrase->getWordSets()) + ); + + $oPhrase = new Phrase('a b c d', ''); + $this->assertEquals( + '(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)', + $this->serializeSets($oPhrase->getWordSets()) + ); + } + + + public function testInverseWordSets() + { + $oPhrase = new Phrase('a b c', ''); + $oPhrase->invertWordSets(); + + $this->assertEquals( + '(a b c),(c|a b),(c|b|a),(b c|a)', + $this->serializeSets($oPhrase->getWordSets()) + ); + } + + + public function testMaxDepth() + { + $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), ''); + $this->assertEquals(8, count($oPhrase->getWordSets())); + $oPhrase->invertWordSets(); + $this->assertEquals(8, count($oPhrase->getWordSets())); + + $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), ''); + $this->assertEquals(41226, count($oPhrase->getWordSets())); + $oPhrase->invertWordSets(); + $this->assertEquals(41226, count($oPhrase->getWordSets())); + } +}