forked from hans/Nominatim
Merge pull request #814 from lonvia/phrase-as-a-class
Make phrases a class and add early checking of token validity
This commit is contained in:
@@ -3,6 +3,7 @@
|
|||||||
namespace Nominatim;
|
namespace Nominatim;
|
||||||
|
|
||||||
require_once(CONST_BasePath.'/lib/PlaceLookup.php');
|
require_once(CONST_BasePath.'/lib/PlaceLookup.php');
|
||||||
|
require_once(CONST_BasePath.'/lib/Phrase.php');
|
||||||
require_once(CONST_BasePath.'/lib/ReverseGeocode.php');
|
require_once(CONST_BasePath.'/lib/ReverseGeocode.php');
|
||||||
require_once(CONST_BasePath.'/lib/SearchDescription.php');
|
require_once(CONST_BasePath.'/lib/SearchDescription.php');
|
||||||
require_once(CONST_BasePath.'/lib/SearchContext.php');
|
require_once(CONST_BasePath.'/lib/SearchContext.php');
|
||||||
@@ -668,7 +669,7 @@ class Geocode
|
|||||||
return $aSearchResults;
|
return $aSearchResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery)
|
public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bIsStructured)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
Calculate all searches using aValidTokens i.e.
|
Calculate all searches using aValidTokens i.e.
|
||||||
@@ -683,15 +684,11 @@ class Geocode
|
|||||||
*/
|
*/
|
||||||
$iGlobalRank = 0;
|
$iGlobalRank = 0;
|
||||||
|
|
||||||
foreach ($aPhrases as $iPhrase => $aPhrase) {
|
foreach ($aPhrases as $iPhrase => $oPhrase) {
|
||||||
$aNewPhraseSearches = array();
|
$aNewPhraseSearches = array();
|
||||||
if ($bStructuredPhrases) {
|
$sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : '';
|
||||||
$sPhraseType = $aPhraseTypes[$iPhrase];
|
|
||||||
} else {
|
|
||||||
$sPhraseType = '';
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach ($aPhrase['wordsets'] as $iWordSet => $aWordset) {
|
foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) {
|
||||||
// Too many permutations - too expensive
|
// Too many permutations - too expensive
|
||||||
if ($iWordSet > 120) break;
|
if ($iWordSet > 120) break;
|
||||||
|
|
||||||
@@ -710,17 +707,8 @@ class Geocode
|
|||||||
// If the token is valid
|
// If the token is valid
|
||||||
if (isset($aValidTokens[' '.$sToken])) {
|
if (isset($aValidTokens[' '.$sToken])) {
|
||||||
foreach ($aValidTokens[' '.$sToken] as $aSearchTerm) {
|
foreach ($aValidTokens[' '.$sToken] as $aSearchTerm) {
|
||||||
// Recheck if the original word shows up in the query.
|
|
||||||
$bWordInQuery = false;
|
|
||||||
if (isset($aSearchTerm['word']) && $aSearchTerm['word']) {
|
|
||||||
$bWordInQuery = strpos(
|
|
||||||
$sNormQuery,
|
|
||||||
$this->normTerm($aSearchTerm['word'])
|
|
||||||
) !== false;
|
|
||||||
}
|
|
||||||
$aNewSearches = $oCurrentSearch->extendWithFullTerm(
|
$aNewSearches = $oCurrentSearch->extendWithFullTerm(
|
||||||
$aSearchTerm,
|
$aSearchTerm,
|
||||||
$bWordInQuery,
|
|
||||||
isset($aValidTokens[$sToken])
|
isset($aValidTokens[$sToken])
|
||||||
&& strpos($sToken, ' ') === false,
|
&& strpos($sToken, ' ') === false,
|
||||||
$sPhraseType,
|
$sPhraseType,
|
||||||
@@ -746,9 +734,8 @@ class Geocode
|
|||||||
foreach ($aValidTokens[$sToken] as $aSearchTerm) {
|
foreach ($aValidTokens[$sToken] as $aSearchTerm) {
|
||||||
$aNewSearches = $oCurrentSearch->extendWithPartialTerm(
|
$aNewSearches = $oCurrentSearch->extendWithPartialTerm(
|
||||||
$aSearchTerm,
|
$aSearchTerm,
|
||||||
$bStructuredPhrases,
|
$bIsStructured,
|
||||||
$iPhrase,
|
$iPhrase,
|
||||||
$aWordFrequencyScores,
|
|
||||||
isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array()
|
isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array()
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -806,7 +793,7 @@ class Geocode
|
|||||||
// Revisit searches, drop bad searches and give penalty to unlikely combinations.
|
// Revisit searches, drop bad searches and give penalty to unlikely combinations.
|
||||||
$aGroupedSearches = array();
|
$aGroupedSearches = array();
|
||||||
foreach ($aSearches as $oSearch) {
|
foreach ($aSearches as $oSearch) {
|
||||||
if (!$oSearch->isValidSearch($this->aCountryCodes)) {
|
if (!$oSearch->isValidSearch()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -955,10 +942,10 @@ class Geocode
|
|||||||
// Split query into phrases
|
// Split query into phrases
|
||||||
// Commas are used to reduce the search space by indicating where phrases split
|
// Commas are used to reduce the search space by indicating where phrases split
|
||||||
if ($this->aStructuredQuery) {
|
if ($this->aStructuredQuery) {
|
||||||
$aPhrases = $this->aStructuredQuery;
|
$aInPhrases = $this->aStructuredQuery;
|
||||||
$bStructuredPhrases = true;
|
$bStructuredPhrases = true;
|
||||||
} else {
|
} else {
|
||||||
$aPhrases = explode(',', $sQuery);
|
$aInPhrases = explode(',', $sQuery);
|
||||||
$bStructuredPhrases = false;
|
$bStructuredPhrases = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -967,25 +954,19 @@ class Geocode
|
|||||||
// Get all 'sets' of words
|
// Get all 'sets' of words
|
||||||
// Generate a complete list of all
|
// Generate a complete list of all
|
||||||
$aTokens = array();
|
$aTokens = array();
|
||||||
foreach ($aPhrases as $iPhrase => $sPhrase) {
|
$aPhrases = array();
|
||||||
$aPhrase = chksql(
|
foreach ($aInPhrases as $iPhrase => $sPhrase) {
|
||||||
$this->oDB->getRow("SELECT make_standard_name('".pg_escape_string($sPhrase)."') as string"),
|
$sPhrase = chksql(
|
||||||
|
$this->oDB->getOne('SELECT make_standard_name('.getDBQuoted($sPhrase).')'),
|
||||||
"Cannot normalize query string (is it a UTF-8 string?)"
|
"Cannot normalize query string (is it a UTF-8 string?)"
|
||||||
);
|
);
|
||||||
if (trim($aPhrase['string'])) {
|
if (trim($sPhrase)) {
|
||||||
$aPhrases[$iPhrase] = $aPhrase;
|
$oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
|
||||||
$aPhrases[$iPhrase]['words'] = explode(' ', $aPhrases[$iPhrase]['string']);
|
$oPhrase->addTokens($aTokens);
|
||||||
$aPhrases[$iPhrase]['wordsets'] = getWordSets($aPhrases[$iPhrase]['words'], 0);
|
$aPhrases[] = $oPhrase;
|
||||||
$aTokens = array_merge($aTokens, getTokensFromSets($aPhrases[$iPhrase]['wordsets']));
|
|
||||||
} else {
|
|
||||||
unset($aPhrases[$iPhrase]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reindex phrases - we make assumptions later on that they are numerically keyed in order
|
|
||||||
$aPhraseTypes = array_keys($aPhrases);
|
|
||||||
$aPhrases = array_values($aPhrases);
|
|
||||||
|
|
||||||
if (sizeof($aTokens)) {
|
if (sizeof($aTokens)) {
|
||||||
// Check which tokens we have, get the ID numbers
|
// Check which tokens we have, get the ID numbers
|
||||||
$sSQL = 'SELECT word_id, word_token, word, class, type, country_code, operator, search_name_count';
|
$sSQL = 'SELECT word_id, word_token, word, class, type, country_code, operator, search_name_count';
|
||||||
@@ -999,22 +980,29 @@ class Geocode
|
|||||||
$this->oDB->getAll($sSQL),
|
$this->oDB->getAll($sSQL),
|
||||||
"Could not get word tokens."
|
"Could not get word tokens."
|
||||||
);
|
);
|
||||||
$aPossibleMainWordIDs = array();
|
|
||||||
$aWordFrequencyScores = array();
|
$aWordFrequencyScores = array();
|
||||||
foreach ($aDatabaseWords as $aToken) {
|
foreach ($aDatabaseWords as $aToken) {
|
||||||
// Very special case - require 2 letter country param to match the country code found
|
// Filter country tokens that do not match restricted countries.
|
||||||
if ($bStructuredPhrases && $aToken['country_code'] && !empty($this->aStructuredQuery['country'])
|
if ($this->aCountryCodes
|
||||||
&& strlen($this->aStructuredQuery['country']) == 2 && strtolower($this->aStructuredQuery['country']) != $aToken['country_code']
|
&& $aToken['country_code']
|
||||||
|
&& !in_array($aToken['country_code'], $this->aCountryCodes)
|
||||||
) {
|
) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Special terms need to appear in their normalized form.
|
||||||
|
if ($aToken['word'] && $aToken['class']) {
|
||||||
|
$sNormWord = $this->normTerm($aToken['word']);
|
||||||
|
if (strpos($sNormQuery, $sNormWord) === false) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (isset($aValidTokens[$aToken['word_token']])) {
|
if (isset($aValidTokens[$aToken['word_token']])) {
|
||||||
$aValidTokens[$aToken['word_token']][] = $aToken;
|
$aValidTokens[$aToken['word_token']][] = $aToken;
|
||||||
} else {
|
} else {
|
||||||
$aValidTokens[$aToken['word_token']] = array($aToken);
|
$aValidTokens[$aToken['word_token']] = array($aToken);
|
||||||
}
|
}
|
||||||
if (!$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1;
|
|
||||||
$aWordFrequencyScores[$aToken['word_id']] = $aToken['search_name_count'] + 1;
|
$aWordFrequencyScores[$aToken['word_id']] = $aToken['search_name_count'] + 1;
|
||||||
}
|
}
|
||||||
if (CONST_Debug) var_Dump($aPhrases, $aValidTokens);
|
if (CONST_Debug) var_Dump($aPhrases, $aValidTokens);
|
||||||
@@ -1046,19 +1034,18 @@ class Geocode
|
|||||||
// Any words that have failed completely?
|
// Any words that have failed completely?
|
||||||
// TODO: suggestions
|
// TODO: suggestions
|
||||||
|
|
||||||
$aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery);
|
$aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bStructuredPhrases);
|
||||||
|
|
||||||
if ($this->bReverseInPlan) {
|
if ($this->bReverseInPlan) {
|
||||||
// Reverse phrase array and also reverse the order of the wordsets in
|
// Reverse phrase array and also reverse the order of the wordsets in
|
||||||
// the first and final phrase. Don't bother about phrases in the middle
|
// the first and final phrase. Don't bother about phrases in the middle
|
||||||
// because order in the address doesn't matter.
|
// because order in the address doesn't matter.
|
||||||
$aPhrases = array_reverse($aPhrases);
|
$aPhrases = array_reverse($aPhrases);
|
||||||
$aPhrases[0]['wordsets'] = getInverseWordSets($aPhrases[0]['words'], 0);
|
$aPhrases[0]->invertWordSets();
|
||||||
if (sizeof($aPhrases) > 1) {
|
if (sizeof($aPhrases) > 1) {
|
||||||
$aFinalPhrase = end($aPhrases);
|
$aPhrases[sizeof($aPhrases)-1]->invertWordSets();
|
||||||
$aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0);
|
|
||||||
}
|
}
|
||||||
$aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery);
|
$aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, false);
|
||||||
|
|
||||||
foreach ($aGroupedSearches as $aSearches) {
|
foreach ($aGroupedSearches as $aSearches) {
|
||||||
foreach ($aSearches as $aSearch) {
|
foreach ($aSearches as $aSearch) {
|
||||||
@@ -1288,8 +1275,7 @@ class Geocode
|
|||||||
|
|
||||||
$aResult['name'] = $aResult['langaddress'];
|
$aResult['name'] = $aResult['langaddress'];
|
||||||
|
|
||||||
if ($oCtx->hasNearPoint())
|
if ($oCtx->hasNearPoint()) {
|
||||||
{
|
|
||||||
$aResult['importance'] = 0.001;
|
$aResult['importance'] = 0.001;
|
||||||
$aResult['foundorder'] = $aResult['addressimportance'];
|
$aResult['foundorder'] = $aResult['addressimportance'];
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
116
lib/Phrase.php
Normal file
116
lib/Phrase.php
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Nominatim;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Segment of a query string.
|
||||||
|
*
|
||||||
|
* The parts of a query strings are usually separated by commas.
|
||||||
|
*/
|
||||||
|
class Phrase
|
||||||
|
{
|
||||||
|
const MAX_DEPTH = 7;
|
||||||
|
|
||||||
|
// Complete phrase as a string.
|
||||||
|
private $sPhrase;
|
||||||
|
// Element type for structured searches.
|
||||||
|
private $sPhraseType;
|
||||||
|
// Space-separated words of the phrase.
|
||||||
|
private $aWords;
|
||||||
|
// Possible segmentations of the phrase.
|
||||||
|
private $aWordSets;
|
||||||
|
|
||||||
|
|
||||||
|
public function __construct($sPhrase, $sPhraseType)
|
||||||
|
{
|
||||||
|
$this->sPhrase = trim($sPhrase);
|
||||||
|
$this->sPhraseType = $sPhraseType;
|
||||||
|
$this->aWords = explode(' ', $this->sPhrase);
|
||||||
|
$this->aWordSets = $this->createWordSets($this->aWords, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the element type of the phrase.
|
||||||
|
*
|
||||||
|
* @return string Pharse type if the phrase comes from a structured query
|
||||||
|
* or empty string otherwise.
|
||||||
|
*/
|
||||||
|
public function getPhraseType()
|
||||||
|
{
|
||||||
|
return $this->sPhraseType;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the array of possible segmentations of the phrase.
|
||||||
|
*
|
||||||
|
* @return string[][] Array of segmentations, each consisting of an
|
||||||
|
* array of terms.
|
||||||
|
*/
|
||||||
|
public function getWordSets()
|
||||||
|
{
|
||||||
|
return $this->aWordSets;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add the tokens from this phrase to the given list of tokens.
|
||||||
|
*
|
||||||
|
* @param string[] $aTokens List of tokens to append.
|
||||||
|
*
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function addTokens(&$aTokens)
|
||||||
|
{
|
||||||
|
foreach ($this->aWordSets as $aSet) {
|
||||||
|
foreach ($aSet as $sWord) {
|
||||||
|
$aTokens[' '.$sWord] = ' '.$sWord;
|
||||||
|
$aTokens[$sWord] = $sWord;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invert the set of possible segmentations.
|
||||||
|
*
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function invertWordSets()
|
||||||
|
{
|
||||||
|
$this->aWordSets = $this->createInverseWordSets($this->aWords, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function createWordSets($aWords, $iDepth)
|
||||||
|
{
|
||||||
|
$aResult = array(array(join(' ', $aWords)));
|
||||||
|
$sFirstToken = '';
|
||||||
|
if ($iDepth < Phrase::MAX_DEPTH) {
|
||||||
|
while (sizeof($aWords) > 1) {
|
||||||
|
$sWord = array_shift($aWords);
|
||||||
|
$sFirstToken .= ($sFirstToken?' ':'').$sWord;
|
||||||
|
$aRest = $this->createWordSets($aWords, $iDepth + 1);
|
||||||
|
foreach ($aRest as $aSet) {
|
||||||
|
$aResult[] = array_merge(array($sFirstToken), $aSet);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $aResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function createInverseWordSets($aWords, $iDepth)
|
||||||
|
{
|
||||||
|
$aResult = array(array(join(' ', $aWords)));
|
||||||
|
$sFirstToken = '';
|
||||||
|
if ($iDepth < Phrase::MAX_DEPTH) {
|
||||||
|
while (sizeof($aWords) > 1) {
|
||||||
|
$sWord = array_pop($aWords);
|
||||||
|
$sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken;
|
||||||
|
$aRest = $this->createInverseWordSets($aWords, $iDepth + 1);
|
||||||
|
foreach ($aRest as $aSet) {
|
||||||
|
$aResult[] = array_merge(array($sFirstToken), $aSet);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $aResult;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -155,22 +155,17 @@ class SearchDescription
|
|||||||
/**
|
/**
|
||||||
* Check if the combination of parameters is sensible.
|
* Check if the combination of parameters is sensible.
|
||||||
*
|
*
|
||||||
* @param string[] $aCountryCodes List of country codes.
|
|
||||||
*
|
|
||||||
* @return bool True, if the search looks valid.
|
* @return bool True, if the search looks valid.
|
||||||
*/
|
*/
|
||||||
public function isValidSearch(&$aCountryCodes)
|
public function isValidSearch()
|
||||||
{
|
{
|
||||||
if (!sizeof($this->aName)) {
|
if (!sizeof($this->aName)) {
|
||||||
if ($this->sHouseNumber) {
|
if ($this->sHouseNumber) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
if (!$this->sClass && !$this->sCountryCode) {
|
||||||
if ($aCountryCodes
|
return false;
|
||||||
&& $this->sCountryCode
|
}
|
||||||
&& !in_array($this->sCountryCode, $aCountryCodes)
|
|
||||||
) {
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@@ -183,8 +178,6 @@ class SearchDescription
|
|||||||
* Derive new searches by adding a full term to the existing search.
|
* Derive new searches by adding a full term to the existing search.
|
||||||
*
|
*
|
||||||
* @param mixed[] $aSearchTerm Description of the token.
|
* @param mixed[] $aSearchTerm Description of the token.
|
||||||
* @param bool $bWordInQuery True, if the normalised version of the word
|
|
||||||
* is contained in the query.
|
|
||||||
* @param bool $bHasPartial True if there are also tokens of partial terms
|
* @param bool $bHasPartial True if there are also tokens of partial terms
|
||||||
* with the same name.
|
* with the same name.
|
||||||
* @param string $sPhraseType Type of phrase the token is contained in.
|
* @param string $sPhraseType Type of phrase the token is contained in.
|
||||||
@@ -198,7 +191,7 @@ class SearchDescription
|
|||||||
*
|
*
|
||||||
* @return SearchDescription[] List of derived search descriptions.
|
* @return SearchDescription[] List of derived search descriptions.
|
||||||
*/
|
*/
|
||||||
public function extendWithFullTerm($aSearchTerm, $bWordInQuery, $bHasPartial, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken, &$iGlobalRank)
|
public function extendWithFullTerm($aSearchTerm, $bHasPartial, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken, &$iGlobalRank)
|
||||||
{
|
{
|
||||||
$aNewSearches = array();
|
$aNewSearches = array();
|
||||||
|
|
||||||
@@ -229,7 +222,8 @@ class SearchDescription
|
|||||||
// We need to try the case where the postal code is the primary element
|
// We need to try the case where the postal code is the primary element
|
||||||
// (i.e. no way to tell if it is (postalcode, city) OR (city, postalcode)
|
// (i.e. no way to tell if it is (postalcode, city) OR (city, postalcode)
|
||||||
// so try both.
|
// so try both.
|
||||||
if (!$this->sPostcode && $bWordInQuery
|
if (!$this->sPostcode
|
||||||
|
&& $aSearchTerm['word']
|
||||||
&& pg_escape_string($aSearchTerm['word']) == $aSearchTerm['word']
|
&& pg_escape_string($aSearchTerm['word']) == $aSearchTerm['word']
|
||||||
) {
|
) {
|
||||||
// If we have structured search or this is the first term,
|
// If we have structured search or this is the first term,
|
||||||
@@ -278,16 +272,8 @@ class SearchDescription
|
|||||||
}
|
}
|
||||||
$aNewSearches[] = $oSearch;
|
$aNewSearches[] = $oSearch;
|
||||||
}
|
}
|
||||||
} elseif ($sPhraseType == ''
|
} elseif ($sPhraseType == '' && $aSearchTerm['class']) {
|
||||||
&& $aSearchTerm['class'] !== '' && $aSearchTerm['class'] !== null
|
if ($this->iOperator == Operator::NONE) {
|
||||||
) {
|
|
||||||
// require a normalized exact match of the term
|
|
||||||
// if we have the normalizer version of the query
|
|
||||||
// available
|
|
||||||
if ($this->iOperator == Operator::NONE
|
|
||||||
&& (isset($aSearchTerm['word']) && $aSearchTerm['word'])
|
|
||||||
&& $bWordInQuery
|
|
||||||
) {
|
|
||||||
$oSearch = clone $this;
|
$oSearch = clone $this;
|
||||||
$oSearch->iSearchRank++;
|
$oSearch->iSearchRank++;
|
||||||
|
|
||||||
@@ -302,7 +288,10 @@ class SearchDescription
|
|||||||
$oSearch->setPoiSearch($iOp, $aSearchTerm['class'], $aSearchTerm['type']);
|
$oSearch->setPoiSearch($iOp, $aSearchTerm['class'], $aSearchTerm['type']);
|
||||||
$aNewSearches[] = $oSearch;
|
$aNewSearches[] = $oSearch;
|
||||||
}
|
}
|
||||||
} elseif (isset($aSearchTerm['word_id']) && $aSearchTerm['word_id']) {
|
} elseif (isset($aSearchTerm['word_id'])
|
||||||
|
&& $aSearchTerm['word_id']
|
||||||
|
&& $sPhraseType != 'country'
|
||||||
|
) {
|
||||||
$iWordID = $aSearchTerm['word_id'];
|
$iWordID = $aSearchTerm['word_id'];
|
||||||
if (sizeof($this->aName)) {
|
if (sizeof($this->aName)) {
|
||||||
if (($sPhraseType == '' || !$bFirstPhrase)
|
if (($sPhraseType == '' || !$bFirstPhrase)
|
||||||
@@ -330,17 +319,15 @@ class SearchDescription
|
|||||||
/**
|
/**
|
||||||
* Derive new searches by adding a partial term to the existing search.
|
* Derive new searches by adding a partial term to the existing search.
|
||||||
*
|
*
|
||||||
* @param mixed[] $aSearchTerm Description of the token.
|
* @param mixed[] $aSearchTerm Description of the token.
|
||||||
* @param bool $bStructuredPhrases True if the search is structured.
|
* @param bool $bStructuredPhrases True if the search is structured.
|
||||||
* @param integer $iPhrase Number of the phrase the token is in.
|
* @param integer $iPhrase Number of the phrase the token is in.
|
||||||
* @param mixed[] $aWordFrequencyScores Number of times tokens appears
|
* @param array[] $aFullTokens List of full term tokens with the
|
||||||
* overall in a planet database.
|
* same name.
|
||||||
* @param array[] $aFullTokens List of full term tokens with the
|
|
||||||
* same name.
|
|
||||||
*
|
*
|
||||||
* @return SearchDescription[] List of derived search descriptions.
|
* @return SearchDescription[] List of derived search descriptions.
|
||||||
*/
|
*/
|
||||||
public function extendWithPartialTerm($aSearchTerm, $bStructuredPhrases, $iPhrase, &$aWordFrequencyScores, $aFullTokens)
|
public function extendWithPartialTerm($aSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens)
|
||||||
{
|
{
|
||||||
// Only allow name terms.
|
// Only allow name terms.
|
||||||
if (!(isset($aSearchTerm['word_id']) && $aSearchTerm['word_id'])) {
|
if (!(isset($aSearchTerm['word_id']) && $aSearchTerm['word_id'])) {
|
||||||
@@ -354,7 +341,7 @@ class SearchDescription
|
|||||||
&& sizeof($this->aName)
|
&& sizeof($this->aName)
|
||||||
&& strpos($aSearchTerm['word_token'], ' ') === false
|
&& strpos($aSearchTerm['word_token'], ' ') === false
|
||||||
) {
|
) {
|
||||||
if ($aWordFrequencyScores[$iWordID] < CONST_Max_Word_Frequency) {
|
if ($aSearchTerm['search_name_count'] + 1 < CONST_Max_Word_Frequency) {
|
||||||
$oSearch = clone $this;
|
$oSearch = clone $this;
|
||||||
$oSearch->iSearchRank++;
|
$oSearch->iSearchRank++;
|
||||||
$oSearch->aAddress[$iWordID] = $iWordID;
|
$oSearch->aAddress[$iWordID] = $iWordID;
|
||||||
@@ -397,7 +384,7 @@ class SearchDescription
|
|||||||
if (preg_match('#^[0-9]+$#', $aSearchTerm['word_token'])) {
|
if (preg_match('#^[0-9]+$#', $aSearchTerm['word_token'])) {
|
||||||
$oSearch->iSearchRank += 2;
|
$oSearch->iSearchRank += 2;
|
||||||
}
|
}
|
||||||
if ($aWordFrequencyScores[$iWordID] < CONST_Max_Word_Frequency) {
|
if ($aSearchTerm['search_name_count'] + 1 < CONST_Max_Word_Frequency) {
|
||||||
$oSearch->aName[$iWordID] = $iWordID;
|
$oSearch->aName[$iWordID] = $iWordID;
|
||||||
} else {
|
} else {
|
||||||
$oSearch->aNameNonSearch[$iWordID] = $iWordID;
|
$oSearch->aNameNonSearch[$iWordID] = $iWordID;
|
||||||
|
|||||||
48
lib/lib.php
48
lib/lib.php
@@ -60,54 +60,6 @@ function byImportance($a, $b)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function getWordSets($aWords, $iDepth)
|
|
||||||
{
|
|
||||||
$aResult = array(array(join(' ', $aWords)));
|
|
||||||
$sFirstToken = '';
|
|
||||||
if ($iDepth < 7) {
|
|
||||||
while (sizeof($aWords) > 1) {
|
|
||||||
$sWord = array_shift($aWords);
|
|
||||||
$sFirstToken .= ($sFirstToken?' ':'').$sWord;
|
|
||||||
$aRest = getWordSets($aWords, $iDepth+1);
|
|
||||||
foreach ($aRest as $aSet) {
|
|
||||||
$aResult[] = array_merge(array($sFirstToken), $aSet);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return $aResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
function getInverseWordSets($aWords, $iDepth)
|
|
||||||
{
|
|
||||||
$aResult = array(array(join(' ', $aWords)));
|
|
||||||
$sFirstToken = '';
|
|
||||||
if ($iDepth < 8) {
|
|
||||||
while (sizeof($aWords) > 1) {
|
|
||||||
$sWord = array_pop($aWords);
|
|
||||||
$sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken;
|
|
||||||
$aRest = getInverseWordSets($aWords, $iDepth+1);
|
|
||||||
foreach ($aRest as $aSet) {
|
|
||||||
$aResult[] = array_merge(array($sFirstToken), $aSet);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return $aResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function getTokensFromSets($aSets)
|
|
||||||
{
|
|
||||||
$aTokens = array();
|
|
||||||
foreach ($aSets as $aSet) {
|
|
||||||
foreach ($aSet as $sWord) {
|
|
||||||
$aTokens[' '.$sWord] = ' '.$sWord;
|
|
||||||
$aTokens[$sWord] = $sWord;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return $aTokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function getClassTypes()
|
function getClassTypes()
|
||||||
{
|
{
|
||||||
return array(
|
return array(
|
||||||
|
|||||||
@@ -66,76 +66,6 @@ class NominatimTest extends \PHPUnit_Framework_TestCase
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public function testGetWordSets()
|
|
||||||
{
|
|
||||||
// given an array of arrays like
|
|
||||||
// array( array('a','b'), array('c','d') )
|
|
||||||
// returns a summary as string: '(a|b),(c|d)'
|
|
||||||
|
|
||||||
|
|
||||||
function serializeSets($aSets)
|
|
||||||
{
|
|
||||||
$aParts = array();
|
|
||||||
foreach ($aSets as $aSet) {
|
|
||||||
$aParts[] = '(' . join('|', $aSet) . ')';
|
|
||||||
}
|
|
||||||
return join(',', $aParts);
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->assertEquals(
|
|
||||||
array(array('')),
|
|
||||||
getWordSets(array(), 0)
|
|
||||||
);
|
|
||||||
|
|
||||||
$this->assertEquals(
|
|
||||||
'(a)',
|
|
||||||
serializeSets(getWordSets(array("a"), 0))
|
|
||||||
);
|
|
||||||
|
|
||||||
$this->assertEquals(
|
|
||||||
'(a b),(a|b)',
|
|
||||||
serializeSets(getWordSets(array('a', 'b'), 0))
|
|
||||||
);
|
|
||||||
|
|
||||||
$this->assertEquals(
|
|
||||||
'(a b c),(a|b c),(a|b|c),(a b|c)',
|
|
||||||
serializeSets(getWordSets(array('a', 'b', 'c'), 0))
|
|
||||||
);
|
|
||||||
|
|
||||||
$this->assertEquals(
|
|
||||||
'(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)',
|
|
||||||
serializeSets(getWordSets(array('a', 'b', 'c', 'd'), 0))
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
// Inverse
|
|
||||||
$this->assertEquals(
|
|
||||||
'(a b c),(c|a b),(c|b|a),(b c|a)',
|
|
||||||
serializeSets(getInverseWordSets(array('a', 'b', 'c'), 0))
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
// make sure we don't create too many sets
|
|
||||||
// 4 words => 8 sets
|
|
||||||
// 10 words => 511 sets
|
|
||||||
// 15 words => 12911 sets
|
|
||||||
// 18 words => 65536 sets
|
|
||||||
// 20 words => 169766 sets
|
|
||||||
// 22 words => 401930 sets
|
|
||||||
// 28 words => 3505699 sets (needs more than 4GB via 'phpunit -d memory_limit=' to run)
|
|
||||||
$this->assertEquals(
|
|
||||||
8,
|
|
||||||
count(getWordSets(array_fill(0, 4, 'a'), 0))
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
$this->assertEquals(
|
|
||||||
41226,
|
|
||||||
count(getWordSets(array_fill(0, 18, 'a'), 0))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public function testCreatePointsAroundCenter()
|
public function testCreatePointsAroundCenter()
|
||||||
{
|
{
|
||||||
// you might say we're creating a circle
|
// you might say we're creating a circle
|
||||||
|
|||||||
87
test/php/Nominatim/PhraseTest.php
Normal file
87
test/php/Nominatim/PhraseTest.php
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Nominatim;
|
||||||
|
|
||||||
|
require_once '../../lib/Phrase.php';
|
||||||
|
|
||||||
|
class PhraseTest extends \PHPUnit_Framework_TestCase
|
||||||
|
{
|
||||||
|
private function serializeSets($aSets)
|
||||||
|
{
|
||||||
|
$aParts = array();
|
||||||
|
foreach ($aSets as $aSet) {
|
||||||
|
$aParts[] = '(' . join('|', $aSet) . ')';
|
||||||
|
}
|
||||||
|
return join(',', $aParts);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function testEmptyPhrase()
|
||||||
|
{
|
||||||
|
$oPhrase = new Phrase('', '');
|
||||||
|
|
||||||
|
$this->assertEquals(
|
||||||
|
array(array('')),
|
||||||
|
$oPhrase->getWordSets()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function testSingleWordPhrase()
|
||||||
|
{
|
||||||
|
$oPhrase = new Phrase('a', '');
|
||||||
|
|
||||||
|
$this->assertEquals(
|
||||||
|
'(a)',
|
||||||
|
$this->serializeSets($oPhrase->getWordSets())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function testMultiWordPhrase()
|
||||||
|
{
|
||||||
|
$oPhrase = new Phrase('a b', '');
|
||||||
|
$this->assertEquals(
|
||||||
|
'(a b),(a|b)',
|
||||||
|
$this->serializeSets($oPhrase->getWordSets())
|
||||||
|
);
|
||||||
|
|
||||||
|
$oPhrase = new Phrase('a b c', '');
|
||||||
|
$this->assertEquals(
|
||||||
|
'(a b c),(a|b c),(a|b|c),(a b|c)',
|
||||||
|
$this->serializeSets($oPhrase->getWordSets())
|
||||||
|
);
|
||||||
|
|
||||||
|
$oPhrase = new Phrase('a b c d', '');
|
||||||
|
$this->assertEquals(
|
||||||
|
'(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)',
|
||||||
|
$this->serializeSets($oPhrase->getWordSets())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function testInverseWordSets()
|
||||||
|
{
|
||||||
|
$oPhrase = new Phrase('a b c', '');
|
||||||
|
$oPhrase->invertWordSets();
|
||||||
|
|
||||||
|
$this->assertEquals(
|
||||||
|
'(a b c),(c|a b),(c|b|a),(b c|a)',
|
||||||
|
$this->serializeSets($oPhrase->getWordSets())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function testMaxDepth()
|
||||||
|
{
|
||||||
|
$oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
|
||||||
|
$this->assertEquals(8, count($oPhrase->getWordSets()));
|
||||||
|
$oPhrase->invertWordSets();
|
||||||
|
$this->assertEquals(8, count($oPhrase->getWordSets()));
|
||||||
|
|
||||||
|
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
|
||||||
|
$this->assertEquals(41226, count($oPhrase->getWordSets()));
|
||||||
|
$oPhrase->invertWordSets();
|
||||||
|
$this->assertEquals(41226, count($oPhrase->getWordSets()));
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user