convert phrase array to class

This commit is contained in:
Sarah Hoffmann
2017-10-12 22:37:44 +02:00
parent 7ea1ef3feb
commit 023f94b066
3 changed files with 112 additions and 78 deletions

View File

@@ -3,6 +3,7 @@
namespace Nominatim; namespace Nominatim;
require_once(CONST_BasePath.'/lib/PlaceLookup.php'); require_once(CONST_BasePath.'/lib/PlaceLookup.php');
require_once(CONST_BasePath.'/lib/Phrase.php');
require_once(CONST_BasePath.'/lib/ReverseGeocode.php'); require_once(CONST_BasePath.'/lib/ReverseGeocode.php');
require_once(CONST_BasePath.'/lib/SearchDescription.php'); require_once(CONST_BasePath.'/lib/SearchDescription.php');
require_once(CONST_BasePath.'/lib/SearchContext.php'); require_once(CONST_BasePath.'/lib/SearchContext.php');
@@ -668,7 +669,7 @@ class Geocode
return $aSearchResults; return $aSearchResults;
} }
public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery) public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bIsStructured, $sNormQuery)
{ {
/* /*
Calculate all searches using aValidTokens i.e. Calculate all searches using aValidTokens i.e.
@@ -683,15 +684,11 @@ class Geocode
*/ */
$iGlobalRank = 0; $iGlobalRank = 0;
foreach ($aPhrases as $iPhrase => $aPhrase) { foreach ($aPhrases as $iPhrase => $oPhrase) {
$aNewPhraseSearches = array(); $aNewPhraseSearches = array();
if ($bStructuredPhrases) { $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : '';
$sPhraseType = $aPhraseTypes[$iPhrase];
} else {
$sPhraseType = '';
}
foreach ($aPhrase['wordsets'] as $iWordSet => $aWordset) { foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) {
// Too many permutations - too expensive // Too many permutations - too expensive
if ($iWordSet > 120) break; if ($iWordSet > 120) break;
@@ -746,7 +743,7 @@ class Geocode
foreach ($aValidTokens[$sToken] as $aSearchTerm) { foreach ($aValidTokens[$sToken] as $aSearchTerm) {
$aNewSearches = $oCurrentSearch->extendWithPartialTerm( $aNewSearches = $oCurrentSearch->extendWithPartialTerm(
$aSearchTerm, $aSearchTerm,
$bStructuredPhrases, $bIsStructured,
$iPhrase, $iPhrase,
$aWordFrequencyScores, $aWordFrequencyScores,
isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array() isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array()
@@ -955,10 +952,10 @@ class Geocode
// Split query into phrases // Split query into phrases
// Commas are used to reduce the search space by indicating where phrases split // Commas are used to reduce the search space by indicating where phrases split
if ($this->aStructuredQuery) { if ($this->aStructuredQuery) {
$aPhrases = $this->aStructuredQuery; $aInPhrases = $this->aStructuredQuery;
$bStructuredPhrases = true; $bStructuredPhrases = true;
} else { } else {
$aPhrases = explode(',', $sQuery); $aInPhrases = explode(',', $sQuery);
$bStructuredPhrases = false; $bStructuredPhrases = false;
} }
@@ -967,25 +964,19 @@ class Geocode
// Get all 'sets' of words // Get all 'sets' of words
// Generate a complete list of all // Generate a complete list of all
$aTokens = array(); $aTokens = array();
foreach ($aPhrases as $iPhrase => $sPhrase) { $aPhrases = array();
$aPhrase = chksql( foreach ($aInPhrases as $iPhrase => $sPhrase) {
$this->oDB->getRow("SELECT make_standard_name('".pg_escape_string($sPhrase)."') as string"), $sPhrase = chksql(
$this->oDB->getOne('SELECT make_standard_name('.getDBQuoted($sPhrase).')'),
"Cannot normalize query string (is it a UTF-8 string?)" "Cannot normalize query string (is it a UTF-8 string?)"
); );
if (trim($aPhrase['string'])) { if (trim($sPhrase)) {
$aPhrases[$iPhrase] = $aPhrase; $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
$aPhrases[$iPhrase]['words'] = explode(' ', $aPhrases[$iPhrase]['string']); $oPhrase->addTokens($aTokens);
$aPhrases[$iPhrase]['wordsets'] = getWordSets($aPhrases[$iPhrase]['words'], 0); $aPhrases[] = $oPhrase;
$aTokens = array_merge($aTokens, getTokensFromSets($aPhrases[$iPhrase]['wordsets']));
} else {
unset($aPhrases[$iPhrase]);
} }
} }
// Reindex phrases - we make assumptions later on that they are numerically keyed in order
$aPhraseTypes = array_keys($aPhrases);
$aPhrases = array_values($aPhrases);
if (sizeof($aTokens)) { if (sizeof($aTokens)) {
// Check which tokens we have, get the ID numbers // Check which tokens we have, get the ID numbers
$sSQL = 'SELECT word_id, word_token, word, class, type, country_code, operator, search_name_count'; $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, operator, search_name_count';
@@ -1046,19 +1037,18 @@ class Geocode
// Any words that have failed completely? // Any words that have failed completely?
// TODO: suggestions // TODO: suggestions
$aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery); $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery);
if ($this->bReverseInPlan) { if ($this->bReverseInPlan) {
// Reverse phrase array and also reverse the order of the wordsets in // Reverse phrase array and also reverse the order of the wordsets in
// the first and final phrase. Don't bother about phrases in the middle // the first and final phrase. Don't bother about phrases in the middle
// because order in the address doesn't matter. // because order in the address doesn't matter.
$aPhrases = array_reverse($aPhrases); $aPhrases = array_reverse($aPhrases);
$aPhrases[0]['wordsets'] = getInverseWordSets($aPhrases[0]['words'], 0); $aPhrases[0]->invertWordSets();
if (sizeof($aPhrases) > 1) { if (sizeof($aPhrases) > 1) {
$aFinalPhrase = end($aPhrases); $aPhrases[sizeof($aPhrases)-1]->invertWordSets();
$aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0);
} }
$aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery); $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery);
foreach ($aGroupedSearches as $aSearches) { foreach ($aGroupedSearches as $aSearches) {
foreach ($aSearches as $aSearch) { foreach ($aSearches as $aSearch) {

92
lib/Phrase.php Normal file
View File

@@ -0,0 +1,92 @@
<?php
namespace Nominatim;
/**
* Segment of a query string.
*
* The parts of a query strings are usually separated by commas.
*/
class Phrase
{
CONST MAX_DEPTH = 7;
// Complete phrase as a string.
private $sPhrase;
// Element type for structured searches.
private $sPhraseType;
// Space-separated words of the phrase.
private $aWords;
// Possible segmentations of the phrase.
private $aWordSets;
public function __construct($sPhrase, $sPhraseType)
{
$this->sPhrase = trim($sPhrase);
$this->sPhraseType = $sPhraseType;
$this->aWords = explode(' ', $this->sPhrase);
$this->aWordSets = $this->createWordSets($this->aWords, 0);
}
public function getPhraseType()
{
return $this->sPhraseType;
}
public function getWordSets()
{
return $this->aWordSets;
}
public function addTokens(&$aTokens)
{
foreach ($this->aWordSets as $aSet) {
foreach ($aSet as $sWord) {
$aTokens[' '.$sWord] = ' '.$sWord;
$aTokens[$sWord] = $sWord;
}
}
}
public function invertWordSets()
{
$this->aWordSets = $this->createInverseWordSets($this->aWords, 0);
}
private function createWordSets($aWords, $iDepth)
{
$aResult = array(array(join(' ', $aWords)));
$sFirstToken = '';
if ($iDepth < Phrase::MAX_DEPTH) {
while (sizeof($aWords) > 1) {
$sWord = array_shift($aWords);
$sFirstToken .= ($sFirstToken?' ':'').$sWord;
$aRest = $this->createWordSets($aWords, $iDepth + 1);
foreach ($aRest as $aSet) {
$aResult[] = array_merge(array($sFirstToken), $aSet);
}
}
}
return $aResult;
}
public function createInverseWordSets($aWords, $iDepth)
{
$aResult = array(array(join(' ', $aWords)));
$sFirstToken = '';
if ($iDepth < Phrase::MAX_DEPTH) {
while (sizeof($aWords) > 1) {
$sWord = array_pop($aWords);
$sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken;
$aRest = $this->createInverseWordSets($aWords, $iDepth + 1);
foreach ($aRest as $aSet) {
$aResult[] = array_merge(array($sFirstToken), $aSet);
}
}
}
return $aResult;
}
};

View File

@@ -60,54 +60,6 @@ function byImportance($a, $b)
} }
function getWordSets($aWords, $iDepth)
{
$aResult = array(array(join(' ', $aWords)));
$sFirstToken = '';
if ($iDepth < 7) {
while (sizeof($aWords) > 1) {
$sWord = array_shift($aWords);
$sFirstToken .= ($sFirstToken?' ':'').$sWord;
$aRest = getWordSets($aWords, $iDepth+1);
foreach ($aRest as $aSet) {
$aResult[] = array_merge(array($sFirstToken), $aSet);
}
}
}
return $aResult;
}
function getInverseWordSets($aWords, $iDepth)
{
$aResult = array(array(join(' ', $aWords)));
$sFirstToken = '';
if ($iDepth < 8) {
while (sizeof($aWords) > 1) {
$sWord = array_pop($aWords);
$sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken;
$aRest = getInverseWordSets($aWords, $iDepth+1);
foreach ($aRest as $aSet) {
$aResult[] = array_merge(array($sFirstToken), $aSet);
}
}
}
return $aResult;
}
function getTokensFromSets($aSets)
{
$aTokens = array();
foreach ($aSets as $aSet) {
foreach ($aSet as $sWord) {
$aTokens[' '.$sWord] = ' '.$sWord;
$aTokens[$sWord] = $sWord;
}
}
return $aTokens;
}
function getClassTypes() function getClassTypes()
{ {
return array( return array(