diff --git a/lib/Geocode.php b/lib/Geocode.php index ec8eb348..17aaf826 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -653,7 +653,7 @@ class Geocode return $aSearchResults; } - public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases) + public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery) { /* Calculate all searches using aValidTokens i.e. @@ -752,13 +752,19 @@ class Geocode */ } } elseif ($sPhraseType == '' && $aSearchTerm['class'] !== '' && $aSearchTerm['class'] !== null) { - if ($aSearch['sClass'] === '') { - $aSearch['sOperator'] = $aSearchTerm['operator']; + // require a normalized exact match of the term + // if we have the normalizer version of the query + // available + if ($aSearch['sClass'] === '' + && ($sNormQuery === null || !($aSearchTerm['word'] && strpos($sNormQuery, $aSearchTerm['word']) === false))) { $aSearch['sClass'] = $aSearchTerm['class']; $aSearch['sType'] = $aSearchTerm['type']; - if (sizeof($aSearch['aName'])) $aSearch['sOperator'] = 'name'; - else $aSearch['sOperator'] = 'near'; // near = in for the moment - if (strlen($aSearchTerm['operator']) == 0) $aSearch['iSearchRank'] += 1; + if ($aSearchTerm['operator'] == '') { + $aSearch['sOperator'] = sizeof($aSearch['aName']) ? 'name' : 'near'; + $aSearch['iSearchRank'] += 2; + } else { + $aSearch['sOperator'] = 'near'; // near = in for the moment + } if ($aSearch['iSearchRank'] < $this->iMaxRank) $aNewWordsetSearches[] = $aSearch; } @@ -913,6 +919,13 @@ class Geocode { if (!$this->sQuery && !$this->aStructuredQuery) return array(); + $oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules); + if ($oNormalizer !== null) { + $sNormQuery = $oNormalizer->transliterate($this->sQuery); + } else { + $sNormQuery = null; + } + $sLanguagePrefArraySQL = "ARRAY[".join(',', array_map("getDBQuoted", $this->aLangPrefOrder))."]"; $sCountryCodesSQL = false; if ($this->aCountryCodes) { @@ -1139,7 +1152,7 @@ class Geocode // array with: placeid => -1 | tiger-housenumber $aResultPlaceIDs = array(); - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery); if ($this->bReverseInPlan) { // Reverse phrase array and also reverse the order of the wordsets in @@ -1151,7 +1164,7 @@ class Geocode $aFinalPhrase = end($aPhrases); $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { diff --git a/settings/defaults.php b/settings/defaults.php index 16711542..9f694c89 100644 --- a/settings/defaults.php +++ b/settings/defaults.php @@ -17,6 +17,10 @@ if (isset($_GET['debug']) && $_GET['debug']) @define('CONST_Debug', true); // codes, to restrict import to a subset of languages. // Currently only affects the import of country names and special phrases. @define('CONST_Languages', false); +// Rules for normalizing terms for comparison before doing comparisons. +// The default is to remove accents and punctuation and to lower-case the +// term. Spaces are kept but collapsed to one standard space. +@define('CONST_Term_Normalization_Rules', ":: NFD (); [:Nonspacing Mark:] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"); // Set to false to avoid importing extra postcodes for the US. @define('CONST_Use_Extra_US_Postcodes', true); diff --git a/utils/specialphrases.php b/utils/specialphrases.php index 15616976..1a4a51d7 100755 --- a/utils/specialphrases.php +++ b/utils/specialphrases.php @@ -19,7 +19,7 @@ getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true); include(CONST_InstallPath.'/settings/phrase_settings.php'); if ($aCMDResult['wiki-import']) { - $oNormalizer = Transliterator::createFromRules(":: NFD (); [:Nonspacing Mark:] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"); + $oNormalizer = Transliterator::createFromRules(CONST_Term_Normalization_Rules); $aPairs = array(); $sLanguageIn = CONST_Languages ? CONST_Languages : @@ -32,7 +32,11 @@ if ($aCMDResult['wiki-import']) { if (preg_match_all('#\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([\\-YN])#', $sWikiPageXML, $aMatches, PREG_SET_ORDER)) { foreach ($aMatches as $aMatch) { $sLabel = trim($aMatch[1]); - $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel)); + if ($oNormalizer !== null) { + $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel)); + } else { + $sTrans = null; + } $sClass = trim($aMatch[2]); $sType = trim($aMatch[3]); // hack around a bug where building=yes was imported with