From e3fb706c65505ce3ad79fe6a2d94c11cc77bf67a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 1 Jun 2017 19:34:16 +0200 Subject: [PATCH 1/3] add normalized version of special search terms on import Requires the PHP bindings for libicu, so add that as a requirement. --- docs/Installation.md | 1 + sql/functions.sql | 12 ++++++------ utils/specialphrases.php | 8 +++++--- vagrant/install-on-centos-7.sh | 2 +- vagrant/install-on-travis-ci.sh | 2 +- vagrant/install-on-ubuntu-16.sh | 2 +- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/docs/Installation.md b/docs/Installation.md index 41f76df1..88f32ada 100644 --- a/docs/Installation.md +++ b/docs/Installation.md @@ -39,6 +39,7 @@ For running Nominatim: * [PostGIS](http://postgis.refractions.net) (2.0 or later) * [PHP](http://php.net) (5.4 or later) * PHP-pgsql + * PHP-intl (bundled with PHP) * [PEAR::DB](http://pear.php.net/package/DB) * a webserver (apache or nginx are recommended) diff --git a/sql/functions.sql b/sql/functions.sql index 6cc42803..da496a10 100644 --- a/sql/functions.sql +++ b/sql/functions.sql @@ -101,7 +101,7 @@ END; $$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, lookup_class text, lookup_type text) +CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT, lookup_class text, lookup_type text) RETURNS INTEGER AS $$ DECLARE @@ -109,17 +109,17 @@ DECLARE return_word_id INTEGER; BEGIN lookup_token := ' '||trim(lookup_word); - SELECT min(word_id) FROM word WHERE word_token = lookup_token and class=lookup_class and type = lookup_type into return_word_id; + SELECT min(word_id) FROM word WHERE word_token = lookup_token and word=normalized_word and class=lookup_class and type = lookup_type into return_word_id; IF return_word_id IS NULL THEN return_word_id := nextval('seq_word'); - INSERT INTO word VALUES (return_word_id, lookup_token, null, lookup_class, lookup_type, null, 0); + INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word, lookup_class, lookup_type, null, 0); END IF; RETURN return_word_id; END; $$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, lookup_class text, lookup_type text, op text) +CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, normalized_word TEXT, lookup_class text, lookup_type text, op text) RETURNS INTEGER AS $$ DECLARE @@ -127,10 +127,10 @@ DECLARE return_word_id INTEGER; BEGIN lookup_token := ' '||trim(lookup_word); - SELECT min(word_id) FROM word WHERE word_token = lookup_token and class=lookup_class and type = lookup_type and operator = op into return_word_id; + SELECT min(word_id) FROM word WHERE word_token = lookup_token and word=normalized_word and class=lookup_class and type = lookup_type and operator = op into return_word_id; IF return_word_id IS NULL THEN return_word_id := nextval('seq_word'); - INSERT INTO word VALUES (return_word_id, lookup_token, null, lookup_class, lookup_type, null, 0, op); + INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word, lookup_class, lookup_type, null, 0, op); END IF; RETURN return_word_id; END; diff --git a/utils/specialphrases.php b/utils/specialphrases.php index 50522fc2..15616976 100755 --- a/utils/specialphrases.php +++ b/utils/specialphrases.php @@ -19,6 +19,7 @@ getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true); include(CONST_InstallPath.'/settings/phrase_settings.php'); if ($aCMDResult['wiki-import']) { + $oNormalizer = Transliterator::createFromRules(":: NFD (); [:Nonspacing Mark:] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"); $aPairs = array(); $sLanguageIn = CONST_Languages ? CONST_Languages : @@ -31,6 +32,7 @@ if ($aCMDResult['wiki-import']) { if (preg_match_all('#\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([\\-YN])#', $sWikiPageXML, $aMatches, PREG_SET_ORDER)) { foreach ($aMatches as $aMatch) { $sLabel = trim($aMatch[1]); + $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel)); $sClass = trim($aMatch[2]); $sType = trim($aMatch[3]); // hack around a bug where building=yes was imported with @@ -57,13 +59,13 @@ if ($aCMDResult['wiki-import']) { switch (trim($aMatch[4])) { case 'near': - echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sClass', '$sType', 'near');\n"; + echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sTrans', '$sClass', '$sType', 'near');\n"; break; case 'in': - echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sClass', '$sType', 'in');\n"; + echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sTrans', '$sClass', '$sType', 'in');\n"; break; default: - echo "select getorcreate_amenity(make_standard_name('".pg_escape_string($sLabel)."'), '$sClass', '$sType');\n"; + echo "select getorcreate_amenity(make_standard_name('".pg_escape_string($sLabel)."'), '$sTrans', '$sClass', '$sType');\n"; break; } } diff --git a/vagrant/install-on-centos-7.sh b/vagrant/install-on-centos-7.sh index 8aeedcc6..8b283ef6 100755 --- a/vagrant/install-on-centos-7.sh +++ b/vagrant/install-on-centos-7.sh @@ -21,7 +21,7 @@ sudo yum install -y postgresql-server postgresql-contrib postgresql-devel postgis postgis-utils \ git cmake make gcc gcc-c++ libtool policycoreutils-python \ - php-pgsql php php-pear php-pear-DB libpqxx-devel proj-epsg \ + php-pgsql php php-pear php-pear-DB php-intl libpqxx-devel proj-epsg \ bzip2-devel proj-devel geos-devel libxml2-devel boost-devel expat-devel zlib-devel # If you want to run the test suite, you need to install the following diff --git a/vagrant/install-on-travis-ci.sh b/vagrant/install-on-travis-ci.sh index 44faa614..b2d9a326 100755 --- a/vagrant/install-on-travis-ci.sh +++ b/vagrant/install-on-travis-ci.sh @@ -16,7 +16,7 @@ sudo apt-get install -y -qq libboost-dev libboost-system-dev \ libboost-filesystem-dev libexpat1-dev zlib1g-dev libxml2-dev\ libbz2-dev libpq-dev libgeos-c1 libgeos++-dev libproj-dev \ postgresql-server-dev-9.6 postgresql-9.6-postgis-2.3 postgresql-contrib-9.6 \ - apache2 php5 php5-pgsql php-pear php-db + apache2 php5 php5-pgsql php-pear php-db php-intl sudo apt-get install -y -qq python3-dev python3-pip python3-psycopg2 phpunit php5-cgi diff --git a/vagrant/install-on-ubuntu-16.sh b/vagrant/install-on-ubuntu-16.sh index c347923f..11f80a3e 100755 --- a/vagrant/install-on-ubuntu-16.sh +++ b/vagrant/install-on-ubuntu-16.sh @@ -28,7 +28,7 @@ export DEBIAN_FRONTEND=noninteractive #DOCS: libbz2-dev libpq-dev libgeos-dev libgeos++-dev libproj-dev \ postgresql-server-dev-9.5 postgresql-9.5-postgis-2.2 postgresql-contrib-9.5 \ apache2 php php-pgsql libapache2-mod-php php-pear php-db \ - git + php-intl git # If you want to run the test suite, you need to install the following # additional packages: From 54393addd38726e4f02643591e2579b5da7085fd Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 1 Jun 2017 21:40:23 +0200 Subject: [PATCH 2/3] disregard special phrases that do not match fully Compare the normalized terms imported with the special terms script with the normalized version of the query string. Disregard them if they cannot be found. This avoids a significant number of mismatches due to transliteration issues. The match will only be done when a normalized word has been set making this change backwards compatible with older databases. --- lib/Geocode.php | 29 +++++++++++++++++++++-------- settings/defaults.php | 4 ++++ utils/specialphrases.php | 8 ++++++-- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/lib/Geocode.php b/lib/Geocode.php index ec8eb348..17aaf826 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -653,7 +653,7 @@ class Geocode return $aSearchResults; } - public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases) + public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery) { /* Calculate all searches using aValidTokens i.e. @@ -752,13 +752,19 @@ class Geocode */ } } elseif ($sPhraseType == '' && $aSearchTerm['class'] !== '' && $aSearchTerm['class'] !== null) { - if ($aSearch['sClass'] === '') { - $aSearch['sOperator'] = $aSearchTerm['operator']; + // require a normalized exact match of the term + // if we have the normalizer version of the query + // available + if ($aSearch['sClass'] === '' + && ($sNormQuery === null || !($aSearchTerm['word'] && strpos($sNormQuery, $aSearchTerm['word']) === false))) { $aSearch['sClass'] = $aSearchTerm['class']; $aSearch['sType'] = $aSearchTerm['type']; - if (sizeof($aSearch['aName'])) $aSearch['sOperator'] = 'name'; - else $aSearch['sOperator'] = 'near'; // near = in for the moment - if (strlen($aSearchTerm['operator']) == 0) $aSearch['iSearchRank'] += 1; + if ($aSearchTerm['operator'] == '') { + $aSearch['sOperator'] = sizeof($aSearch['aName']) ? 'name' : 'near'; + $aSearch['iSearchRank'] += 2; + } else { + $aSearch['sOperator'] = 'near'; // near = in for the moment + } if ($aSearch['iSearchRank'] < $this->iMaxRank) $aNewWordsetSearches[] = $aSearch; } @@ -913,6 +919,13 @@ class Geocode { if (!$this->sQuery && !$this->aStructuredQuery) return array(); + $oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules); + if ($oNormalizer !== null) { + $sNormQuery = $oNormalizer->transliterate($this->sQuery); + } else { + $sNormQuery = null; + } + $sLanguagePrefArraySQL = "ARRAY[".join(',', array_map("getDBQuoted", $this->aLangPrefOrder))."]"; $sCountryCodesSQL = false; if ($this->aCountryCodes) { @@ -1139,7 +1152,7 @@ class Geocode // array with: placeid => -1 | tiger-housenumber $aResultPlaceIDs = array(); - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery); if ($this->bReverseInPlan) { // Reverse phrase array and also reverse the order of the wordsets in @@ -1151,7 +1164,7 @@ class Geocode $aFinalPhrase = end($aPhrases); $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { diff --git a/settings/defaults.php b/settings/defaults.php index 16711542..9f694c89 100644 --- a/settings/defaults.php +++ b/settings/defaults.php @@ -17,6 +17,10 @@ if (isset($_GET['debug']) && $_GET['debug']) @define('CONST_Debug', true); // codes, to restrict import to a subset of languages. // Currently only affects the import of country names and special phrases. @define('CONST_Languages', false); +// Rules for normalizing terms for comparison before doing comparisons. +// The default is to remove accents and punctuation and to lower-case the +// term. Spaces are kept but collapsed to one standard space. +@define('CONST_Term_Normalization_Rules', ":: NFD (); [:Nonspacing Mark:] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"); // Set to false to avoid importing extra postcodes for the US. @define('CONST_Use_Extra_US_Postcodes', true); diff --git a/utils/specialphrases.php b/utils/specialphrases.php index 15616976..1a4a51d7 100755 --- a/utils/specialphrases.php +++ b/utils/specialphrases.php @@ -19,7 +19,7 @@ getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true); include(CONST_InstallPath.'/settings/phrase_settings.php'); if ($aCMDResult['wiki-import']) { - $oNormalizer = Transliterator::createFromRules(":: NFD (); [:Nonspacing Mark:] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"); + $oNormalizer = Transliterator::createFromRules(CONST_Term_Normalization_Rules); $aPairs = array(); $sLanguageIn = CONST_Languages ? CONST_Languages : @@ -32,7 +32,11 @@ if ($aCMDResult['wiki-import']) { if (preg_match_all('#\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([\\-YN])#', $sWikiPageXML, $aMatches, PREG_SET_ORDER)) { foreach ($aMatches as $aMatch) { $sLabel = trim($aMatch[1]); - $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel)); + if ($oNormalizer !== null) { + $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel)); + } else { + $sTrans = null; + } $sClass = trim($aMatch[2]); $sType = trim($aMatch[3]); // hack around a bug where building=yes was imported with From af81c6aa9496e9dbdcc3fb96c9f78c73dad934bc Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 4 Jun 2017 11:35:11 +0200 Subject: [PATCH 3/3] add special terms import to travis testing and fixup libs --- .travis.yml | 1 + Vagrantfile | 11 ++++++++++- vagrant/install-on-travis-ci.sh | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 68d5be4e..5efc9f08 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,7 @@ script: - cd $TRAVIS_BUILD_DIR/build - if [[ $TEST_SUITE == "monaco" ]]; then wget --no-verbose --output-document=../data/monaco.osm.pbf http://download.geofabrik.de/europe/monaco-latest.osm.pbf; fi - if [[ $TEST_SUITE == "monaco" ]]; then ./utils/setup.php --osm-file ../data/monaco.osm.pbf --osm2pgsql-cache 1000 --all 2>&1 | grep -v 'ETA (seconds)'; fi + - if [[ $TEST_SUITE == "monaco" ]]; then ./utils/specialphrases.php --wiki-import | psql -d test_api_nominatim >/dev/null; fi - cd $TRAVIS_BUILD_DIR/test/php - if [[ $TEST_SUITE == "tests" ]]; then phpunit ./ ; fi - if [[ $TEST_SUITE == "tests" ]]; then phpcs --report-width=120 */**.php ; fi diff --git a/Vagrantfile b/Vagrantfile index 15d66e9e..b9d618e2 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -23,7 +23,16 @@ Vagrant.configure("2") do |config| end end - config.vm.define "centos" do |sub| + config.vm.define "travis" do |sub| + sub.vm.box = "bento/ubuntu-14.04" + sub.vm.provision :shell do |s| + s.path = "vagrant/install-on-travis-ci.sh" + s.privileged = false + s.args = [checkout] + end + end + + config.vm.define "centos" do |sub| sub.vm.box = "bento/centos-7.2" sub.vm.provision :shell do |s| s.path = "vagrant/install-on-centos-7.sh" diff --git a/vagrant/install-on-travis-ci.sh b/vagrant/install-on-travis-ci.sh index b2d9a326..ec0a92da 100755 --- a/vagrant/install-on-travis-ci.sh +++ b/vagrant/install-on-travis-ci.sh @@ -16,7 +16,7 @@ sudo apt-get install -y -qq libboost-dev libboost-system-dev \ libboost-filesystem-dev libexpat1-dev zlib1g-dev libxml2-dev\ libbz2-dev libpq-dev libgeos-c1 libgeos++-dev libproj-dev \ postgresql-server-dev-9.6 postgresql-9.6-postgis-2.3 postgresql-contrib-9.6 \ - apache2 php5 php5-pgsql php-pear php-db php-intl + apache2 php5 php5-pgsql php-pear php-db php5-intl sudo apt-get install -y -qq python3-dev python3-pip python3-psycopg2 phpunit php5-cgi