Merge pull request #1245 from lonvia/address-levels-from-json

Make rank assignments configurable
This commit is contained in:
Sarah Hoffmann
2018-12-01 21:43:53 +01:00
committed by GitHub
11 changed files with 356 additions and 149 deletions

View File

@@ -22,7 +22,7 @@ script:
- if [[ $TEST_SUITE == "tests" ]]; then phpunit ./ ; fi
- cd $TRAVIS_BUILD_DIR/test/bdd
- # behave --format=progress3 api
- if [[ $TEST_SUITE == "tests" ]]; then behave --format=progress3 db ; fi
- if [[ $TEST_SUITE == "tests" ]]; then behave -DREMOVE_TEMPLATE=1 --format=progress3 db ; fi
- if [[ $TEST_SUITE == "tests" ]]; then behave --format=progress3 osm2pgsql ; fi
- cd $TRAVIS_BUILD_DIR/build
- if [[ $TEST_SUITE == "monaco" ]]; then wget --no-verbose --output-document=../data/monaco.osm.pbf http://download.geofabrik.de/europe/monaco-latest.osm.pbf; fi

89
docs/develop/Ranking.md Normal file
View File

@@ -0,0 +1,89 @@
# Place Ranking in Nominatim
Nominatim uses two metrics to rank a place: search rank and address rank.
Both can be assigned a value between 0 and 30. They serve slightly
different purposes, which are explained in this chapter.
## Search rank
The search rank describes the extent and importance of a place. It is used
when ranking search result. Simply put, if there are two results for a
search query which are otherwise equal, then the result with the _lower_
search rank will be appear higher in the result list.
Search ranks are not so important these days because many well-known
places use the Wikipedia importance ranking instead.
## Address rank
The address rank describes where a place shows up in an address hierarchy.
Usually only administrative boundaries and place nodes and areas are
eligible to be part of an address. All other objects have an address rank
of 0.
Note that the search rank of a place place a role in the address computation
as well. When collecting the places that should make up the address parts
then only places are taken into account that have a lower address rank than
the search rank of the base object.
## Rank configuration
Search and address ranks are assigned to a place when it is first imported
into the database. There are a few hard-coded rules for the assignment:
* postcodes follow special rules according to their length
* boundaries that are not areas and railway=rail are dropped completely
* the following are always search rank 30 and address rank 0:
* highway nodes
* landuse that is not an area
Other than that, the ranks can be freely assigned via the json file
defined with `CONST_Address_Level_Config` according to their type and
the country they are in.
The address level configuration must consist of an array of configuration
entries, each containing a tag definition and an optional country array:
```
[ {
"tags" : {
"place" : {
"county" : 12,
"city" : 16,
},
"landuse" : {
"residential" : 22,
"" : 30
}
}
},
{
"countries" : [ "ca", "us" ],
"tags" : {
"boundary" : {
"administrative8" : 18,
"administrative9" : 20
},
"landuse" : {
"residential" : [22, 0]
}
}
}
]
```
The `countries` field contains a list of countries (as ISO 3166-1 alpha 2 code)
for which the definition applies. When the field is omitted, then the
definition is used as a fallback, when nothing more specific for a given
country exists.
`tags` contains the ranks for key/value pairs. The ranks can be either a
single number, in which case they are to search and address rank, or a tuple
of search and address rank (in that order). The value may be left empty.
Then the rank is used when no more specific value is found for the given
key.
Countries and key/value combination may appear in multiple defintions. Just
make sure that each combination of counrty/key/value appears only once per
file. Otherwise the import will fail with a UNIQUE INDEX constraint violation
on import.

View File

@@ -20,6 +20,7 @@ pages:
- 'Troubleshooting' : 'admin/Faq.md'
- 'Developers Guide':
- 'Overview' : 'develop/overview.md'
- 'Place Ranking' : 'develop/Ranking.md'
- 'External Data Sources':
- 'Overview' : 'data-sources/overview.md'
- 'US Census (Tiger)': 'data-sources/US-Tiger.md'

View File

@@ -0,0 +1,98 @@
<?php
namespace Nominatim\Setup;
/**
* Parses an address level description.
*/
class AddressLevelParser
{
private $aLevels;
public function __construct($sDescriptionFile)
{
$sJson = file_get_contents($sDescriptionFile);
$this->aLevels = json_decode($sJson, true);
if (!$this->aLevels) {
switch (json_last_error()) {
case JSON_ERROR_NONE:
break;
case JSON_ERROR_DEPTH:
fail('JSON error - Maximum stack depth exceeded');
break;
case JSON_ERROR_STATE_MISMATCH:
fail('JSON error - Underflow or the modes mismatch');
break;
case JSON_ERROR_CTRL_CHAR:
fail('JSON error - Unexpected control character found');
break;
case JSON_ERROR_SYNTAX:
fail('JSON error - Syntax error, malformed JSON');
break;
case JSON_ERROR_UTF8:
fail('JSON error - Malformed UTF-8 characters, possibly incorrectly encoded');
break;
default:
fail('JSON error - Unknown error');
break;
}
}
}
/**
* Dump the description into a database table.
*
* @param object $oDB Database conneciton to use.
* @param string $sTable Name of table to create.
*
* @return null
*
* A new table is created. Any previously existing table is dropped.
* The table has the following columns:
* country, class, type, rank_search, rank_address.
*/
public function createTable($oDB, $sTable)
{
chksql($oDB->query('DROP TABLE IF EXISTS '.$sTable));
$sSql = 'CREATE TABLE '.$sTable;
$sSql .= '(country_code varchar(2), class TEXT, type TEXT,';
$sSql .= ' rank_search SMALLINT, rank_address SMALLINT)';
chksql($oDB->query($sSql));
$sSql = 'CREATE UNIQUE INDEX ON '.$sTable.'(country_code, class, type)';
chksql($oDB->query($sSql));
$sSql = 'INSERT INTO '.$sTable.' VALUES ';
foreach ($this->aLevels as $aLevel) {
$aCountries = array();
if (isset($aLevel['countries'])) {
foreach ($aLevel['countries'] as $sCountry) {
$aCountries[$sCountry] = getDBQuoted($sCountry);
}
} else {
$aCountries['NULL'] = 'NULL';
}
foreach ($aLevel['tags'] as $sKey => $aValues) {
foreach ($aValues as $sValue => $mRanks) {
$aFields = array(
getDBQuoted($sKey),
$sValue ? getDBQuoted($sValue) : 'NULL'
);
if (is_array($mRanks)) {
$aFields[] = (string) $mRanks[0];
$aFields[] = (string) $mRanks[1];
} else {
$aFields[] = (string) $mRanks;
$aFields[] = (string) $mRanks;
}
$sLine = ','.join(',', $aFields).'),';
foreach ($aCountries as $sCountries) {
$sSql .= '('.$sCountries.$sLine;
}
}
}
}
chksql($oDB->query(rtrim($sSql, ',')));
}
}

View File

@@ -2,6 +2,8 @@
namespace Nominatim\Setup;
require_once(CONST_BasePath.'/lib/setup/AddressLevelParser.php');
class SetupFunctions
{
protected $iCacheMemory;
@@ -271,6 +273,9 @@ class SetupFunctions
if ($bReverseOnly) {
$this->pgExec('DROP TABLE search_name');
}
$oAlParser = new AddressLevelParser(CONST_Address_Level_Config);
$oAlParser->createTable($this->oDB, 'address_levels');
}
public function createPartitionTables()

View File

@@ -0,0 +1,85 @@
[
{ "tags" : {
"place" : {
"sea" : [2, 0],
"continent" : [2, 0],
"country" : [4, 4],
"state" : [8, 8],
"region" : [18, 0],
"county" : 12,
"city" : 16,
"island" : [17, 0],
"town" : [18, 16],
"village" : [19, 16],
"hamlet" : [19, 16],
"municipality" : [19, 16],
"district" : [19, 16],
"unincorporated_area" : [19, 16],
"borough" : [19, 16],
"suburb" : 20,
"croft" : 20,
"subdivision" : 20,
"isolated_dwelling" : 20,
"farm" : [20, 0],
"locality" : [20, 0],
"islet" : [20, 0],
"mountain_pass" : [20, 0],
"neighbourhood" : 22,
"houses" : [28, 0]
},
"boundary" : {
"administrative2" : 4,
"administrative3" : 6,
"administrative4" : 8,
"administrative5" : 10,
"administrative6" : 12,
"administrative7" : 14,
"administrative8" : 16,
"administrative9" : 18,
"administrative10" : 20,
"administrative11" : 22,
"administrative12" : 24
},
"landuse" : {
"residential" : 22,
"farm" : 22,
"farmyard" : 22,
"industrial" : 22,
"commercial" : 22,
"allotments" : 22,
"retail" : 22,
"" : [22, 0]
},
"leisure" : {
"park" : [24, 0]
},
"natural" : {
"peak" : [18, 0],
"volcano" : [18, 0],
"mountain_range" : [18, 0],
"sea" : [4, 0]
},
"waterway" : {
"" : [17, 0]
},
"highway" : {
"" : 26,
"service" : 27,
"cycleway" : 27,
"path" : 27,
"footway" : 27,
"steps" : 27,
"bridleway" : 27,
"motorway_link" : 27,
"primary_link" : 27,
"trunk_link" : 27,
"secondary_link" : 27,
"tertiary_link" : 27
},
"mountain_pass" : {
"" : [20, 0]
}
}
}
]

View File

@@ -49,6 +49,7 @@ if (isset($_GET['debug']) && $_GET['debug']) @define('CONST_Debug', true);
@define('CONST_Pyosmium_Binary', '@PYOSMIUM_PATH@');
@define('CONST_Tiger_Data_Path', CONST_ExtraDataPath.'/tiger');
@define('CONST_Wikipedia_Data_Path', CONST_ExtraDataPath);
@define('CONST_Address_Level_Config', CONST_BasePath.'/settings/address-levels.json');
// osm2pgsql settings
@define('CONST_Osm2pgsql_Flatnode_File', null);

View File

@@ -801,11 +801,12 @@ DECLARE
i INTEGER;
postcode TEXT;
result BOOLEAN;
is_area BOOLEAN;
country_code VARCHAR(2);
default_language VARCHAR(10);
diameter FLOAT;
classtable TEXT;
line RECORD;
classtype TEXT;
BEGIN
--DEBUG: RAISE WARNING '% % % %',NEW.osm_type,NEW.osm_id,NEW.class,NEW.type;
@@ -832,148 +833,71 @@ BEGIN
IF NEW.osm_type = 'X' THEN
-- E'X'ternal records should already be in the right format so do nothing
ELSE
NEW.rank_search := 30;
NEW.rank_address := NEW.rank_search;
is_area := ST_GeometryType(NEW.geometry) IN ('ST_Polygon','ST_MultiPolygon');
-- By doing in postgres we have the country available to us - currently only used for postcode
IF NEW.class in ('place','boundary') AND NEW.type in ('postcode','postal_code') THEN
IF NEW.class in ('place','boundary')
AND NEW.type in ('postcode','postal_code') THEN
IF NEW.address IS NULL OR NOT NEW.address ? 'postcode' THEN
-- most likely just a part of a multipolygon postcode boundary, throw it away
RETURN NULL;
END IF;
NEW.name := hstore('ref', NEW.address->'postcode');
SELECT * FROM get_postcode_rank(NEW.country_code, NEW.address->'postcode')
INTO NEW.rank_search, NEW.rank_address;
IF NOT ST_GeometryType(NEW.geometry) IN ('ST_Polygon','ST_MultiPolygon') THEN
NEW.rank_address := 0;
END IF;
ELSEIF NEW.class = 'place' THEN
IF NEW.type in ('continent', 'sea') THEN
NEW.rank_search := 2;
NEW.rank_address := 0;
NEW.country_code := NULL;
ELSEIF NEW.type in ('country') THEN
NEW.rank_search := 4;
IF ST_GeometryType(NEW.geometry) IN ('ST_Polygon','ST_MultiPolygon') THEN
NEW.rank_address := NEW.rank_search;
ELSE
NEW.rank_address := 0;
END IF;
ELSEIF NEW.type in ('state') THEN
NEW.rank_search := 8;
IF ST_GeometryType(NEW.geometry) IN ('ST_Polygon','ST_MultiPolygon') THEN
NEW.rank_address := NEW.rank_search;
ELSE
NEW.rank_address := 0;
END IF;
ELSEIF NEW.type in ('region') THEN
NEW.rank_search := 18; -- dropped from previous value of 10
NEW.rank_address := 0; -- So badly miss-used that better to just drop it!
ELSEIF NEW.type in ('county') THEN
NEW.rank_search := 12;
NEW.rank_address := NEW.rank_search;
ELSEIF NEW.type in ('city') THEN
NEW.rank_search := 16;
NEW.rank_address := NEW.rank_search;
ELSEIF NEW.type in ('island') THEN
NEW.rank_search := 17;
NEW.rank_address := 0;
ELSEIF NEW.type in ('town') THEN
NEW.rank_search := 18;
NEW.rank_address := 16;
ELSEIF NEW.type in ('village','hamlet','municipality','district','unincorporated_area','borough') THEN
NEW.rank_search := 19;
NEW.rank_address := 16;
ELSEIF NEW.type in ('suburb','croft','subdivision','isolated_dwelling') THEN
NEW.rank_search := 20;
NEW.rank_address := NEW.rank_search;
ELSEIF NEW.type in ('farm','locality','islet','mountain_pass') THEN
NEW.rank_search := 20;
NEW.rank_address := 0;
-- Irish townlands, tagged as place=locality and locality=townland
IF (NEW.extratags -> 'locality') = 'townland' THEN
NEW.rank_address := 20;
END IF;
ELSEIF NEW.type in ('neighbourhood') THEN
NEW.rank_search := 22;
NEW.rank_address := 22;
ELSEIF NEW.type in ('house','building') THEN
NEW.rank_search := 30;
NEW.rank_address := NEW.rank_search;
ELSEIF NEW.type in ('houses') THEN
-- can't guarantee all required nodes loaded yet due to caching in osm2pgsql
NEW.rank_search := 28;
NEW.rank_address := 0;
IF NEW.address IS NULL OR NOT NEW.address ? 'postcode' THEN
-- most likely just a part of a multipolygon postcode boundary, throw it away
RETURN NULL;
END IF;
ELSEIF NEW.class = 'boundary' THEN
IF ST_GeometryType(NEW.geometry) NOT IN ('ST_Polygon','ST_MultiPolygon') THEN
-- RAISE WARNING 'invalid boundary %',NEW.osm_id;
NEW.name := hstore('ref', NEW.address->'postcode');
SELECT * FROM get_postcode_rank(NEW.country_code, NEW.address->'postcode')
INTO NEW.rank_search, NEW.rank_address;
IF NOT is_area THEN
NEW.rank_address := 0;
END IF;
ELSEIF NEW.class = 'boundary' AND NOT is_area THEN
return NULL;
END IF;
NEW.rank_search := NEW.admin_level * 2;
IF NEW.type = 'administrative' THEN
NEW.rank_address := NEW.rank_search;
ELSE
NEW.rank_address := 0;
END IF;
ELSEIF NEW.class = 'landuse' AND ST_GeometryType(NEW.geometry) in ('ST_Polygon','ST_MultiPolygon') THEN
NEW.rank_search := 22;
IF NEW.type in ('residential', 'farm', 'farmyard', 'industrial', 'commercial', 'allotments', 'retail') THEN
NEW.rank_address := NEW.rank_search;
ELSE
NEW.rank_address := 0;
END IF;
ELSEIF NEW.class = 'leisure' and NEW.type in ('park') THEN
NEW.rank_search := 24;
NEW.rank_address := 0;
ELSEIF NEW.class = 'natural' and NEW.type in ('peak','volcano','mountain_range') THEN
NEW.rank_search := 18;
NEW.rank_address := 0;
ELSEIF NEW.class = 'natural' and NEW.type = 'sea' THEN
NEW.rank_search := 4;
NEW.rank_address := NEW.rank_search;
-- any feature more than 5 square miles is probably worth indexing
ELSEIF ST_GeometryType(NEW.geometry) in ('ST_Polygon','ST_MultiPolygon') AND ST_Area(NEW.geometry) > 0.1 THEN
NEW.rank_search := 22;
NEW.rank_address := 0;
ELSEIF NEW.class = 'railway' AND NEW.type in ('rail') THEN
RETURN NULL;
ELSEIF NEW.class = 'waterway' THEN
IF NEW.osm_type = 'R' THEN
NEW.rank_search := 16;
return NULL;
ELSEIF NEW.osm_type = 'N' AND NEW.class = 'highway' THEN
NEW.rank_search = 30;
NEW.rank_address = 0;
ELSEIF NEW.class = 'landuse' AND NOT is_area THEN
NEW.rank_search = 30;
NEW.rank_address = 0;
ELSE
-- do table lookup stuff
IF NEW.class = 'boundary' and NEW.type = 'administrative' THEN
classtype = NEW.type || NEW.admin_level::TEXT;
ELSE
NEW.rank_search := 17;
classtype = NEW.type;
END IF;
SELECT l.rank_search, l.rank_address FROM address_levels l
WHERE (l.country_code = NEW.country_code or l.country_code is NULL)
AND l.class = NEW.class AND (l.type = classtype or l.type is NULL)
ORDER BY l.country_code, l.class, l.type LIMIT 1
INTO NEW.rank_search, NEW.rank_address;
IF NEW.rank_search is NULL THEN
NEW.rank_search := 30;
END IF;
IF NEW.rank_address is NULL THEN
NEW.rank_address := 30;
END IF;
NEW.rank_address := 0;
ELSEIF NEW.class = 'highway' AND NEW.osm_type != 'N' AND NEW.type in ('service','cycleway','path','footway','steps','bridleway','motorway_link','primary_link','trunk_link','secondary_link','tertiary_link') THEN
NEW.rank_search := 27;
NEW.rank_address := NEW.rank_search;
ELSEIF NEW.class = 'highway' AND NEW.osm_type != 'N' THEN
NEW.rank_search := 26;
NEW.rank_address := NEW.rank_search;
ELSEIF NEW.class = 'mountain_pass' THEN
NEW.rank_search := 20;
NEW.rank_address := 0;
END IF;
END IF;
-- some postcorrections
IF NEW.class = 'place' THEN
IF NEW.type in ('continent', 'sea', 'country', 'state') AND NEW.osm_type = 'N' THEN
NEW.rank_address := 0;
END IF;
ELSEIF NEW.class = 'waterway' AND NEW.osm_type = 'R' THEN
-- Slightly promote waterway relations so that they are processed
-- before their members.
NEW.rank_search := NEW.rank_search - 1;
END IF;
IF NEW.rank_search > 30 THEN
NEW.rank_search := 30;
END IF;
IF (NEW.extratags -> 'capital') = 'yes' THEN
NEW.rank_search := NEW.rank_search - 1;
END IF;
IF NEW.rank_address > 30 THEN
NEW.rank_address := 30;
END IF;
IF (NEW.extratags -> 'capital') = 'yes' THEN
NEW.rank_search := NEW.rank_search - 1;
END IF;
-- a country code make no sense below rank 4 (country)

View File

@@ -26,8 +26,8 @@ Feature: Import into placex
| R1 | boundary | administrative | 2 | de | (-100 40, -101 40, -101 41, -100 41, -100 40) |
When importing
Then placex contains
| object | addr+country | country_code |
| R1 | de | de |
| object | rank_search| addr+country | country_code |
| R1 | 4 | de | de |
Scenario: Illegal country code tag for countries is ignored
Given the named places
@@ -157,9 +157,6 @@ Feature: Import into placex
| N36 | place | house |
| N37 | place | building |
| N38 | place | houses |
And the named places
| osm | class | type | extra+locality |
| N100 | place | locality | townland |
And the named places
| osm | class | type | extra+capital |
| N101 | place | city | yes |
@@ -191,7 +188,6 @@ Feature: Import into placex
| N32 | 20 | 0 |
| N33 | 20 | 0 |
| N34 | 20 | 0 |
| N100 | 20 | 20 |
| N101 | 15 | 16 |
| N35 | 22 | 22 |
| N36 | 30 | 30 |
@@ -222,8 +218,8 @@ Feature: Import into placex
| object | rank_search | rank_address |
| R20 | 4 | 4 |
| R21 | 30 | 30 |
| R22 | 12 | 0 |
| R23 | 20 | 0 |
| R22 | 30 | 30 |
| R23 | 30 | 30 |
| R40 | 4 | 4 |
| R41 | 8 | 8 |
@@ -243,7 +239,7 @@ Feature: Import into placex
When importing
Then placex contains
| object | rank_search | rank_address |
| N1 | 30 | 30 |
| N1 | 30 | 0 |
| W1 | 26 | 26 |
| W2 | 26 | 26 |
| W3 | 26 | 26 |
@@ -264,11 +260,11 @@ Feature: Import into placex
When importing
Then placex contains
| object | rank_search | rank_address |
| N2 | 30 | 30 |
| W2 | 30 | 30 |
| N2 | 30 | 0 |
| W2 | 30 | 0 |
| W4 | 22 | 22 |
| R2 | 22 | 22 |
| R3 | 22 | 0 |
| R3 | 22 | 0 |
Scenario: rank and inclusion of naturals
Given the named places
@@ -292,8 +288,8 @@ Feature: Import into placex
| N5 | 30 | 30 |
| W2 | 18 | 0 |
| R3 | 18 | 0 |
| R4 | 22 | 0 |
| R5 | 4 | 4 |
| R6 | 4 | 4 |
| R4 | 30 | 30 |
| R5 | 4 | 0 |
| R6 | 4 | 0 |
| W3 | 30 | 30 |

View File

@@ -34,7 +34,7 @@ Feature: Update of simple objects
When importing
Then placex contains
| object | rank_address |
| R1 | 0 |
| R1 | 30 |
| W1 | 30 |
When marking for delete R1,W1
Then placex has no entry for W1
@@ -103,4 +103,4 @@ Feature: Update of simple objects
| W1 | boundary | historic | Haha | 5 | (1, 2, 4, 3, 1) |
Then placex contains
| object | rank_address |
| W1 | 0 |
| W1 | 30 |

View File

@@ -5,6 +5,7 @@ require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
require_once(CONST_BasePath.'/lib/init-cmd.php');
require_once(CONST_BasePath.'/lib/setup_functions.php');
require_once(CONST_BasePath.'/lib/setup/SetupClass.php');
require_once(CONST_BasePath.'/lib/setup/AddressLevelParser.php');
ini_set('memory_limit', '800M');
@@ -42,6 +43,7 @@ $aCMDOptions
array('deduplicate', '', 0, 1, 0, 0, 'bool', 'Deduplicate tokens'),
array('recompute-word-counts', '', 0, 1, 0, 0, 'bool', 'Compute frequency of full-word search terms'),
array('update-address-levels', '', 0, 1, 0, 0, 'bool', 'Reimport address level configuration (EXPERT)'),
array('no-npi', '', 0, 1, 0, 0, 'bool', '(obsolete)'),
);
@@ -307,6 +309,12 @@ if ($aResult['index']) {
runWithEnv($sCmd, $aProcEnv);
}
if ($aResult['update-address-levels']) {
echo 'Updating address levels from '.CONST_Address_Level_Config.".\n";
$oAlParser = new \Nominatim\Setup\AddressLevelParser(CONST_Address_Level_Config);
$oAlParser->createTable($oDB, 'address_levels');
}
if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) {
//
if (strpos(CONST_Replication_Url, 'download.geofabrik.de') !== false && CONST_Replication_Update_Interval < 86400) {