mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-11 05:14:07 +00:00
ignore frequent partial search terms
Adds word counts from a full planet to the word table. There is a new configuration option CONST_Max_Word_Frequency which allows to take into account the word count: the value that was set on import is used to determine if a word is added to the search_name table. The value during runtime determines if a single term should be used for partial search or simply be ignored.
This commit is contained in:
49646
data/words.sql
Normal file
49646
data/words.sql
Normal file
File diff suppressed because it is too large
Load Diff
@@ -5,6 +5,7 @@
|
|||||||
// General settings
|
// General settings
|
||||||
@define('CONST_Debug', false);
|
@define('CONST_Debug', false);
|
||||||
@define('CONST_Database_DSN', 'pgsql://@/nominatim');
|
@define('CONST_Database_DSN', 'pgsql://@/nominatim');
|
||||||
|
@define('CONST_Max_Word_Frequency', '50000');
|
||||||
|
|
||||||
// Paths
|
// Paths
|
||||||
@define('CONST_Postgresql_Version', '9.1');
|
@define('CONST_Postgresql_Version', '9.1');
|
||||||
|
|||||||
@@ -83,18 +83,24 @@ END;
|
|||||||
$$
|
$$
|
||||||
LANGUAGE 'plpgsql' IMMUTABLE;
|
LANGUAGE 'plpgsql' IMMUTABLE;
|
||||||
|
|
||||||
|
-- returns NULL if the word is too common
|
||||||
CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT)
|
CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT)
|
||||||
RETURNS INTEGER
|
RETURNS INTEGER
|
||||||
AS $$
|
AS $$
|
||||||
DECLARE
|
DECLARE
|
||||||
lookup_token TEXT;
|
lookup_token TEXT;
|
||||||
return_word_id INTEGER;
|
return_word_id INTEGER;
|
||||||
|
count INTEGER;
|
||||||
BEGIN
|
BEGIN
|
||||||
lookup_token := trim(lookup_word);
|
lookup_token := trim(lookup_word);
|
||||||
SELECT min(word_id) FROM word WHERE word_token = lookup_token and class is null and type is null into return_word_id;
|
SELECT min(word_id), max(search_name_count) FROM word WHERE word_token = lookup_token and class is null and type is null into return_word_id, count;
|
||||||
IF return_word_id IS NULL THEN
|
IF return_word_id IS NULL THEN
|
||||||
return_word_id := nextval('seq_word');
|
return_word_id := nextval('seq_word');
|
||||||
INSERT INTO word VALUES (return_word_id, lookup_token, regexp_replace(lookup_token,E'([^0-9])\\1+',E'\\1','g'), null, null, null, null, 0, null);
|
INSERT INTO word VALUES (return_word_id, lookup_token, regexp_replace(lookup_token,E'([^0-9])\\1+',E'\\1','g'), null, null, null, null, 0, null);
|
||||||
|
ELSE
|
||||||
|
IF count > get_maxwordfreq() THEN
|
||||||
|
return_word_id := NULL;
|
||||||
|
END IF;
|
||||||
END IF;
|
END IF;
|
||||||
RETURN return_word_id;
|
RETURN return_word_id;
|
||||||
END;
|
END;
|
||||||
@@ -317,7 +323,7 @@ BEGIN
|
|||||||
FOR j IN 1..array_upper(words, 1) LOOP
|
FOR j IN 1..array_upper(words, 1) LOOP
|
||||||
IF (words[j] != '') THEN
|
IF (words[j] != '') THEN
|
||||||
w = getorcreate_word_id(words[j]);
|
w = getorcreate_word_id(words[j]);
|
||||||
IF NOT (ARRAY[w] <@ result) THEN
|
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
||||||
result := result || w;
|
result := result || w;
|
||||||
END IF;
|
END IF;
|
||||||
END IF;
|
END IF;
|
||||||
@@ -330,7 +336,7 @@ BEGIN
|
|||||||
s := make_standard_name(words[j]);
|
s := make_standard_name(words[j]);
|
||||||
IF s != '' THEN
|
IF s != '' THEN
|
||||||
w := getorcreate_word_id(s);
|
w := getorcreate_word_id(s);
|
||||||
IF NOT (ARRAY[w] <@ result) THEN
|
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
||||||
result := result || w;
|
result := result || w;
|
||||||
END IF;
|
END IF;
|
||||||
END IF;
|
END IF;
|
||||||
@@ -379,7 +385,7 @@ BEGIN
|
|||||||
FOR j IN 1..array_upper(words, 1) LOOP
|
FOR j IN 1..array_upper(words, 1) LOOP
|
||||||
IF (words[j] != '') THEN
|
IF (words[j] != '') THEN
|
||||||
w = getorcreate_word_id(words[j]);
|
w = getorcreate_word_id(words[j]);
|
||||||
IF NOT (ARRAY[w] <@ result) THEN
|
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
||||||
result := result || w;
|
result := result || w;
|
||||||
END IF;
|
END IF;
|
||||||
END IF;
|
END IF;
|
||||||
@@ -392,7 +398,7 @@ BEGIN
|
|||||||
s := make_standard_name(words[j]);
|
s := make_standard_name(words[j]);
|
||||||
IF s != '' THEN
|
IF s != '' THEN
|
||||||
w := getorcreate_word_id(s);
|
w := getorcreate_word_id(s);
|
||||||
IF NOT (ARRAY[w] <@ result) THEN
|
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
||||||
result := result || w;
|
result := result || w;
|
||||||
END IF;
|
END IF;
|
||||||
END IF;
|
END IF;
|
||||||
|
|||||||
@@ -271,7 +271,7 @@
|
|||||||
|
|
||||||
if ($aCMDResult['load-data'] || $aCMDResult['all'])
|
if ($aCMDResult['load-data'] || $aCMDResult['all'])
|
||||||
{
|
{
|
||||||
echo "Load Data\n";
|
echo "Drop old Data\n";
|
||||||
$bDidSomething = true;
|
$bDidSomething = true;
|
||||||
|
|
||||||
$oDB =& getDB();
|
$oDB =& getDB();
|
||||||
@@ -307,17 +307,18 @@
|
|||||||
echo '.';
|
echo '.';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// used by getorcreate_word_id to ignore frequent partial words
|
||||||
|
if (!pg_query($oDB->connection, 'CREATE OR REPLACE FUNCTION get_maxwordfreq() RETURNS integer AS $$ SELECT '.CONST_Max_Word_Frequency.' as maxwordfreq; $$ LANGUAGE SQL IMMUTABLE')) fail(pg_last_error($oDB->connection));
|
||||||
|
echo ".\n";
|
||||||
|
|
||||||
// pre-create the word list
|
// pre-create the word list
|
||||||
if (!$aCMDResult['disable-token-precalc'])
|
if (!$aCMDResult['disable-token-precalc'])
|
||||||
{
|
{
|
||||||
if (!pg_query($oDB->connection, 'select count(make_keywords(v)) from (select distinct svals(name) as v from place) as w where v is not null;')) fail(pg_last_error($oDB->connection));
|
echo "Loading word list\n";
|
||||||
echo '.';
|
pgsqlRunScriptFile(CONST_BasePath.'/data/words.sql');
|
||||||
if (!pg_query($oDB->connection, 'select count(make_keywords(v)) from (select distinct postcode as v from place) as w where v is not null;')) fail(pg_last_error($oDB->connection));
|
|
||||||
echo '.';
|
|
||||||
if (!pg_query($oDB->connection, 'select count(getorcreate_housenumber_id(v)) from (select distinct housenumber as v from place where housenumber is not null) as w;')) fail(pg_last_error($oDB->connection));
|
|
||||||
echo '.';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
echo "Load Data\n";
|
||||||
$aDBInstances = array();
|
$aDBInstances = array();
|
||||||
for($i = 0; $i < $iInstances; $i++)
|
for($i = 0; $i < $iInstances; $i++)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -312,6 +312,7 @@
|
|||||||
// Check which tokens we have, get the ID numbers
|
// Check which tokens we have, get the ID numbers
|
||||||
$sSQL = 'select word_id,word_token, word, class, type, location, country_code, operator';
|
$sSQL = 'select word_id,word_token, word, class, type, location, country_code, operator';
|
||||||
$sSQL .= ' from word where word_token in ('.join(',',array_map("getDBQuoted",$aTokens)).')';
|
$sSQL .= ' from word where word_token in ('.join(',',array_map("getDBQuoted",$aTokens)).')';
|
||||||
|
$sSQL .= ' and search_name_count < '.CONST_Max_Word_Frequency;
|
||||||
$sSQL .= ' and (class is null or class not in (\'highway\'))';
|
$sSQL .= ' and (class is null or class not in (\'highway\'))';
|
||||||
// $sSQL .= ' group by word_token, word, class, type, location, country_code';
|
// $sSQL .= ' group by word_token, word, class, type, location, country_code';
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user