ignore frequent partial search terms

Adds word counts from a full planet to the word table. There is a
new configuration option CONST_Max_Word_Frequency which allows to
take into account the word count: the value that was set on import
is used to determine if a word is added to the search_name table.
The value during runtime determines if a single term should be
used for partial search or simply be ignored.
This commit is contained in:
Sarah Hoffmann
2012-09-25 00:36:34 +02:00
parent f1063497ef
commit e73e67001e
5 changed files with 49667 additions and 12 deletions

49646
data/words.sql Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -5,6 +5,7 @@
// General settings // General settings
@define('CONST_Debug', false); @define('CONST_Debug', false);
@define('CONST_Database_DSN', 'pgsql://@/nominatim'); @define('CONST_Database_DSN', 'pgsql://@/nominatim');
@define('CONST_Max_Word_Frequency', '50000');
// Paths // Paths
@define('CONST_Postgresql_Version', '9.1'); @define('CONST_Postgresql_Version', '9.1');

View File

@@ -83,18 +83,24 @@ END;
$$ $$
LANGUAGE 'plpgsql' IMMUTABLE; LANGUAGE 'plpgsql' IMMUTABLE;
-- returns NULL if the word is too common
CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT) CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT)
RETURNS INTEGER RETURNS INTEGER
AS $$ AS $$
DECLARE DECLARE
lookup_token TEXT; lookup_token TEXT;
return_word_id INTEGER; return_word_id INTEGER;
count INTEGER;
BEGIN BEGIN
lookup_token := trim(lookup_word); lookup_token := trim(lookup_word);
SELECT min(word_id) FROM word WHERE word_token = lookup_token and class is null and type is null into return_word_id; SELECT min(word_id), max(search_name_count) FROM word WHERE word_token = lookup_token and class is null and type is null into return_word_id, count;
IF return_word_id IS NULL THEN IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word'); return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, regexp_replace(lookup_token,E'([^0-9])\\1+',E'\\1','g'), null, null, null, null, 0, null); INSERT INTO word VALUES (return_word_id, lookup_token, regexp_replace(lookup_token,E'([^0-9])\\1+',E'\\1','g'), null, null, null, null, 0, null);
ELSE
IF count > get_maxwordfreq() THEN
return_word_id := NULL;
END IF;
END IF; END IF;
RETURN return_word_id; RETURN return_word_id;
END; END;
@@ -317,7 +323,7 @@ BEGIN
FOR j IN 1..array_upper(words, 1) LOOP FOR j IN 1..array_upper(words, 1) LOOP
IF (words[j] != '') THEN IF (words[j] != '') THEN
w = getorcreate_word_id(words[j]); w = getorcreate_word_id(words[j]);
IF NOT (ARRAY[w] <@ result) THEN IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w; result := result || w;
END IF; END IF;
END IF; END IF;
@@ -330,7 +336,7 @@ BEGIN
s := make_standard_name(words[j]); s := make_standard_name(words[j]);
IF s != '' THEN IF s != '' THEN
w := getorcreate_word_id(s); w := getorcreate_word_id(s);
IF NOT (ARRAY[w] <@ result) THEN IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w; result := result || w;
END IF; END IF;
END IF; END IF;
@@ -379,7 +385,7 @@ BEGIN
FOR j IN 1..array_upper(words, 1) LOOP FOR j IN 1..array_upper(words, 1) LOOP
IF (words[j] != '') THEN IF (words[j] != '') THEN
w = getorcreate_word_id(words[j]); w = getorcreate_word_id(words[j]);
IF NOT (ARRAY[w] <@ result) THEN IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w; result := result || w;
END IF; END IF;
END IF; END IF;
@@ -392,7 +398,7 @@ BEGIN
s := make_standard_name(words[j]); s := make_standard_name(words[j]);
IF s != '' THEN IF s != '' THEN
w := getorcreate_word_id(s); w := getorcreate_word_id(s);
IF NOT (ARRAY[w] <@ result) THEN IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w; result := result || w;
END IF; END IF;
END IF; END IF;

View File

@@ -271,7 +271,7 @@
if ($aCMDResult['load-data'] || $aCMDResult['all']) if ($aCMDResult['load-data'] || $aCMDResult['all'])
{ {
echo "Load Data\n"; echo "Drop old Data\n";
$bDidSomething = true; $bDidSomething = true;
$oDB =& getDB(); $oDB =& getDB();
@@ -307,17 +307,18 @@
echo '.'; echo '.';
} }
// used by getorcreate_word_id to ignore frequent partial words
if (!pg_query($oDB->connection, 'CREATE OR REPLACE FUNCTION get_maxwordfreq() RETURNS integer AS $$ SELECT '.CONST_Max_Word_Frequency.' as maxwordfreq; $$ LANGUAGE SQL IMMUTABLE')) fail(pg_last_error($oDB->connection));
echo ".\n";
// pre-create the word list // pre-create the word list
if (!$aCMDResult['disable-token-precalc']) if (!$aCMDResult['disable-token-precalc'])
{ {
if (!pg_query($oDB->connection, 'select count(make_keywords(v)) from (select distinct svals(name) as v from place) as w where v is not null;')) fail(pg_last_error($oDB->connection)); echo "Loading word list\n";
echo '.'; pgsqlRunScriptFile(CONST_BasePath.'/data/words.sql');
if (!pg_query($oDB->connection, 'select count(make_keywords(v)) from (select distinct postcode as v from place) as w where v is not null;')) fail(pg_last_error($oDB->connection));
echo '.';
if (!pg_query($oDB->connection, 'select count(getorcreate_housenumber_id(v)) from (select distinct housenumber as v from place where housenumber is not null) as w;')) fail(pg_last_error($oDB->connection));
echo '.';
} }
echo "Load Data\n";
$aDBInstances = array(); $aDBInstances = array();
for($i = 0; $i < $iInstances; $i++) for($i = 0; $i < $iInstances; $i++)
{ {

View File

@@ -312,6 +312,7 @@
// Check which tokens we have, get the ID numbers // Check which tokens we have, get the ID numbers
$sSQL = 'select word_id,word_token, word, class, type, location, country_code, operator'; $sSQL = 'select word_id,word_token, word, class, type, location, country_code, operator';
$sSQL .= ' from word where word_token in ('.join(',',array_map("getDBQuoted",$aTokens)).')'; $sSQL .= ' from word where word_token in ('.join(',',array_map("getDBQuoted",$aTokens)).')';
$sSQL .= ' and search_name_count < '.CONST_Max_Word_Frequency;
$sSQL .= ' and (class is null or class not in (\'highway\'))'; $sSQL .= ' and (class is null or class not in (\'highway\'))';
// $sSQL .= ' group by word_token, word, class, type, location, country_code'; // $sSQL .= ' group by word_token, word, class, type, location, country_code';