mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-11 13:24:07 +00:00
reinstate word column in icu word table
Postgresql is very bad at creating statistics for jsonb columns. The result is that the query planer tends to use JIT for queries with a where over 'info' even when there is an index.
This commit is contained in:
@@ -145,8 +145,7 @@ class Tokenizer
|
|||||||
private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
|
private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
|
||||||
{
|
{
|
||||||
// Check which tokens we have, get the ID numbers
|
// Check which tokens we have, get the ID numbers
|
||||||
$sSQL = 'SELECT word_id, word_token, type,';
|
$sSQL = 'SELECT word_id, word_token, type, word,';
|
||||||
$sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,";
|
|
||||||
$sSQL .= " info->>'op' as operator,";
|
$sSQL .= " info->>'op' as operator,";
|
||||||
$sSQL .= " info->>'class' as class, info->>'type' as ctype,";
|
$sSQL .= " info->>'class' as class, info->>'type' as ctype,";
|
||||||
$sSQL .= " info->>'count' as count";
|
$sSQL .= " info->>'count' as count";
|
||||||
@@ -163,11 +162,14 @@ class Tokenizer
|
|||||||
|
|
||||||
switch ($aWord['type']) {
|
switch ($aWord['type']) {
|
||||||
case 'C': // country name tokens
|
case 'C': // country name tokens
|
||||||
if ($aWord['country'] !== null
|
if ($aWord['word'] !== null
|
||||||
&& (!$this->aCountryRestriction
|
&& (!$this->aCountryRestriction
|
||||||
|| in_array($aWord['country'], $this->aCountryRestriction))
|
|| in_array($aWord['word'], $this->aCountryRestriction))
|
||||||
) {
|
) {
|
||||||
$oValidTokens->addToken($sTok, new Token\Country($iId, $aWord['country']));
|
$oValidTokens->addToken(
|
||||||
|
$sTok,
|
||||||
|
new Token\Country($iId, $aWord['word'])
|
||||||
|
);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 'H': // house number tokens
|
case 'H': // house number tokens
|
||||||
@@ -177,12 +179,15 @@ class Tokenizer
|
|||||||
// Postcodes are not normalized, so they may have content
|
// Postcodes are not normalized, so they may have content
|
||||||
// that makes SQL injection possible. Reject postcodes
|
// that makes SQL injection possible. Reject postcodes
|
||||||
// that would need special escaping.
|
// that would need special escaping.
|
||||||
if ($aWord['postcode'] !== null
|
if ($aWord['word'] !== null
|
||||||
&& pg_escape_string($aWord['postcode']) == $aWord['postcode']
|
&& pg_escape_string($aWord['word']) == $aWord['word']
|
||||||
) {
|
) {
|
||||||
$sNormPostcode = $this->normalizeString($aWord['postcode']);
|
$sNormPostcode = $this->normalizeString($aWord['word']);
|
||||||
if (strpos($sNormQuery, $sNormPostcode) !== false) {
|
if (strpos($sNormQuery, $sNormPostcode) !== false) {
|
||||||
$oValidTokens->addToken($sTok, new Token\Postcode($iId, $aWord['postcode'], null));
|
$oValidTokens->addToken(
|
||||||
|
$sTok,
|
||||||
|
new Token\Postcode($iId, $aWord['word'], null)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -192,7 +197,7 @@ class Tokenizer
|
|||||||
$iId,
|
$iId,
|
||||||
$aWord['class'],
|
$aWord['class'],
|
||||||
$aWord['ctype'],
|
$aWord['ctype'],
|
||||||
(isset($aWord['op'])) ? Operator::NEAR : Operator::NONE
|
(isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ CREATE TABLE word (
|
|||||||
word_id INTEGER,
|
word_id INTEGER,
|
||||||
word_token text NOT NULL,
|
word_token text NOT NULL,
|
||||||
type text NOT NULL,
|
type text NOT NULL,
|
||||||
|
word text,
|
||||||
info jsonb
|
info jsonb
|
||||||
) {{db.tablespace.search_data}};
|
) {{db.tablespace.search_data}};
|
||||||
|
|
||||||
@@ -10,15 +11,15 @@ CREATE INDEX idx_word_word_token ON word
|
|||||||
USING BTREE (word_token) {{db.tablespace.search_index}};
|
USING BTREE (word_token) {{db.tablespace.search_index}};
|
||||||
-- Used when updating country names from the boundary relation.
|
-- Used when updating country names from the boundary relation.
|
||||||
CREATE INDEX idx_word_country_names ON word
|
CREATE INDEX idx_word_country_names ON word
|
||||||
USING btree((info->>'cc')) {{db.tablespace.address_index}}
|
USING btree(word) {{db.tablespace.address_index}}
|
||||||
WHERE type = 'C';
|
WHERE type = 'C';
|
||||||
-- Used when inserting new postcodes on updates.
|
-- Used when inserting new postcodes on updates.
|
||||||
CREATE INDEX idx_word_postcodes ON word
|
CREATE INDEX idx_word_postcodes ON word
|
||||||
USING btree((info->>'postcode')) {{db.tablespace.address_index}}
|
USING btree(word) {{db.tablespace.address_index}}
|
||||||
WHERE type = 'P';
|
WHERE type = 'P';
|
||||||
-- Used when inserting full words.
|
-- Used when inserting full words.
|
||||||
CREATE INDEX idx_word_full_word ON word
|
CREATE INDEX idx_word_full_word ON word
|
||||||
USING btree((info->>'word')) {{db.tablespace.address_index}}
|
USING btree(word) {{db.tablespace.address_index}}
|
||||||
WHERE type = 'W';
|
WHERE type = 'W';
|
||||||
|
|
||||||
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
|
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
|
||||||
|
|||||||
@@ -98,13 +98,13 @@ DECLARE
|
|||||||
term_count INTEGER;
|
term_count INTEGER;
|
||||||
BEGIN
|
BEGIN
|
||||||
SELECT min(word_id) INTO full_token
|
SELECT min(word_id) INTO full_token
|
||||||
FROM word WHERE info->>'word' = norm_term and type = 'W';
|
FROM word WHERE word = norm_term and type = 'W';
|
||||||
|
|
||||||
IF full_token IS NULL THEN
|
IF full_token IS NULL THEN
|
||||||
full_token := nextval('seq_word');
|
full_token := nextval('seq_word');
|
||||||
INSERT INTO word (word_id, word_token, type, info)
|
INSERT INTO word (word_id, word_token, type, word, info)
|
||||||
SELECT full_token, lookup_term, 'W',
|
SELECT full_token, lookup_term, 'W', norm_term,
|
||||||
json_build_object('word', norm_term, 'count', 0)
|
json_build_object('count', 0)
|
||||||
FROM unnest(lookup_terms) as lookup_term;
|
FROM unnest(lookup_terms) as lookup_term;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
|
|||||||
@@ -278,7 +278,7 @@ class LegacyICUNameAnalyzer:
|
|||||||
(SELECT pc, word FROM
|
(SELECT pc, word FROM
|
||||||
(SELECT distinct(postcode) as pc FROM location_postcode) p
|
(SELECT distinct(postcode) as pc FROM location_postcode) p
|
||||||
FULL JOIN
|
FULL JOIN
|
||||||
(SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
|
(SELECT word FROM word WHERE type = 'P') w
|
||||||
ON pc = word) x
|
ON pc = word) x
|
||||||
WHERE pc is null or word is null""")
|
WHERE pc is null or word is null""")
|
||||||
|
|
||||||
@@ -288,15 +288,15 @@ class LegacyICUNameAnalyzer:
|
|||||||
to_delete.append(word)
|
to_delete.append(word)
|
||||||
else:
|
else:
|
||||||
copystr.add(self.name_processor.get_search_normalized(postcode),
|
copystr.add(self.name_processor.get_search_normalized(postcode),
|
||||||
'P', json.dumps({'postcode': postcode}))
|
'P', postcode)
|
||||||
|
|
||||||
if to_delete:
|
if to_delete:
|
||||||
cur.execute("""DELETE FROM WORD
|
cur.execute("""DELETE FROM WORD
|
||||||
WHERE type ='P' and info->>'postcode' = any(%s)
|
WHERE type ='P' and word = any(%s)
|
||||||
""", (to_delete, ))
|
""", (to_delete, ))
|
||||||
|
|
||||||
copystr.copy_out(cur, 'word',
|
copystr.copy_out(cur, 'word',
|
||||||
columns=['word_token', 'type', 'info'])
|
columns=['word_token', 'type', 'word'])
|
||||||
|
|
||||||
|
|
||||||
def update_special_phrases(self, phrases, should_replace):
|
def update_special_phrases(self, phrases, should_replace):
|
||||||
@@ -311,9 +311,9 @@ class LegacyICUNameAnalyzer:
|
|||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
# Get the old phrases.
|
# Get the old phrases.
|
||||||
existing_phrases = set()
|
existing_phrases = set()
|
||||||
cur.execute("SELECT info FROM word WHERE type = 'S'")
|
cur.execute("SELECT word, info FROM word WHERE type = 'S'")
|
||||||
for (info, ) in cur:
|
for word, info in cur:
|
||||||
existing_phrases.add((info['word'], info['class'], info['type'],
|
existing_phrases.add((word, info['class'], info['type'],
|
||||||
info.get('op') or '-'))
|
info.get('op') or '-'))
|
||||||
|
|
||||||
added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
|
added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
|
||||||
@@ -337,13 +337,13 @@ class LegacyICUNameAnalyzer:
|
|||||||
for word, cls, typ, oper in to_add:
|
for word, cls, typ, oper in to_add:
|
||||||
term = self.name_processor.get_search_normalized(word)
|
term = self.name_processor.get_search_normalized(word)
|
||||||
if term:
|
if term:
|
||||||
copystr.add(term, 'S',
|
copystr.add(term, 'S', word,
|
||||||
json.dumps({'word': word, 'class': cls, 'type': typ,
|
json.dumps({'class': cls, 'type': typ,
|
||||||
'op': oper if oper in ('in', 'near') else None}))
|
'op': oper if oper in ('in', 'near') else None}))
|
||||||
added += 1
|
added += 1
|
||||||
|
|
||||||
copystr.copy_out(cursor, 'word',
|
copystr.copy_out(cursor, 'word',
|
||||||
columns=['word_token', 'type', 'info'])
|
columns=['word_token', 'type', 'word', 'info'])
|
||||||
|
|
||||||
return added
|
return added
|
||||||
|
|
||||||
@@ -358,7 +358,7 @@ class LegacyICUNameAnalyzer:
|
|||||||
if to_delete:
|
if to_delete:
|
||||||
cursor.execute_values(
|
cursor.execute_values(
|
||||||
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
|
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
|
||||||
WHERE info->>'word' = name
|
WHERE type = 'S' and word = name
|
||||||
and info->>'class' = in_class and info->>'type' = in_type
|
and info->>'class' = in_class and info->>'type' = in_type
|
||||||
and ((op = '-' and info->>'op' is null) or op = info->>'op')
|
and ((op = '-' and info->>'op' is null) or op = info->>'op')
|
||||||
""", to_delete)
|
""", to_delete)
|
||||||
@@ -378,14 +378,14 @@ class LegacyICUNameAnalyzer:
|
|||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
# Get existing names
|
# Get existing names
|
||||||
cur.execute("""SELECT word_token FROM word
|
cur.execute("""SELECT word_token FROM word
|
||||||
WHERE type = 'C' and info->>'cc'= %s""",
|
WHERE type = 'C' and word = %s""",
|
||||||
(country_code, ))
|
(country_code, ))
|
||||||
word_tokens.difference_update((t[0] for t in cur))
|
word_tokens.difference_update((t[0] for t in cur))
|
||||||
|
|
||||||
# Only add those names that are not yet in the list.
|
# Only add those names that are not yet in the list.
|
||||||
if word_tokens:
|
if word_tokens:
|
||||||
cur.execute("""INSERT INTO word (word_token, type, info)
|
cur.execute("""INSERT INTO word (word_token, type, word)
|
||||||
(SELECT token, 'C', json_build_object('cc', %s)
|
(SELECT token, 'C', %s
|
||||||
FROM unnest(%s) as token)
|
FROM unnest(%s) as token)
|
||||||
""", (country_code, list(word_tokens)))
|
""", (country_code, list(word_tokens)))
|
||||||
|
|
||||||
@@ -503,12 +503,11 @@ class LegacyICUNameAnalyzer:
|
|||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
# no word_id needed for postcodes
|
# no word_id needed for postcodes
|
||||||
cur.execute("""INSERT INTO word (word_token, type, info)
|
cur.execute("""INSERT INTO word (word_token, type, word)
|
||||||
(SELECT %s, 'P', json_build_object('postcode', pc)
|
(SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
|
||||||
FROM (VALUES (%s)) as v(pc)
|
|
||||||
WHERE NOT EXISTS
|
WHERE NOT EXISTS
|
||||||
(SELECT * FROM word
|
(SELECT * FROM word
|
||||||
WHERE type = 'P' and info->>'postcode' = pc))
|
WHERE type = 'P' and word = pc))
|
||||||
""", (term, postcode))
|
""", (term, postcode))
|
||||||
self._cache.postcodes.add(postcode)
|
self._cache.postcodes.add(postcode)
|
||||||
|
|
||||||
|
|||||||
@@ -266,22 +266,6 @@ def check_location_postcode(context):
|
|||||||
|
|
||||||
db_row.assert_row(row, ('country', 'postcode'))
|
db_row.assert_row(row, ('country', 'postcode'))
|
||||||
|
|
||||||
@then("word contains(?P<exclude> not)?")
|
|
||||||
def check_word_table(context, exclude):
|
|
||||||
""" Check the contents of the word table. Each row represents a table row
|
|
||||||
and all data must match. Data not present in the expected table, may
|
|
||||||
be arbitry. The rows are identified via all given columns.
|
|
||||||
"""
|
|
||||||
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
|
||||||
for row in context.table:
|
|
||||||
wheres = ' AND '.join(["{} = %s".format(h) for h in row.headings])
|
|
||||||
cur.execute("SELECT * from word WHERE " + wheres, list(row.cells))
|
|
||||||
if exclude:
|
|
||||||
assert cur.rowcount == 0, "Row still in word table: %s" % '/'.join(values)
|
|
||||||
else:
|
|
||||||
assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values)
|
|
||||||
|
|
||||||
|
|
||||||
@then("there are(?P<exclude> no)? word tokens for postcodes (?P<postcodes>.*)")
|
@then("there are(?P<exclude> no)? word tokens for postcodes (?P<postcodes>.*)")
|
||||||
def check_word_table_for_postcodes(context, exclude, postcodes):
|
def check_word_table_for_postcodes(context, exclude, postcodes):
|
||||||
""" Check that the tokenizer produces postcode tokens for the given
|
""" Check that the tokenizer produces postcode tokens for the given
|
||||||
@@ -297,8 +281,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
|
|||||||
|
|
||||||
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||||
if nctx.tokenizer == 'legacy_icu':
|
if nctx.tokenizer == 'legacy_icu':
|
||||||
cur.execute("""SELECT info->>'postcode' FROM word
|
cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
|
||||||
WHERE type = 'P' and info->>'postcode' = any(%s)""",
|
|
||||||
(plist,))
|
(plist,))
|
||||||
else:
|
else:
|
||||||
cur.execute("""SELECT word FROM word WHERE word = any(%s)
|
cur.execute("""SELECT word FROM word WHERE word = any(%s)
|
||||||
|
|||||||
@@ -12,16 +12,16 @@ class MockIcuWordTable:
|
|||||||
cur.execute("""CREATE TABLE word (word_id INTEGER,
|
cur.execute("""CREATE TABLE word (word_id INTEGER,
|
||||||
word_token text NOT NULL,
|
word_token text NOT NULL,
|
||||||
type text NOT NULL,
|
type text NOT NULL,
|
||||||
|
word text,
|
||||||
info jsonb)""")
|
info jsonb)""")
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def add_special(self, word_token, word, cls, typ, oper):
|
def add_special(self, word_token, word, cls, typ, oper):
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("""INSERT INTO word (word_token, type, info)
|
cur.execute("""INSERT INTO word (word_token, type, word, info)
|
||||||
VALUES (%s, 'S',
|
VALUES (%s, 'S', %s,
|
||||||
json_build_object('word', %s,
|
json_build_object('class', %s,
|
||||||
'class', %s,
|
|
||||||
'type', %s,
|
'type', %s,
|
||||||
'op', %s))
|
'op', %s))
|
||||||
""", (word_token, word, cls, typ, oper))
|
""", (word_token, word, cls, typ, oper))
|
||||||
@@ -30,16 +30,16 @@ class MockIcuWordTable:
|
|||||||
|
|
||||||
def add_country(self, country_code, word_token):
|
def add_country(self, country_code, word_token):
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("""INSERT INTO word (word_token, type, info)
|
cur.execute("""INSERT INTO word (word_token, type, word)
|
||||||
VALUES(%s, 'C', json_build_object('cc', %s))""",
|
VALUES(%s, 'C', %s)""",
|
||||||
(word_token, country_code))
|
(word_token, country_code))
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def add_postcode(self, word_token, postcode):
|
def add_postcode(self, word_token, postcode):
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("""INSERT INTO word (word_token, type, info)
|
cur.execute("""INSERT INTO word (word_token, type, word)
|
||||||
VALUES (%s, 'P', json_build_object('postcode', %s))
|
VALUES (%s, 'P', %s)
|
||||||
""", (word_token, postcode))
|
""", (word_token, postcode))
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
@@ -56,8 +56,8 @@ class MockIcuWordTable:
|
|||||||
|
|
||||||
def get_special(self):
|
def get_special(self):
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("SELECT word_token, info FROM word WHERE type = 'S'")
|
cur.execute("SELECT word_token, info, word FROM word WHERE type = 'S'")
|
||||||
result = set(((row[0], row[1]['word'], row[1]['class'],
|
result = set(((row[0], row[2], row[1]['class'],
|
||||||
row[1]['type'], row[1]['op']) for row in cur))
|
row[1]['type'], row[1]['op']) for row in cur))
|
||||||
assert len(result) == cur.rowcount, "Word table has duplicates."
|
assert len(result) == cur.rowcount, "Word table has duplicates."
|
||||||
return result
|
return result
|
||||||
@@ -65,7 +65,7 @@ class MockIcuWordTable:
|
|||||||
|
|
||||||
def get_country(self):
|
def get_country(self):
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("SELECT info->>'cc', word_token FROM word WHERE type = 'C'")
|
cur.execute("SELECT word, word_token FROM word WHERE type = 'C'")
|
||||||
result = set((tuple(row) for row in cur))
|
result = set((tuple(row) for row in cur))
|
||||||
assert len(result) == cur.rowcount, "Word table has duplicates."
|
assert len(result) == cur.rowcount, "Word table has duplicates."
|
||||||
return result
|
return result
|
||||||
@@ -73,7 +73,7 @@ class MockIcuWordTable:
|
|||||||
|
|
||||||
def get_postcodes(self):
|
def get_postcodes(self):
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("SELECT info->>'postcode' FROM word WHERE type = 'P'")
|
cur.execute("SELECT word FROM word WHERE type = 'P'")
|
||||||
return set((row[0] for row in cur))
|
return set((row[0] for row in cur))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user