Merge pull request #2408 from lonvia/icu-change-word-table-layout

Change table layout of word table for ICU tokenizer
This commit is contained in:
Sarah Hoffmann
2021-07-28 14:28:49 +02:00
committed by GitHub
16 changed files with 483 additions and 306 deletions

View File

@@ -12,7 +12,7 @@ require_once(CONST_Debug ? 'DebugHtml.php' : 'DebugNone.php');
function userError($sMsg) function userError($sMsg)
{ {
throw new Exception($sMsg, 400); throw new \Exception($sMsg, 400);
} }
@@ -37,7 +37,7 @@ function shutdown_exception_handler_xml()
{ {
$error = error_get_last(); $error = error_get_last();
if ($error !== null && $error['type'] === E_ERROR) { if ($error !== null && $error['type'] === E_ERROR) {
exception_handler_xml(new Exception($error['message'], 500)); exception_handler_xml(new \Exception($error['message'], 500));
} }
} }
@@ -45,7 +45,7 @@ function shutdown_exception_handler_json()
{ {
$error = error_get_last(); $error = error_get_last();
if ($error !== null && $error['type'] === E_ERROR) { if ($error !== null && $error['type'] === E_ERROR) {
exception_handler_json(new Exception($error['message'], 500)); exception_handler_json(new \Exception($error['message'], 500));
} }
} }

View File

@@ -19,13 +19,13 @@ class Tokenizer
public function checkStatus() public function checkStatus()
{ {
$sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')"; $sSQL = 'SELECT word_id FROM word limit 1';
$iWordID = $this->oDB->getOne($sSQL); $iWordID = $this->oDB->getOne($sSQL);
if ($iWordID === false) { if ($iWordID === false) {
throw new Exception('Query failed', 703); throw new \Exception('Query failed', 703);
} }
if (!$iWordID) { if (!$iWordID) {
throw new Exception('No value', 704); throw new \Exception('No value', 704);
} }
} }
@@ -55,9 +55,8 @@ class Tokenizer
{ {
$aResults = array(); $aResults = array();
$sSQL = 'SELECT word_id, class, type FROM word '; $sSQL = "SELECT word_id, info->>'class' as class, info->>'type' as type ";
$sSQL .= ' WHERE word_token = \' \' || :term'; $sSQL .= ' FROM word WHERE word_token = :term and type = \'S\'';
$sSQL .= ' AND class is not null AND class not in (\'place\')';
Debug::printVar('Term', $sTerm); Debug::printVar('Term', $sTerm);
Debug::printSQL($sSQL); Debug::printSQL($sSQL);
@@ -146,8 +145,10 @@ class Tokenizer
private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
{ {
// Check which tokens we have, get the ID numbers // Check which tokens we have, get the ID numbers
$sSQL = 'SELECT word_id, word_token, word, class, type, country_code,'; $sSQL = 'SELECT word_id, word_token, type, word,';
$sSQL .= ' operator, coalesce(search_name_count, 0) as count'; $sSQL .= " info->>'op' as operator,";
$sSQL .= " info->>'class' as class, info->>'type' as ctype,";
$sSQL .= " info->>'count' as count";
$sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= ' FROM word WHERE word_token in (';
$sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
@@ -156,66 +157,66 @@ class Tokenizer
$aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.'); $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
foreach ($aDBWords as $aWord) { foreach ($aDBWords as $aWord) {
$oToken = null;
$iId = (int) $aWord['word_id']; $iId = (int) $aWord['word_id'];
$sTok = $aWord['word_token'];
if ($aWord['class']) { switch ($aWord['type']) {
// Special terms need to appear in their normalized form. case 'C': // country name tokens
// (postcodes are not normalized in the word table) if ($aWord['word'] !== null
$sNormWord = $this->normalizeString($aWord['word']); && (!$this->aCountryRestriction
if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) { || in_array($aWord['word'], $this->aCountryRestriction))
continue;
}
if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
$oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
} elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
if ($aWord['word']
&& pg_escape_string($aWord['word']) == $aWord['word']
) { ) {
$oToken = new Token\Postcode( $oValidTokens->addToken(
$iId, $sTok,
$aWord['word'], new Token\Country($iId, $aWord['word'])
$aWord['country_code']
); );
} }
} else { break;
// near and in operator the same at the moment case 'H': // house number tokens
$oToken = new Token\SpecialTerm( $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token']));
break;
case 'P': // postcode tokens
// Postcodes are not normalized, so they may have content
// that makes SQL injection possible. Reject postcodes
// that would need special escaping.
if ($aWord['word'] !== null
&& pg_escape_string($aWord['word']) == $aWord['word']
) {
$sNormPostcode = $this->normalizeString($aWord['word']);
if (strpos($sNormQuery, $sNormPostcode) !== false) {
$oValidTokens->addToken(
$sTok,
new Token\Postcode($iId, $aWord['word'], null)
);
}
}
break;
case 'S': // tokens for classification terms (special phrases)
if ($aWord['class'] !== null && $aWord['ctype'] !== null) {
$oValidTokens->addToken($sTok, new Token\SpecialTerm(
$iId,
$aWord['class'],
$aWord['ctype'],
(isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE
));
}
break;
case 'W': // full-word tokens
$oValidTokens->addToken($sTok, new Token\Word(
$iId, $iId,
$aWord['class'], (int) $aWord['count'],
$aWord['type'], substr_count($aWord['word_token'], ' ')
$aWord['operator'] ? Operator::NEAR : Operator::NONE ));
); break;
} case 'w': // partial word terms
} elseif ($aWord['country_code']) { $oValidTokens->addToken($sTok, new Token\Partial(
// Filter country tokens that do not match restricted countries. $iId,
if (!$this->aCountryRestriction $aWord['word_token'],
|| in_array($aWord['country_code'], $this->aCountryRestriction) (int) $aWord['count']
) { ));
$oToken = new Token\Country($iId, $aWord['country_code']); break;
} default:
} elseif ($aWord['word_token'][0] == ' ') { break;
$oToken = new Token\Word(
$iId,
(int) $aWord['count'],
substr_count($aWord['word_token'], ' ')
);
} else {
$oToken = new Token\Partial(
$iId,
$aWord['word_token'],
(int) $aWord['count']
);
}
if ($oToken) {
// remove any leading spaces
if ($aWord['word_token'][0] == ' ') {
$oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken);
} else {
$oValidTokens->addToken($aWord['word_token'], $oToken);
}
} }
} }
} }
@@ -234,12 +235,10 @@ class Tokenizer
for ($i = 0; $i < $iNumWords; $i++) { for ($i = 0; $i < $iNumWords; $i++) {
$sPhrase = $aWords[$i]; $sPhrase = $aWords[$i];
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
$aTokens[$sPhrase] = $sPhrase; $aTokens[$sPhrase] = $sPhrase;
for ($j = $i + 1; $j < $iNumWords; $j++) { for ($j = $i + 1; $j < $iNumWords; $j++) {
$sPhrase .= ' '.$aWords[$j]; $sPhrase .= ' '.$aWords[$j];
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
$aTokens[$sPhrase] = $sPhrase; $aTokens[$sPhrase] = $sPhrase;
} }
} }

View File

@@ -19,20 +19,20 @@ class Tokenizer
{ {
$sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')"); $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
if ($sStandardWord === false) { if ($sStandardWord === false) {
throw new Exception('Module failed', 701); throw new \Exception('Module failed', 701);
} }
if ($sStandardWord != 'a') { if ($sStandardWord != 'a') {
throw new Exception('Module call failed', 702); throw new \Exception('Module call failed', 702);
} }
$sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')"; $sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')";
$iWordID = $this->oDB->getOne($sSQL); $iWordID = $this->oDB->getOne($sSQL);
if ($iWordID === false) { if ($iWordID === false) {
throw new Exception('Query failed', 703); throw new \Exception('Query failed', 703);
} }
if (!$iWordID) { if (!$iWordID) {
throw new Exception('No value', 704); throw new \Exception('No value', 704);
} }
} }

View File

@@ -83,7 +83,7 @@ if ($sOsmType && $iOsmId > 0) {
} }
if ($sPlaceId === false) { if ($sPlaceId === false) {
throw new Exception('No place with that OSM ID found.', 404); throw new \Exception('No place with that OSM ID found.', 404);
} }
} else { } else {
if ($sPlaceId === false) { if ($sPlaceId === false) {
@@ -146,7 +146,7 @@ $sSQL .= " WHERE place_id = $iPlaceID";
$aPointDetails = $oDB->getRow($sSQL, null, 'Could not get details of place object.'); $aPointDetails = $oDB->getRow($sSQL, null, 'Could not get details of place object.');
if (!$aPointDetails) { if (!$aPointDetails) {
throw new Exception('No place with that place ID found.', 404); throw new \Exception('No place with that place ID found.', 404);
} }
$aPointDetails['localname'] = $aPointDetails['localname']?$aPointDetails['localname']:$aPointDetails['housenumber']; $aPointDetails['localname'] = $aPointDetails['localname']?$aPointDetails['localname']:$aPointDetails['housenumber'];

View File

@@ -0,0 +1,29 @@
DROP TABLE IF EXISTS word;
CREATE TABLE word (
word_id INTEGER,
word_token text NOT NULL,
type text NOT NULL,
word text,
info jsonb
) {{db.tablespace.search_data}};
CREATE INDEX idx_word_word_token ON word
USING BTREE (word_token) {{db.tablespace.search_index}};
-- Used when updating country names from the boundary relation.
CREATE INDEX idx_word_country_names ON word
USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'C';
-- Used when inserting new postcodes on updates.
CREATE INDEX idx_word_postcodes ON word
USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'P';
-- Used when inserting full words.
CREATE INDEX idx_word_full_word ON word
USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'W';
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS seq_word;
CREATE SEQUENCE seq_word start 1;
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";

View File

@@ -98,12 +98,14 @@ DECLARE
term_count INTEGER; term_count INTEGER;
BEGIN BEGIN
SELECT min(word_id) INTO full_token SELECT min(word_id) INTO full_token
FROM word WHERE word = norm_term and class is null and country_code is null; FROM word WHERE word = norm_term and type = 'W';
IF full_token IS NULL THEN IF full_token IS NULL THEN
full_token := nextval('seq_word'); full_token := nextval('seq_word');
INSERT INTO word (word_id, word_token, word, search_name_count) INSERT INTO word (word_id, word_token, type, word, info)
SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term; SELECT full_token, lookup_term, 'W', norm_term,
json_build_object('count', 0)
FROM unnest(lookup_terms) as lookup_term;
END IF; END IF;
FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
@@ -115,14 +117,14 @@ BEGIN
partial_tokens := '{}'::INT[]; partial_tokens := '{}'::INT[];
FOR term IN SELECT unnest(partial_terms) LOOP FOR term IN SELECT unnest(partial_terms) LOOP
SELECT min(word_id), max(search_name_count) INTO term_id, term_count SELECT min(word_id), max(info->>'count') INTO term_id, term_count
FROM word WHERE word_token = term and class is null and country_code is null; FROM word WHERE word_token = term and type = 'w';
IF term_id IS NULL THEN IF term_id IS NULL THEN
term_id := nextval('seq_word'); term_id := nextval('seq_word');
term_count := 0; term_count := 0;
INSERT INTO word (word_id, word_token, search_name_count) INSERT INTO word (word_id, word_token, type, info)
VALUES (term_id, term, 0); VALUES (term_id, term, 'w', json_build_object('count', term_count));
END IF; END IF;
IF term_count < {{ max_word_freq }} THEN IF term_count < {{ max_word_freq }} THEN
@@ -140,15 +142,13 @@ CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
DECLARE DECLARE
return_id INTEGER; return_id INTEGER;
BEGIN BEGIN
SELECT min(word_id) INTO return_id SELECT min(word_id) INTO return_id FROM word
FROM word WHERE word_token = lookup_term and type = 'H';
WHERE word_token = ' ' || lookup_term
and class = 'place' and type = 'house';
IF return_id IS NULL THEN IF return_id IS NULL THEN
return_id := nextval('seq_word'); return_id := nextval('seq_word');
INSERT INTO word (word_id, word_token, class, type, search_name_count) INSERT INTO word (word_id, word_token, type)
VALUES (return_id, ' ' || lookup_term, 'place', 'house', 0); VALUES (return_id, lookup_term, 'H');
END IF; END IF;
RETURN return_id; RETURN return_id;

View File

@@ -65,6 +65,7 @@ _SQL_TRANSLATION = {ord(u'\\'): u'\\\\',
ord(u'\t'): u'\\t', ord(u'\t'): u'\\t',
ord(u'\n'): u'\\n'} ord(u'\n'): u'\\n'}
class CopyBuffer: class CopyBuffer:
""" Data collector for the copy_from command. """ Data collector for the copy_from command.
""" """

View File

@@ -4,6 +4,7 @@ libICU instead of the PostgreSQL module.
""" """
from collections import Counter from collections import Counter
import itertools import itertools
import json
import logging import logging
import re import re
from textwrap import dedent from textwrap import dedent
@@ -74,13 +75,10 @@ class LegacyICUTokenizer:
self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
def finalize_import(self, config): def finalize_import(self, _):
""" Do any required postprocessing to make the tokenizer data ready """ Do any required postprocessing to make the tokenizer data ready
for use. for use.
""" """
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
def update_sql_functions(self, config): def update_sql_functions(self, config):
@@ -121,18 +119,17 @@ class LegacyICUTokenizer:
""" """
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
# pylint: disable=missing-format-attribute
def _install_php(self, phpdir): def _install_php(self, phpdir):
""" Install the php script for the tokenizer. """ Install the php script for the tokenizer.
""" """
php_file = self.data_dir / "tokenizer.php" php_file = self.data_dir / "tokenizer.php"
php_file.write_text(dedent("""\ php_file.write_text(dedent(f"""\
<?php <?php
@define('CONST_Max_Word_Frequency', {0.max_word_frequency}); @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
@define('CONST_Term_Normalization_Rules', "{0.term_normalization}"); @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
@define('CONST_Transliteration', "{0.naming_rules.search_rules}"); @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
require_once('{1}/tokenizer/legacy_icu_tokenizer.php'); require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
""".format(self, phpdir)))
def _save_config(self, config): def _save_config(self, config):
@@ -152,40 +149,48 @@ class LegacyICUTokenizer:
""" """
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config) sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql') sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
conn.commit() conn.commit()
LOG.warning("Precomputing word tokens") LOG.warning("Precomputing word tokens")
# get partial words and their frequencies # get partial words and their frequencies
words = Counter() words = self._count_partial_terms(conn)
name_proc = ICUNameProcessor(self.naming_rules)
with conn.cursor(name="words") as cur:
cur.execute(""" SELECT v, count(*) FROM
(SELECT svals(name) as v FROM place)x
WHERE length(v) < 75 GROUP BY v""")
for name, cnt in cur:
terms = set()
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
if ' ' in word:
terms.update(word.split())
for term in terms:
words[term] += cnt
# copy them back into the word table # copy them back into the word table
with CopyBuffer() as copystr: with CopyBuffer() as copystr:
for args in words.items(): for term, cnt in words.items():
copystr.add(*args) copystr.add('w', term, json.dumps({'count': cnt}))
with conn.cursor() as cur: with conn.cursor() as cur:
copystr.copy_out(cur, 'word', copystr.copy_out(cur, 'word',
columns=['word_token', 'search_name_count']) columns=['type', 'word_token', 'info'])
cur.execute("""UPDATE word SET word_id = nextval('seq_word') cur.execute("""UPDATE word SET word_id = nextval('seq_word')
WHERE word_id is null""") WHERE word_id is null and type = 'w'""")
conn.commit() conn.commit()
def _count_partial_terms(self, conn):
""" Count the partial terms from the names in the place table.
"""
words = Counter()
name_proc = ICUNameProcessor(self.naming_rules)
with conn.cursor(name="words") as cur:
cur.execute(""" SELECT v, count(*) FROM
(SELECT svals(name) as v FROM place)x
WHERE length(v) < 75 GROUP BY v""")
for name, cnt in cur:
terms = set()
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
if ' ' in word:
terms.update(word.split())
for term in terms:
words[term] += cnt
return words
class LegacyICUNameAnalyzer: class LegacyICUNameAnalyzer:
""" The legacy analyzer uses the ICU library for splitting names. """ The legacy analyzer uses the ICU library for splitting names.
@@ -229,22 +234,26 @@ class LegacyICUNameAnalyzer:
The function is used for testing and debugging only The function is used for testing and debugging only
and not necessarily efficient. and not necessarily efficient.
""" """
tokens = {} full_tokens = {}
partial_tokens = {}
for word in words: for word in words:
if word.startswith('#'): if word.startswith('#'):
tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:]) full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
else: else:
tokens[word] = self.name_processor.get_search_normalized(word) partial_tokens[word] = self.name_processor.get_search_normalized(word)
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id cur.execute("""SELECT word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t FROM word WHERE word_token = ANY(%s) and type = 'W'
WHERE word_token = t.term """, (list(full_tokens.values()),))
and class is null and country_code is null""", full_ids = {r[0]: r[1] for r in cur}
(list(tokens.values()), )) cur.execute("""SELECT word_token, word_id
ids = {r[0]: r[1] for r in cur} FROM word WHERE word_token = ANY(%s) and type = 'w'""",
(list(partial_tokens.values()),))
part_ids = {r[0]: r[1] for r in cur}
return [(k, v, ids.get(v, None)) for k, v in tokens.items()] return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
+ [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
@staticmethod @staticmethod
@@ -276,8 +285,7 @@ class LegacyICUNameAnalyzer:
(SELECT pc, word FROM (SELECT pc, word FROM
(SELECT distinct(postcode) as pc FROM location_postcode) p (SELECT distinct(postcode) as pc FROM location_postcode) p
FULL JOIN FULL JOIN
(SELECT word FROM word (SELECT word FROM word WHERE type = 'P') w
WHERE class ='place' and type = 'postcode') w
ON pc = word) x ON pc = word) x
WHERE pc is null or word is null""") WHERE pc is null or word is null""")
@@ -286,24 +294,23 @@ class LegacyICUNameAnalyzer:
if postcode is None: if postcode is None:
to_delete.append(word) to_delete.append(word)
else: else:
copystr.add( copystr.add(self.name_processor.get_search_normalized(postcode),
postcode, 'P', postcode)
' ' + self.name_processor.get_search_normalized(postcode),
'place', 'postcode', 0)
if to_delete: if to_delete:
cur.execute("""DELETE FROM WORD cur.execute("""DELETE FROM WORD
WHERE class ='place' and type = 'postcode' WHERE type ='P' and word = any(%s)
and word = any(%s)
""", (to_delete, )) """, (to_delete, ))
copystr.copy_out(cur, 'word', copystr.copy_out(cur, 'word',
columns=['word', 'word_token', 'class', 'type', columns=['word_token', 'type', 'word'])
'search_name_count'])
def update_special_phrases(self, phrases, should_replace): def update_special_phrases(self, phrases, should_replace):
""" Replace the search index for special phrases with the new phrases. """ Replace the search index for special phrases with the new phrases.
If `should_replace` is True, then the previous set of will be
completely replaced. Otherwise the phrases are added to the
already existing ones.
""" """
norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3]) norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
for p in phrases)) for p in phrases))
@@ -311,11 +318,10 @@ class LegacyICUNameAnalyzer:
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
# Get the old phrases. # Get the old phrases.
existing_phrases = set() existing_phrases = set()
cur.execute("""SELECT word, class, type, operator FROM word cur.execute("SELECT word, info FROM word WHERE type = 'S'")
WHERE class != 'place' for word, info in cur:
OR (type != 'house' AND type != 'postcode')""") existing_phrases.add((word, info['class'], info['type'],
for label, cls, typ, oper in cur: info.get('op') or '-'))
existing_phrases.add((label, cls, typ, oper or '-'))
added = self._add_special_phrases(cur, norm_phrases, existing_phrases) added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
if should_replace: if should_replace:
@@ -338,13 +344,13 @@ class LegacyICUNameAnalyzer:
for word, cls, typ, oper in to_add: for word, cls, typ, oper in to_add:
term = self.name_processor.get_search_normalized(word) term = self.name_processor.get_search_normalized(word)
if term: if term:
copystr.add(word, ' ' + term, cls, typ, copystr.add(term, 'S', word,
oper if oper in ('in', 'near') else None, 0) json.dumps({'class': cls, 'type': typ,
'op': oper if oper in ('in', 'near') else None}))
added += 1 added += 1
copystr.copy_out(cursor, 'word', copystr.copy_out(cursor, 'word',
columns=['word', 'word_token', 'class', 'type', columns=['word_token', 'type', 'word', 'info'])
'operator', 'search_name_count'])
return added return added
@@ -359,9 +365,10 @@ class LegacyICUNameAnalyzer:
if to_delete: if to_delete:
cursor.execute_values( cursor.execute_values(
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
WHERE word = name and class = in_class and type = in_type WHERE type = 'S' and word = name
and ((op = '-' and operator is null) or op = operator)""", and info->>'class' = in_class and info->>'type' = in_type
to_delete) and ((op = '-' and info->>'op' is null) or op = info->>'op')
""", to_delete)
return len(to_delete) return len(to_delete)
@@ -371,22 +378,28 @@ class LegacyICUNameAnalyzer:
""" """
word_tokens = set() word_tokens = set()
for name in self._compute_full_names(names): for name in self._compute_full_names(names):
if name: norm_name = self.name_processor.get_search_normalized(name)
word_tokens.add(' ' + self.name_processor.get_search_normalized(name)) if norm_name:
word_tokens.add(norm_name)
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
# Get existing names # Get existing names
cur.execute("SELECT word_token FROM word WHERE country_code = %s", cur.execute("""SELECT word_token FROM word
WHERE type = 'C' and word = %s""",
(country_code, )) (country_code, ))
word_tokens.difference_update((t[0] for t in cur)) word_tokens.difference_update((t[0] for t in cur))
# Only add those names that are not yet in the list.
if word_tokens: if word_tokens:
cur.execute("""INSERT INTO word (word_id, word_token, country_code, cur.execute("""INSERT INTO word (word_token, type, word)
search_name_count) (SELECT token, 'C', %s
(SELECT nextval('seq_word'), token, %s, 0
FROM unnest(%s) as token) FROM unnest(%s) as token)
""", (country_code, list(word_tokens))) """, (country_code, list(word_tokens)))
# No names are deleted at the moment.
# If deletion is made possible, then the static names from the
# initial 'country_name' table should be kept.
def process_place(self, place): def process_place(self, place):
""" Determine tokenizer information about the given place. """ Determine tokenizer information about the given place.
@@ -497,14 +510,12 @@ class LegacyICUNameAnalyzer:
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
# no word_id needed for postcodes # no word_id needed for postcodes
cur.execute("""INSERT INTO word (word, word_token, class, type, cur.execute("""INSERT INTO word (word_token, type, word)
search_name_count) (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
(SELECT pc, %s, 'place', 'postcode', 0
FROM (VALUES (%s)) as v(pc)
WHERE NOT EXISTS WHERE NOT EXISTS
(SELECT * FROM word (SELECT * FROM word
WHERE word = pc and class='place' and type='postcode')) WHERE type = 'P' and word = pc))
""", (' ' + term, postcode)) """, (term, postcode))
self._cache.postcodes.add(postcode) self._cache.postcodes.add(postcode)
@@ -595,7 +606,8 @@ class _TokenCache:
def get_hnr_tokens(self, conn, terms): def get_hnr_tokens(self, conn, terms):
""" Get token ids for a list of housenumbers, looking them up in the """ Get token ids for a list of housenumbers, looking them up in the
database if necessary. database if necessary. `terms` is an iterable of normalized
housenumbers.
""" """
tokens = [] tokens = []
askdb = [] askdb = []

View File

@@ -134,9 +134,7 @@ Feature: Import of postcodes
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 01982 | country:de | | de | 01982 | country:de |
And word contains And there are word tokens for postcodes 01982
| word | class | type |
| 01982 | place | postcode |
Scenario: Different postcodes with the same normalization can both be found Scenario: Different postcodes with the same normalization can both be found
Given the places Given the places

View File

@@ -18,10 +18,7 @@ Feature: Update of postcode
| country | postcode | geometry | | country | postcode | geometry |
| de | 01982 | country:de | | de | 01982 | country:de |
| ch | 4567 | country:ch | | ch | 4567 | country:ch |
And word contains And there are word tokens for postcodes 01982,4567
| word | class | type |
| 01982 | place | postcode |
| 4567 | place | postcode |
Scenario: When the last postcode is deleted, it is deleted from postcode and word Scenario: When the last postcode is deleted, it is deleted from postcode and word
Given the places Given the places
@@ -34,12 +31,8 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| ch | 4567 | country:ch | | ch | 4567 | country:ch |
And word contains not And there are word tokens for postcodes 4567
| word | class | type | And there are no word tokens for postcodes 01982
| 01982 | place | postcode |
And word contains
| word | class | type |
| 4567 | place | postcode |
Scenario: A postcode is not deleted from postcode and word when it exist in another country Scenario: A postcode is not deleted from postcode and word when it exist in another country
Given the places Given the places
@@ -52,9 +45,7 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| ch | 01982 | country:ch | | ch | 01982 | country:ch |
And word contains And there are word tokens for postcodes 01982
| word | class | type |
| 01982 | place | postcode |
Scenario: Updating a postcode is reflected in postcode table Scenario: Updating a postcode is reflected in postcode table
Given the places Given the places
@@ -68,9 +59,7 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 20453 | country:de | | de | 20453 | country:de |
And word contains And there are word tokens for postcodes 20453
| word | class | type |
| 20453 | place | postcode |
Scenario: When changing from a postcode type, the entry appears in placex Scenario: When changing from a postcode type, the entry appears in placex
When importing When importing
@@ -91,9 +80,7 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 20453 | country:de | | de | 20453 | country:de |
And word contains And there are word tokens for postcodes 20453
| word | class | type |
| 20453 | place | postcode |
Scenario: When changing to a postcode type, the entry disappears from placex Scenario: When changing to a postcode type, the entry disappears from placex
When importing When importing
@@ -114,6 +101,4 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 01982 | country:de | | de | 01982 | country:de |
And word contains And there are word tokens for postcodes 01982
| word | class | type |
| 01982 | place | postcode |

View File

@@ -266,20 +266,36 @@ def check_location_postcode(context):
db_row.assert_row(row, ('country', 'postcode')) db_row.assert_row(row, ('country', 'postcode'))
@then("word contains(?P<exclude> not)?") @then("there are(?P<exclude> no)? word tokens for postcodes (?P<postcodes>.*)")
def check_word_table(context, exclude): def check_word_table_for_postcodes(context, exclude, postcodes):
""" Check the contents of the word table. Each row represents a table row """ Check that the tokenizer produces postcode tokens for the given
and all data must match. Data not present in the expected table, may postcodes. The postcodes are a comma-separated list of postcodes.
be arbitry. The rows are identified via all given columns. Whitespace matters.
""" """
nctx = context.nominatim
tokenizer = tokenizer_factory.get_tokenizer_for_db(nctx.get_test_config())
with tokenizer.name_analyzer() as ana:
plist = [ana.normalize_postcode(p) for p in postcodes.split(',')]
plist.sort()
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
for row in context.table: if nctx.tokenizer == 'legacy_icu':
wheres = ' AND '.join(["{} = %s".format(h) for h in row.headings]) cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
cur.execute("SELECT * from word WHERE " + wheres, list(row.cells)) (plist,))
if exclude: else:
assert cur.rowcount == 0, "Row still in word table: %s" % '/'.join(values) cur.execute("""SELECT word FROM word WHERE word = any(%s)
else: and class = 'place' and type = 'postcode'""",
assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values) (plist,))
found = [row[0] for row in cur]
assert len(found) == len(set(found)), f"Duplicate rows for postcodes: {found}"
if exclude:
assert len(found) == 0, f"Unexpected postcodes: {found}"
else:
assert set(found) == set(plist), \
f"Missing postcodes {set(plist) - set(found)}. Found: {found}"
@then("place_addressline contains") @then("place_addressline contains")
def check_place_addressline(context): def check_place_addressline(context):

View File

@@ -0,0 +1,84 @@
"""
Legacy word table for testing with functions to prefil and test contents
of the table.
"""
class MockIcuWordTable:
""" A word table for testing using legacy word table structure.
"""
def __init__(self, conn):
self.conn = conn
with conn.cursor() as cur:
cur.execute("""CREATE TABLE word (word_id INTEGER,
word_token text NOT NULL,
type text NOT NULL,
word text,
info jsonb)""")
conn.commit()
def add_special(self, word_token, word, cls, typ, oper):
with self.conn.cursor() as cur:
cur.execute("""INSERT INTO word (word_token, type, word, info)
VALUES (%s, 'S', %s,
json_build_object('class', %s,
'type', %s,
'op', %s))
""", (word_token, word, cls, typ, oper))
self.conn.commit()
def add_country(self, country_code, word_token):
with self.conn.cursor() as cur:
cur.execute("""INSERT INTO word (word_token, type, word)
VALUES(%s, 'C', %s)""",
(word_token, country_code))
self.conn.commit()
def add_postcode(self, word_token, postcode):
with self.conn.cursor() as cur:
cur.execute("""INSERT INTO word (word_token, type, word)
VALUES (%s, 'P', %s)
""", (word_token, postcode))
self.conn.commit()
def count(self):
with self.conn.cursor() as cur:
return cur.scalar("SELECT count(*) FROM word")
def count_special(self):
with self.conn.cursor() as cur:
return cur.scalar("SELECT count(*) FROM word WHERE type = 'S'")
def get_special(self):
with self.conn.cursor() as cur:
cur.execute("SELECT word_token, info, word FROM word WHERE type = 'S'")
result = set(((row[0], row[2], row[1]['class'],
row[1]['type'], row[1]['op']) for row in cur))
assert len(result) == cur.rowcount, "Word table has duplicates."
return result
def get_country(self):
with self.conn.cursor() as cur:
cur.execute("SELECT word, word_token FROM word WHERE type = 'C'")
result = set((tuple(row) for row in cur))
assert len(result) == cur.rowcount, "Word table has duplicates."
return result
def get_postcodes(self):
with self.conn.cursor() as cur:
cur.execute("SELECT word FROM word WHERE type = 'P'")
return set((row[0] for row in cur))
def get_partial_words(self):
with self.conn.cursor() as cur:
cur.execute("SELECT word_token, info FROM word WHERE type ='w'")
return set(((row[0], row[1]['count']) for row in cur))

View File

@@ -0,0 +1,86 @@
"""
Legacy word table for testing with functions to prefil and test contents
of the table.
"""
class MockLegacyWordTable:
""" A word table for testing using legacy word table structure.
"""
def __init__(self, conn):
self.conn = conn
with conn.cursor() as cur:
cur.execute("""CREATE TABLE word (word_id INTEGER,
word_token text,
word text,
class text,
type text,
country_code varchar(2),
search_name_count INTEGER,
operator TEXT)""")
conn.commit()
def add_special(self, word_token, word, cls, typ, oper):
with self.conn.cursor() as cur:
cur.execute("""INSERT INTO word (word_token, word, class, type, operator)
VALUES (%s, %s, %s, %s, %s)
""", (word_token, word, cls, typ, oper))
self.conn.commit()
def add_country(self, country_code, word_token):
with self.conn.cursor() as cur:
cur.execute("INSERT INTO word (word_token, country_code) VALUES(%s, %s)",
(word_token, country_code))
self.conn.commit()
def add_postcode(self, word_token, postcode):
with self.conn.cursor() as cur:
cur.execute("""INSERT INTO word (word_token, word, class, type)
VALUES (%s, %s, 'place', 'postcode')
""", (word_token, postcode))
self.conn.commit()
def count(self):
with self.conn.cursor() as cur:
return cur.scalar("SELECT count(*) FROM word")
def count_special(self):
with self.conn.cursor() as cur:
return cur.scalar("SELECT count(*) FROM word WHERE class != 'place'")
def get_special(self):
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word, class, type, operator
FROM word WHERE class != 'place'""")
result = set((tuple(row) for row in cur))
assert len(result) == cur.rowcount, "Word table has duplicates."
return result
def get_country(self):
with self.conn.cursor() as cur:
cur.execute("""SELECT country_code, word_token
FROM word WHERE country_code is not null""")
result = set((tuple(row) for row in cur))
assert len(result) == cur.rowcount, "Word table has duplicates."
return result
def get_postcodes(self):
with self.conn.cursor() as cur:
cur.execute("""SELECT word FROM word
WHERE class = 'place' and type = 'postcode'""")
return set((row[0] for row in cur))
def get_partial_words(self):
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, search_name_count FROM word
WHERE class is null and country_code is null
and not word_token like ' %'""")
return set((tuple(row) for row in cur))

View File

@@ -7,6 +7,9 @@ import psycopg2.extras
from nominatim.db import properties from nominatim.db import properties
# This must always point to the mock word table for the default tokenizer.
from mock_legacy_word_table import MockLegacyWordTable as MockWordTable
class MockParamCapture: class MockParamCapture:
""" Mock that records the parameters with which a function was called """ Mock that records the parameters with which a function was called
as well as the number of calls. as well as the number of calls.
@@ -24,88 +27,6 @@ class MockParamCapture:
return self.return_value return self.return_value
class MockWordTable:
""" A word table for testing.
"""
def __init__(self, conn):
self.conn = conn
with conn.cursor() as cur:
cur.execute("""CREATE TABLE word (word_id INTEGER,
word_token text,
word text,
class text,
type text,
country_code varchar(2),
search_name_count INTEGER,
operator TEXT)""")
conn.commit()
def add_special(self, word_token, word, cls, typ, oper):
with self.conn.cursor() as cur:
cur.execute("""INSERT INTO word (word_token, word, class, type, operator)
VALUES (%s, %s, %s, %s, %s)
""", (word_token, word, cls, typ, oper))
self.conn.commit()
def add_country(self, country_code, word_token):
with self.conn.cursor() as cur:
cur.execute("INSERT INTO word (word_token, country_code) VALUES(%s, %s)",
(word_token, country_code))
self.conn.commit()
def add_postcode(self, word_token, postcode):
with self.conn.cursor() as cur:
cur.execute("""INSERT INTO word (word_token, word, class, type)
VALUES (%s, %s, 'place', 'postcode')
""", (word_token, postcode))
self.conn.commit()
def count(self):
with self.conn.cursor() as cur:
return cur.scalar("SELECT count(*) FROM word")
def count_special(self):
with self.conn.cursor() as cur:
return cur.scalar("SELECT count(*) FROM word WHERE class != 'place'")
def get_special(self):
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word, class, type, operator
FROM word WHERE class != 'place'""")
result = set((tuple(row) for row in cur))
assert len(result) == cur.rowcount, "Word table has duplicates."
return result
def get_country(self):
with self.conn.cursor() as cur:
cur.execute("""SELECT country_code, word_token
FROM word WHERE country_code is not null""")
result = set((tuple(row) for row in cur))
assert len(result) == cur.rowcount, "Word table has duplicates."
return result
def get_postcodes(self):
with self.conn.cursor() as cur:
cur.execute("""SELECT word FROM word
WHERE class = 'place' and type = 'postcode'""")
return set((row[0] for row in cur))
def get_partial_words(self):
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, search_name_count FROM word
WHERE class is null and country_code is null
and not word_token like ' %'""")
return set((tuple(row) for row in cur))
class MockPlacexTable: class MockPlacexTable:
""" A placex table for testing. """ A placex table for testing.
""" """

View File

@@ -1,6 +1,8 @@
""" """
Tests for DB utility functions in db.utils Tests for DB utility functions in db.utils
""" """
import json
import pytest import pytest
import nominatim.db.utils as db_utils import nominatim.db.utils as db_utils
@@ -115,3 +117,38 @@ class TestCopyBuffer:
class TestCopyBufferJson:
TABLE_NAME = 'copytable'
@pytest.fixture(autouse=True)
def setup_test_table(self, table_factory):
table_factory(self.TABLE_NAME, 'colA INT, colB JSONB')
def table_rows(self, cursor):
cursor.execute('SELECT * FROM ' + self.TABLE_NAME)
results = {k: v for k,v in cursor}
assert len(results) == cursor.rowcount
return results
def test_json_object(self, temp_db_cursor):
with db_utils.CopyBuffer() as buf:
buf.add(1, json.dumps({'test': 'value', 'number': 1}))
buf.copy_out(temp_db_cursor, self.TABLE_NAME)
assert self.table_rows(temp_db_cursor) == \
{1: {'test': 'value', 'number': 1}}
def test_json_object_special_chras(self, temp_db_cursor):
with db_utils.CopyBuffer() as buf:
buf.add(1, json.dumps({'te\tst': 'va\nlue', 'nu"mber': None}))
buf.copy_out(temp_db_cursor, self.TABLE_NAME)
assert self.table_rows(temp_db_cursor) == \
{1: {'te\tst': 'va\nlue', 'nu"mber': None}}

View File

@@ -11,6 +11,12 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties from nominatim.db import properties
from mock_icu_word_table import MockIcuWordTable
@pytest.fixture
def word_table(temp_db_conn):
return MockIcuWordTable(temp_db_conn)
@pytest.fixture @pytest.fixture
def test_config(def_config, tmp_path): def test_config(def_config, tmp_path):
@@ -21,8 +27,8 @@ def test_config(def_config, tmp_path):
sqldir.mkdir() sqldir.mkdir()
(sqldir / 'tokenizer').mkdir() (sqldir / 'tokenizer').mkdir()
(sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'") (sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'")
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'), shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql')) str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
def_config.lib_dir.sql = sqldir def_config.lib_dir.sql = sqldir
@@ -88,12 +94,14 @@ DECLARE
term_count INTEGER; term_count INTEGER;
BEGIN BEGIN
SELECT min(word_id) INTO full_token SELECT min(word_id) INTO full_token
FROM word WHERE word = norm_term and class is null and country_code is null; FROM word WHERE info->>'word' = norm_term and type = 'W';
IF full_token IS NULL THEN IF full_token IS NULL THEN
full_token := nextval('seq_word'); full_token := nextval('seq_word');
INSERT INTO word (word_id, word_token, word, search_name_count) INSERT INTO word (word_id, word_token, type, info)
SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term; SELECT full_token, lookup_term, 'W',
json_build_object('word', norm_term, 'count', 0)
FROM unnest(lookup_terms) as lookup_term;
END IF; END IF;
FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
@@ -105,18 +113,18 @@ BEGIN
partial_tokens := '{}'::INT[]; partial_tokens := '{}'::INT[];
FOR term IN SELECT unnest(partial_terms) LOOP FOR term IN SELECT unnest(partial_terms) LOOP
SELECT min(word_id), max(search_name_count) INTO term_id, term_count SELECT min(word_id), max(info->>'count') INTO term_id, term_count
FROM word WHERE word_token = term and class is null and country_code is null; FROM word WHERE word_token = term and type = 'w';
IF term_id IS NULL THEN IF term_id IS NULL THEN
term_id := nextval('seq_word'); term_id := nextval('seq_word');
term_count := 0; term_count := 0;
INSERT INTO word (word_id, word_token, search_name_count) INSERT INTO word (word_id, word_token, type, info)
VALUES (term_id, term, 0); VALUES (term_id, term, 'w', json_build_object('count', term_count));
END IF; END IF;
IF NOT (ARRAY[term_id] <@ partial_tokens) THEN IF NOT (ARRAY[term_id] <@ partial_tokens) THEN
partial_tokens := partial_tokens || term_id; partial_tokens := partial_tokens || term_id;
END IF; END IF;
END LOOP; END LOOP;
END; END;
@@ -232,14 +240,14 @@ def test_update_special_phrase_empty_table(analyzer, word_table):
], True) ], True)
assert word_table.get_special() \ assert word_table.get_special() \
== {(' KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'), == {('KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'),
(' KÖNIGE', 'Könige', 'amenity', 'royal', None), ('KÖNIGE', 'Könige', 'amenity', 'royal', None),
(' STREET', 'street', 'highway', 'primary', 'in')} ('STREET', 'street', 'highway', 'primary', 'in')}
def test_update_special_phrase_delete_all(analyzer, word_table): def test_update_special_phrase_delete_all(analyzer, word_table):
word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in') word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in')
word_table.add_special(' BAR', 'bar', 'highway', 'road', None) word_table.add_special('BAR', 'bar', 'highway', 'road', None)
assert word_table.count_special() == 2 assert word_table.count_special() == 2
@@ -250,8 +258,8 @@ def test_update_special_phrase_delete_all(analyzer, word_table):
def test_update_special_phrases_no_replace(analyzer, word_table): def test_update_special_phrases_no_replace(analyzer, word_table):
word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in') word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in')
word_table.add_special(' BAR', 'bar', 'highway', 'road', None) word_table.add_special('BAR', 'bar', 'highway', 'road', None)
assert word_table.count_special() == 2 assert word_table.count_special() == 2
@@ -262,8 +270,8 @@ def test_update_special_phrases_no_replace(analyzer, word_table):
def test_update_special_phrase_modify(analyzer, word_table): def test_update_special_phrase_modify(analyzer, word_table):
word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in') word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in')
word_table.add_special(' BAR', 'bar', 'highway', 'road', None) word_table.add_special('BAR', 'bar', 'highway', 'road', None)
assert word_table.count_special() == 2 assert word_table.count_special() == 2
@@ -275,25 +283,25 @@ def test_update_special_phrase_modify(analyzer, word_table):
], True) ], True)
assert word_table.get_special() \ assert word_table.get_special() \
== {(' PRISON', 'prison', 'amenity', 'prison', 'in'), == {('PRISON', 'prison', 'amenity', 'prison', 'in'),
(' BAR', 'bar', 'highway', 'road', None), ('BAR', 'bar', 'highway', 'road', None),
(' GARDEN', 'garden', 'leisure', 'garden', 'near')} ('GARDEN', 'garden', 'leisure', 'garden', 'near')}
def test_add_country_names_new(analyzer, word_table): def test_add_country_names_new(analyzer, word_table):
with analyzer() as anl: with analyzer() as anl:
anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'}) anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})
assert word_table.get_country() == {('es', ' ESPAGÑA'), ('es', ' SPAIN')} assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')}
def test_add_country_names_extend(analyzer, word_table): def test_add_country_names_extend(analyzer, word_table):
word_table.add_country('ch', ' SCHWEIZ') word_table.add_country('ch', 'SCHWEIZ')
with analyzer() as anl: with analyzer() as anl:
anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'}) anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})
assert word_table.get_country() == {('ch', ' SCHWEIZ'), ('ch', ' SUISSE')} assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')}
class TestPlaceNames: class TestPlaceNames:
@@ -307,6 +315,7 @@ class TestPlaceNames:
def expect_name_terms(self, info, *expected_terms): def expect_name_terms(self, info, *expected_terms):
tokens = self.analyzer.get_word_token_info(expected_terms) tokens = self.analyzer.get_word_token_info(expected_terms)
print (tokens)
for token in tokens: for token in tokens:
assert token[2] is not None, "No token for {0}".format(token) assert token[2] is not None, "No token for {0}".format(token)
@@ -316,7 +325,7 @@ class TestPlaceNames:
def test_simple_names(self): def test_simple_names(self):
info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}}) info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}})
self.expect_name_terms(info, '#Soft bAr', '#34','Soft', 'bAr', '34') self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
@pytest.mark.parametrize('sep', [',' , ';']) @pytest.mark.parametrize('sep', [',' , ';'])
@@ -339,7 +348,7 @@ class TestPlaceNames:
'country_feature': 'no'}) 'country_feature': 'no'})
self.expect_name_terms(info, '#norge', 'norge') self.expect_name_terms(info, '#norge', 'norge')
assert word_table.get_country() == {('no', ' NORGE')} assert word_table.get_country() == {('no', 'NORGE')}
class TestPlaceAddress: class TestPlaceAddress: