mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-09 19:44:07 +00:00
enable BDD tests for different tokenizers
The tokenizer to be used can be choosen with -DTOKENIZER. Adapt all tests, so that they work with legacy_icu tokenizer. Move lookup in word table to a function in the tokenizer. Special phrases are temporarily imported from the wiki until we have an implementation that can import from file. TIGER tests do not work yet.
This commit is contained in:
@@ -35,7 +35,7 @@ def create(dsn, data_dir):
|
|||||||
class LegacyICUTokenizer:
|
class LegacyICUTokenizer:
|
||||||
""" This tokenizer uses libICU to covert names and queries to ASCII.
|
""" This tokenizer uses libICU to covert names and queries to ASCII.
|
||||||
Otherwise it uses the same algorithms and data structures as the
|
Otherwise it uses the same algorithms and data structures as the
|
||||||
normalization routines in Nominatm 3.
|
normalization routines in Nominatim 3.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, dsn, data_dir):
|
def __init__(self, dsn, data_dir):
|
||||||
@@ -228,6 +228,35 @@ class LegacyICUNameAnalyzer:
|
|||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_word_token_info(self, conn, words):
|
||||||
|
""" Return token information for the given list of words.
|
||||||
|
If a word starts with # it is assumed to be a full name
|
||||||
|
otherwise is a partial name.
|
||||||
|
|
||||||
|
The function returns a list of tuples with
|
||||||
|
(original word, word token, word id).
|
||||||
|
|
||||||
|
The function is used for testing and debugging only
|
||||||
|
and not necessarily efficient.
|
||||||
|
"""
|
||||||
|
tokens = {}
|
||||||
|
for word in words:
|
||||||
|
if word.startswith('#'):
|
||||||
|
tokens[word] = ' ' + self.make_standard_word(word[1:])
|
||||||
|
else:
|
||||||
|
tokens[word] = self.make_standard_word(word)
|
||||||
|
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("""SELECT word_token, word_id
|
||||||
|
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
|
||||||
|
WHERE word_token = t.term
|
||||||
|
and class is null and country_code is null""",
|
||||||
|
(list(tokens.values()), ))
|
||||||
|
ids = {r[0]: r[1] for r in cur}
|
||||||
|
|
||||||
|
return [(k, v, ids[v]) for k, v in tokens.items()]
|
||||||
|
|
||||||
|
|
||||||
def normalize(self, phrase):
|
def normalize(self, phrase):
|
||||||
""" Normalize the given phrase, i.e. remove all properties that
|
""" Normalize the given phrase, i.e. remove all properties that
|
||||||
are irrelevant for search.
|
are irrelevant for search.
|
||||||
@@ -236,7 +265,7 @@ class LegacyICUNameAnalyzer:
|
|||||||
|
|
||||||
@functools.lru_cache(maxsize=1024)
|
@functools.lru_cache(maxsize=1024)
|
||||||
def make_standard_word(self, name):
|
def make_standard_word(self, name):
|
||||||
""" Create the normalised version of the name.
|
""" Create the normalised version of the input.
|
||||||
"""
|
"""
|
||||||
norm = ' ' + self.transliterator.transliterate(name) + ' '
|
norm = ' ' + self.transliterator.transliterate(name) + ' '
|
||||||
for full, abbr in self.abbreviations:
|
for full, abbr in self.abbreviations:
|
||||||
@@ -333,24 +362,25 @@ class LegacyICUNameAnalyzer:
|
|||||||
"""
|
"""
|
||||||
full_names = set((self.make_standard_word(n) for n in names))
|
full_names = set((self.make_standard_word(n) for n in names))
|
||||||
full_names.discard('')
|
full_names.discard('')
|
||||||
self._add_normalised_country_names(country_code, full_names)
|
self._add_normalized_country_names(country_code, full_names)
|
||||||
|
|
||||||
|
|
||||||
def _add_normalised_country_names(self, country_code, names):
|
def _add_normalized_country_names(self, country_code, names):
|
||||||
""" Add names for the given country to the search index.
|
""" Add names for the given country to the search index.
|
||||||
"""
|
"""
|
||||||
|
word_tokens = set((' ' + name for name in names))
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
# Get existing names
|
# Get existing names
|
||||||
cur.execute("SELECT word_token FROM word WHERE country_code = %s",
|
cur.execute("SELECT word_token FROM word WHERE country_code = %s",
|
||||||
(country_code, ))
|
(country_code, ))
|
||||||
new_names = names.difference((t[0] for t in cur))
|
word_tokens.difference_update((t[0] for t in cur))
|
||||||
|
|
||||||
if new_names:
|
if word_tokens:
|
||||||
cur.execute("""INSERT INTO word (word_id, word_token, country_code,
|
cur.execute("""INSERT INTO word (word_id, word_token, country_code,
|
||||||
search_name_count)
|
search_name_count)
|
||||||
(SELECT nextval('seq_word'), token, '{}', 0
|
(SELECT nextval('seq_word'), token, '{}', 0
|
||||||
FROM unnest(%s) as token)
|
FROM unnest(%s) as token)
|
||||||
""".format(country_code), (list(new_names),))
|
""".format(country_code), (list(word_tokens),))
|
||||||
|
|
||||||
|
|
||||||
def process_place(self, place):
|
def process_place(self, place):
|
||||||
@@ -371,7 +401,7 @@ class LegacyICUNameAnalyzer:
|
|||||||
|
|
||||||
country_feature = place.get('country_feature')
|
country_feature = place.get('country_feature')
|
||||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
||||||
self._add_normalised_country_names(country_feature.lower(),
|
self._add_normalized_country_names(country_feature.lower(),
|
||||||
full_names)
|
full_names)
|
||||||
|
|
||||||
address = place.get('address')
|
address = place.get('address')
|
||||||
|
|||||||
@@ -271,6 +271,33 @@ class LegacyNameAnalyzer:
|
|||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_word_token_info(conn, words):
|
||||||
|
""" Return token information for the given list of words.
|
||||||
|
If a word starts with # it is assumed to be a full name
|
||||||
|
otherwise is a partial name.
|
||||||
|
|
||||||
|
The function returns a list of tuples with
|
||||||
|
(original word, word token, word id).
|
||||||
|
|
||||||
|
The function is used for testing and debugging only
|
||||||
|
and not necessarily efficient.
|
||||||
|
"""
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("""SELECT t.term, word_token, word_id
|
||||||
|
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
|
||||||
|
WHERE word_token = (CASE
|
||||||
|
WHEN left(t.term, 1) = '#' THEN
|
||||||
|
' ' || make_standard_name(substring(t.term from 2))
|
||||||
|
ELSE
|
||||||
|
make_standard_name(t.term)
|
||||||
|
END)
|
||||||
|
and class is null and country_code is null""",
|
||||||
|
(words, ))
|
||||||
|
|
||||||
|
return [(r[0], r[1], r[2]) for r in cur]
|
||||||
|
|
||||||
|
|
||||||
def normalize(self, phrase):
|
def normalize(self, phrase):
|
||||||
""" Normalize the given phrase, i.e. remove all properties that
|
""" Normalize the given phrase, i.e. remove all properties that
|
||||||
are irrelevant for search.
|
are irrelevant for search.
|
||||||
|
|||||||
@@ -163,7 +163,7 @@ Feature: Search queries
|
|||||||
Then exactly 0 results are returned
|
Then exactly 0 results are returned
|
||||||
|
|
||||||
Scenario: Ignore country searches when query is restricted to countries
|
Scenario: Ignore country searches when query is restricted to countries
|
||||||
When sending json search query "de"
|
When sending json search query "fr"
|
||||||
| countrycodes |
|
| countrycodes |
|
||||||
| li |
|
| li |
|
||||||
Then exactly 0 results are returned
|
Then exactly 0 results are returned
|
||||||
|
|||||||
@@ -55,6 +55,4 @@ Feature: Import and search of names
|
|||||||
| Вологда |
|
| Вологда |
|
||||||
| Αθήνα |
|
| Αθήνα |
|
||||||
| القاهرة |
|
| القاهرة |
|
||||||
| រាជធានីភ្នំពេញ |
|
|
||||||
| 東京都 |
|
| 東京都 |
|
||||||
| ပုဗ္ဗသီရိ |
|
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ userconfig = {
|
|||||||
'API_TEST_DB' : 'test_api_nominatim',
|
'API_TEST_DB' : 'test_api_nominatim',
|
||||||
'API_TEST_FILE' : (TEST_BASE_DIR / 'testdb' / 'apidb-test-data.pbf').resolve(),
|
'API_TEST_FILE' : (TEST_BASE_DIR / 'testdb' / 'apidb-test-data.pbf').resolve(),
|
||||||
'SERVER_MODULE_PATH' : None,
|
'SERVER_MODULE_PATH' : None,
|
||||||
|
'TOKENIZER' : None, # Test with a custom tokenizer
|
||||||
'PHPCOV' : False, # set to output directory to enable code coverage
|
'PHPCOV' : False, # set to output directory to enable code coverage
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ class NominatimEnvironment:
|
|||||||
self.test_db = config['TEST_DB']
|
self.test_db = config['TEST_DB']
|
||||||
self.api_test_db = config['API_TEST_DB']
|
self.api_test_db = config['API_TEST_DB']
|
||||||
self.api_test_file = config['API_TEST_FILE']
|
self.api_test_file = config['API_TEST_FILE']
|
||||||
|
self.tokenizer = config['TOKENIZER']
|
||||||
self.server_module_path = config['SERVER_MODULE_PATH']
|
self.server_module_path = config['SERVER_MODULE_PATH']
|
||||||
self.reuse_template = not config['REMOVE_TEMPLATE']
|
self.reuse_template = not config['REMOVE_TEMPLATE']
|
||||||
self.keep_scenario_db = config['KEEP_TEST_DB']
|
self.keep_scenario_db = config['KEEP_TEST_DB']
|
||||||
@@ -96,6 +97,8 @@ class NominatimEnvironment:
|
|||||||
self.test_env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str((self.build_dir / 'module').resolve())
|
self.test_env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str((self.build_dir / 'module').resolve())
|
||||||
self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = str((self.build_dir / 'osm2pgsql' / 'osm2pgsql').resolve())
|
self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = str((self.build_dir / 'osm2pgsql' / 'osm2pgsql').resolve())
|
||||||
self.test_env['NOMINATIM_NOMINATIM_TOOL'] = str((self.build_dir / 'nominatim').resolve())
|
self.test_env['NOMINATIM_NOMINATIM_TOOL'] = str((self.build_dir / 'nominatim').resolve())
|
||||||
|
if self.tokenizer is not None:
|
||||||
|
self.test_env['NOMINATIM_TOKENIZER'] = self.tokenizer
|
||||||
|
|
||||||
if self.server_module_path:
|
if self.server_module_path:
|
||||||
self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.server_module_path
|
self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.server_module_path
|
||||||
@@ -189,11 +192,19 @@ class NominatimEnvironment:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
self.run_nominatim('import', '--osm-file', str(self.api_test_file))
|
self.run_nominatim('import', '--osm-file', str(self.api_test_file))
|
||||||
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
|
if self.tokenizer != 'legacy_icu':
|
||||||
|
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
|
||||||
self.run_nominatim('freeze')
|
self.run_nominatim('freeze')
|
||||||
|
|
||||||
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
|
if self.tokenizer != 'legacy_icu':
|
||||||
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
|
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
|
||||||
|
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
|
||||||
|
else:
|
||||||
|
# XXX Temporary use the wiki while there is no CSV import
|
||||||
|
# available.
|
||||||
|
self.test_env['NOMINATIM_LANGUAGES'] = 'en'
|
||||||
|
self.run_nominatim('special-phrases', '--import-from-wiki')
|
||||||
|
del self.test_env['NOMINATIM_LANGUAGES']
|
||||||
except:
|
except:
|
||||||
self.db_drop_database(self.api_test_db)
|
self.db_drop_database(self.api_test_db)
|
||||||
raise
|
raise
|
||||||
|
|||||||
@@ -199,44 +199,35 @@ def check_search_name_contents(context, exclude):
|
|||||||
have an identifier of the form '<NRW><osm id>[:<class>]'. All
|
have an identifier of the form '<NRW><osm id>[:<class>]'. All
|
||||||
expected rows are expected to be present with at least one database row.
|
expected rows are expected to be present with at least one database row.
|
||||||
"""
|
"""
|
||||||
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
tokenizer = tokenizer_factory.get_tokenizer_for_db(context.nominatim.get_test_config())
|
||||||
for row in context.table:
|
|
||||||
nid = NominatimID(row['object'])
|
with tokenizer.name_analyzer() as analyzer:
|
||||||
nid.row_by_place_id(cur, 'search_name',
|
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||||
['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
|
for row in context.table:
|
||||||
assert cur.rowcount > 0, "No rows found for " + row['object']
|
nid = NominatimID(row['object'])
|
||||||
|
nid.row_by_place_id(cur, 'search_name',
|
||||||
|
['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
|
||||||
|
assert cur.rowcount > 0, "No rows found for " + row['object']
|
||||||
|
|
||||||
|
for res in cur:
|
||||||
|
db_row = DBRow(nid, res, context)
|
||||||
|
for name, value in zip(row.headings, row.cells):
|
||||||
|
if name in ('name_vector', 'nameaddress_vector'):
|
||||||
|
items = [x.strip() for x in value.split(',')]
|
||||||
|
tokens = analyzer.get_word_token_info(context.db, items)
|
||||||
|
|
||||||
for res in cur:
|
|
||||||
db_row = DBRow(nid, res, context)
|
|
||||||
for name, value in zip(row.headings, row.cells):
|
|
||||||
if name in ('name_vector', 'nameaddress_vector'):
|
|
||||||
items = [x.strip() for x in value.split(',')]
|
|
||||||
with context.db.cursor() as subcur:
|
|
||||||
subcur.execute(""" SELECT word_id, word_token
|
|
||||||
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
|
|
||||||
WHERE word_token = make_standard_name(t.term)
|
|
||||||
and class is null and country_code is null
|
|
||||||
and operator is null
|
|
||||||
UNION
|
|
||||||
SELECT word_id, word_token
|
|
||||||
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
|
|
||||||
WHERE word_token = ' ' || make_standard_name(t.term)
|
|
||||||
and class is null and country_code is null
|
|
||||||
and operator is null
|
|
||||||
""",
|
|
||||||
(list(filter(lambda x: not x.startswith('#'), items)),
|
|
||||||
list(filter(lambda x: x.startswith('#'), items))))
|
|
||||||
if not exclude:
|
if not exclude:
|
||||||
assert subcur.rowcount >= len(items), \
|
assert len(tokens) >= len(items), \
|
||||||
"No word entry found for {}. Entries found: {!s}".format(value, subcur.rowcount)
|
"No word entry found for {}. Entries found: {!s}".format(value, len(tokens))
|
||||||
for wid in subcur:
|
for word, token, wid in tokens:
|
||||||
present = wid[0] in res[name]
|
|
||||||
if exclude:
|
if exclude:
|
||||||
assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1])
|
assert wid not in res[name], \
|
||||||
|
"Found term for {}/{}: {}".format(nid, name, wid)
|
||||||
else:
|
else:
|
||||||
assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1])
|
assert wid in res[name], \
|
||||||
elif name != 'object':
|
"Missing term for {}/{}: {}".format(nid, name, wid)
|
||||||
assert db_row.contains(name, value), db_row.assert_msg(name, value)
|
elif name != 'object':
|
||||||
|
assert db_row.contains(name, value), db_row.assert_msg(name, value)
|
||||||
|
|
||||||
@then("search_name has no entry for (?P<oid>.*)")
|
@then("search_name has no entry for (?P<oid>.*)")
|
||||||
def check_search_name_has_entry(context, oid):
|
def check_search_name_has_entry(context, oid):
|
||||||
|
|||||||
Reference in New Issue
Block a user