enable BDD tests for different tokenizers

The tokenizer to be used can be choosen with -DTOKENIZER. Adapt all tests, so that they work with legacy_icu tokenizer. Move lookup in word table to a function in the tokenizer. Special phrases are temporarily imported from the wiki until we have an implementation that can import from file. TIGER tests do not work yet.
2026-02-16 15:47:58 +00:00 · 2021-05-05 10:00:34 +02:00
parent 18c99a5c5f
commit a263e54b94
7 changed files with 106 additions and 48 deletions
--- a/test/bdd/api/search/queries.feature
+++ b/test/bdd/api/search/queries.feature
@@ -163,7 +163,7 @@ Feature: Search queries
        Then exactly 0 results are returned

    Scenario: Ignore country searches when query is restricted to countries
-        When sending json search query "de"
+        When sending json search query "fr"
            | countrycodes |
            | li  |
        Then exactly 0 results are returned
--- a/test/bdd/db/import/naming.feature
+++ b/test/bdd/db/import/naming.feature
@@ -55,6 +55,4 @@ Feature: Import and search of names
        | Вологда |
        | Αθήνα |
        | القاهرة |
-        | រាជធានីភ្នំពេញ |
        | 東京都 |
-        | ပုဗ္ဗသီရိ |
--- a/test/bdd/environment.py
+++ b/test/bdd/environment.py
@@ -20,6 +20,7 @@ userconfig = {
    'API_TEST_DB' : 'test_api_nominatim',
    'API_TEST_FILE'  : (TEST_BASE_DIR / 'testdb' / 'apidb-test-data.pbf').resolve(),
    'SERVER_MODULE_PATH' : None,
+    'TOKENIZER' : None, # Test with a custom tokenizer
    'PHPCOV' : False, # set to output directory to enable code coverage
 }

--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@@ -28,6 +28,7 @@ class NominatimEnvironment:
        self.test_db = config['TEST_DB']
        self.api_test_db = config['API_TEST_DB']
        self.api_test_file = config['API_TEST_FILE']
+        self.tokenizer = config['TOKENIZER']
        self.server_module_path = config['SERVER_MODULE_PATH']
        self.reuse_template = not config['REMOVE_TEMPLATE']
        self.keep_scenario_db = config['KEEP_TEST_DB']
@@ -96,6 +97,8 @@ class NominatimEnvironment:
        self.test_env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str((self.build_dir / 'module').resolve())
        self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = str((self.build_dir / 'osm2pgsql' / 'osm2pgsql').resolve())
        self.test_env['NOMINATIM_NOMINATIM_TOOL'] = str((self.build_dir / 'nominatim').resolve())
+        if self.tokenizer is not None:
+            self.test_env['NOMINATIM_TOKENIZER'] = self.tokenizer

        if self.server_module_path:
            self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.server_module_path
@@ -189,11 +192,19 @@ class NominatimEnvironment:

                try:
                    self.run_nominatim('import', '--osm-file', str(self.api_test_file))
-                    self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
+                    if self.tokenizer != 'legacy_icu':
+                        self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                    self.run_nominatim('freeze')

-                    phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
-                    run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
+                    if self.tokenizer != 'legacy_icu':
+                        phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
+                        run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
+                    else:
+                        # XXX Temporary use the wiki while there is no CSV import
+                        # available.
+                        self.test_env['NOMINATIM_LANGUAGES'] = 'en'
+                        self.run_nominatim('special-phrases', '--import-from-wiki')
+                        del self.test_env['NOMINATIM_LANGUAGES']
                except:
                    self.db_drop_database(self.api_test_db)
                    raise
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -199,44 +199,35 @@ def check_search_name_contents(context, exclude):
        have an identifier of the form '<NRW><osm id>[:<class>]'. All
        expected rows are expected to be present with at least one database row.
    """
-    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        for row in context.table:
-            nid = NominatimID(row['object'])
-            nid.row_by_place_id(cur, 'search_name',
-                                ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
-            assert cur.rowcount > 0, "No rows found for " + row['object']
+    tokenizer = tokenizer_factory.get_tokenizer_for_db(context.nominatim.get_test_config())
+
+    with tokenizer.name_analyzer() as analyzer:
+        with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+            for row in context.table:
+                nid = NominatimID(row['object'])
+                nid.row_by_place_id(cur, 'search_name',
+                                    ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
+                assert cur.rowcount > 0, "No rows found for " + row['object']
+
+                for res in cur:
+                    db_row = DBRow(nid, res, context)
+                    for name, value in zip(row.headings, row.cells):
+                        if name in ('name_vector', 'nameaddress_vector'):
+                            items = [x.strip() for x in value.split(',')]
+                            tokens = analyzer.get_word_token_info(context.db, items)

-            for res in cur:
-                db_row = DBRow(nid, res, context)
-                for name, value in zip(row.headings, row.cells):
-                    if name in ('name_vector', 'nameaddress_vector'):
-                        items = [x.strip() for x in value.split(',')]
-                        with context.db.cursor() as subcur:
-                            subcur.execute(""" SELECT word_id, word_token
-                                               FROM word, (SELECT unnest(%s::TEXT[]) as term) t
-                                               WHERE word_token = make_standard_name(t.term)
-                                                     and class is null and country_code is null
-                                                     and operator is null
-                                              UNION
-                                               SELECT word_id, word_token
-                                               FROM word, (SELECT unnest(%s::TEXT[]) as term) t
-                                               WHERE word_token = ' ' || make_standard_name(t.term)
-                                                     and class is null and country_code is null
-                                                     and operator is null
-                                           """,
-                                           (list(filter(lambda x: not x.startswith('#'), items)),
-                                            list(filter(lambda x: x.startswith('#'), items))))
                            if not exclude:
-                                assert subcur.rowcount >= len(items), \
-                                    "No word entry found for {}. Entries found: {!s}".format(value, subcur.rowcount)
-                            for wid in subcur:
-                                present = wid[0] in res[name]
+                                assert len(tokens) >= len(items), \
+                                       "No word entry found for {}. Entries found: {!s}".format(value, len(tokens))
+                            for word, token, wid in tokens:
                                if exclude:
-                                    assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1])
+                                    assert wid not in res[name], \
+                                           "Found term for {}/{}: {}".format(nid, name, wid)
                                else:
-                                    assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1])
-                    elif name != 'object':
-                        assert db_row.contains(name, value), db_row.assert_msg(name, value)
+                                    assert wid in res[name], \
+                                           "Missing term for {}/{}: {}".format(nid, name, wid)
+                        elif name != 'object':
+                            assert db_row.contains(name, value), db_row.assert_msg(name, value)

@then("search_name has no entry for (?P<oid>.*)")
 def check_search_name_has_entry(context, oid):