enable BDD tests for different tokenizers

The tokenizer to be used can be choosen with -DTOKENIZER. Adapt all tests, so that they work with legacy_icu tokenizer. Move lookup in word table to a function in the tokenizer. Special phrases are temporarily imported from the wiki until we have an implementation that can import from file. TIGER tests do not work yet.
2026-03-09 19:44:07 +00:00 · 2021-05-05 10:00:34 +02:00
parent 18c99a5c5f
commit a263e54b94
7 changed files with 106 additions and 48 deletions
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -35,7 +35,7 @@ def create(dsn, data_dir):
 class LegacyICUTokenizer:
    """ This tokenizer uses libICU to covert names and queries to ASCII.
        Otherwise it uses the same algorithms and data structures as the
-        normalization routines in Nominatm 3.
+        normalization routines in Nominatim 3.
    """
    def __init__(self, dsn, data_dir):
@@ -228,6 +228,35 @@ class LegacyICUNameAnalyzer:
            self.conn = None
    def get_word_token_info(self, conn, words):
        """ Return token information for the given list of words.
            If a word starts with # it is assumed to be a full name
            otherwise is a partial name.
            The function returns a list of tuples with
            (original word, word token, word id).
            The function is used for testing and debugging only
            and not necessarily efficient.
        """
        tokens = {}
        for word in words:
            if word.startswith('#'):
                tokens[word] = ' ' + self.make_standard_word(word[1:])
            else:
                tokens[word] = self.make_standard_word(word)
        with conn.cursor() as cur:
            cur.execute("""SELECT word_token, word_id
                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                           WHERE word_token = t.term
                                 and class is null and country_code is null""",
                        (list(tokens.values()), ))
            ids = {r[0]: r[1] for r in cur}
        return [(k, v, ids[v]) for k, v in tokens.items()]
    def normalize(self, phrase):
        """ Normalize the given phrase, i.e. remove all properties that
            are irrelevant for search.
@@ -236,7 +265,7 @@ class LegacyICUNameAnalyzer:
    @functools.lru_cache(maxsize=1024)
    def make_standard_word(self, name):
-        """ Create the normalised version of the name.
+        """ Create the normalised version of the input.
        """
        norm = ' ' + self.transliterator.transliterate(name) + ' '
        for full, abbr in self.abbreviations:
@@ -333,24 +362,25 @@ class LegacyICUNameAnalyzer:
        """
        full_names = set((self.make_standard_word(n) for n in names))
        full_names.discard('')
-        self._add_normalised_country_names(country_code, full_names)
+        self._add_normalized_country_names(country_code, full_names)
-    def _add_normalised_country_names(self, country_code, names):
+    def _add_normalized_country_names(self, country_code, names):
        """ Add names for the given country to the search index.
        """
        word_tokens = set((' ' + name for name in names))
        with self.conn.cursor() as cur:
            # Get existing names
            cur.execute("SELECT word_token FROM word WHERE country_code = %s",
                        (country_code, ))
-            new_names = names.difference((t[0] for t in cur))
+            word_tokens.difference_update((t[0] for t in cur))
-            if new_names:
+            if word_tokens:
                cur.execute("""INSERT INTO word (word_id, word_token, country_code,
                                                 search_name_count)
                               (SELECT nextval('seq_word'), token, '{}', 0
                                FROM unnest(%s) as token)
-                            """.format(country_code), (list(new_names),))
+                            """.format(country_code), (list(word_tokens),))
    def process_place(self, place):
@@ -371,7 +401,7 @@ class LegacyICUNameAnalyzer:
            country_feature = place.get('country_feature')
            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self._add_normalised_country_names(country_feature.lower(),
+                self._add_normalized_country_names(country_feature.lower(),
                                                   full_names)
        address = place.get('address')
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -271,6 +271,33 @@ class LegacyNameAnalyzer:
            self.conn = None
    @staticmethod
    def get_word_token_info(conn, words):
        """ Return token information for the given list of words.
            If a word starts with # it is assumed to be a full name
            otherwise is a partial name.
            The function returns a list of tuples with
            (original word, word token, word id).
            The function is used for testing and debugging only
            and not necessarily efficient.
        """
        with conn.cursor() as cur:
            cur.execute("""SELECT t.term, word_token, word_id
                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                           WHERE word_token = (CASE
                                   WHEN left(t.term, 1) = '#' THEN
                                     ' ' || make_standard_name(substring(t.term from 2))
                                   ELSE
                                     make_standard_name(t.term)
                                   END)
                                 and class is null and country_code is null""",
                        (words, ))
            return [(r[0], r[1], r[2]) for r in cur]
    def normalize(self, phrase):
        """ Normalize the given phrase, i.e. remove all properties that
            are irrelevant for search.
--- a/test/bdd/api/search/queries.feature
+++ b/test/bdd/api/search/queries.feature
@@ -163,7 +163,7 @@ Feature: Search queries
        Then exactly 0 results are returned
    Scenario: Ignore country searches when query is restricted to countries
-        When sending json search query "de"
+        When sending json search query "fr"
            | countrycodes |
            | li  |
        Then exactly 0 results are returned
--- a/test/bdd/db/import/naming.feature
+++ b/test/bdd/db/import/naming.feature
@@ -55,6 +55,4 @@ Feature: Import and search of names
        | Вологда |
        | Αθήνα |
        | القاهرة |
        | រាជធានីភ្នំពេញ |
        | 東京都 |
        | ပုဗ္ဗသီရိ |
--- a/test/bdd/environment.py
+++ b/test/bdd/environment.py
@@ -20,6 +20,7 @@ userconfig = {
    'API_TEST_DB' : 'test_api_nominatim',
    'API_TEST_FILE'  : (TEST_BASE_DIR / 'testdb' / 'apidb-test-data.pbf').resolve(),
    'SERVER_MODULE_PATH' : None,
    'TOKENIZER' : None, # Test with a custom tokenizer
    'PHPCOV' : False, # set to output directory to enable code coverage
 }
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@@ -28,6 +28,7 @@ class NominatimEnvironment:
        self.test_db = config['TEST_DB']
        self.api_test_db = config['API_TEST_DB']
        self.api_test_file = config['API_TEST_FILE']
        self.tokenizer = config['TOKENIZER']
        self.server_module_path = config['SERVER_MODULE_PATH']
        self.reuse_template = not config['REMOVE_TEMPLATE']
        self.keep_scenario_db = config['KEEP_TEST_DB']
@@ -96,6 +97,8 @@ class NominatimEnvironment:
        self.test_env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str((self.build_dir / 'module').resolve())
        self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = str((self.build_dir / 'osm2pgsql' / 'osm2pgsql').resolve())
        self.test_env['NOMINATIM_NOMINATIM_TOOL'] = str((self.build_dir / 'nominatim').resolve())
        if self.tokenizer is not None:
            self.test_env['NOMINATIM_TOKENIZER'] = self.tokenizer
        if self.server_module_path:
            self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.server_module_path
@@ -189,11 +192,19 @@ class NominatimEnvironment:
                try:
                    self.run_nominatim('import', '--osm-file', str(self.api_test_file))
-                    self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
+                    if self.tokenizer != 'legacy_icu':
                        self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                    self.run_nominatim('freeze')
-                    phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
+                    if self.tokenizer != 'legacy_icu':
-                    run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
+                        phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
                        run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                    else:
                        # XXX Temporary use the wiki while there is no CSV import
                        # available.
                        self.test_env['NOMINATIM_LANGUAGES'] = 'en'
                        self.run_nominatim('special-phrases', '--import-from-wiki')
                        del self.test_env['NOMINATIM_LANGUAGES']
                except:
                    self.db_drop_database(self.api_test_db)
                    raise
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -199,44 +199,35 @@ def check_search_name_contents(context, exclude):
        have an identifier of the form '<NRW><osm id>[:<class>]'. All
        expected rows are expected to be present with at least one database row.
    """
-    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+    tokenizer = tokenizer_factory.get_tokenizer_for_db(context.nominatim.get_test_config())
-        for row in context.table:
+
-            nid = NominatimID(row['object'])
+    with tokenizer.name_analyzer() as analyzer:
-            nid.row_by_place_id(cur, 'search_name',
+        with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-                                ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
+            for row in context.table:
-            assert cur.rowcount > 0, "No rows found for " + row['object']
+                nid = NominatimID(row['object'])
                nid.row_by_place_id(cur, 'search_name',
                                    ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
                assert cur.rowcount > 0, "No rows found for " + row['object']
                for res in cur:
                    db_row = DBRow(nid, res, context)
                    for name, value in zip(row.headings, row.cells):
                        if name in ('name_vector', 'nameaddress_vector'):
                            items = [x.strip() for x in value.split(',')]
                            tokens = analyzer.get_word_token_info(context.db, items)
            for res in cur:
                db_row = DBRow(nid, res, context)
                for name, value in zip(row.headings, row.cells):
                    if name in ('name_vector', 'nameaddress_vector'):
                        items = [x.strip() for x in value.split(',')]
                        with context.db.cursor() as subcur:
                            subcur.execute(""" SELECT word_id, word_token
                                               FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                                               WHERE word_token = make_standard_name(t.term)
                                                     and class is null and country_code is null
                                                     and operator is null
                                              UNION
                                               SELECT word_id, word_token
                                               FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                                               WHERE word_token = ' ' || make_standard_name(t.term)
                                                     and class is null and country_code is null
                                                     and operator is null
                                           """,
                                           (list(filter(lambda x: not x.startswith('#'), items)),
                                            list(filter(lambda x: x.startswith('#'), items))))
                            if not exclude:
-                                assert subcur.rowcount >= len(items), \
+                                assert len(tokens) >= len(items), \
-                                    "No word entry found for {}. Entries found: {!s}".format(value, subcur.rowcount)
+                                       "No word entry found for {}. Entries found: {!s}".format(value, len(tokens))
-                            for wid in subcur:
+                            for word, token, wid in tokens:
                                present = wid[0] in res[name]
                                if exclude:
-                                    assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1])
+                                    assert wid not in res[name], \
                                           "Found term for {}/{}: {}".format(nid, name, wid)
                                else:
-                                    assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1])
+                                    assert wid in res[name], \
-                    elif name != 'object':
+                                           "Missing term for {}/{}: {}".format(nid, name, wid)
-                        assert db_row.contains(name, value), db_row.assert_msg(name, value)
+                        elif name != 'object':
                            assert db_row.contains(name, value), db_row.assert_msg(name, value)
@then("search_name has no entry for (?P<oid>.*)")
 def check_search_name_has_entry(context, oid):