enable BDD tests for different tokenizers

The tokenizer to be used can be choosen with -DTOKENIZER. Adapt all tests, so that they work with legacy_icu tokenizer. Move lookup in word table to a function in the tokenizer. Special phrases are temporarily imported from the wiki until we have an implementation that can import from file. TIGER tests do not work yet.
2021-05-05 10:00:34 +02:00
parent 18c99a5c5f
commit a263e54b94
7 changed files with 106 additions and 48 deletions
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -271,6 +271,33 @@ class LegacyNameAnalyzer:
            self.conn = None


+    @staticmethod
+    def get_word_token_info(conn, words):
+        """ Return token information for the given list of words.
+            If a word starts with # it is assumed to be a full name
+            otherwise is a partial name.
+
+            The function returns a list of tuples with
+            (original word, word token, word id).
+
+            The function is used for testing and debugging only
+            and not necessarily efficient.
+        """
+        with conn.cursor() as cur:
+            cur.execute("""SELECT t.term, word_token, word_id
+                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
+                           WHERE word_token = (CASE
+                                   WHEN left(t.term, 1) = '#' THEN
+                                     ' ' || make_standard_name(substring(t.term from 2))
+                                   ELSE
+                                     make_standard_name(t.term)
+                                   END)
+                                 and class is null and country_code is null""",
+                        (words, ))
+
+            return [(r[0], r[1], r[2]) for r in cur]
+
+
    def normalize(self, phrase):
        """ Normalize the given phrase, i.e. remove all properties that
            are irrelevant for search.