adapt tests for ICU tokenizer

2021-06-06 11:00:44 +02:00
parent 8413075249
commit 2e3c5d4c5b
8 changed files with 143 additions and 67 deletions
--- a/nominatim/tokenizer/icu_name_processor.py
+++ b/nominatim/tokenizer/icu_name_processor.py
@@ -76,7 +76,7 @@ class ICUNameProcessor:
        """ Normalize the given name, i.e. remove all elements not relevant
            for search.
        """
-        return self.normalizer.transliterate(name)
+        return self.normalizer.transliterate(name).strip()

    def get_variants_ascii(self, norm_name):
        """ Compute the spelling variants for the given normalized name
@@ -108,4 +108,4 @@ class ICUNameProcessor:
        """ Return the normalized version of the name (including transliteration)
            to be applied at search time.
        """
-        return self.search.transliterate(name)
+        return self.search.transliterate(' ' + name + ' ').strip()
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -28,7 +28,7 @@ class ICURuleLoader:


    def get_search_rules(self):
-        """ Returns the ICU rules to be used during search.
+        """ Return the ICU rules to be used during search.
            The rules combine normalization, compound decomposition (including
            abbreviated compounds) and transliteration.
        """
@@ -60,7 +60,7 @@ class ICURuleLoader:
        return self.transliteration_rules

    def get_replacement_pairs(self):
-        """ Returns the list of possible compound decompositions with
+        """ Return the list of possible compound decompositions with
            application of abbreviations included.
            The result is a list of pairs: the first item is the sequence to
            replace, the second is a list of replacements.
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -219,7 +219,7 @@ class LegacyICUNameAnalyzer:
            self.conn = None


-    def get_word_token_info(self, conn, words):
+    def get_word_token_info(self, words):
        """ Return token information for the given list of words.
            If a word starts with # it is assumed to be a full name
            otherwise is a partial name.
@@ -233,11 +233,11 @@ class LegacyICUNameAnalyzer:
        tokens = {}
        for word in words:
            if word.startswith('#'):
-                tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
+                tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
            else:
-                tokens[word] = self.name_processor.get_normalized(word)
+                tokens[word] = self.name_processor.get_search_normalized(word)

-        with conn.cursor() as cur:
+        with self.conn.cursor() as cur:
            cur.execute("""SELECT word_token, word_id
                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                           WHERE word_token = t.term
@@ -245,7 +245,7 @@ class LegacyICUNameAnalyzer:
                        (list(tokens.values()), ))
            ids = {r[0]: r[1] for r in cur}

-        return [(k, v, ids[v]) for k, v in tokens.items()]
+        return [(k, v, ids.get(v, None)) for k, v in tokens.items()]


    @staticmethod
@@ -308,7 +308,7 @@ class LegacyICUNameAnalyzer:
    def update_special_phrases(self, phrases, should_replace):
        """ Replace the search index for special phrases with the new phrases.
        """
-        norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
                            for p in phrases))

        with self.conn.cursor() as cur:
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -271,8 +271,7 @@ class LegacyNameAnalyzer:
            self.conn = None


-    @staticmethod
-    def get_word_token_info(conn, words):
+    def get_word_token_info(self, words):
        """ Return token information for the given list of words.
            If a word starts with # it is assumed to be a full name
            otherwise is a partial name.
@@ -283,7 +282,7 @@ class LegacyNameAnalyzer:
            The function is used for testing and debugging only
            and not necessarily efficient.
        """
-        with conn.cursor() as cur:
+        with self.conn.cursor() as cur:
            cur.execute("""SELECT t.term, word_token, word_id
                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                           WHERE word_token = (CASE