add address counts to tokens

2026-02-14 18:37:58 +00:00 · 2024-03-15 10:54:13 +01:00
parent bb5de9b955
commit 07b7fd1dbb
10 changed files with 32 additions and 21 deletions
--- a/test/python/api/search/test_api_search_query.py
+++ b/test/python/api/search/test_api_search_query.py
@@ -18,7 +18,8 @@ class MyToken(query.Token):


 def mktoken(tid: int):
-    return MyToken(3.0, tid, 1, 'foo', True)
+    return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
+                   lookup_word='foo', is_indexed=True)


@pytest.mark.parametrize('ptype,ttype', [('NONE', 'WORD'),
--- a/test/python/api/search/test_db_search_builder.py
+++ b/test/python/api/search/test_db_search_builder.py
@@ -31,7 +31,9 @@ def make_query(*args):
        for end, ttype, tinfo in tlist:
            for tid, word in tinfo:
                q.add_token(TokenRange(start, end), ttype,
-                            MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True))
+                            MyToken(penalty=0.5 if ttype == TokenType.PARTIAL else 0.0,
+                                    token=tid, count=1, addr_count=1,
+                                    lookup_word=word, is_indexed=True))


    return q
@@ -395,14 +397,14 @@ def make_counted_searches(name_part, name_full, address_part, address_full,
    q.add_node(BreakType.END, PhraseType.NONE)

    q.add_token(TokenRange(0, 1), TokenType.PARTIAL,
-                MyToken(0.5, 1, name_part, 'name_part', True))
+                MyToken(0.5, 1, name_part, 1, 'name_part', True))
    q.add_token(TokenRange(0, 1), TokenType.WORD,
-                MyToken(0, 101, name_full, 'name_full', True))
+                MyToken(0, 101, name_full, 1, 'name_full', True))
    for i in range(num_address_parts):
        q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
-                    MyToken(0.5, 2, address_part, 'address_part', True))
+                    MyToken(0.5, 2, address_part, 1, 'address_part', True))
        q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
-                    MyToken(0, 102, address_full, 'address_full', True))
+                    MyToken(0, 102, address_full, 1, 'address_full', True))

    builder = SearchBuilder(q, SearchDetails())

--- a/test/python/api/search/test_token_assignment.py
+++ b/test/python/api/search/test_token_assignment.py
@@ -19,7 +19,8 @@ class MyToken(Token):

 def make_query(*args):
    q = QueryStruct([Phrase(args[0][1], '')])
-    dummy = MyToken(3.0, 45, 1, 'foo', True)
+    dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1,
+                    lookup_word='foo', is_indexed=True)

    for btype, ptype, _ in args[1:]:
        q.add_node(btype, ptype)
--- a/test/python/cli/conftest.py
+++ b/test/python/cli/conftest.py
@@ -32,16 +32,16 @@ class DummyTokenizer:
        self.update_statistics_called = False
        self.update_word_tokens_called = False

-    def update_sql_functions(self, *args):
+    def update_sql_functions(self, *args, **kwargs):
        self.update_sql_functions_called = True

-    def finalize_import(self, *args):
+    def finalize_import(self, *args, **kwargs):
        self.finalize_import_called = True

-    def update_statistics(self, *args):
+    def update_statistics(self, *args, **kwargs):
        self.update_statistics_called = True

-    def update_word_tokens(self, *args):
+    def update_word_tokens(self, *args, **kwargs):
        self.update_word_tokens_called = True


--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -227,16 +227,20 @@ def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_conf
 def test_update_statistics(word_table, table_factory, temp_db_cursor,
                           tokenizer_factory, test_config):
    word_table.add_full_word(1000, 'hello')
+    word_table.add_full_word(1001, 'bye')
    table_factory('search_name',
-                  'place_id BIGINT, name_vector INT[]',
-                  [(12, [1000])])
+                  'place_id BIGINT, name_vector INT[], nameaddress_vector INT[]',
+                  [(12, [1000], [1001])])
    tok = tokenizer_factory()

    tok.update_statistics(test_config)

    assert temp_db_cursor.scalar("""SELECT count(*) FROM word
-                                    WHERE type = 'W' and
-                                          (info->>'count')::int > 0""") > 0
+                                    WHERE type = 'W' and word_id = 1000 and
+                                          (info->>'count')::int > 0""") == 1
+    assert temp_db_cursor.scalar("""SELECT count(*) FROM word
+                                    WHERE type = 'W' and word_id = 1001 and
+                                          (info->>'addr_count')::int > 0""") == 1


 def test_normalize_postcode(analyzer):