add address counts to tokens

This commit is contained in:
Sarah Hoffmann
2024-03-15 10:54:13 +01:00
parent bb5de9b955
commit 07b7fd1dbb
10 changed files with 32 additions and 21 deletions

View File

@@ -18,7 +18,8 @@ class MyToken(query.Token):
def mktoken(tid: int):
return MyToken(3.0, tid, 1, 'foo', True)
return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
lookup_word='foo', is_indexed=True)
@pytest.mark.parametrize('ptype,ttype', [('NONE', 'WORD'),

View File

@@ -31,7 +31,9 @@ def make_query(*args):
for end, ttype, tinfo in tlist:
for tid, word in tinfo:
q.add_token(TokenRange(start, end), ttype,
MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True))
MyToken(penalty=0.5 if ttype == TokenType.PARTIAL else 0.0,
token=tid, count=1, addr_count=1,
lookup_word=word, is_indexed=True))
return q
@@ -395,14 +397,14 @@ def make_counted_searches(name_part, name_full, address_part, address_full,
q.add_node(BreakType.END, PhraseType.NONE)
q.add_token(TokenRange(0, 1), TokenType.PARTIAL,
MyToken(0.5, 1, name_part, 'name_part', True))
MyToken(0.5, 1, name_part, 1, 'name_part', True))
q.add_token(TokenRange(0, 1), TokenType.WORD,
MyToken(0, 101, name_full, 'name_full', True))
MyToken(0, 101, name_full, 1, 'name_full', True))
for i in range(num_address_parts):
q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
MyToken(0.5, 2, address_part, 'address_part', True))
MyToken(0.5, 2, address_part, 1, 'address_part', True))
q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
MyToken(0, 102, address_full, 'address_full', True))
MyToken(0, 102, address_full, 1, 'address_full', True))
builder = SearchBuilder(q, SearchDetails())

View File

@@ -19,7 +19,8 @@ class MyToken(Token):
def make_query(*args):
q = QueryStruct([Phrase(args[0][1], '')])
dummy = MyToken(3.0, 45, 1, 'foo', True)
dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1,
lookup_word='foo', is_indexed=True)
for btype, ptype, _ in args[1:]:
q.add_node(btype, ptype)

View File

@@ -32,16 +32,16 @@ class DummyTokenizer:
self.update_statistics_called = False
self.update_word_tokens_called = False
def update_sql_functions(self, *args):
def update_sql_functions(self, *args, **kwargs):
self.update_sql_functions_called = True
def finalize_import(self, *args):
def finalize_import(self, *args, **kwargs):
self.finalize_import_called = True
def update_statistics(self, *args):
def update_statistics(self, *args, **kwargs):
self.update_statistics_called = True
def update_word_tokens(self, *args):
def update_word_tokens(self, *args, **kwargs):
self.update_word_tokens_called = True

View File

@@ -227,16 +227,20 @@ def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_conf
def test_update_statistics(word_table, table_factory, temp_db_cursor,
tokenizer_factory, test_config):
word_table.add_full_word(1000, 'hello')
word_table.add_full_word(1001, 'bye')
table_factory('search_name',
'place_id BIGINT, name_vector INT[]',
[(12, [1000])])
'place_id BIGINT, name_vector INT[], nameaddress_vector INT[]',
[(12, [1000], [1001])])
tok = tokenizer_factory()
tok.update_statistics(test_config)
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
WHERE type = 'W' and
(info->>'count')::int > 0""") > 0
WHERE type = 'W' and word_id = 1000 and
(info->>'count')::int > 0""") == 1
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
WHERE type = 'W' and word_id = 1001 and
(info->>'addr_count')::int > 0""") == 1
def test_normalize_postcode(analyzer):