do not run near queries on qualifier words

There is too much potential for confusion (e.g. 'Rio Grande' read
as 'river near Grande') fir too little gain. Use near phrases
instead.
This commit is contained in:
Sarah Hoffmann
2024-01-06 17:49:58 +01:00
parent f03ec3ea12
commit 10a5424a71
2 changed files with 2 additions and 7 deletions

View File

@@ -8,7 +8,6 @@
Implementation of query analysis for the ICU tokenizer. Implementation of query analysis for the ICU tokenizer.
""" """
from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
from copy import copy
from collections import defaultdict from collections import defaultdict
import dataclasses import dataclasses
import difflib import difflib
@@ -188,10 +187,6 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
else: else:
query.add_token(trange, qmod.TokenType.QUALIFIER, token) query.add_token(trange, qmod.TokenType.QUALIFIER, token)
if trange.start == 0 or trange.end == query.num_token_slots():
token = copy(token)
token.penalty += 0.1 * (query.num_token_slots())
query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
else: else:
query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token) query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)

View File

@@ -148,9 +148,9 @@ async def test_qualifier_words(conn):
query = await ana.analyze_query(make_phrase('foo BAR foo BAR foo')) query = await ana.analyze_query(make_phrase('foo BAR foo BAR foo'))
assert query.num_token_slots() == 5 assert query.num_token_slots() == 5
assert set(t.ttype for t in query.nodes[0].starting) == {TokenType.NEAR_ITEM, TokenType.QUALIFIER} assert set(t.ttype for t in query.nodes[0].starting) == {TokenType.QUALIFIER}
assert set(t.ttype for t in query.nodes[2].starting) == {TokenType.QUALIFIER} assert set(t.ttype for t in query.nodes[2].starting) == {TokenType.QUALIFIER}
assert set(t.ttype for t in query.nodes[4].starting) == {TokenType.NEAR_ITEM, TokenType.QUALIFIER} assert set(t.ttype for t in query.nodes[4].starting) == {TokenType.QUALIFIER}
@pytest.mark.asyncio @pytest.mark.asyncio