remove word_number counting for phrases

We can just examine the break types to know if we are dealing
with a partial token.
This commit is contained in:
Sarah Hoffmann
2025-02-20 17:36:50 +01:00
parent adabfee3be
commit abc911079e

View File

@@ -50,15 +50,16 @@ PENALTY_IN_TOKEN_BREAK = {
@dataclasses.dataclass @dataclasses.dataclass
class QueryPart: class QueryPart:
""" Normalized and transliterated form of a single term in the query. """ Normalized and transliterated form of a single term in the query.
When the term came out of a split during the transliteration, When the term came out of a split during the transliteration,
the normalized string is the full word before transliteration. the normalized string is the full word before transliteration.
The word number keeps track of the word before transliteration Check the subsequent break type to figure out if the word is
and can be used to identify partial transliterated terms. continued.
Penalty is the break penalty for the break following the token. Penalty is the break penalty for the break following the token.
""" """
token: str token: str
normalized: str normalized: str
word_number: int
penalty: float penalty: float
@@ -256,7 +257,6 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
parts: QueryParts = [] parts: QueryParts = []
phrase_start = 0 phrase_start = 0
words = defaultdict(list) words = defaultdict(list)
wordnr = 0
for phrase in query.source: for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype query.nodes[-1].ptype = phrase.ptype
phrase_split = re.split('([ :-])', phrase.text) phrase_split = re.split('([ :-])', phrase.text)
@@ -271,12 +271,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
if trans: if trans:
for term in trans.split(' '): for term in trans.split(' '):
if term: if term:
parts.append(QueryPart(term, word, wordnr, parts.append(QueryPart(term, word,
PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN])) PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
query.add_node(qmod.BreakType.TOKEN, phrase.ptype) query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
query.nodes[-1].btype = qmod.BreakType(breakchar) query.nodes[-1].btype = qmod.BreakType(breakchar)
parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)] parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
wordnr += 1
for word, wrange in yield_words(parts, phrase_start): for word, wrange in yield_words(parts, phrase_start):
words[word].append(wrange) words[word].append(wrange)
@@ -323,7 +322,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL): elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
norm = parts[i].normalized norm = parts[i].normalized
for j in range(i + 1, tlist.end): for j in range(i + 1, tlist.end):
if parts[j - 1].word_number != parts[j].word_number: if node.btype != qmod.BreakType.TOKEN:
norm += ' ' + parts[j].normalized norm += ' ' + parts[j].normalized
for token in tlist.tokens: for token in tlist.tokens:
cast(ICUToken, token).rematch(norm) cast(ICUToken, token).rematch(norm)