query analyzer for ICU tokenizer

This commit is contained in:
Sarah Hoffmann
2023-05-22 08:46:19 +02:00
parent ff66595f7a
commit 004883bdb1
5 changed files with 547 additions and 4 deletions

View File

@@ -7,7 +7,7 @@
"""
Datastructures for a tokenized query.
"""
from typing import List, Tuple, Optional, NamedTuple
from typing import List, Tuple, Optional, NamedTuple, Iterator
from abc import ABC, abstractmethod
import dataclasses
import enum
@@ -124,6 +124,13 @@ class TokenList:
tokens: List[Token]
def add_penalty(self, penalty: float) -> None:
""" Add the given penalty to all tokens in the list.
"""
for token in self.tokens:
token.penalty += penalty
@dataclasses.dataclass
class QueryNode:
""" A node of the querry representing a break between terms.
@@ -226,6 +233,14 @@ class QueryStruct:
for i in range(trange.start, trange.end)]
def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
""" Iterator over all token lists in the query.
"""
for i, node in enumerate(self.nodes):
for tlist in node.starting:
yield i, node, tlist
def find_lookup_word_by_id(self, token: int) -> str:
""" Find the first token with the given token ID and return
its lookup word. Returns 'None' if no such token exists.