add normalised country name to word table

Country tokens now follow the usual convetion of having the
normalized version in the word column and the extra info about the
country code in the info column.
This commit is contained in:
Sarah Hoffmann
2025-11-27 12:00:47 +01:00
parent f2a122c5c0
commit 81c6cb72e6
7 changed files with 67 additions and 32 deletions

View File

@@ -59,12 +59,16 @@ class ICUToken(qmod.Token):
assert self.info
return self.info.get('class', ''), self.info.get('type', '')
def rematch(self, norm: str) -> None:
def get_country(self) -> str:
assert self.info
return cast(str, self.info.get('cc', ''))
def match_penalty(self, norm: str) -> float:
""" Check how well the token matches the given normalized string
and add a penalty, if necessary.
"""
if not self.lookup_word:
return
return 0.0
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
distance = 0
@@ -75,7 +79,7 @@ class ICUToken(qmod.Token):
distance += max((ato-afrom), (bto-bfrom))
elif tag != 'equal':
distance += abs((ato-afrom) - (bto-bfrom))
self.penalty += (distance/len(self.lookup_word))
return (distance/len(self.lookup_word))
@staticmethod
def from_db_row(row: SaRow) -> 'ICUToken':
@@ -330,9 +334,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
for n in query.nodes[start + 1:end + 1]).strip()
for ttype, tokens in tlist.items():
if ttype != qmod.TOKEN_COUNTRY:
for token in tokens:
cast(ICUToken, token).rematch(norm)
for token in tokens:
itok = cast(ICUToken, token)
itok.penalty += itok.match_penalty(norm) * \
(1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2)
def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
""" Set the break penalties for the nodes in the query.