add normalised country name to word table

Country tokens now follow the usual convetion of having the
normalized version in the word column and the extra info about the
country code in the info column.
This commit is contained in:
Sarah Hoffmann
2025-11-27 12:00:47 +01:00
parent f2a122c5c0
commit 81c6cb72e6
7 changed files with 67 additions and 32 deletions

View File

@@ -244,6 +244,21 @@ class SearchData:
setattr(self, field, wstrs)
def set_countries(self, tokens: List[Token]) -> None:
""" Set the WeightedStrings properties for countries. Multiple
entries for the same country are deduplicated and the minimum
penalty is used. Adapts the global penalty, so that the
minimum penalty is 0.
"""
if tokens:
min_penalty = min(t.penalty for t in tokens)
self.penalty += min_penalty
countries: dict[str, float] = {}
for t in tokens:
cc = t.get_country()
countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000))
self.countries = WeightedStrings(list(countries.keys()), list(countries.values()))
def set_qualifiers(self, tokens: List[Token]) -> None:
""" Set the qulaifier field from the given tokens.
"""