mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-06 18:14:16 +00:00
add normalised country name to word table
Country tokens now follow the usual convetion of having the normalized version in the word column and the extra info about the country code in the info column.
This commit is contained in:
@@ -374,7 +374,7 @@ class SearchBuilder:
|
||||
tokens = self.get_country_tokens(assignment.country)
|
||||
if not tokens:
|
||||
return None
|
||||
sdata.set_strings('countries', tokens)
|
||||
sdata.set_countries(tokens)
|
||||
sdata.penalty += self.query.get_in_word_penalty(assignment.country)
|
||||
elif self.details.countries:
|
||||
sdata.countries = dbf.WeightedStrings(self.details.countries,
|
||||
|
||||
@@ -244,6 +244,21 @@ class SearchData:
|
||||
|
||||
setattr(self, field, wstrs)
|
||||
|
||||
def set_countries(self, tokens: List[Token]) -> None:
|
||||
""" Set the WeightedStrings properties for countries. Multiple
|
||||
entries for the same country are deduplicated and the minimum
|
||||
penalty is used. Adapts the global penalty, so that the
|
||||
minimum penalty is 0.
|
||||
"""
|
||||
if tokens:
|
||||
min_penalty = min(t.penalty for t in tokens)
|
||||
self.penalty += min_penalty
|
||||
countries: dict[str, float] = {}
|
||||
for t in tokens:
|
||||
cc = t.get_country()
|
||||
countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000))
|
||||
self.countries = WeightedStrings(list(countries.keys()), list(countries.values()))
|
||||
|
||||
def set_qualifiers(self, tokens: List[Token]) -> None:
|
||||
""" Set the qulaifier field from the given tokens.
|
||||
"""
|
||||
|
||||
@@ -59,12 +59,16 @@ class ICUToken(qmod.Token):
|
||||
assert self.info
|
||||
return self.info.get('class', ''), self.info.get('type', '')
|
||||
|
||||
def rematch(self, norm: str) -> None:
|
||||
def get_country(self) -> str:
|
||||
assert self.info
|
||||
return cast(str, self.info.get('cc', ''))
|
||||
|
||||
def match_penalty(self, norm: str) -> float:
|
||||
""" Check how well the token matches the given normalized string
|
||||
and add a penalty, if necessary.
|
||||
"""
|
||||
if not self.lookup_word:
|
||||
return
|
||||
return 0.0
|
||||
|
||||
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
|
||||
distance = 0
|
||||
@@ -75,7 +79,7 @@ class ICUToken(qmod.Token):
|
||||
distance += max((ato-afrom), (bto-bfrom))
|
||||
elif tag != 'equal':
|
||||
distance += abs((ato-afrom) - (bto-bfrom))
|
||||
self.penalty += (distance/len(self.lookup_word))
|
||||
return (distance/len(self.lookup_word))
|
||||
|
||||
@staticmethod
|
||||
def from_db_row(row: SaRow) -> 'ICUToken':
|
||||
@@ -330,9 +334,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
||||
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
|
||||
for n in query.nodes[start + 1:end + 1]).strip()
|
||||
for ttype, tokens in tlist.items():
|
||||
if ttype != qmod.TOKEN_COUNTRY:
|
||||
for token in tokens:
|
||||
cast(ICUToken, token).rematch(norm)
|
||||
for token in tokens:
|
||||
itok = cast(ICUToken, token)
|
||||
itok.penalty += itok.match_penalty(norm) * \
|
||||
(1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2)
|
||||
|
||||
def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
|
||||
""" Set the break penalties for the nodes in the query.
|
||||
|
||||
@@ -127,6 +127,12 @@ class Token(ABC):
|
||||
category objects.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_country(self) -> str:
|
||||
""" Return the country code this tojen is associated with
|
||||
(currently for country tokens only).
|
||||
"""
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TokenRange:
|
||||
|
||||
Reference in New Issue
Block a user