add normalised country name to word table

Country tokens now follow the usual convetion of having the
normalized version in the word column and the extra info about the
country code in the info column.
This commit is contained in:
Sarah Hoffmann
2025-11-27 12:00:47 +01:00
parent f2a122c5c0
commit 81c6cb72e6
7 changed files with 67 additions and 32 deletions

View File

@@ -374,7 +374,7 @@ class SearchBuilder:
tokens = self.get_country_tokens(assignment.country)
if not tokens:
return None
sdata.set_strings('countries', tokens)
sdata.set_countries(tokens)
sdata.penalty += self.query.get_in_word_penalty(assignment.country)
elif self.details.countries:
sdata.countries = dbf.WeightedStrings(self.details.countries,

View File

@@ -244,6 +244,21 @@ class SearchData:
setattr(self, field, wstrs)
def set_countries(self, tokens: List[Token]) -> None:
""" Set the WeightedStrings properties for countries. Multiple
entries for the same country are deduplicated and the minimum
penalty is used. Adapts the global penalty, so that the
minimum penalty is 0.
"""
if tokens:
min_penalty = min(t.penalty for t in tokens)
self.penalty += min_penalty
countries: dict[str, float] = {}
for t in tokens:
cc = t.get_country()
countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000))
self.countries = WeightedStrings(list(countries.keys()), list(countries.values()))
def set_qualifiers(self, tokens: List[Token]) -> None:
""" Set the qulaifier field from the given tokens.
"""

View File

@@ -59,12 +59,16 @@ class ICUToken(qmod.Token):
assert self.info
return self.info.get('class', ''), self.info.get('type', '')
def rematch(self, norm: str) -> None:
def get_country(self) -> str:
assert self.info
return cast(str, self.info.get('cc', ''))
def match_penalty(self, norm: str) -> float:
""" Check how well the token matches the given normalized string
and add a penalty, if necessary.
"""
if not self.lookup_word:
return
return 0.0
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
distance = 0
@@ -75,7 +79,7 @@ class ICUToken(qmod.Token):
distance += max((ato-afrom), (bto-bfrom))
elif tag != 'equal':
distance += abs((ato-afrom) - (bto-bfrom))
self.penalty += (distance/len(self.lookup_word))
return (distance/len(self.lookup_word))
@staticmethod
def from_db_row(row: SaRow) -> 'ICUToken':
@@ -330,9 +334,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
for n in query.nodes[start + 1:end + 1]).strip()
for ttype, tokens in tlist.items():
if ttype != qmod.TOKEN_COUNTRY:
for token in tokens:
cast(ICUToken, token).rematch(norm)
for token in tokens:
itok = cast(ICUToken, token)
itok.penalty += itok.match_penalty(norm) * \
(1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2)
def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
""" Set the break penalties for the nodes in the query.

View File

@@ -127,6 +127,12 @@ class Token(ABC):
category objects.
"""
@abstractmethod
def get_country(self) -> str:
""" Return the country code this tojen is associated with
(currently for country tokens only).
"""
@dataclasses.dataclass
class TokenRange:

View File

@@ -475,20 +475,23 @@ class ICUNameAnalyzer(AbstractAnalyzer):
assert self.conn is not None
word_tokens = set()
for name in names:
norm_name = self._search_normalized(name.name)
if norm_name:
word_tokens.add(norm_name)
norm_name = self._normalized(name.name)
token_name = self._search_normalized(name.name)
if norm_name and token_name:
word_tokens.add((token_name, norm_name))
with self.conn.cursor() as cur:
# Get existing names
cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
cur.execute("""SELECT word_token,
word as lookup,
coalesce(info ? 'internal', false) as is_internal
FROM word
WHERE type = 'C' and word = %s""",
WHERE type = 'C' and info->>'cc' = %s""",
(country_code, ))
# internal/external names
existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
existing_tokens: Dict[bool, Set[Tuple[str, str]]] = {True: set(), False: set()}
for word in cur:
existing_tokens[word[1]].add(word[0])
existing_tokens[word[2]].add((word[0], word[1]))
# Delete names that no longer exist.
gone_tokens = existing_tokens[internal] - word_tokens
@@ -496,10 +499,10 @@ class ICUNameAnalyzer(AbstractAnalyzer):
gone_tokens.update(existing_tokens[False] & word_tokens)
if gone_tokens:
cur.execute("""DELETE FROM word
USING unnest(%s::text[]) as token
WHERE type = 'C' and word = %s
and word_token = token""",
(list(gone_tokens), country_code))
USING jsonb_array_elements(%s) as data
WHERE type = 'C' and info->>'cc' = %s
and word_token = data->>0 and word = data->>1""",
(Jsonb(list(gone_tokens)), country_code))
# Only add those names that are not yet in the list.
new_tokens = word_tokens - existing_tokens[True]
@@ -508,15 +511,17 @@ class ICUNameAnalyzer(AbstractAnalyzer):
if new_tokens:
if internal:
sql = """INSERT INTO word (word_token, type, word, info)
(SELECT token, 'C', %s, '{"internal": "yes"}'
FROM unnest(%s::text[]) as token)
(SELECT data->>0, 'C', data->>1,
jsonb_build_object('internal', 'yes', 'cc', %s::text)
FROM jsonb_array_elements(%s) as data)
"""
else:
sql = """INSERT INTO word (word_token, type, word)
(SELECT token, 'C', %s
FROM unnest(%s::text[]) as token)
sql = """INSERT INTO word (word_token, type, word, info)
(SELECT data->>0, 'C', data->>1,
jsonb_build_object('cc', %s::text)
FROM jsonb_array_elements(%s) as data)
"""
cur.execute(sql, (country_code, list(new_tokens)))
cur.execute(sql, (country_code, Jsonb(list(new_tokens))))
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
""" Determine tokenizer information about the given place.

View File

@@ -10,6 +10,8 @@ of the table.
"""
from nominatim_db.db.connection import execute_scalar
from psycopg.types.json import Jsonb
class MockIcuWordTable:
""" A word table for testing using legacy word table structure.
@@ -42,11 +44,11 @@ class MockIcuWordTable:
""", (word_token, word, cls, typ, oper))
self.conn.commit()
def add_country(self, country_code, word_token):
def add_country(self, country_code, word_token, lookup):
with self.conn.cursor() as cur:
cur.execute("""INSERT INTO word (word_token, type, word)
VALUES(%s, 'C', %s)""",
(word_token, country_code))
cur.execute("""INSERT INTO word (word_token, type, word, info)
VALUES(%s, 'C', %s, %s)""",
(word_token, lookup, Jsonb({'cc': country_code})))
self.conn.commit()
def add_postcode(self, word_token, postcode):
@@ -93,7 +95,7 @@ class MockIcuWordTable:
def get_country(self):
with self.conn.cursor() as cur:
cur.execute("SELECT word, word_token FROM word WHERE type = 'C'")
cur.execute("SELECT info->>'cc', word_token, word FROM word WHERE type = 'C'")
result = set((tuple(row) for row in cur))
assert len(result) == cur.rowcount, "Word table has duplicates."
return result

View File

@@ -343,16 +343,18 @@ def test_add_country_names_new(analyzer, word_table):
with analyzer() as anl:
anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})
assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')}
assert word_table.get_country() == {('es', 'ESPAGÑA', 'Espagña'),
('es', 'SPAIN', 'Spain')}
def test_add_country_names_extend(analyzer, word_table):
word_table.add_country('ch', 'SCHWEIZ')
word_table.add_country('ch', 'SCHWEIZ', 'Schweiz')
with analyzer() as anl:
anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})
assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')}
assert word_table.get_country() == {('ch', 'SCHWEIZ', 'Schweiz'),
('ch', 'SUISSE', 'Suisse')}
class TestPlaceNames: