Merge pull request #3894 from lonvia/country-names-with-word-lookup

Add normalized form of country names to coutry tokens in word table
This commit is contained in:
Sarah Hoffmann
2025-12-01 14:54:24 +01:00
committed by GitHub
14 changed files with 104 additions and 37 deletions

View File

@@ -15,7 +15,7 @@ classifiers = [
"Operating System :: OS Independent", "Operating System :: OS Independent",
] ]
dependencies = [ dependencies = [
"psycopg", "psycopg<3.3",
"python-dotenv", "python-dotenv",
"jinja2", "jinja2",
"pyYAML>=5.1", "pyYAML>=5.1",

View File

@@ -374,7 +374,7 @@ class SearchBuilder:
tokens = self.get_country_tokens(assignment.country) tokens = self.get_country_tokens(assignment.country)
if not tokens: if not tokens:
return None return None
sdata.set_strings('countries', tokens) sdata.set_countries(tokens)
sdata.penalty += self.query.get_in_word_penalty(assignment.country) sdata.penalty += self.query.get_in_word_penalty(assignment.country)
elif self.details.countries: elif self.details.countries:
sdata.countries = dbf.WeightedStrings(self.details.countries, sdata.countries = dbf.WeightedStrings(self.details.countries,

View File

@@ -244,6 +244,21 @@ class SearchData:
setattr(self, field, wstrs) setattr(self, field, wstrs)
def set_countries(self, tokens: List[Token]) -> None:
""" Set the WeightedStrings properties for countries. Multiple
entries for the same country are deduplicated and the minimum
penalty is used. Adapts the global penalty, so that the
minimum penalty is 0.
"""
if tokens:
min_penalty = min(t.penalty for t in tokens)
self.penalty += min_penalty
countries: dict[str, float] = {}
for t in tokens:
cc = t.get_country()
countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000))
self.countries = WeightedStrings(list(countries.keys()), list(countries.values()))
def set_qualifiers(self, tokens: List[Token]) -> None: def set_qualifiers(self, tokens: List[Token]) -> None:
""" Set the qulaifier field from the given tokens. """ Set the qulaifier field from the given tokens.
""" """

View File

@@ -59,12 +59,16 @@ class ICUToken(qmod.Token):
assert self.info assert self.info
return self.info.get('class', ''), self.info.get('type', '') return self.info.get('class', ''), self.info.get('type', '')
def rematch(self, norm: str) -> None: def get_country(self) -> str:
assert self.info
return cast(str, self.info.get('cc', ''))
def match_penalty(self, norm: str) -> float:
""" Check how well the token matches the given normalized string """ Check how well the token matches the given normalized string
and add a penalty, if necessary. and add a penalty, if necessary.
""" """
if not self.lookup_word: if not self.lookup_word:
return return 0.0
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm) seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
distance = 0 distance = 0
@@ -75,7 +79,7 @@ class ICUToken(qmod.Token):
distance += max((ato-afrom), (bto-bfrom)) distance += max((ato-afrom), (bto-bfrom))
elif tag != 'equal': elif tag != 'equal':
distance += abs((ato-afrom) - (bto-bfrom)) distance += abs((ato-afrom) - (bto-bfrom))
self.penalty += (distance/len(self.lookup_word)) return (distance/len(self.lookup_word))
@staticmethod @staticmethod
def from_db_row(row: SaRow) -> 'ICUToken': def from_db_row(row: SaRow) -> 'ICUToken':
@@ -330,9 +334,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}" norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
for n in query.nodes[start + 1:end + 1]).strip() for n in query.nodes[start + 1:end + 1]).strip()
for ttype, tokens in tlist.items(): for ttype, tokens in tlist.items():
if ttype != qmod.TOKEN_COUNTRY: for token in tokens:
for token in tokens: itok = cast(ICUToken, token)
cast(ICUToken, token).rematch(norm) itok.penalty += itok.match_penalty(norm) * \
(1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2)
def compute_break_penalties(self, query: qmod.QueryStruct) -> None: def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
""" Set the break penalties for the nodes in the query. """ Set the break penalties for the nodes in the query.

View File

@@ -127,6 +127,12 @@ class Token(ABC):
category objects. category objects.
""" """
@abstractmethod
def get_country(self) -> str:
""" Return the country code this tojen is associated with
(currently for country tokens only).
"""
@dataclasses.dataclass @dataclasses.dataclass
class TokenRange: class TokenRange:

View File

@@ -475,20 +475,23 @@ class ICUNameAnalyzer(AbstractAnalyzer):
assert self.conn is not None assert self.conn is not None
word_tokens = set() word_tokens = set()
for name in names: for name in names:
norm_name = self._search_normalized(name.name) norm_name = self._normalized(name.name)
if norm_name: token_name = self._search_normalized(name.name)
word_tokens.add(norm_name) if norm_name and token_name:
word_tokens.add((token_name, norm_name))
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
# Get existing names # Get existing names
cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal cur.execute("""SELECT word_token,
word as lookup,
coalesce(info ? 'internal', false) as is_internal
FROM word FROM word
WHERE type = 'C' and word = %s""", WHERE type = 'C' and info->>'cc' = %s""",
(country_code, )) (country_code, ))
# internal/external names # internal/external names
existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()} existing_tokens: Dict[bool, Set[Tuple[str, str]]] = {True: set(), False: set()}
for word in cur: for word in cur:
existing_tokens[word[1]].add(word[0]) existing_tokens[word[2]].add((word[0], word[1]))
# Delete names that no longer exist. # Delete names that no longer exist.
gone_tokens = existing_tokens[internal] - word_tokens gone_tokens = existing_tokens[internal] - word_tokens
@@ -496,10 +499,10 @@ class ICUNameAnalyzer(AbstractAnalyzer):
gone_tokens.update(existing_tokens[False] & word_tokens) gone_tokens.update(existing_tokens[False] & word_tokens)
if gone_tokens: if gone_tokens:
cur.execute("""DELETE FROM word cur.execute("""DELETE FROM word
USING unnest(%s::text[]) as token USING jsonb_array_elements(%s) as data
WHERE type = 'C' and word = %s WHERE type = 'C' and info->>'cc' = %s
and word_token = token""", and word_token = data->>0 and word = data->>1""",
(list(gone_tokens), country_code)) (Jsonb(list(gone_tokens)), country_code))
# Only add those names that are not yet in the list. # Only add those names that are not yet in the list.
new_tokens = word_tokens - existing_tokens[True] new_tokens = word_tokens - existing_tokens[True]
@@ -508,15 +511,17 @@ class ICUNameAnalyzer(AbstractAnalyzer):
if new_tokens: if new_tokens:
if internal: if internal:
sql = """INSERT INTO word (word_token, type, word, info) sql = """INSERT INTO word (word_token, type, word, info)
(SELECT token, 'C', %s, '{"internal": "yes"}' (SELECT data->>0, 'C', data->>1,
FROM unnest(%s::text[]) as token) jsonb_build_object('internal', 'yes', 'cc', %s::text)
FROM jsonb_array_elements(%s) as data)
""" """
else: else:
sql = """INSERT INTO word (word_token, type, word) sql = """INSERT INTO word (word_token, type, word, info)
(SELECT token, 'C', %s (SELECT data->>0, 'C', data->>1,
FROM unnest(%s::text[]) as token) jsonb_build_object('cc', %s::text)
FROM jsonb_array_elements(%s) as data)
""" """
cur.execute(sql, (country_code, list(new_tokens))) cur.execute(sql, (country_code, Jsonb(list(new_tokens))))
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]: def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
""" Determine tokenizer information about the given place. """ Determine tokenizer information about the given place.

View File

@@ -2,7 +2,7 @@
# #
# This file is part of Nominatim. (https://nominatim.org) # This file is part of Nominatim. (https://nominatim.org)
# #
# Copyright (C) 2024 by the Nominatim developer community. # Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log. # For a full list of authors see the git log.
""" """
Functions for database migration to newer software versions. Functions for database migration to newer software versions.
@@ -18,6 +18,7 @@ from ..db.connection import connect, Connection, \
from ..db.sql_preprocessor import SQLPreprocessor from ..db.sql_preprocessor import SQLPreprocessor
from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version
from ..tokenizer import factory as tokenizer_factory from ..tokenizer import factory as tokenizer_factory
from ..data.country_info import create_country_names, setup_country_config
from . import refresh from . import refresh
LOG = logging.getLogger() LOG = logging.getLogger()
@@ -156,3 +157,25 @@ def create_place_entrance_table(conn: Connection, config: Configuration, **_: An
CREATE UNIQUE INDEX place_entrance_osm_id_idx ON place_entrance CREATE UNIQUE INDEX place_entrance_osm_id_idx ON place_entrance
USING BTREE (osm_id); USING BTREE (osm_id);
""") """)
@_migration(5, 2, 99, 1)
def convert_country_tokens(conn: Connection, config: Configuration, **_: Any) -> None:
""" Convert country word tokens
Country tokens now save the country in the info field instead of the
word. This migration removes all country tokens from the word table
and reimports the default country name. This means that custom names
are lost. If you need them back, invalidate the OSM objects containing
the names by setting indexed_status to 2 and then reindex the database.
"""
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
# There is only one tokenizer at the time of migration, so we make
# some assumptions here about the structure of the database. This will
# fail if somebody has written a custom tokenizer.
with conn.cursor() as cur:
cur.execute("DELETE FROM word WHERE type = 'C'")
conn.commit()
setup_country_config(config)
create_country_names(conn, tokenizer, config.get_str_list('LANGUAGES'))

View File

@@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion:
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')]) return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
NOMINATIM_VERSION = parse_version('5.2.0-0') NOMINATIM_VERSION = parse_version('5.2.99-0')
POSTGRESQL_REQUIRED_VERSION = (12, 0) POSTGRESQL_REQUIRED_VERSION = (12, 0)
POSTGIS_REQUIRED_VERSION = (3, 0) POSTGIS_REQUIRED_VERSION = (3, 0)

View File

@@ -17,6 +17,9 @@ class MyToken(query.Token):
def get_category(self): def get_category(self):
return 'this', 'that' return 'this', 'that'
def get_country(self):
return 'cc'
def mktoken(tid: int): def mktoken(tid: int):
return MyToken(penalty=3.0, token=tid, count=1, addr_count=1, return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,

View File

@@ -21,6 +21,9 @@ class MyToken(Token):
def get_category(self): def get_category(self):
return 'this', 'that' return 'this', 'that'
def get_country(self):
return self.lookup_word
def make_query(*args): def make_query(*args):
q = QueryStruct([Phrase(qmod.PHRASE_ANY, '')]) q = QueryStruct([Phrase(qmod.PHRASE_ANY, '')])

View File

@@ -20,6 +20,9 @@ class MyToken(Token):
def get_category(self): def get_category(self):
return 'this', 'that' return 'this', 'that'
def get_country(self):
return 'cc'
def make_query(*args): def make_query(*args):
q = QueryStruct([Phrase(args[0][1], '')]) q = QueryStruct([Phrase(args[0][1], '')])

View File

@@ -99,7 +99,7 @@ def test_address_simple_places(apiobj, frontend, atype, address, search):
def test_address_country(apiobj, frontend): def test_address_country(apiobj, frontend):
apiobj.add_word_table([(None, 'ro', 'C', 'ro', None)]) apiobj.add_word_table([(None, 'ro', 'C', 'ro', {'cc': 'ro'})])
apiobj.add_country('ro', 'POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))') apiobj.add_country('ro', 'POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))')
apiobj.add_country_name('ro', {'name': 'România'}) apiobj.add_country_name('ro', {'name': 'România'})

View File

@@ -10,6 +10,8 @@ of the table.
""" """
from nominatim_db.db.connection import execute_scalar from nominatim_db.db.connection import execute_scalar
from psycopg.types.json import Jsonb
class MockIcuWordTable: class MockIcuWordTable:
""" A word table for testing using legacy word table structure. """ A word table for testing using legacy word table structure.
@@ -42,11 +44,11 @@ class MockIcuWordTable:
""", (word_token, word, cls, typ, oper)) """, (word_token, word, cls, typ, oper))
self.conn.commit() self.conn.commit()
def add_country(self, country_code, word_token): def add_country(self, country_code, word_token, lookup):
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
cur.execute("""INSERT INTO word (word_token, type, word) cur.execute("""INSERT INTO word (word_token, type, word, info)
VALUES(%s, 'C', %s)""", VALUES(%s, 'C', %s, %s)""",
(word_token, country_code)) (word_token, lookup, Jsonb({'cc': country_code})))
self.conn.commit() self.conn.commit()
def add_postcode(self, word_token, postcode): def add_postcode(self, word_token, postcode):
@@ -93,7 +95,7 @@ class MockIcuWordTable:
def get_country(self): def get_country(self):
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
cur.execute("SELECT word, word_token FROM word WHERE type = 'C'") cur.execute("SELECT info->>'cc', word_token, word FROM word WHERE type = 'C'")
result = set((tuple(row) for row in cur)) result = set((tuple(row) for row in cur))
assert len(result) == cur.rowcount, "Word table has duplicates." assert len(result) == cur.rowcount, "Word table has duplicates."
return result return result

View File

@@ -343,16 +343,18 @@ def test_add_country_names_new(analyzer, word_table):
with analyzer() as anl: with analyzer() as anl:
anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'}) anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})
assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')} assert word_table.get_country() == {('es', 'ESPAGÑA', 'Espagña'),
('es', 'SPAIN', 'Spain')}
def test_add_country_names_extend(analyzer, word_table): def test_add_country_names_extend(analyzer, word_table):
word_table.add_country('ch', 'SCHWEIZ') word_table.add_country('ch', 'SCHWEIZ', 'Schweiz')
with analyzer() as anl: with analyzer() as anl:
anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'}) anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})
assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')} assert word_table.get_country() == {('ch', 'SCHWEIZ', 'Schweiz'),
('ch', 'SUISSE', 'Suisse')}
class TestPlaceNames: class TestPlaceNames:
@@ -403,7 +405,7 @@ class TestPlaceNames:
info = self.analyzer.process_place(place) info = self.analyzer.process_place(place)
self.expect_name_terms(info, '#norge', 'norge') self.expect_name_terms(info, '#norge', 'norge')
assert word_table.get_country() == {('no', 'NORGE')} assert word_table.get_country() == {('no', 'NORGE', 'Norge')}
class TestPlaceAddress: class TestPlaceAddress: