From 81c6cb72e62f8003d330892906522958a24878e1 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 27 Nov 2025 12:00:47 +0100 Subject: [PATCH 1/4] add normalised country name to word table Country tokens now follow the usual convetion of having the normalized version in the word column and the extra info about the country code in the info column. --- src/nominatim_api/search/db_search_builder.py | 2 +- src/nominatim_api/search/db_search_fields.py | 15 +++++++ src/nominatim_api/search/icu_tokenizer.py | 17 +++++--- src/nominatim_api/search/query.py | 6 +++ src/nominatim_db/tokenizer/icu_tokenizer.py | 39 +++++++++++-------- test/python/mock_icu_word_table.py | 12 +++--- test/python/tokenizer/test_icu.py | 8 ++-- 7 files changed, 67 insertions(+), 32 deletions(-) diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py index f90c6d7f..591d32ca 100644 --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@ -374,7 +374,7 @@ class SearchBuilder: tokens = self.get_country_tokens(assignment.country) if not tokens: return None - sdata.set_strings('countries', tokens) + sdata.set_countries(tokens) sdata.penalty += self.query.get_in_word_penalty(assignment.country) elif self.details.countries: sdata.countries = dbf.WeightedStrings(self.details.countries, diff --git a/src/nominatim_api/search/db_search_fields.py b/src/nominatim_api/search/db_search_fields.py index 669e2a5e..70b8ad7b 100644 --- a/src/nominatim_api/search/db_search_fields.py +++ b/src/nominatim_api/search/db_search_fields.py @@ -244,6 +244,21 @@ class SearchData: setattr(self, field, wstrs) + def set_countries(self, tokens: List[Token]) -> None: + """ Set the WeightedStrings properties for countries. Multiple + entries for the same country are deduplicated and the minimum + penalty is used. Adapts the global penalty, so that the + minimum penalty is 0. + """ + if tokens: + min_penalty = min(t.penalty for t in tokens) + self.penalty += min_penalty + countries: dict[str, float] = {} + for t in tokens: + cc = t.get_country() + countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000)) + self.countries = WeightedStrings(list(countries.keys()), list(countries.values())) + def set_qualifiers(self, tokens: List[Token]) -> None: """ Set the qulaifier field from the given tokens. """ diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 4ab85fd3..50c133a0 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -59,12 +59,16 @@ class ICUToken(qmod.Token): assert self.info return self.info.get('class', ''), self.info.get('type', '') - def rematch(self, norm: str) -> None: + def get_country(self) -> str: + assert self.info + return cast(str, self.info.get('cc', '')) + + def match_penalty(self, norm: str) -> float: """ Check how well the token matches the given normalized string and add a penalty, if necessary. """ if not self.lookup_word: - return + return 0.0 seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm) distance = 0 @@ -75,7 +79,7 @@ class ICUToken(qmod.Token): distance += max((ato-afrom), (bto-bfrom)) elif tag != 'equal': distance += abs((ato-afrom) - (bto-bfrom)) - self.penalty += (distance/len(self.lookup_word)) + return (distance/len(self.lookup_word)) @staticmethod def from_db_row(row: SaRow) -> 'ICUToken': @@ -330,9 +334,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}" for n in query.nodes[start + 1:end + 1]).strip() for ttype, tokens in tlist.items(): - if ttype != qmod.TOKEN_COUNTRY: - for token in tokens: - cast(ICUToken, token).rematch(norm) + for token in tokens: + itok = cast(ICUToken, token) + itok.penalty += itok.match_penalty(norm) * \ + (1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2) def compute_break_penalties(self, query: qmod.QueryStruct) -> None: """ Set the break penalties for the nodes in the query. diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py index f64dd1db..c08e6499 100644 --- a/src/nominatim_api/search/query.py +++ b/src/nominatim_api/search/query.py @@ -127,6 +127,12 @@ class Token(ABC): category objects. """ + @abstractmethod + def get_country(self) -> str: + """ Return the country code this tojen is associated with + (currently for country tokens only). + """ + @dataclasses.dataclass class TokenRange: diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py index b7fa8682..5d90bb27 100644 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@ -475,20 +475,23 @@ class ICUNameAnalyzer(AbstractAnalyzer): assert self.conn is not None word_tokens = set() for name in names: - norm_name = self._search_normalized(name.name) - if norm_name: - word_tokens.add(norm_name) + norm_name = self._normalized(name.name) + token_name = self._search_normalized(name.name) + if norm_name and token_name: + word_tokens.add((token_name, norm_name)) with self.conn.cursor() as cur: # Get existing names - cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal + cur.execute("""SELECT word_token, + word as lookup, + coalesce(info ? 'internal', false) as is_internal FROM word - WHERE type = 'C' and word = %s""", + WHERE type = 'C' and info->>'cc' = %s""", (country_code, )) # internal/external names - existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()} + existing_tokens: Dict[bool, Set[Tuple[str, str]]] = {True: set(), False: set()} for word in cur: - existing_tokens[word[1]].add(word[0]) + existing_tokens[word[2]].add((word[0], word[1])) # Delete names that no longer exist. gone_tokens = existing_tokens[internal] - word_tokens @@ -496,10 +499,10 @@ class ICUNameAnalyzer(AbstractAnalyzer): gone_tokens.update(existing_tokens[False] & word_tokens) if gone_tokens: cur.execute("""DELETE FROM word - USING unnest(%s::text[]) as token - WHERE type = 'C' and word = %s - and word_token = token""", - (list(gone_tokens), country_code)) + USING jsonb_array_elements(%s) as data + WHERE type = 'C' and info->>'cc' = %s + and word_token = data->>0 and word = data->>1""", + (Jsonb(list(gone_tokens)), country_code)) # Only add those names that are not yet in the list. new_tokens = word_tokens - existing_tokens[True] @@ -508,15 +511,17 @@ class ICUNameAnalyzer(AbstractAnalyzer): if new_tokens: if internal: sql = """INSERT INTO word (word_token, type, word, info) - (SELECT token, 'C', %s, '{"internal": "yes"}' - FROM unnest(%s::text[]) as token) + (SELECT data->>0, 'C', data->>1, + jsonb_build_object('internal', 'yes', 'cc', %s::text) + FROM jsonb_array_elements(%s) as data) """ else: - sql = """INSERT INTO word (word_token, type, word) - (SELECT token, 'C', %s - FROM unnest(%s::text[]) as token) + sql = """INSERT INTO word (word_token, type, word, info) + (SELECT data->>0, 'C', data->>1, + jsonb_build_object('cc', %s::text) + FROM jsonb_array_elements(%s) as data) """ - cur.execute(sql, (country_code, list(new_tokens))) + cur.execute(sql, (country_code, Jsonb(list(new_tokens)))) def process_place(self, place: PlaceInfo) -> Mapping[str, Any]: """ Determine tokenizer information about the given place. diff --git a/test/python/mock_icu_word_table.py b/test/python/mock_icu_word_table.py index b26025a0..083246cb 100644 --- a/test/python/mock_icu_word_table.py +++ b/test/python/mock_icu_word_table.py @@ -10,6 +10,8 @@ of the table. """ from nominatim_db.db.connection import execute_scalar +from psycopg.types.json import Jsonb + class MockIcuWordTable: """ A word table for testing using legacy word table structure. @@ -42,11 +44,11 @@ class MockIcuWordTable: """, (word_token, word, cls, typ, oper)) self.conn.commit() - def add_country(self, country_code, word_token): + def add_country(self, country_code, word_token, lookup): with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, type, word) - VALUES(%s, 'C', %s)""", - (word_token, country_code)) + cur.execute("""INSERT INTO word (word_token, type, word, info) + VALUES(%s, 'C', %s, %s)""", + (word_token, lookup, Jsonb({'cc': country_code}))) self.conn.commit() def add_postcode(self, word_token, postcode): @@ -93,7 +95,7 @@ class MockIcuWordTable: def get_country(self): with self.conn.cursor() as cur: - cur.execute("SELECT word, word_token FROM word WHERE type = 'C'") + cur.execute("SELECT info->>'cc', word_token, word FROM word WHERE type = 'C'") result = set((tuple(row) for row in cur)) assert len(result) == cur.rowcount, "Word table has duplicates." return result diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 6d2e9ce7..39796822 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -343,16 +343,18 @@ def test_add_country_names_new(analyzer, word_table): with analyzer() as anl: anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'}) - assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')} + assert word_table.get_country() == {('es', 'ESPAGÑA', 'Espagña'), + ('es', 'SPAIN', 'Spain')} def test_add_country_names_extend(analyzer, word_table): - word_table.add_country('ch', 'SCHWEIZ') + word_table.add_country('ch', 'SCHWEIZ', 'Schweiz') with analyzer() as anl: anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'}) - assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')} + assert word_table.get_country() == {('ch', 'SCHWEIZ', 'Schweiz'), + ('ch', 'SUISSE', 'Suisse')} class TestPlaceNames: From 9447c90b09c995710ec2a1ee4347e276d69acdf9 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 27 Nov 2025 16:05:49 +0100 Subject: [PATCH 2/4] adapt tests to new country token format --- test/python/api/search/test_api_search_query.py | 3 +++ test/python/api/search/test_db_search_builder.py | 3 +++ test/python/api/search/test_token_assignment.py | 3 +++ test/python/api/test_api_search.py | 2 +- test/python/tokenizer/test_icu.py | 2 +- 5 files changed, 11 insertions(+), 2 deletions(-) diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py index ea3b9772..1f229111 100644 --- a/test/python/api/search/test_api_search_query.py +++ b/test/python/api/search/test_api_search_query.py @@ -17,6 +17,9 @@ class MyToken(query.Token): def get_category(self): return 'this', 'that' + def get_country(self): + return 'cc' + def mktoken(tid: int): return MyToken(penalty=3.0, token=tid, count=1, addr_count=1, diff --git a/test/python/api/search/test_db_search_builder.py b/test/python/api/search/test_db_search_builder.py index d304e197..18beb6f2 100644 --- a/test/python/api/search/test_db_search_builder.py +++ b/test/python/api/search/test_db_search_builder.py @@ -21,6 +21,9 @@ class MyToken(Token): def get_category(self): return 'this', 'that' + def get_country(self): + return self.lookup_word + def make_query(*args): q = QueryStruct([Phrase(qmod.PHRASE_ANY, '')]) diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py index 0b2d7cb9..bc47d358 100644 --- a/test/python/api/search/test_token_assignment.py +++ b/test/python/api/search/test_token_assignment.py @@ -20,6 +20,9 @@ class MyToken(Token): def get_category(self): return 'this', 'that' + def get_country(self): + return 'cc' + def make_query(*args): q = QueryStruct([Phrase(args[0][1], '')]) diff --git a/test/python/api/test_api_search.py b/test/python/api/test_api_search.py index 59a83aa9..04317dec 100644 --- a/test/python/api/test_api_search.py +++ b/test/python/api/test_api_search.py @@ -99,7 +99,7 @@ def test_address_simple_places(apiobj, frontend, atype, address, search): def test_address_country(apiobj, frontend): - apiobj.add_word_table([(None, 'ro', 'C', 'ro', None)]) + apiobj.add_word_table([(None, 'ro', 'C', 'ro', {'cc': 'ro'})]) apiobj.add_country('ro', 'POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))') apiobj.add_country_name('ro', {'name': 'România'}) diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 39796822..cf4140c9 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -405,7 +405,7 @@ class TestPlaceNames: info = self.analyzer.process_place(place) self.expect_name_terms(info, '#norge', 'norge') - assert word_table.get_country() == {('no', 'NORGE')} + assert word_table.get_country() == {('no', 'NORGE', 'Norge')} class TestPlaceAddress: From cd1b1736a9b9215a468274e034ce13b6d390aa2d Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 27 Nov 2025 16:51:29 +0100 Subject: [PATCH 3/4] add migration for changed country token format --- src/nominatim_db/tools/migration.py | 25 ++++++++++++++++++++++++- src/nominatim_db/version.py | 2 +- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/nominatim_db/tools/migration.py b/src/nominatim_db/tools/migration.py index 5763a694..ab6860f2 100644 --- a/src/nominatim_db/tools/migration.py +++ b/src/nominatim_db/tools/migration.py @@ -2,7 +2,7 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. """ Functions for database migration to newer software versions. @@ -18,6 +18,7 @@ from ..db.connection import connect, Connection, \ from ..db.sql_preprocessor import SQLPreprocessor from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version from ..tokenizer import factory as tokenizer_factory +from ..data.country_info import create_country_names, setup_country_config from . import refresh LOG = logging.getLogger() @@ -156,3 +157,25 @@ def create_place_entrance_table(conn: Connection, config: Configuration, **_: An CREATE UNIQUE INDEX place_entrance_osm_id_idx ON place_entrance USING BTREE (osm_id); """) + + +@_migration(5, 2, 99, 1) +def convert_country_tokens(conn: Connection, config: Configuration, **_: Any) -> None: + """ Convert country word tokens + + Country tokens now save the country in the info field instead of the + word. This migration removes all country tokens from the word table + and reimports the default country name. This means that custom names + are lost. If you need them back, invalidate the OSM objects containing + the names by setting indexed_status to 2 and then reindex the database. + """ + tokenizer = tokenizer_factory.get_tokenizer_for_db(config) + # There is only one tokenizer at the time of migration, so we make + # some assumptions here about the structure of the database. This will + # fail if somebody has written a custom tokenizer. + with conn.cursor() as cur: + cur.execute("DELETE FROM word WHERE type = 'C'") + conn.commit() + + setup_country_config(config) + create_country_names(conn, tokenizer, config.get_str_list('LANGUAGES')) diff --git a/src/nominatim_db/version.py b/src/nominatim_db/version.py index e035ec57..1c9f5e87 100644 --- a/src/nominatim_db/version.py +++ b/src/nominatim_db/version.py @@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion: return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')]) -NOMINATIM_VERSION = parse_version('5.2.0-0') +NOMINATIM_VERSION = parse_version('5.2.99-0') POSTGRESQL_REQUIRED_VERSION = (12, 0) POSTGIS_REQUIRED_VERSION = (3, 0) From 23db1ab9816f26002fef8cb52e487e77afaea301 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 1 Dec 2025 14:23:36 +0100 Subject: [PATCH 4/4] avoid most recent psycopg 3.3 release --- packaging/nominatim-db/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/nominatim-db/pyproject.toml b/packaging/nominatim-db/pyproject.toml index 80eec85f..6df9560a 100644 --- a/packaging/nominatim-db/pyproject.toml +++ b/packaging/nominatim-db/pyproject.toml @@ -15,7 +15,7 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - "psycopg", + "psycopg<3.3", "python-dotenv", "jinja2", "pyYAML>=5.1",