Merge pull request #3894 from lonvia/country-names-with-word-lookup

Add normalized form of country names to coutry tokens in word table
avoid most recent psycopg 3.3 release
2026-02-14 18:37:58 +00:00 · 2025-12-01 14:54:24 +01:00 · 2025-12-01 14:23:36 +01:00 · 2025-12-01 13:10:18 +01:00 · 2025-12-01 13:10:18 +01:00 · 2025-12-01 13:10:18 +01:00
26 changed files with 234 additions and 69 deletions
--- a/docs/develop/Testing.md
+++ b/docs/develop/Testing.md
@@ -52,6 +52,15 @@ To run the functional tests, do

    pytest test/bdd

+You can run a single feature file using expression matching:
+
+    pytest test/bdd -k osm2pgsql/import/entrances.feature
+
+This even works for running single tests by adding the line number of the
+scenario header like that:
+
+    pytest test/bdd -k 'osm2pgsql/import/entrances.feature and L4'
+
 The BDD tests create databases for the tests. You can set name of the databases
 through configuration variables in your `pytest.ini`:

--- a/lib-lua/themes/nominatim/presets.lua
+++ b/lib-lua/themes/nominatim/presets.lua
@@ -117,6 +117,7 @@ module.MAIN_TAGS.all_boundaries = {
    boundary = {'named',
                place = 'delete',
                land_area = 'delete',
+                protected_area = 'fallback',
                postal_code = 'always'},
    landuse = 'fallback',
    place = 'always'
@@ -198,7 +199,7 @@ module.MAIN_TAGS_POIS = function (group)
                no = group},
    landuse = {cemetery = 'always'},
    leisure = {'always',
-               nature_reserve = 'fallback',
+               nature_reserve = 'named',
               swimming_pool = 'named',
               garden = 'named',
               common = 'named',
@@ -321,7 +322,6 @@ module.NAME_TAGS = {}

 module.NAME_TAGS.core = {main = {'name', 'name:*',
                                 'int_name', 'int_name:*',
-                                 'nat_name', 'nat_name:*',
                                 'reg_name', 'reg_name:*',
                                 'loc_name', 'loc_name:*',
                                 'old_name', 'old_name:*',
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -341,6 +341,22 @@ BEGIN
    END IF;
  END IF;

+  IF bnd.extratags ? 'wikidata' THEN
+    FOR linked_placex IN
+      SELECT * FROM placex
+      WHERE placex.class = 'place' AND placex.osm_type = 'N'
+        AND placex.extratags ? 'wikidata' -- needed to select right index
+        AND placex.extratags->'wikidata' = bnd.extratags->'wikidata'
+        AND (placex.linked_place_id is null or placex.linked_place_id = bnd.place_id)
+        AND placex.rank_search < 26
+        AND _st_covers(bnd.geometry, placex.geometry)
+      ORDER BY lower(name->'name') = bnd_name desc
+    LOOP
+      {% if debug %}RAISE WARNING 'Found wikidata-matching place node %', linked_placex.osm_id;{% endif %}
+      RETURN linked_placex;
+    END LOOP;
+  END IF;
+
  -- If extratags has a place tag, look for linked nodes by their place type.
  -- Area and node still have to have the same name.
  IF bnd.extratags ? 'place' and bnd.extratags->'place' != 'postcode'
@@ -361,22 +377,6 @@ BEGIN
    END LOOP;
  END IF;

-  IF bnd.extratags ? 'wikidata' THEN
-    FOR linked_placex IN
-      SELECT * FROM placex
-      WHERE placex.class = 'place' AND placex.osm_type = 'N'
-        AND placex.extratags ? 'wikidata' -- needed to select right index
-        AND placex.extratags->'wikidata' = bnd.extratags->'wikidata'
-        AND (placex.linked_place_id is null or placex.linked_place_id = bnd.place_id)
-        AND placex.rank_search < 26
-        AND _st_covers(bnd.geometry, placex.geometry)
-      ORDER BY lower(name->'name') = bnd_name desc
-    LOOP
-      {% if debug %}RAISE WARNING 'Found wikidata-matching place node %', linked_placex.osm_id;{% endif %}
-      RETURN linked_placex;
-    END LOOP;
-  END IF;
-
  -- Name searches can be done for ways as well as relations
  IF bnd_name is not null THEN
    {% if debug %}RAISE WARNING 'Looking for nodes with matching names';{% endif %}
@@ -874,7 +874,7 @@ BEGIN
  -- Remove linkage, if we have computed a different new linkee.
  UPDATE placex SET linked_place_id = null, indexed_status = 2
    WHERE linked_place_id = NEW.place_id
-          and (linked_place is null or linked_place_id != linked_place);
+          and (linked_place is null or place_id != linked_place);
  -- update not necessary for osmline, cause linked_place_id does not exist

  -- Postcodes are just here to compute the centroids. They are not searchable
--- a/packaging/nominatim-db/pyproject.toml
+++ b/packaging/nominatim-db/pyproject.toml
@@ -15,7 +15,7 @@ classifiers = [
    "Operating System :: OS Independent",
 ]
 dependencies = [
-    "psycopg",
+    "psycopg<3.3",
    "python-dotenv",
    "jinja2",
    "pyYAML>=5.1",
--- a/src/nominatim_api/result_formatting.py
+++ b/src/nominatim_api/result_formatting.py
@@ -10,7 +10,7 @@ Helper classes and functions for formatting results into API responses.
 from typing import Type, TypeVar, Dict, List, Callable, Any, Mapping, Optional, cast
 from collections import defaultdict
 from pathlib import Path
-import importlib
+import importlib.util

 from .server.content_types import CONTENT_JSON

--- a/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@ -374,7 +374,7 @@ class SearchBuilder:
            tokens = self.get_country_tokens(assignment.country)
            if not tokens:
                return None
-            sdata.set_strings('countries', tokens)
+            sdata.set_countries(tokens)
            sdata.penalty += self.query.get_in_word_penalty(assignment.country)
        elif self.details.countries:
            sdata.countries = dbf.WeightedStrings(self.details.countries,
--- a/src/nominatim_api/search/db_search_fields.py
+++ b/src/nominatim_api/search/db_search_fields.py
@@ -244,6 +244,21 @@ class SearchData:

            setattr(self, field, wstrs)

+    def set_countries(self, tokens: List[Token]) -> None:
+        """ Set the WeightedStrings properties for countries. Multiple
+            entries for the same country are deduplicated and the minimum
+            penalty is used. Adapts the global penalty, so that the
+            minimum penalty is 0.
+        """
+        if tokens:
+            min_penalty = min(t.penalty for t in tokens)
+            self.penalty += min_penalty
+            countries: dict[str, float] = {}
+            for t in tokens:
+                cc = t.get_country()
+                countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000))
+            self.countries = WeightedStrings(list(countries.keys()), list(countries.values()))
+
    def set_qualifiers(self, tokens: List[Token]) -> None:
        """ Set the qulaifier field from the given tokens.
        """
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -59,12 +59,16 @@ class ICUToken(qmod.Token):
        assert self.info
        return self.info.get('class', ''), self.info.get('type', '')

-    def rematch(self, norm: str) -> None:
+    def get_country(self) -> str:
+        assert self.info
+        return cast(str, self.info.get('cc', ''))
+
+    def match_penalty(self, norm: str) -> float:
        """ Check how well the token matches the given normalized string
            and add a penalty, if necessary.
        """
        if not self.lookup_word:
-            return
+            return 0.0

        seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
        distance = 0
@@ -75,7 +79,7 @@ class ICUToken(qmod.Token):
                distance += max((ato-afrom), (bto-bfrom))
            elif tag != 'equal':
                distance += abs((ato-afrom) - (bto-bfrom))
-        self.penalty += (distance/len(self.lookup_word))
+        return (distance/len(self.lookup_word))

    @staticmethod
    def from_db_row(row: SaRow) -> 'ICUToken':
@@ -330,9 +334,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
            norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
                           for n in query.nodes[start + 1:end + 1]).strip()
            for ttype, tokens in tlist.items():
-                if ttype != qmod.TOKEN_COUNTRY:
-                    for token in tokens:
-                        cast(ICUToken, token).rematch(norm)
+                for token in tokens:
+                    itok = cast(ICUToken, token)
+                    itok.penalty += itok.match_penalty(norm) * \
+                        (1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2)

    def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
        """ Set the break penalties for the nodes in the query.
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -127,6 +127,12 @@ class Token(ABC):
            category objects.
        """

+    @abstractmethod
+    def get_country(self) -> str:
+        """ Return the country code this tojen is associated with
+            (currently for country tokens only).
+        """
+

@dataclasses.dataclass
 class TokenRange:
--- a/src/nominatim_api/server/falcon/server.py
+++ b/src/nominatim_api/server/falcon/server.py
@@ -7,6 +7,8 @@
 """
 Server implementation using the falcon webserver framework.
 """
+from __future__ import annotations
+
 from typing import Optional, Mapping, Any, List, cast
 from pathlib import Path
 import asyncio
@@ -161,7 +163,7 @@ class APIMiddleware:

    def __init__(self, project_dir: Path, environ: Optional[Mapping[str, str]]) -> None:
        self.api = NominatimAPIAsync(project_dir, environ)
-        self.app: Optional[App] = None
+        self.app: Optional[App[Request, Response]] = None

    @property
    def config(self) -> Configuration:
@@ -169,7 +171,7 @@ class APIMiddleware:
        """
        return self.api.config

-    def set_app(self, app: App) -> None:
+    def set_app(self, app: App[Request, Response]) -> None:
        """ Set the Falcon application this middleware is connected to.
        """
        self.app = app
@@ -193,7 +195,7 @@ class APIMiddleware:


 def get_application(project_dir: Path,
-                    environ: Optional[Mapping[str, str]] = None) -> App:
+                    environ: Optional[Mapping[str, str]] = None) -> App[Request, Response]:
    """ Create a Nominatim Falcon ASGI application.
    """
    apimw = APIMiddleware(project_dir, environ)
@@ -215,7 +217,7 @@ def get_application(project_dir: Path,
    return app


-def run_wsgi() -> App:
+def run_wsgi() -> App[Request, Response]:
    """ Entry point for uvicorn.

        Make sure uvicorn is run from the project directory.
--- a/src/nominatim_db/clicmd/setup.py
+++ b/src/nominatim_db/clicmd/setup.py
@@ -23,6 +23,7 @@ from ..tokenizer.base import AbstractTokenizer
 from ..version import NOMINATIM_VERSION
 from .args import NominatimArgs

+import time

 LOG = logging.getLogger()

@@ -86,6 +87,8 @@ class SetupAll:
        from ..tools import database_import, postcodes, freeze
        from ..indexer.indexer import Indexer

+        start_time = time.time()
+
        num_threads = args.threads or psutil.cpu_count() or 1
        country_info.setup_country_config(args.config)

@@ -138,6 +141,10 @@ class SetupAll:
        LOG.warning('Recompute word counts')
        tokenizer.update_statistics(args.config, threads=num_threads)

+        end_time = time.time()
+        elapsed = end_time - start_time
+        LOG.warning(f'Import completed successfully in {elapsed:.2f} seconds.')
+
        self._finalize_database(args.config.get_libpq_dsn(), args.offline)

        return 0
--- a/src/nominatim_db/data/postcode_format.py
+++ b/src/nominatim_db/data/postcode_format.py
@@ -29,6 +29,9 @@ class CountryPostcodeMatcher:
        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?({pc_pattern})\\s*')
        self.pattern = re.compile(pc_pattern)

+        # We want to exclude 0000, 00-000, 000 00 etc
+        self.zero_pattern = re.compile(r'^[0\- ]+$')
+
        self.output = config.get('output', r'\g<0>')

    def match(self, postcode: str) -> Optional[Match[str]]:
@@ -40,7 +43,10 @@ class CountryPostcodeMatcher:
        normalized = self.norm_pattern.fullmatch(postcode.upper())

        if normalized:
-            return self.pattern.fullmatch(normalized.group(1))
+            match = self.pattern.fullmatch(normalized.group(1))
+            if match and self.zero_pattern.match(match.string):
+                return None
+            return match

        return None

--- a/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@ -475,20 +475,23 @@ class ICUNameAnalyzer(AbstractAnalyzer):
        assert self.conn is not None
        word_tokens = set()
        for name in names:
-            norm_name = self._search_normalized(name.name)
-            if norm_name:
-                word_tokens.add(norm_name)
+            norm_name = self._normalized(name.name)
+            token_name = self._search_normalized(name.name)
+            if norm_name and token_name:
+                word_tokens.add((token_name, norm_name))

        with self.conn.cursor() as cur:
            # Get existing names
-            cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
+            cur.execute("""SELECT word_token,
+                                  word as lookup,
+                                  coalesce(info ? 'internal', false) as is_internal
                             FROM word
-                             WHERE type = 'C' and word = %s""",
+                             WHERE type = 'C' and info->>'cc' = %s""",
                        (country_code, ))
            # internal/external names
-            existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
+            existing_tokens: Dict[bool, Set[Tuple[str, str]]] = {True: set(), False: set()}
            for word in cur:
-                existing_tokens[word[1]].add(word[0])
+                existing_tokens[word[2]].add((word[0], word[1]))

            # Delete names that no longer exist.
            gone_tokens = existing_tokens[internal] - word_tokens
@@ -496,10 +499,10 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                gone_tokens.update(existing_tokens[False] & word_tokens)
            if gone_tokens:
                cur.execute("""DELETE FROM word
-                               USING unnest(%s::text[]) as token
-                               WHERE type = 'C' and word = %s
-                                     and word_token = token""",
-                            (list(gone_tokens), country_code))
+                               USING jsonb_array_elements(%s) as data
+                               WHERE type = 'C' and info->>'cc' = %s
+                                     and word_token = data->>0 and word = data->>1""",
+                            (Jsonb(list(gone_tokens)), country_code))

            # Only add those names that are not yet in the list.
            new_tokens = word_tokens - existing_tokens[True]
@@ -508,15 +511,17 @@ class ICUNameAnalyzer(AbstractAnalyzer):
            if new_tokens:
                if internal:
                    sql = """INSERT INTO word (word_token, type, word, info)
-                               (SELECT token, 'C', %s, '{"internal": "yes"}'
-                                  FROM unnest(%s::text[]) as token)
+                               (SELECT data->>0, 'C', data->>1,
+                                       jsonb_build_object('internal', 'yes', 'cc', %s::text)
+                                  FROM jsonb_array_elements(%s) as data)
                           """
                else:
-                    sql = """INSERT INTO word (word_token, type, word)
-                                   (SELECT token, 'C', %s
-                                    FROM unnest(%s::text[]) as token)
+                    sql = """INSERT INTO word (word_token, type, word, info)
+                                   (SELECT data->>0, 'C', data->>1,
+                                           jsonb_build_object('cc', %s::text)
+                                    FROM  jsonb_array_elements(%s) as data)
                          """
-                cur.execute(sql, (country_code, list(new_tokens)))
+                cur.execute(sql, (country_code, Jsonb(list(new_tokens))))

    def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
        """ Determine tokenizer information about the given place.
--- a/src/nominatim_db/tools/migration.py
+++ b/src/nominatim_db/tools/migration.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Functions for database migration to newer software versions.
@@ -18,6 +18,7 @@ from ..db.connection import connect, Connection, \
 from ..db.sql_preprocessor import SQLPreprocessor
 from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version
 from ..tokenizer import factory as tokenizer_factory
+from ..data.country_info import create_country_names, setup_country_config
 from . import refresh

 LOG = logging.getLogger()
@@ -156,3 +157,25 @@ def create_place_entrance_table(conn: Connection, config: Configuration, **_: An
            CREATE UNIQUE INDEX place_entrance_osm_id_idx ON place_entrance
              USING BTREE (osm_id);
              """)
+
+
+@_migration(5, 2, 99, 1)
+def convert_country_tokens(conn: Connection, config: Configuration, **_: Any) -> None:
+    """ Convert country word tokens
+
+        Country tokens now save the country in the info field instead of the
+        word. This migration removes all country tokens from the word table
+        and reimports the default country name. This means that custom names
+        are lost. If you need them back, invalidate the OSM objects containing
+        the names by setting indexed_status to 2 and then reindex the database.
+    """
+    tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+    # There is only one tokenizer at the time of migration, so we make
+    # some assumptions here about the structure of the database. This will
+    # fail if somebody has written a custom tokenizer.
+    with conn.cursor() as cur:
+        cur.execute("DELETE FROM word WHERE type = 'C'")
+    conn.commit()
+
+    setup_country_config(config)
+    create_country_names(conn, tokenizer, config.get_str_list('LANGUAGES'))
--- a/src/nominatim_db/version.py
+++ b/src/nominatim_db/version.py
@@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion:
    return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])


-NOMINATIM_VERSION = parse_version('5.2.0-0')
+NOMINATIM_VERSION = parse_version('5.2.99-0')

 POSTGRESQL_REQUIRED_VERSION = (12, 0)
 POSTGIS_REQUIRED_VERSION = (3, 0)
--- a/test/bdd/conftest.py
+++ b/test/bdd/conftest.py
@@ -9,6 +9,7 @@ Fixtures for BDD test steps
 """
 import sys
 import json
+import re
 from pathlib import Path

 import psycopg
@@ -20,7 +21,8 @@ sys.path.insert(0, str(SRC_DIR / 'src'))

 import pytest
 from pytest_bdd.parsers import re as step_parse
-from pytest_bdd import given, when, then
+from pytest_bdd import given, when, then, scenario
+from pytest_bdd.feature import get_features

 pytest.register_assert_rewrite('utils')

@@ -373,3 +375,57 @@ def check_place_missing_lines(db_conn, table, osm_type, osm_id, osm_class):

    with db_conn.cursor() as cur:
        assert cur.execute(sql, params).fetchone()[0] == 0
+
+
+if pytest.version_tuple >= (8, 0, 0):
+    def pytest_pycollect_makemodule(module_path, parent):
+        return BddTestCollector.from_parent(parent, path=module_path)
+
+
+class BddTestCollector(pytest.Module):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def collect(self):
+        for item in super().collect():
+            yield item
+
+        if hasattr(self.obj, 'PYTEST_BDD_SCENARIOS'):
+            for path in self.obj.PYTEST_BDD_SCENARIOS:
+                for feature in get_features([str(Path(self.path.parent, path).resolve())]):
+                    yield FeatureFile.from_parent(self,
+                                                  name=str(Path(path, feature.rel_filename)),
+                                                  path=Path(feature.filename),
+                                                  feature=feature)
+
+
+# borrowed from pytest-bdd: src/pytest_bdd/scenario.py
+def make_python_name(string: str) -> str:
+    """Make python attribute name out of a given string."""
+    string = re.sub(r"\W", "", string.replace(" ", "_"))
+    return re.sub(r"^\d+_*", "", string).lower()
+
+
+class FeatureFile(pytest.File):
+    class obj:
+        pass
+
+    def __init__(self, feature, **kwargs):
+        self.feature = feature
+        super().__init__(**kwargs)
+
+    def collect(self):
+        for sname, sobject in self.feature.scenarios.items():
+            class_name = f"L{sobject.line_number}"
+            test_name = "test_" + make_python_name(sname)
+
+            @scenario(self.feature.filename, sname)
+            def _test():
+                pass
+
+            tclass = type(class_name, (),
+                          {test_name: staticmethod(_test)})
+            setattr(self.obj, class_name, tclass)
+
+            yield pytest.Class.from_parent(self, name=class_name, obj=tclass)
--- a/test/bdd/test_api.py
+++ b/test/bdd/test_api.py
@@ -15,7 +15,7 @@ import xml.etree.ElementTree as ET

 import pytest
 from pytest_bdd.parsers import re as step_parse
-from pytest_bdd import scenarios, when, given, then
+from pytest_bdd import when, given, then

 from nominatim_db import cli
 from nominatim_db.config import Configuration
@@ -150,4 +150,8 @@ def parse_api_json_response(api_response, fmt, num):
    return result


-scenarios('features/api')
+if pytest.version_tuple >= (8, 0, 0):
+    PYTEST_BDD_SCENARIOS = ['features/api']
+else:
+    from pytest_bdd import scenarios
+    scenarios('features/api')
--- a/test/bdd/test_db.py
+++ b/test/bdd/test_db.py
@@ -15,7 +15,7 @@ import re
 import psycopg

 import pytest
-from pytest_bdd import scenarios, when, then, given
+from pytest_bdd import when, then, given
 from pytest_bdd.parsers import re as step_parse

 from utils.place_inserter import PlaceColumn
@@ -276,4 +276,8 @@ def then_check_interpolation_table_negative(db_conn, oid):
        assert cur.fetchone()[0] == 0


-scenarios('features/db')
+if pytest.version_tuple >= (8, 0, 0):
+    PYTEST_BDD_SCENARIOS = ['features/db']
+else:
+    from pytest_bdd import scenarios
+    scenarios('features/db')
--- a/test/bdd/test_osm2pgsql.py
+++ b/test/bdd/test_osm2pgsql.py
@@ -11,7 +11,7 @@ import asyncio
 import random

 import pytest
-from pytest_bdd import scenarios, when, then, given
+from pytest_bdd import when, then, given
 from pytest_bdd.parsers import re as step_parse

 from nominatim_db import cli
@@ -106,4 +106,8 @@ def check_place_content(db_conn, datatable, node_grid, table, exact):
    check_table_content(db_conn, table, datatable, grid=node_grid, exact=bool(exact))


-scenarios('features/osm2pgsql')
+if pytest.version_tuple >= (8, 0, 0):
+    PYTEST_BDD_SCENARIOS = ['features/osm2pgsql']
+else:
+    from pytest_bdd import scenarios
+    scenarios('features/osm2pgsql')
--- a/test/python/api/search/test_api_search_query.py
+++ b/test/python/api/search/test_api_search_query.py
@@ -17,6 +17,9 @@ class MyToken(query.Token):
    def get_category(self):
        return 'this', 'that'

+    def get_country(self):
+        return 'cc'
+

 def mktoken(tid: int):
    return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
--- a/test/python/api/search/test_db_search_builder.py
+++ b/test/python/api/search/test_db_search_builder.py
@@ -21,6 +21,9 @@ class MyToken(Token):
    def get_category(self):
        return 'this', 'that'

+    def get_country(self):
+        return self.lookup_word
+

 def make_query(*args):
    q = QueryStruct([Phrase(qmod.PHRASE_ANY, '')])
--- a/test/python/api/search/test_token_assignment.py
+++ b/test/python/api/search/test_token_assignment.py
@@ -20,6 +20,9 @@ class MyToken(Token):
    def get_category(self):
        return 'this', 'that'

+    def get_country(self):
+        return 'cc'
+

 def make_query(*args):
    q = QueryStruct([Phrase(args[0][1], '')])
--- a/test/python/api/test_api_search.py
+++ b/test/python/api/test_api_search.py
@@ -99,7 +99,7 @@ def test_address_simple_places(apiobj, frontend, atype, address, search):


 def test_address_country(apiobj, frontend):
-    apiobj.add_word_table([(None, 'ro', 'C', 'ro', None)])
+    apiobj.add_word_table([(None, 'ro', 'C', 'ro', {'cc': 'ro'})])
    apiobj.add_country('ro', 'POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))')
    apiobj.add_country_name('ro', {'name': 'România'})

--- a/test/python/mock_icu_word_table.py
+++ b/test/python/mock_icu_word_table.py
@@ -10,6 +10,8 @@ of the table.
 """
 from nominatim_db.db.connection import execute_scalar

+from psycopg.types.json import Jsonb
+

 class MockIcuWordTable:
    """ A word table for testing using legacy word table structure.
@@ -42,11 +44,11 @@ class MockIcuWordTable:
                        """, (word_token, word, cls, typ, oper))
        self.conn.commit()

-    def add_country(self, country_code, word_token):
+    def add_country(self, country_code, word_token, lookup):
        with self.conn.cursor() as cur:
-            cur.execute("""INSERT INTO word (word_token, type, word)
-                           VALUES(%s, 'C', %s)""",
-                        (word_token, country_code))
+            cur.execute("""INSERT INTO word (word_token, type, word, info)
+                           VALUES(%s, 'C', %s, %s)""",
+                        (word_token, lookup, Jsonb({'cc': country_code})))
        self.conn.commit()

    def add_postcode(self, word_token, postcode):
@@ -93,7 +95,7 @@ class MockIcuWordTable:

    def get_country(self):
        with self.conn.cursor() as cur:
-            cur.execute("SELECT word, word_token FROM word WHERE type = 'C'")
+            cur.execute("SELECT info->>'cc', word_token, word FROM word WHERE type = 'C'")
            result = set((tuple(row) for row in cur))
            assert len(result) == cur.rowcount, "Word table has duplicates."
            return result
--- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py
+++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
@@ -237,3 +237,9 @@ def test_postcode_default_pattern_pass(sanitize, postcode):
@pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}')
 def test_postcode_default_pattern_fail(sanitize, postcode):
    assert sanitize(country='an', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('00000', '00-000', 'PL-00000', 'PL 00-000'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_zeros(sanitize, postcode):
+    assert sanitize(country='pl', postcode=postcode) == []
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -343,16 +343,18 @@ def test_add_country_names_new(analyzer, word_table):
    with analyzer() as anl:
        anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})

-    assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')}
+    assert word_table.get_country() == {('es', 'ESPAGÑA', 'Espagña'),
+                                        ('es', 'SPAIN', 'Spain')}


 def test_add_country_names_extend(analyzer, word_table):
-    word_table.add_country('ch', 'SCHWEIZ')
+    word_table.add_country('ch', 'SCHWEIZ', 'Schweiz')

    with analyzer() as anl:
        anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})

-    assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')}
+    assert word_table.get_country() == {('ch', 'SCHWEIZ', 'Schweiz'),
+                                        ('ch', 'SUISSE', 'Suisse')}


 class TestPlaceNames:
@@ -403,7 +405,7 @@ class TestPlaceNames:
        info = self.analyzer.process_place(place)

        self.expect_name_terms(info, '#norge', 'norge')
-        assert word_table.get_country() == {('no', 'NORGE')}
+        assert word_table.get_country() == {('no', 'NORGE', 'Norge')}


 class TestPlaceAddress:
Author	SHA1	Message	Date
Sarah Hoffmann	96d04e3a2e	Merge pull request #3894 from lonvia/country-names-with-word-lookup Add normalized form of country names to coutry tokens in word table	2025-12-01 14:54:24 +01:00
Sarah Hoffmann	23db1ab981	avoid most recent psycopg 3.3 release	2025-12-01 14:23:36 +01:00
Sarah Hoffmann	cd1b1736a9	add migration for changed country token format	2025-12-01 13:10:18 +01:00
Sarah Hoffmann	9447c90b09	adapt tests to new country token format	2025-12-01 13:10:18 +01:00
Sarah Hoffmann	81c6cb72e6	add normalised country name to word table Country tokens now follow the usual convetion of having the normalized version in the word column and the extra info about the country code in the info column.	2025-12-01 13:10:18 +01:00
Sarah Hoffmann	f2a122c5c0	Merge pull request #3893 from lonvia/nature-reserve Prefer leisure=nature_reserve as main tag over boundary=protected_area	2025-12-01 11:36:17 +01:00
Sarah Hoffmann	57ef0e1f98	prefer leisure=nature_reserve as main tag	2025-12-01 09:47:55 +01:00
Sarah Hoffmann	922667b650	Merge pull request #3892 from daishu0000/master Add success message to setup.log: related to #3891	2025-11-30 14:13:51 +01:00
Sarah Hoffmann	fba803167c	fix imprecise import	2025-11-30 11:50:55 +01:00
daishu0000	782df52ea0	Add success message to db log	2025-11-30 01:53:40 +08:00
Sarah Hoffmann	c36da68a48	Merge pull request #3890 from mtmail/remove-nat-name Skip nat_name in default import	2025-11-28 14:13:30 +01:00
marc tobias	716de13bc9	Skip nat_name in default import	2025-11-28 11:35:35 +01:00
Sarah Hoffmann	1df56d7548	Merge pull request #3889 from lonvia/improve-linkage-code Small improvements to place linking code	2025-11-26 22:11:11 +01:00
Sarah Hoffmann	9cfef7a31a	prefer wikidata over name match when linking	2025-11-26 17:44:47 +01:00
Sarah Hoffmann	139678f367	fix linkage removal when nothing has changed	2025-11-26 17:03:19 +01:00
Sarah Hoffmann	e578c60ff4	Merge pull request #3874 from vytas7/falcon-4.2-typing Adapt type annotations to Falcon App type changes	2025-11-16 16:12:35 +01:00
Vytautas Liuolia	7b4a3c8500	Add `from __future__ import annotations` to delay evaluation	2025-11-16 14:41:25 +01:00
Vytautas Liuolia	7751f9a6b6	Adapt type annotations to Falcon App type changes See also: https://falcon.readthedocs.io/en/latest/api/typing.html#generic-app-types	2025-11-10 20:09:17 +01:00
Sarah Hoffmann	303ac42b47	Merge pull request #3862 from mtmail/skip-all-zero-postcodes Postcode sanetizer now skips values which are only zeros	2025-10-31 10:36:05 +01:00
Sarah Hoffmann	6a2d2daad5	Merge pull request #3863 from lonvia/improve-bdd-test-names Add custom pytest collector for BDD feature files	2025-10-31 10:19:56 +01:00
Sarah Hoffmann	a51c771107	disable improved BDD test naming for pytest < 8 Needs the improved test collector introduced in pytest 8.0.	2025-10-30 20:50:00 +01:00
Sarah Hoffmann	55547723bf	add custom pytest collector for BDD feature files	2025-10-30 17:56:23 +01:00
marc tobias	362088775f	postcode sanetizer skips postcodes which are only zeros	2025-10-30 13:45:29 +01:00