Merge pull request #3980 from lonvia/security-smells

Improve SQL query assembly
ignore tables with odd names in SQLPreprocessor
2026-02-14 01:47:57 +00:00 · 2026-02-10 15:26:34 +01:00 · 2026-02-10 11:40:52 +01:00 · 2026-02-10 11:39:19 +01:00 · 2026-02-10 11:39:19 +01:00 · 2026-02-10 11:39:17 +01:00
29 changed files with 126 additions and 101 deletions
--- a/docs/mk_install_instructions.py
+++ b/docs/mk_install_instructions.py
@@ -13,7 +13,8 @@ for infile in VAGRANT_PATH.glob('Install-on-*.sh'):
    outfile = f"admin/{infile.stem}.md"
    title = infile.stem.replace('-', ' ')

-    with mkdocs_gen_files.open(outfile, "w") as outfd, infile.open() as infd:
+    with mkdocs_gen_files.open(outfile, "w", encoding='utf-8') as outfd, \
+            infile.open(encoding='utf-8') as infd:
        print("#", title, file=outfd)
        has_empty = False
        for line in infd:
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -672,7 +672,7 @@ CREATE OR REPLACE FUNCTION placex_insert()
  AS $$
 DECLARE
  postcode TEXT;
-  result BOOLEAN;
+  result INT;
  is_area BOOLEAN;
  country_code VARCHAR(2);
  diameter FLOAT;
@@ -777,11 +777,12 @@ BEGIN


   -- add to tables for special search
-   -- Note: won't work on initial import because the classtype tables
-   -- do not yet exist. It won't hurt either.
  classtable := 'place_classtype_' || NEW.class || '_' || NEW.type;
-  SELECT count(*)>0 FROM pg_tables WHERE tablename = classtable and schemaname = current_schema() INTO result;
-  IF result THEN
+  SELECT count(*) INTO result
+    FROM pg_tables
+    WHERE classtable NOT SIMILAR TO '%\W%'
+          AND tablename = classtable and schemaname = current_schema();
+  IF result > 0 THEN
    EXECUTE 'INSERT INTO ' || classtable::regclass || ' (place_id, centroid) VALUES ($1,$2)' 
    USING NEW.place_id, NEW.centroid;
  END IF;
@@ -1337,6 +1338,7 @@ CREATE OR REPLACE FUNCTION placex_delete()
  AS $$
 DECLARE
  b BOOLEAN;
+  result INT;
  classtable TEXT;
 BEGIN
  -- RAISE WARNING 'placex_delete % %',OLD.osm_type,OLD.osm_id;
@@ -1395,8 +1397,12 @@ BEGIN

  -- remove from tables for special search
  classtable := 'place_classtype_' || OLD.class || '_' || OLD.type;
-  SELECT count(*)>0 FROM pg_tables WHERE tablename = classtable and schemaname = current_schema() INTO b;
-  IF b THEN
+  SELECT count(*) INTO result
+    FROM pg_tables
+    WHERE classtable NOT SIMILAR TO '%\W%'
+          AND tablename = classtable and schemaname = current_schema();
+
+  IF result > 0 THEN
    EXECUTE 'DELETE FROM ' || classtable::regclass || ' WHERE place_id = $1' USING OLD.place_id;
  END IF;

--- a/src/nominatim_db/config.py
+++ b/src/nominatim_db/config.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2025 by the Nominatim developer community.
+# Copyright (C) 2026 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Nominatim configuration accessor.
@@ -12,6 +12,7 @@ import importlib.util
 import logging
 import os
 import sys
+import re
 from pathlib import Path
 import json
 import yaml
@@ -80,6 +81,10 @@ class Configuration:
        self.lib_dir = _LibDirs()
        self._private_plugins: Dict[str, object] = {}

+        if re.fullmatch(r'[\w-]+', self.DATABASE_WEBUSER) is None:
+            raise UsageError("Misconfigured DATABASE_WEBUSER. "
+                             "Only alphnumberic characters, - and _ are allowed.")
+
    def set_libdirs(self, **kwargs: StrPath) -> None:
        """ Set paths to library functions and data.
        """
--- a/src/nominatim_db/db/sql_preprocessor.py
+++ b/src/nominatim_db/db/sql_preprocessor.py
@@ -2,12 +2,13 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2026 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Preprocessing of SQL files.
 """
 from typing import Set, Dict, Any, cast
+import re

 import jinja2

@@ -34,7 +35,9 @@ def _get_tables(conn: Connection) -> Set[str]:
    with conn.cursor() as cur:
        cur.execute("SELECT tablename FROM pg_tables WHERE schemaname = 'public'")

-        return set((row[0] for row in list(cur)))
+        # paranoia check: make sure we don't get table names that cause
+        # an SQL injection later
+        return {row[0] for row in list(cur) if re.fullmatch(r'\w+', row[0])}


 def _get_middle_db_format(conn: Connection, tables: Set[str]) -> str:
--- a/src/nominatim_db/indexer/indexer.py
+++ b/src/nominatim_db/indexer/indexer.py
@@ -59,7 +59,7 @@ class Indexer:
                if await self.index_by_rank(0, 4) > 0:
                    _analyze()

-                if await self.index_boundaries(0, 30) > 100:
+                if await self.index_boundaries() > 100:
                    _analyze()

                if await self.index_by_rank(5, 25) > 100:
@@ -74,7 +74,7 @@ class Indexer:
                if not self.has_pending():
                    break

-    async def index_boundaries(self, minrank: int, maxrank: int) -> int:
+    async def index_boundaries(self, minrank: int = 0, maxrank: int = 30) -> int:
        """ Index only administrative boundaries within the given rank range.
        """
        total = 0
--- a/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2025 by the Nominatim developer community.
+# Copyright (C) 2026 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Tokenizer implementing normalisation as used before Nominatim 4 but using
@@ -294,13 +294,12 @@ class ICUTokenizer(AbstractTokenizer):
        with connect(self.dsn) as conn:
            drop_tables(conn, 'word')
            with conn.cursor() as cur:
-                cur.execute(f"ALTER TABLE {old} RENAME TO word")
-                for idx in ('word_token', 'word_id'):
-                    cur.execute(f"""ALTER INDEX idx_{old}_{idx}
-                                      RENAME TO idx_word_{idx}""")
-                for name, _ in WORD_TYPES:
-                    cur.execute(f"""ALTER INDEX idx_{old}_{name}
-                                    RENAME TO idx_word_{name}""")
+                cur.execute(pysql.SQL("ALTER TABLE {} RENAME TO word")
+                                 .format(pysql.Identifier(old)))
+                for idx in ['word_token', 'word_id'] + [n[0] for n in WORD_TYPES]:
+                    cur.execute(pysql.SQL("ALTER INDEX {} RENAME TO {}")
+                                     .format(pysql.Identifier(f"idx_{old}_{idx}"),
+                                             pysql.Identifier(f"idx_word_{idx}")))
            conn.commit()


--- a/src/nominatim_db/tools/postcodes.py
+++ b/src/nominatim_db/tools/postcodes.py
@@ -159,7 +159,7 @@ class _PostcodeCollector:

        if fname.is_file():
            LOG.info("Using external postcode file '%s'.", fname)
-            return gzip.open(fname, 'rt')
+            return gzip.open(fname, 'rt', encoding='utf-8')

        return None

--- a/src/nominatim_db/tools/refresh.py
+++ b/src/nominatim_db/tools/refresh.py
@@ -141,7 +141,9 @@ def import_importance_csv(dsn: str, data_file: Path) -> int:

            copy_cmd = """COPY wikimedia_importance(language, title, importance, wikidata)
                          FROM STDIN"""
-            with gzip.open(str(data_file), 'rt') as fd, cur.copy(copy_cmd) as copy:
+            with gzip.open(
+                    str(data_file), 'rt', encoding='utf-8') as fd, \
+                    cur.copy(copy_cmd) as copy:
                for row in csv.DictReader(fd, delimiter='\t', quotechar='|'):
                    wd_id = int(row['wikidata_id'][1:])
                    copy.write_row((row['language'],
--- a/test/bdd/test_osm2pgsql.py
+++ b/test/bdd/test_osm2pgsql.py
@@ -43,7 +43,7 @@ def opl_writer(tmp_path, node_grid):
    def _write(data):
        fname = tmp_path / f"test_osm_{nr[0]}.opl"
        nr[0] += 1
-        with fname.open('wt') as fd:
+        with fname.open('wt', encoding='utf-8') as fd:
            for line in data.split('\n'):
                if line.startswith('n') and ' x' not in line:
                    coord = node_grid.get(line[1:].split(' ')[0]) \
@@ -59,7 +59,7 @@ def opl_writer(tmp_path, node_grid):
@given('the lua style file', target_fixture='osm2pgsql_options')
 def set_lua_style_file(osm2pgsql_options, docstring, tmp_path):
    style = tmp_path / 'custom.lua'
-    style.write_text(docstring)
+    style.write_text(docstring, encoding='utf-8')
    osm2pgsql_options['osm2pgsql_style'] = str(style)

    return osm2pgsql_options
--- a/test/python/api/search/test_postcode_parser.py
+++ b/test/python/api/search/test_postcode_parser.py
@@ -58,7 +58,7 @@ gb:
    pattern: "(l?ld[A-Z0-9]?) ?(dll)"
    output: \1 \2

-    """)
+    """, encoding='utf-8')

    return project_env

--- a/test/python/config/test_config.py
+++ b/test/python/config/test_config.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2025 by the Nominatim developer community.
+# Copyright (C) 2026 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Test for loading dotenv configuration.
@@ -48,7 +48,7 @@ def test_no_project_dir(make_config):
@pytest.mark.parametrize("val", ('apache', '"apache"'))
 def test_prefer_project_setting_over_default(make_config, val, tmp_path):
    envfile = tmp_path / '.env'
-    envfile.write_text('NOMINATIM_DATABASE_WEBUSER={}\n'.format(val))
+    envfile.write_text('NOMINATIM_DATABASE_WEBUSER={}\n'.format(val), encoding='utf-8')

    config = make_config(tmp_path)

@@ -57,7 +57,7 @@ def test_prefer_project_setting_over_default(make_config, val, tmp_path):

 def test_prefer_os_environ_over_project_setting(make_config, monkeypatch, tmp_path):
    envfile = tmp_path / '.env'
-    envfile.write_text('NOMINATIM_DATABASE_WEBUSER=apache\n')
+    envfile.write_text('NOMINATIM_DATABASE_WEBUSER=apache\n', encoding='utf-8')

    monkeypatch.setenv('NOMINATIM_DATABASE_WEBUSER', 'nobody')

@@ -68,13 +68,13 @@ def test_prefer_os_environ_over_project_setting(make_config, monkeypatch, tmp_pa

 def test_prefer_os_environ_can_unset_project_setting(make_config, monkeypatch, tmp_path):
    envfile = tmp_path / '.env'
-    envfile.write_text('NOMINATIM_DATABASE_WEBUSER=apache\n')
+    envfile.write_text('NOMINATIM_OSM2PGSQL_BINARY=osm2pgsql\n', encoding='utf-8')

-    monkeypatch.setenv('NOMINATIM_DATABASE_WEBUSER', '')
+    monkeypatch.setenv('NOMINATIM_OSM2PGSQL_BINARY', '')

    config = make_config(tmp_path)

-    assert config.DATABASE_WEBUSER == ''
+    assert config.OSM2PGSQL_BINARY == ''


 def test_get_os_env_add_defaults(make_config, monkeypatch):
@@ -232,7 +232,7 @@ def test_get_import_style_intern(make_config, src_dir, monkeypatch):

 def test_get_import_style_extern_relative(make_config_path, monkeypatch):
    config = make_config_path()
-    (config.project_dir / 'custom.style').write_text('x')
+    (config.project_dir / 'custom.style').write_text('x', encoding='utf-8')

    monkeypatch.setenv('NOMINATIM_IMPORT_STYLE', 'custom.style')

@@ -243,7 +243,7 @@ def test_get_import_style_extern_absolute(make_config, tmp_path, monkeypatch):
    config = make_config()
    cfgfile = tmp_path / 'test.style'

-    cfgfile.write_text('x')
+    cfgfile.write_text('x', encoding='utf-8')

    monkeypatch.setenv('NOMINATIM_IMPORT_STYLE', str(cfgfile))

@@ -254,10 +254,10 @@ def test_load_subconf_from_project_dir(make_config_path):
    config = make_config_path()

    testfile = config.project_dir / 'test.yaml'
-    testfile.write_text('cow: muh\ncat: miau\n')
+    testfile.write_text('cow: muh\ncat: miau\n', encoding='utf-8')

    testfile = config.config_dir / 'test.yaml'
-    testfile.write_text('cow: miau\ncat: muh\n')
+    testfile.write_text('cow: miau\ncat: muh\n', encoding='utf-8')

    rules = config.load_sub_configuration('test.yaml')

@@ -268,7 +268,7 @@ def test_load_subconf_from_settings_dir(make_config_path):
    config = make_config_path()

    testfile = config.config_dir / 'test.yaml'
-    testfile.write_text('cow: muh\ncat: miau\n')
+    testfile.write_text('cow: muh\ncat: miau\n', encoding='utf-8')

    rules = config.load_sub_configuration('test.yaml')

@@ -280,7 +280,7 @@ def test_load_subconf_empty_env_conf(make_config_path, monkeypatch):
    config = make_config_path()

    testfile = config.config_dir / 'test.yaml'
-    testfile.write_text('cow: muh\ncat: miau\n')
+    testfile.write_text('cow: muh\ncat: miau\n', encoding='utf-8')

    rules = config.load_sub_configuration('test.yaml', config='MY_CONFIG')

@@ -291,8 +291,8 @@ def test_load_subconf_env_absolute_found(make_config_path, monkeypatch, tmp_path
    monkeypatch.setenv('NOMINATIM_MY_CONFIG', str(tmp_path / 'other.yaml'))
    config = make_config_path()

-    (config.config_dir / 'test.yaml').write_text('cow: muh\ncat: miau\n')
-    (tmp_path / 'other.yaml').write_text('dog: muh\nfrog: miau\n')
+    (config.config_dir / 'test.yaml').write_text('cow: muh\ncat: miau\n', encoding='utf-8')
+    (tmp_path / 'other.yaml').write_text('dog: muh\nfrog: miau\n', encoding='utf-8')

    rules = config.load_sub_configuration('test.yaml', config='MY_CONFIG')

@@ -303,7 +303,7 @@ def test_load_subconf_env_absolute_not_found(make_config_path, monkeypatch, tmp_
    monkeypatch.setenv('NOMINATIM_MY_CONFIG', str(tmp_path / 'other.yaml'))
    config = make_config_path()

-    (config.config_dir / 'test.yaml').write_text('cow: muh\ncat: miau\n')
+    (config.config_dir / 'test.yaml').write_text('cow: muh\ncat: miau\n', encoding='utf-8')

    with pytest.raises(UsageError, match='Config file not found.'):
        config.load_sub_configuration('test.yaml', config='MY_CONFIG')
@@ -314,8 +314,8 @@ def test_load_subconf_env_relative_found(make_config_path, monkeypatch, location
    monkeypatch.setenv('NOMINATIM_MY_CONFIG', 'other.yaml')
    config = make_config_path()

-    (config.config_dir / 'test.yaml').write_text('cow: muh\ncat: miau\n')
-    (getattr(config, location) / 'other.yaml').write_text('dog: bark\n')
+    (config.config_dir / 'test.yaml').write_text('cow: muh\ncat: miau\n', encoding='utf-8')
+    (getattr(config, location) / 'other.yaml').write_text('dog: bark\n', encoding='utf-8')

    rules = config.load_sub_configuration('test.yaml', config='MY_CONFIG')

@@ -326,7 +326,7 @@ def test_load_subconf_env_relative_not_found(make_config_path, monkeypatch):
    monkeypatch.setenv('NOMINATIM_MY_CONFIG', 'other.yaml')
    config = make_config_path()

-    (config.config_dir / 'test.yaml').write_text('cow: muh\ncat: miau\n')
+    (config.config_dir / 'test.yaml').write_text('cow: muh\ncat: miau\n', encoding='utf-8')

    with pytest.raises(UsageError, match='Config file not found.'):
        config.load_sub_configuration('test.yaml', config='MY_CONFIG')
@@ -335,7 +335,7 @@ def test_load_subconf_env_relative_not_found(make_config_path, monkeypatch):
 def test_load_subconf_json(make_config_path):
    config = make_config_path()

-    (config.project_dir / 'test.json').write_text('{"cow": "muh", "cat": "miau"}')
+    (config.project_dir / 'test.json').write_text('{"cow": "muh", "cat": "miau"}', encoding='utf-8')

    rules = config.load_sub_configuration('test.json')

@@ -352,7 +352,7 @@ def test_load_subconf_not_found(make_config_path):
 def test_load_subconf_env_unknown_format(make_config_path):
    config = make_config_path()

-    (config.project_dir / 'test.xml').write_text('<html></html>')
+    (config.project_dir / 'test.xml').write_text('<html></html>', encoding='utf-8')

    with pytest.raises(UsageError, match='unknown format'):
        config.load_sub_configuration('test.xml')
@@ -362,8 +362,8 @@ def test_load_subconf_include_absolute(make_config_path, tmp_path):
    config = make_config_path()

    testfile = config.config_dir / 'test.yaml'
-    testfile.write_text(f'base: !include {tmp_path}/inc.yaml\n')
-    (tmp_path / 'inc.yaml').write_text('first: 1\nsecond: 2\n')
+    testfile.write_text(f'base: !include {tmp_path}/inc.yaml\n', encoding='utf-8')
+    (tmp_path / 'inc.yaml').write_text('first: 1\nsecond: 2\n', encoding='utf-8')

    rules = config.load_sub_configuration('test.yaml')

@@ -375,8 +375,8 @@ def test_load_subconf_include_relative(make_config_path, tmp_path, location):
    config = make_config_path()

    testfile = config.config_dir / 'test.yaml'
-    testfile.write_text('base: !include inc.yaml\n')
-    (getattr(config, location) / 'inc.yaml').write_text('first: 1\nsecond: 2\n')
+    testfile.write_text('base: !include inc.yaml\n', encoding='utf-8')
+    (getattr(config, location) / 'inc.yaml').write_text('first: 1\nsecond: 2\n', encoding='utf-8')

    rules = config.load_sub_configuration('test.yaml')

@@ -387,8 +387,8 @@ def test_load_subconf_include_bad_format(make_config_path):
    config = make_config_path()

    testfile = config.config_dir / 'test.yaml'
-    testfile.write_text('base: !include inc.txt\n')
-    (config.config_dir / 'inc.txt').write_text('first: 1\nsecond: 2\n')
+    testfile.write_text('base: !include inc.txt\n', encoding='utf-8')
+    (config.config_dir / 'inc.txt').write_text('first: 1\nsecond: 2\n', encoding='utf-8')

    with pytest.raises(UsageError, match='Cannot handle config file format.'):
        config.load_sub_configuration('test.yaml')
@@ -398,7 +398,7 @@ def test_load_subconf_include_not_found(make_config_path):
    config = make_config_path()

    testfile = config.config_dir / 'test.yaml'
-    testfile.write_text('base: !include inc.txt\n')
+    testfile.write_text('base: !include inc.txt\n', encoding='utf-8')

    with pytest.raises(UsageError, match='Config file not found.'):
        config.load_sub_configuration('test.yaml')
@@ -408,9 +408,9 @@ def test_load_subconf_include_recursive(make_config_path):
    config = make_config_path()

    testfile = config.config_dir / 'test.yaml'
-    testfile.write_text('base: !include inc.yaml\n')
-    (config.config_dir / 'inc.yaml').write_text('- !include more.yaml\n- upper\n')
-    (config.config_dir / 'more.yaml').write_text('- the end\n')
+    testfile.write_text('base: !include inc.yaml\n', encoding='utf-8')
+    (config.config_dir / 'inc.yaml').write_text('- !include more.yaml\n- upper\n', encoding='utf-8')
+    (config.config_dir / 'more.yaml').write_text('- the end\n', encoding='utf-8')

    rules = config.load_sub_configuration('test.yaml')

--- a/test/python/config/test_config_load_module.py
+++ b/test/python/config/test_config_load_module.py
@@ -41,7 +41,7 @@ def test_load_default_module_with_hyphen(test_config):
 def test_load_plugin_module(test_config, tmp_path):
    (tmp_path / 'project' / 'testpath').mkdir()
    (tmp_path / 'project' / 'testpath' / 'mymod.py')\
-        .write_text("def my_test_function():\n  return 'gjwitlsSG42TG%'")
+        .write_text("def my_test_function():\n  return 'gjwitlsSG42TG%'", encoding='utf-8')

    module = test_config.load_plugin_module('testpath/mymod.py', 'private.something')

@@ -49,7 +49,7 @@ def test_load_plugin_module(test_config, tmp_path):

    # also test reloading module
    (tmp_path / 'project' / 'testpath' / 'mymod.py')\
-        .write_text("def my_test_function():\n  return 'hjothjorhj'")
+        .write_text("def my_test_function():\n  return 'hjothjorhj'", encoding='utf-8')

    module = test_config.load_plugin_module('testpath/mymod.py', 'private.something')

@@ -61,9 +61,9 @@ def test_load_external_library_module(test_config, tmp_path, monkeypatch):
    pythonpath = tmp_path / 'priv-python'
    pythonpath.mkdir()
    (pythonpath / MODULE_NAME).mkdir()
-    (pythonpath / MODULE_NAME / '__init__.py').write_text('')
+    (pythonpath / MODULE_NAME / '__init__.py').write_text('', encoding='utf-8')
    (pythonpath / MODULE_NAME / 'tester.py')\
-        .write_text("def my_test_function():\n  return 'gjwitlsSG42TG%'")
+        .write_text("def my_test_function():\n  return 'gjwitlsSG42TG%'", encoding='utf-8')

    monkeypatch.syspath_prepend(pythonpath)

@@ -73,7 +73,7 @@ def test_load_external_library_module(test_config, tmp_path, monkeypatch):

    # also test reloading module
    (pythonpath / MODULE_NAME / 'tester.py')\
-        .write_text("def my_test_function():\n  return 'dfigjreigj'")
+        .write_text("def my_test_function():\n  return 'dfigjreigj'", encoding='utf-8')

    module = test_config.load_plugin_module(f'{MODULE_NAME}.tester', 'private.something')

--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@@ -60,7 +60,7 @@ def temp_db(monkeypatch):

    with psycopg.connect(dbname='postgres', autocommit=True) as conn:
        with conn.cursor() as cur:
-            cur.execute('DROP DATABASE IF EXISTS {}'.format(name))
+            cur.execute(pysql.SQL('DROP DATABASE IF EXISTS') + pysql.Identifier(name))


@pytest.fixture
@@ -104,7 +104,9 @@ def table_factory(temp_db_conn):
    """
    def mk_table(name, definition='id INT', content=None):
        with psycopg.ClientCursor(temp_db_conn) as cur:
-            cur.execute('CREATE TABLE {} ({})'.format(name, definition))
+            cur.execute(pysql.SQL("CREATE TABLE {} ({})")
+                             .format(pysql.Identifier(name),
+                                     pysql.SQL(definition)))
            if content:
                sql = pysql.SQL("INSERT INTO {} VALUES ({})")\
                           .format(pysql.Identifier(name),
--- a/test/python/data/test_country_info.py
+++ b/test/python/data/test_country_info.py
@@ -22,7 +22,8 @@ def loaded_country(def_config):
 def env_with_country_config(project_env):

    def _mk_config(cfg):
-        (project_env.project_dir / 'country_settings.yaml').write_text(dedent(cfg))
+        (project_env.project_dir / 'country_settings.yaml').write_text(
+            dedent(cfg), encoding='utf-8')

        return project_env

--- a/test/python/db/test_sql_preprocessor.py
+++ b/test/python/db/test_sql_preprocessor.py
@@ -22,7 +22,7 @@ def sql_factory(tmp_path):
          BEGIN
            {}
          END;
-          $$ LANGUAGE plpgsql IMMUTABLE;""".format(sql_body))
+          $$ LANGUAGE plpgsql IMMUTABLE;""".format(sql_body), encoding='utf-8')
        return 'test.sql'

    return _mk_sql
@@ -63,7 +63,7 @@ def test_load_file_with_params(sql_preprocessor, sql_factory, temp_db_conn, temp
 async def test_load_parallel_file(dsn, sql_preprocessor, tmp_path, temp_db_cursor):
    (tmp_path / 'test.sql').write_text("""
        CREATE TABLE foo (a TEXT);
-        CREATE TABLE foo2(a TEXT);""" + "\n---\nCREATE TABLE bar (b INT);")
+        CREATE TABLE foo2(a TEXT);""" + "\n---\nCREATE TABLE bar (b INT);", encoding='utf-8')

    await sql_preprocessor.run_parallel_sql_file(dsn, 'test.sql', num_threads=4)

--- a/test/python/db/test_utils.py
+++ b/test/python/db/test_utils.py
@@ -15,7 +15,8 @@ from nominatim_db.errors import UsageError

 def test_execute_file_success(dsn, temp_db_cursor, tmp_path):
    tmpfile = tmp_path / 'test.sql'
-    tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);')
+    tmpfile.write_text(
+        'CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);', encoding='utf-8')

    db_utils.execute_file(dsn, tmpfile)

@@ -29,7 +30,7 @@ def test_execute_file_bad_file(dsn, tmp_path):

 def test_execute_file_bad_sql(dsn, tmp_path):
    tmpfile = tmp_path / 'test.sql'
-    tmpfile.write_text('CREATE STABLE test (id INT)')
+    tmpfile.write_text('CREATE STABLE test (id INT)', encoding='utf-8')

    with pytest.raises(UsageError):
        db_utils.execute_file(dsn, tmpfile)
@@ -37,14 +38,14 @@ def test_execute_file_bad_sql(dsn, tmp_path):

 def test_execute_file_bad_sql_ignore_errors(dsn, tmp_path):
    tmpfile = tmp_path / 'test.sql'
-    tmpfile.write_text('CREATE STABLE test (id INT)')
+    tmpfile.write_text('CREATE STABLE test (id INT)', encoding='utf-8')

    db_utils.execute_file(dsn, tmpfile, ignore_errors=True)


 def test_execute_file_with_pre_code(dsn, tmp_path, temp_db_cursor):
    tmpfile = tmp_path / 'test.sql'
-    tmpfile.write_text('INSERT INTO test VALUES(4)')
+    tmpfile.write_text('INSERT INTO test VALUES(4)', encoding='utf-8')

    db_utils.execute_file(dsn, tmpfile, pre_code='CREATE TABLE test (id INT)')

@@ -53,7 +54,7 @@ def test_execute_file_with_pre_code(dsn, tmp_path, temp_db_cursor):

 def test_execute_file_with_post_code(dsn, tmp_path, temp_db_cursor):
    tmpfile = tmp_path / 'test.sql'
-    tmpfile.write_text('CREATE TABLE test (id INT)')
+    tmpfile.write_text('CREATE TABLE test (id INT)', encoding='utf-8')

    db_utils.execute_file(dsn, tmpfile, post_code='INSERT INTO test VALUES(23)')

--- a/test/python/indexer/test_indexing.py
+++ b/test/python/indexer/test_indexing.py
@@ -247,7 +247,7 @@ async def test_index_boundaries(test_db, threads, test_tokenizer):
    assert test_db.osmline_unindexed() == 1

    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
-    await idx.index_boundaries(0, 30)
+    await idx.index_boundaries()

    assert test_db.placex_unindexed() == 31
    assert test_db.osmline_unindexed() == 1
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -31,7 +31,7 @@ def test_config(project_env, tmp_path):
    sqldir = tmp_path / 'sql'
    sqldir.mkdir()
    (sqldir / 'tokenizer').mkdir()
-    (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
+    (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'", encoding='utf-8')

    project_env.lib_dir.sql = sqldir

@@ -58,7 +58,7 @@ def db_prop(temp_db_conn):
 def analyzer(tokenizer_factory, test_config, monkeypatch,
             temp_db_with_extensions, tmp_path):
    sql = tmp_path / 'sql' / 'tokenizer' / 'icu_tokenizer.sql'
-    sql.write_text("SELECT 'a';")
+    sql.write_text("SELECT 'a';", encoding='utf-8')

    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
    tok = tokenizer_factory()
@@ -80,7 +80,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
        if with_postcode:
            cfgstr['token-analysis'].append({'id': '@postcode',
                                             'analyzer': 'postcodes'})
-        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
+        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(
+            yaml.dump(cfgstr), encoding='utf-8')
        tok.loader = nominatim_db.tokenizer.icu_rule_loader.ICURuleLoader(test_config)

        return tok.name_analyzer()
@@ -190,7 +191,7 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
    table_factory('test', 'txt TEXT')

    func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
-    func_file.write_text("""INSERT INTO test VALUES (1133)""")
+    func_file.write_text("""INSERT INTO test VALUES (1133)""", encoding='utf-8')

    tok.update_sql_functions(test_config)

--- a/test/python/tokenizer/test_icu_rule_loader.py
+++ b/test/python/tokenizer/test_icu_rule_loader.py
@@ -27,7 +27,8 @@ class TestIcuRuleLoader:
        self.project_env = project_env

    def write_config(self, content):
-        (self.project_env.project_dir / 'icu_tokenizer.yaml').write_text(dedent(content))
+        (self.project_env.project_dir / 'icu_tokenizer.yaml').write_text(
+            dedent(content), encoding='utf-8')

    def config_rules(self, *variants):
        content = dedent("""\
@@ -119,7 +120,7 @@ class TestIcuRuleLoader:
                  variants:
            """)
        transpath = self.project_env.project_dir / ('transliteration.yaml')
-        transpath.write_text('- "x > y"')
+        transpath.write_text('- "x > y"', encoding='utf-8')

        loader = ICURuleLoader(self.project_env)
        rules = loader.get_transliteration_rules()
--- a/test/python/tools/conftest.py
+++ b/test/python/tools/conftest.py
@@ -21,7 +21,7 @@ if [ "$*" = "--version" ]; then
 else
  echo "$@"
 fi
-    """)
+    """, encoding='utf-8')
    osm2pgsql_exec.chmod(0o777)

    return dict(osm2pgsql=str(osm2pgsql_exec),
--- a/test/python/tools/test_database_import.py
+++ b/test/python/tools/test_database_import.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2025 by the Nominatim developer community.
+# Copyright (C) 2026 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Tests for functions to import a new database.
@@ -25,12 +25,14 @@ class TestDatabaseSetup:
    def setup_nonexistant_db(self):
        with psycopg.connect(dbname='postgres', autocommit=True) as conn:
            with conn.cursor() as cur:
-                cur.execute(f'DROP DATABASE IF EXISTS {self.DBNAME}')
+                cur.execute(pysql.SQL('DROP DATABASE IF EXISTS ')
+                            + pysql.Identifier(self.DBNAME))

            yield True

            with conn.cursor() as cur:
-                cur.execute(f'DROP DATABASE IF EXISTS {self.DBNAME}')
+                cur.execute(pysql.SQL('DROP DATABASE IF EXISTS ')
+                            + pysql.Identifier(self.DBNAME))

    @pytest.fixture
    def cursor(self):
@@ -62,7 +64,7 @@ class TestDatabaseSetup:
    def test_create_db_missing_ro_user(self):
        with pytest.raises(UsageError, match='Missing read-only user.'):
            database_import.setup_database_skeleton(f'dbname={self.DBNAME}',
-                                                    rouser='sdfwkjkjgdugu2;jgsafkljas;')
+                                                    rouser='sdfwkjkjgdugu2jgsafkljas')

    def test_setup_extensions_old_postgis(self, monkeypatch):
        monkeypatch.setattr(database_import, 'POSTGIS_REQUIRED_VERSION', (50, 50))
@@ -96,7 +98,7 @@ def test_import_osm_data_multifile(table_factory, tmp_path, osm2pgsql_options, c

    files = [tmp_path / 'file1.osm', tmp_path / 'file2.osm']
    for f in files:
-        f.write_text('test')
+        f.write_text('test', encoding='utf-8')

    database_import.import_osm_data(files, osm2pgsql_options)
    captured = capfd.readouterr()
@@ -124,7 +126,7 @@ def test_import_osm_data_drop(table_factory, temp_db_cursor, tmp_path, osm2pgsql
    table_factory('planet_osm_nodes')

    flatfile = tmp_path / 'flatfile'
-    flatfile.write_text('touch')
+    flatfile.write_text('touch', encoding='utf-8')

    osm2pgsql_options['flatnode_file'] = str(flatfile.resolve())

@@ -193,7 +195,7 @@ class TestSetupSQL:
        self.config = def_config

    def write_sql(self, fname, content):
-        (self.config.lib_dir.sql / fname).write_text(content)
+        (self.config.lib_dir.sql / fname).write_text(content, encoding='utf-8')

    @pytest.mark.parametrize("reverse", [True, False])
    def test_create_tables(self, temp_db_conn, temp_db_cursor, reverse):
--- a/test/python/tools/test_freeze.py
+++ b/test/python/tools/test_freeze.py
@@ -54,7 +54,7 @@ def test_drop_flatnode_file_file_already_gone(tmp_path):

 def test_drop_flatnode_file_delete(tmp_path):
    flatfile = tmp_path / 'flatnode.store'
-    flatfile.write_text('Some content')
+    flatfile.write_text('Some content', encoding="utf-8")

    freeze.drop_flatnode_file(flatfile)

--- a/test/python/tools/test_import_special_phrases.py
+++ b/test/python/tools/test_import_special_phrases.py
@@ -30,7 +30,7 @@ def xml_wiki_content(src_dir):
        return the content of the static xml test file.
    """
    xml_test_content = src_dir / 'test' / 'testdata' / 'special_phrases_test_content.txt'
-    return xml_test_content.read_text()
+    return xml_test_content.read_text(encoding='utf-8')


@pytest.fixture
--- a/test/python/tools/test_postcodes.py
+++ b/test/python/tools/test_postcodes.py
@@ -245,7 +245,7 @@ def test_postcodes_extern(postcode_update, postcode_table, tmp_path,
    insert_implicit_postcode(1, 'xx', 'POINT(10 12)', 'AB 4511')

    extfile = tmp_path / 'xx_postcodes.csv'
-    extfile.write_text("postcode,lat,lon\nAB 4511,-4,-1\nCD 4511,-5, -10")
+    extfile.write_text("postcode,lat,lon\nAB 4511,-4,-1\nCD 4511,-5, -10", encoding='utf-8')

    if gzipped:
        subprocess.run(['gzip', str(extfile)])
@@ -262,7 +262,7 @@ def test_postcodes_extern_bad_column(postcode_update, postcode_table, tmp_path,
    insert_implicit_postcode(1, 'xx', 'POINT(10 12)', 'AB 4511')

    extfile = tmp_path / 'xx_postcodes.csv'
-    extfile.write_text("postode,lat,lon\nAB 4511,-4,-1\nCD 4511,-5, -10")
+    extfile.write_text("postode,lat,lon\nAB 4511,-4,-1\nCD 4511,-5, -10", encoding='utf-8')

    postcode_update(tmp_path)

@@ -274,7 +274,8 @@ def test_postcodes_extern_bad_number(postcode_update, insert_implicit_postcode,
    insert_implicit_postcode(1, 'xx', 'POINT(10 12)', 'AB 4511')

    extfile = tmp_path / 'xx_postcodes.csv'
-    extfile.write_text("postcode,lat,lon\nXX 4511,-4,NaN\nCD 4511,-5, -10\n34,200,0")
+    extfile.write_text(
+        "postcode,lat,lon\nXX 4511,-4,NaN\nCD 4511,-5, -10\n34,200,0", encoding='utf-8')

    postcode_update(tmp_path)

--- a/test/python/tools/test_refresh_address_levels.py
+++ b/test/python/tools/test_refresh_address_levels.py
@@ -22,7 +22,7 @@ def test_load_ranks_def_config(temp_db_conn, temp_db_cursor, def_config):

 def test_load_ranks_from_project_dir(project_env, temp_db_conn, temp_db_cursor):
    test_file = project_env.project_dir / 'address-levels.json'
-    test_file.write_text('[{"tags":{"place":{"sea":2}}}]')
+    test_file.write_text('[{"tags":{"place":{"sea":2}}}]', encoding='utf-8')

    load_address_levels_from_config(temp_db_conn, project_env)

@@ -31,7 +31,7 @@ def test_load_ranks_from_project_dir(project_env, temp_db_conn, temp_db_cursor):

 def test_load_ranks_from_broken_file(project_env, temp_db_conn):
    test_file = project_env.project_dir / 'address-levels.json'
-    test_file.write_text('[{"tags":"place":{"sea":2}}}]')
+    test_file.write_text('[{"tags":"place":{"sea":2}}}]', encoding='utf-8')

    with pytest.raises(json.decoder.JSONDecodeError):
        load_address_levels_from_config(temp_db_conn, project_env)
--- a/test/python/tools/test_refresh_create_functions.py
+++ b/test/python/tools/test_refresh_create_functions.py
@@ -21,7 +21,7 @@ class TestCreateFunctions:

    def write_functions(self, content):
        sqlfile = self.config.lib_dir.sql / 'functions.sql'
-        sqlfile.write_text(content)
+        sqlfile.write_text(content, encoding='utf-8')

    def test_create_functions(self, temp_db_cursor):
        self.write_functions("""CREATE OR REPLACE FUNCTION test() RETURNS INTEGER
--- a/test/python/tools/test_refresh_wiki_data.py
+++ b/test/python/tools/test_refresh_wiki_data.py
@@ -20,7 +20,7 @@ from nominatim_db.tools.refresh import (import_wikipedia_articles,
@pytest.fixture
 def wiki_csv(tmp_path, sql_preprocessor):
    def _import(data):
-        with gzip.open(tmp_path / 'wikimedia-importance.csv.gz', mode='wt') as fd:
+        with gzip.open(tmp_path / 'wikimedia-importance.csv.gz', mode='wt', encoding='utf-8') as fd:
            writer = csv.DictWriter(fd, fieldnames=['language', 'type', 'title',
                                                    'importance', 'wikidata_id'],
                                    delimiter='\t', quotechar='|')
--- a/test/python/tools/test_sp_wiki_loader.py
+++ b/test/python/tools/test_sp_wiki_loader.py
@@ -21,7 +21,7 @@ def sp_wiki_loader(src_dir, monkeypatch, def_config):

    def _mock_wiki_content(lang):
        xml_test_content = src_dir / 'test' / 'testdata' / 'special_phrases_test_content.txt'
-        return xml_test_content.read_text()
+        return xml_test_content.read_text(encoding='utf-8')

    monkeypatch.setattr('nominatim_db.tools.special_phrases.sp_wiki_loader._get_wiki_content',
                        _mock_wiki_content)
--- a/test/python/tools/test_tiger_data.py
+++ b/test/python/tools/test_tiger_data.py
@@ -57,11 +57,11 @@ def tiger_table(def_config, temp_db_conn, sql_preprocessor,
           RETURNS INTEGER AS $$
            INSERT INTO tiger VALUES(linegeo, start, stop, interpol, token_info, postcode)
            RETURNING 1
-           $$ LANGUAGE SQL;""")
+           $$ LANGUAGE SQL;""", encoding='utf-8')
    (def_config.lib_dir.sql / 'tiger_import_finish.sql').write_text(
        """DROP FUNCTION tiger_line_import (linegeo GEOMETRY, in_startnumber INTEGER,
                                 in_endnumber INTEGER, interpolationtype TEXT,
-                                 token_info JSONB, in_postcode TEXT);""")
+                                 token_info JSONB, in_postcode TEXT);""", encoding='utf-8')

    return MockTigerTable(temp_db_conn)

@@ -75,7 +75,7 @@ def csv_factory(tmp_path):
        from;to;interpolation;street;city;state;postcode;geometry
        {};{};{};{};{};{};{};{}
        """.format(hnr_from, hnr_to, interpol, street, city, state,
-                   postcode, geometry)))
+                   postcode, geometry)), encoding='utf-8')

    return _mk_file

@@ -129,7 +129,7 @@ async def test_add_tiger_data_no_files(def_config, tiger_table, tokenizer_mock,
 async def test_add_tiger_data_bad_file(def_config, tiger_table, tokenizer_mock,
                                       tmp_path):
    sqlfile = tmp_path / '1010.csv'
-    sqlfile.write_text("""Random text""")
+    sqlfile.write_text("""Random text""", encoding='utf-8')

    await tiger_data.add_tiger_data(str(tmp_path), def_config, 1, tokenizer_mock())

@@ -167,7 +167,7 @@ async def test_add_tiger_data_tarfile(def_config, tiger_table, tokenizer_mock,
 async def test_add_tiger_data_bad_tarfile(def_config, tiger_table, tokenizer_mock,
                                          tmp_path):
    tarfile = tmp_path / 'sample.tar.gz'
-    tarfile.write_text("""Random text""")
+    tarfile.write_text("""Random text""", encoding='utf-8')

    with pytest.raises(UsageError):
        await tiger_data.add_tiger_data(str(tarfile), def_config, 1, tokenizer_mock())
Author	SHA1	Message	Date
Sarah Hoffmann	986d303c95	Merge pull request #3980 from lonvia/security-smells Improve SQL query assembly	2026-02-10 15:26:34 +01:00
Sarah Hoffmann	7a3ea55f3d	ignore tables with odd names in SQLPreprocessor	2026-02-10 11:40:52 +01:00
Sarah Hoffmann	d10d70944d	avoid f-strings in SQL creation in tests	2026-02-10 11:39:19 +01:00
Sarah Hoffmann	73590baf15	use psycopg.sql for SQL building in tokenizer	2026-02-10 11:39:19 +01:00
Sarah Hoffmann	e17d0cb5cf	only allow alphanumeric and dash in DATABASE_WEBUSER This variable is used a lot in raw SQL. Avoid injection issues.	2026-02-10 11:39:17 +01:00
Sarah Hoffmann	7a62c7d812	sanity check class names before inserting into classtype tables The subsequent INSERT is done on an unqouted table name, making in theory an SQL injection through an OSM value possible. In practise this cannot happen because we check for the existance of the table. During the creation of the classtype tables there is a sanity check in place to disallow any table names that consist of anything other than alphanumeric characters.	2026-02-10 11:38:26 +01:00
Sarah Hoffmann	615804b1b3	Merge pull request #3978 from jayaddison/issue-2714-prep/index-boundaries-method-signature-nitpick Refactor: add default params to Indexer.index_boundaries	2026-02-10 09:45:29 +01:00
Sarah Hoffmann	79bbdfd55c	Merge pull request #3975 from kad-link/fix/utf8-encoding-clean Fix: Enforce explicit UTF-8 encoding in file I/O	2026-02-10 09:32:06 +01:00
James Addison	509f59b193	Refactor: add default params to `index_boundaries`	2026-02-09 21:36:30 +00:00
Sri CHaRan	f84b279540	fix: add utf-8 encoding in read-write files	2026-02-10 00:38:40 +05:30