fix: add utf-8 encoding in read-write files

2026-02-16 05:18:00 +00:00 · 2026-02-08 19:14:57 +05:30
parent 67ecf5f6a0
commit f84b279540
22 changed files with 79 additions and 71 deletions
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -31,7 +31,7 @@ def test_config(project_env, tmp_path):
    sqldir = tmp_path / 'sql'
    sqldir.mkdir()
    (sqldir / 'tokenizer').mkdir()
-    (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
+    (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'", encoding='utf-8')

    project_env.lib_dir.sql = sqldir

@@ -58,7 +58,7 @@ def db_prop(temp_db_conn):
 def analyzer(tokenizer_factory, test_config, monkeypatch,
             temp_db_with_extensions, tmp_path):
    sql = tmp_path / 'sql' / 'tokenizer' / 'icu_tokenizer.sql'
-    sql.write_text("SELECT 'a';")
+    sql.write_text("SELECT 'a';", encoding='utf-8')

    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
    tok = tokenizer_factory()
@@ -80,7 +80,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
        if with_postcode:
            cfgstr['token-analysis'].append({'id': '@postcode',
                                             'analyzer': 'postcodes'})
-        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
+        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(
+            yaml.dump(cfgstr), encoding='utf-8')
        tok.loader = nominatim_db.tokenizer.icu_rule_loader.ICURuleLoader(test_config)

        return tok.name_analyzer()
@@ -190,7 +191,7 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
    table_factory('test', 'txt TEXT')

    func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
-    func_file.write_text("""INSERT INTO test VALUES (1133)""")
+    func_file.write_text("""INSERT INTO test VALUES (1133)""", encoding='utf-8')

    tok.update_sql_functions(test_config)