From 1722fc537f3f6c8f192b22d323a6136262663854 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 26 Oct 2021 17:29:03 +0200 Subject: [PATCH 1/2] bdd: add tests for non-latin scripts --- test/bdd/db/query/normalization.feature | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/test/bdd/db/query/normalization.feature b/test/bdd/db/query/normalization.feature index deaa635e..304496e2 100644 --- a/test/bdd/db/query/normalization.feature +++ b/test/bdd/db/query/normalization.feature @@ -3,6 +3,31 @@ Feature: Import and search of names Tests all naming related issues: normalisation, abbreviations, internationalisation, etc. + Scenario: non-latin scripts can be found + Given the places + | osm | class | type | name | + | N1 | place | locality | Речицкий район | + | N2 | place | locality | Refugio de montaña | + | N3 | place | locality | 高槻市| + | N4 | place | locality | الدوحة | + When importing + When sending search query "Речицкий район" + Then results contain + | ID | osm | + | 0 | N1 | + When sending search query "Refugio de montaña" + Then results contain + | ID | osm | + | 0 | N2 | + When sending search query "高槻市" + Then results contain + | ID | osm | + | 0 | N3 | + When sending search query "الدوحة" + Then results contain + | ID | osm | + | 0 | N4 | + Scenario: Case-insensitivity of search Given the places | osm | class | type | name | From 37eeccbf4cd7c25239b78d6c3747fccb1bca519c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 27 Oct 2021 10:07:19 +0200 Subject: [PATCH 2/2] ICU: use normalization from config in PHP The TERM_NORMALIZATION config option is no longer applicable. That was already documented but not yet implemented. --- nominatim/tokenizer/icu_tokenizer.py | 14 ++------------ test/python/test_tokenizer_icu.py | 16 ++++++---------- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 3331a321..ea6e5d3c 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -9,7 +9,6 @@ import re from textwrap import dedent from nominatim.db.connection import connect -from nominatim.db.properties import set_property, get_property from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.indexer.place_info import PlaceInfo @@ -36,7 +35,6 @@ class LegacyICUTokenizer(AbstractTokenizer): self.dsn = dsn self.data_dir = data_dir self.loader = None - self.term_normalization = None def init_new_db(self, config, init_db=True): @@ -47,8 +45,6 @@ class LegacyICUTokenizer(AbstractTokenizer): """ self.loader = ICURuleLoader(config) - self.term_normalization = config.TERM_NORMALIZATION - self._install_php(config.lib_dir.php) self._save_config() @@ -64,7 +60,6 @@ class LegacyICUTokenizer(AbstractTokenizer): with connect(self.dsn) as conn: self.loader.load_config_from_db(conn) - self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) def finalize_import(self, config): @@ -87,13 +82,9 @@ class LegacyICUTokenizer(AbstractTokenizer): def check_database(self, config): """ Check that the tokenizer is set up correctly. """ + # Will throw an error if there is an issue. self.init_from_project(config) - if self.term_normalization is None: - return "Configuration for tokenizer 'icu' are missing." - - return None - def update_statistics(self): """ Recompute frequencies for all name words. @@ -141,7 +132,7 @@ class LegacyICUTokenizer(AbstractTokenizer): php_file.write_text(dedent(f"""\