From 40d5b78eb80c1e7479a51d62969a35e8c31ca98a Mon Sep 17 00:00:00 2001 From: anqixxx Date: Thu, 29 May 2025 09:25:08 -0700 Subject: [PATCH 1/4] Added command line (default 0) min argument for minimum filtering, updated args.py to reflect this --- src/nominatim_db/clicmd/args.py | 1 + src/nominatim_db/clicmd/special_phrases.py | 6 +++++- src/nominatim_db/tools/special_phrases/sp_importer.py | 9 +++++---- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/nominatim_db/clicmd/args.py b/src/nominatim_db/clicmd/args.py index 45df9b7c..5c6a806a 100644 --- a/src/nominatim_db/clicmd/args.py +++ b/src/nominatim_db/clicmd/args.py @@ -136,6 +136,7 @@ class NominatimArgs: import_from_wiki: bool import_from_csv: Optional[str] no_replace: bool + min: int # Arguments to all query functions format: str diff --git a/src/nominatim_db/clicmd/special_phrases.py b/src/nominatim_db/clicmd/special_phrases.py index 9ba751a0..90560fb7 100644 --- a/src/nominatim_db/clicmd/special_phrases.py +++ b/src/nominatim_db/clicmd/special_phrases.py @@ -58,6 +58,8 @@ class ImportSpecialPhrases: help='Import special phrases from a CSV file') group.add_argument('--no-replace', action='store_true', help='Keep the old phrases and only add the new ones') + group.add_argument('--min', type=int, default=0, + help='Restrict special phrases by minimum occurance') def run(self, args: NominatimArgs) -> int: @@ -82,7 +84,9 @@ class ImportSpecialPhrases: tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config) should_replace = not args.no_replace + min = args.min + with connect(args.config.get_libpq_dsn()) as db_connection: SPImporter( args.config, db_connection, loader - ).import_phrases(tokenizer, should_replace) + ).import_phrases(tokenizer, should_replace, min) diff --git a/src/nominatim_db/tools/special_phrases/sp_importer.py b/src/nominatim_db/tools/special_phrases/sp_importer.py index ac50377f..6bd3c287 100644 --- a/src/nominatim_db/tools/special_phrases/sp_importer.py +++ b/src/nominatim_db/tools/special_phrases/sp_importer.py @@ -87,7 +87,7 @@ class SPImporter(): return db_combinations - def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None: + def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool, min: int) -> None: """ Iterate through all SpecialPhrases extracted from the loader and import them into the database. @@ -107,7 +107,7 @@ class SPImporter(): if result: class_type_pairs.add(result) - self._create_classtype_table_and_indexes(class_type_pairs) + self._create_classtype_table_and_indexes(class_type_pairs, min) if should_replace: self._remove_non_existent_tables_from_db() @@ -186,7 +186,8 @@ class SPImporter(): return (phrase.p_class, phrase.p_type) def _create_classtype_table_and_indexes(self, - class_type_pairs: Iterable[Tuple[str, str]]) -> None: + class_type_pairs: Iterable[Tuple[str, str]], + min: int) -> None: """ Create table place_classtype for each given pair. Also create indexes on place_id and centroid. @@ -200,7 +201,7 @@ class SPImporter(): with self.db_connection.cursor() as db_cursor: db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)") - allowed_special_phrases = self.get_classtype_pairs() + allowed_special_phrases = self.get_classtype_pairs(min) for pair in class_type_pairs: phrase_class = pair[0] From 20cf4b56b9c0fd8a6c8f91d341143e8d1c815e4e Mon Sep 17 00:00:00 2001 From: anqixxx Date: Sat, 31 May 2025 09:41:36 -0700 Subject: [PATCH 2/4] Refactored min and associated tests to follow greater than or equal to logic, so that min=0 accounted for no filtering r --- src/nominatim_db/tools/special_phrases/sp_importer.py | 10 ++++++---- test/python/tools/test_sp_importer.py | 8 ++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/nominatim_db/tools/special_phrases/sp_importer.py b/src/nominatim_db/tools/special_phrases/sp_importer.py index 6bd3c287..890cf2fc 100644 --- a/src/nominatim_db/tools/special_phrases/sp_importer.py +++ b/src/nominatim_db/tools/special_phrases/sp_importer.py @@ -68,16 +68,17 @@ class SPImporter(): """ Returns list of allowed special phrases from the database, restricting to a list of combinations of classes and types - which occur more than a specified amount of times. + which occur equal to or more than a specified amount of times. - Default value for this, if not specified, is at least once. + Default value for this is 0, which allows everything in database. """ db_combinations = set() + query = f""" SELECT class AS CLS, type AS typ FROM placex GROUP BY class, type - HAVING COUNT(*) > {min} + HAVING COUNT(*) >= {min} """ with self.db_connection.cursor() as db_cursor: @@ -207,7 +208,8 @@ class SPImporter(): phrase_class = pair[0] phrase_type = pair[1] - if (phrase_class, phrase_type) not in allowed_special_phrases: + # Will only filter if min is not 0 + if min and (phrase_class, phrase_type) not in allowed_special_phrases: LOG.warning("Skipping phrase %s=%s: not in allowed special phrases", phrase_class, phrase_type) continue diff --git a/test/python/tools/test_sp_importer.py b/test/python/tools/test_sp_importer.py index dda02f11..c64c2b7d 100644 --- a/test/python/tools/test_sp_importer.py +++ b/test/python/tools/test_sp_importer.py @@ -3,8 +3,8 @@ from nominatim_db.tools.special_phrases.sp_importer import SPImporter # Testing Database Class Pair Retrival using Conftest.py and placex def test_get_classtype_pair_data(placex_table, def_config, temp_db_conn): - for _ in range(101): - placex_table.add(cls='highway', typ='motorway') # edge case 101 + for _ in range(100): + placex_table.add(cls='highway', typ='motorway') # edge case 100 for _ in range(99): placex_table.add(cls='amenity', typ='prison') # edge case 99 @@ -25,8 +25,8 @@ def test_get_classtype_pair_data(placex_table, def_config, temp_db_conn): def test_get_classtype_pair_data_more(placex_table, def_config, temp_db_conn): - for _ in range(100): - placex_table.add(cls='emergency', typ='firehydrant') # edge case 100, not included + for _ in range(99): + placex_table.add(cls='emergency', typ='firehydrant') # edge case 99, not included for _ in range(199): placex_table.add(cls='amenity', typ='prison') From 7dc3924a3c640ff3e7e2a3c91c7436576fb57b0c Mon Sep 17 00:00:00 2001 From: anqixxx Date: Wed, 4 Jun 2025 01:10:14 -0700 Subject: [PATCH 3/4] Added default min = 0 argument for private functions empty --- src/nominatim_db/tools/special_phrases/sp_importer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/nominatim_db/tools/special_phrases/sp_importer.py b/src/nominatim_db/tools/special_phrases/sp_importer.py index 890cf2fc..4989ef73 100644 --- a/src/nominatim_db/tools/special_phrases/sp_importer.py +++ b/src/nominatim_db/tools/special_phrases/sp_importer.py @@ -88,7 +88,8 @@ class SPImporter(): return db_combinations - def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool, min: int) -> None: + def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool, + min: int = 0) -> None: """ Iterate through all SpecialPhrases extracted from the loader and import them into the database. @@ -188,7 +189,7 @@ class SPImporter(): def _create_classtype_table_and_indexes(self, class_type_pairs: Iterable[Tuple[str, str]], - min: int) -> None: + min: int = 0) -> None: """ Create table place_classtype for each given pair. Also create indexes on place_id and centroid. From cf9b946eba5067790d49d5dc62290f0d9517dd07 Mon Sep 17 00:00:00 2001 From: anqixxx Date: Thu, 5 Jun 2025 09:25:14 +0800 Subject: [PATCH 4/4] Added skip for when min =0 --- src/nominatim_db/tools/special_phrases/sp_importer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nominatim_db/tools/special_phrases/sp_importer.py b/src/nominatim_db/tools/special_phrases/sp_importer.py index 4989ef73..12e695b6 100644 --- a/src/nominatim_db/tools/special_phrases/sp_importer.py +++ b/src/nominatim_db/tools/special_phrases/sp_importer.py @@ -203,7 +203,8 @@ class SPImporter(): with self.db_connection.cursor() as db_cursor: db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)") - allowed_special_phrases = self.get_classtype_pairs(min) + if min: + allowed_special_phrases = self.get_classtype_pairs(min) for pair in class_type_pairs: phrase_class = pair[0]