From 1a323165f93d26d8d519d7b9c6a160663a031e7f Mon Sep 17 00:00:00 2001 From: anqixxx Date: Mon, 7 Apr 2025 21:40:42 -0700 Subject: [PATCH 1/6] Filter special phrases by style and frequency to fix #235 --- .../tools/special_phrases/sp_importer.py | 64 +++++- test/python/tools/test_sp_importer.py | 193 ++++++++++++++++++ 2 files changed, 253 insertions(+), 4 deletions(-) create mode 100644 test/python/tools/test_sp_importer.py diff --git a/src/nominatim_db/tools/special_phrases/sp_importer.py b/src/nominatim_db/tools/special_phrases/sp_importer.py index 40b089a7..d9bf3165 100644 --- a/src/nominatim_db/tools/special_phrases/sp_importer.py +++ b/src/nominatim_db/tools/special_phrases/sp_importer.py @@ -16,7 +16,7 @@ from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set import logging import re - +import json from psycopg.sql import Identifier, SQL from ...typing import Protocol @@ -65,6 +65,52 @@ class SPImporter(): # special phrases class/type on the wiki. self.table_phrases_to_delete: Set[str] = set() + def get_classtype_pairs_style(self) -> Set[Tuple[str, str]]: + """ + Returns list of allowed special phrases from the the style file, + restricting to a list of combinations of classes and types + which have a 'main' property + + Note: This requirement was from 2021 and I am a bit unsure if it is still relevant + """ + style_file = self.config.get_import_style_file() # this gives the path, so i will import it as a json + with open(style_file, 'r') as file: + style_data = json.loads(f'[{file.read()}]') + + style_combinations = set() + for _map in style_data: # following ../settings/import-extratags.style + classes = _map.get("keys", []) + values = _map.get("values", {}) + + for _type, properties in values.items(): + if "main" in properties and _type: # make sure the tag is not an empty string. since type is the value of the main tag + for _class in classes: + style_combinations.add((_class, _type)) + + return style_combinations + + def get_classtype_pairs(self) -> Set[Tuple[str, str]]: + """ + Returns list of allowed special phrases from the database, + restricting to a list of combinations of classes and types + whic occur more than 100 times + """ + db_combinations = set() + query = """ + SELECT class AS CLS, type AS typ + FROM placex + GROUP BY class, type + HAVING COUNT(*) > 100 + """ + + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL(query)) + for row in db_cursor.fetchall(): + db_combinations.add((row[0], row[1])) + + return db_combinations + + def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None: """ Iterate through all SpecialPhrases extracted from the @@ -85,9 +131,11 @@ class SPImporter(): if result: class_type_pairs.add(result) - self._create_classtype_table_and_indexes(class_type_pairs) + self._create_classtype_table_and_indexes(class_type_pairs) if should_replace: self._remove_non_existent_tables_from_db() + + self.db_connection.commit() with tokenizer.name_analyzer() as analyzer: @@ -177,10 +225,17 @@ class SPImporter(): with self.db_connection.cursor() as db_cursor: db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)") + allowed_special_phrases = self.get_classtype_pairs() + for pair in class_type_pairs: phrase_class = pair[0] phrase_type = pair[1] + if (phrase_class, phrase_type) not in allowed_special_phrases: + LOG.warning("Skipping phrase %s=%s: not in allowed special phrases", + phrase_class, phrase_type) + continue + table_name = _classtype_table(phrase_class, phrase_type) if table_name in self.table_phrases_to_delete: @@ -212,8 +267,8 @@ class SPImporter(): if doesn't exit. """ table_name = _classtype_table(phrase_class, phrase_type) - with self.db_connection.cursor() as cur: - cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS SELECT place_id AS place_id, st_centroid(geometry) AS centroid FROM placex @@ -266,3 +321,4 @@ class SPImporter(): drop_tables(self.db_connection, *self.table_phrases_to_delete) for _ in self.table_phrases_to_delete: self.statistics_handler.notify_one_table_deleted() + diff --git a/test/python/tools/test_sp_importer.py b/test/python/tools/test_sp_importer.py new file mode 100644 index 00000000..b49d2ea1 --- /dev/null +++ b/test/python/tools/test_sp_importer.py @@ -0,0 +1,193 @@ +import pytest +import tempfile +import json +import os +from unittest.mock import MagicMock + +from nominatim_db.errors import UsageError +from nominatim_db.tools.special_phrases.sp_csv_loader import SPCsvLoader +from nominatim_db.tools.special_phrases.special_phrase import SpecialPhrase +from nominatim_db.tools.special_phrases.sp_importer import SPImporter + +@pytest.fixture +def sample_style_file(): + sample_data = [ + { + "keys" : ["emergency"], + "values" : { + "fire_hydrant" : "skip", + "yes" : "skip", + "no" : "skip", + "" : "main" + } + }, + { + "keys" : ["historic", "military"], + "values" : { + "no" : "skip", + "yes" : "skip", + "" : "main" + } + }, + { + "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", + "name:botanical", "wikidata", "*:wikidata"], + "values" : { + "" : "extra" + } + }, + { + "keys" : ["addr:housename"], + "values" : { + "" : "name,house" + } + }, + { + "keys": ["highway"], + "values": { + "motorway": "main", + "": "skip" + } + } + ] + content = ",".join(json.dumps(entry) for entry in sample_data) + + with tempfile.NamedTemporaryFile(mode='w+', delete=False) as tmp: + tmp.write(content) + tmp_path = tmp.name + + yield tmp_path + os.remove(tmp_path) + + +def test_get_sp_style(sample_style_file): + mock_config = MagicMock() + mock_config.get_import_style_file.return_value = sample_style_file + + importer = SPImporter(config=mock_config, conn=None, sp_loader=None) + result = importer.get_sp_style() + + expected = { + ("highway", "motorway"), + } + + assert result == expected + +@pytest.fixture +def mock_phrase(): + return SpecialPhrase( + p_label="test", + p_class="highway", + p_type="motorway", + p_operator="eq" + ) + +def test_create_classtype_table_and_indexes(): + mock_config = MagicMock() + mock_config.TABLESPACE_AUX_DATA = '' + mock_config.DATABASE_WEBUSER = 'www-data' + + mock_cursor = MagicMock() + mock_conn = MagicMock() + mock_conn.cursor.return_value.__enter__.return_value = mock_cursor + + importer = SPImporter(config=mock_config, conn=mock_conn, sp_loader=None) + + importer._create_place_classtype_table = MagicMock() + importer._create_place_classtype_indexes = MagicMock() + importer._grant_access_to_webuser = MagicMock() + importer.statistics_handler.notify_one_table_created = lambda: print("✓ Created table") + importer.statistics_handler.notify_one_table_ignored = lambda: print("⨉ Ignored table") + + importer.table_phrases_to_delete = {"place_classtype_highway_motorway"} + + test_pairs = [("highway", "motorway"), ("natural", "peak")] + importer._create_classtype_table_and_indexes(test_pairs) + + print("create_place_classtype_table calls:") + for call in importer._create_place_classtype_table.call_args_list: + print(call) + + print("\ncreate_place_classtype_indexes calls:") + for call in importer._create_place_classtype_indexes.call_args_list: + print(call) + + print("\ngrant_access_to_webuser calls:") + for call in importer._grant_access_to_webuser.call_args_list: + print(call) + +@pytest.fixture +def mock_config(): + config = MagicMock() + config.TABLESPACE_AUX_DATA = '' + config.DATABASE_WEBUSER = 'www-data' + config.load_sub_configuration.return_value = {'blackList': {}, 'whiteList': {}} + return config + + +def test_import_phrases_original(mock_config): + phrase = SpecialPhrase("roundabout", "highway", "motorway", "eq") + + mock_conn = MagicMock() + mock_cursor = MagicMock() + mock_conn.cursor.return_value.__enter__.return_value = mock_cursor + mock_loader = MagicMock() + mock_loader.generate_phrases.return_value = [phrase] + + mock_analyzer = MagicMock() + mock_tokenizer = MagicMock() + mock_tokenizer.name_analyzer.return_value.__enter__.return_value = mock_analyzer + + importer = SPImporter(config=mock_config, conn=mock_conn, sp_loader=mock_loader) + importer._fetch_existing_place_classtype_tables = MagicMock() + importer._create_classtype_table_and_indexes = MagicMock() + importer._remove_non_existent_tables_from_db = MagicMock() + + importer.import_phrases(tokenizer=mock_tokenizer, should_replace=True) + + assert importer.word_phrases == {("roundabout", "highway", "motorway", "-")} + + mock_analyzer.update_special_phrases.assert_called_once_with( + importer.word_phrases, True + ) + + +def test_get_sp_filters_correctly(sample_style_file): + mock_config = MagicMock() + mock_config.get_import_style_file.return_value = sample_style_file + mock_config.load_sub_configuration.return_value = {"blackList": {}, "whiteList": {}} + + importer = SPImporter(config=mock_config, conn=MagicMock(), sp_loader=None) + + allowed_from_db = {("highway", "motorway"), ("historic", "castle")} + importer.get_sp_db = lambda: allowed_from_db + + result = importer.get_sp() + + expected = {("highway", "motorway")} + + assert result == expected, f"Expected {expected}, got {result}" + +def test_get_sp_db_filters_by_count_threshold(mock_config): + mock_cursor = MagicMock() + + # Simulate only results above the threshold being returned (as SQL would) + # These tuples simulate real SELECT class, type FROM placex GROUP BY ... HAVING COUNT(*) > 100 + mock_cursor.fetchall.return_value = [ + ("highway", "motorway"), + ("historic", "castle") + ] + + mock_conn = MagicMock() + mock_conn.cursor.return_value.__enter__.return_value = mock_cursor + importer = SPImporter(config=mock_config, conn=mock_conn, sp_loader=None) + + result = importer.get_sp_db() + + expected = { + ("highway", "motorway"), + ("historic", "castle") + } + + assert result == expected + mock_cursor.execute.assert_called_once() From 1952290359cd1cffccc1af503ec2edcad1f401d2 Mon Sep 17 00:00:00 2001 From: anqixxx Date: Fri, 11 Apr 2025 12:03:57 -0700 Subject: [PATCH 2/6] Removed magic mocking, using monkeypatch instead, and using a placex table to simulate a 'real database' --- .../tools/special_phrases/sp_importer.py | 53 +++-- test/python/tools/test_sp_importer.py | 194 ++++++++---------- 2 files changed, 106 insertions(+), 141 deletions(-) diff --git a/src/nominatim_db/tools/special_phrases/sp_importer.py b/src/nominatim_db/tools/special_phrases/sp_importer.py index d9bf3165..a4d0eaf6 100644 --- a/src/nominatim_db/tools/special_phrases/sp_importer.py +++ b/src/nominatim_db/tools/special_phrases/sp_importer.py @@ -16,7 +16,7 @@ from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set import logging import re -import json +import json from psycopg.sql import Identifier, SQL from ...typing import Protocol @@ -65,37 +65,37 @@ class SPImporter(): # special phrases class/type on the wiki. self.table_phrases_to_delete: Set[str] = set() - def get_classtype_pairs_style(self) -> Set[Tuple[str, str]]: + def get_classtype_pairs_style(self) -> Set[Tuple[str, str]]: """ - Returns list of allowed special phrases from the the style file, - restricting to a list of combinations of classes and types + Returns list of allowed special phrases from the the style file, + restricting to a list of combinations of classes and types which have a 'main' property Note: This requirement was from 2021 and I am a bit unsure if it is still relevant """ - style_file = self.config.get_import_style_file() # this gives the path, so i will import it as a json + style_file = self.config.get_import_style_file() # import style file as json with open(style_file, 'r') as file: style_data = json.loads(f'[{file.read()}]') style_combinations = set() - for _map in style_data: # following ../settings/import-extratags.style + for _map in style_data: # following ../settings/import-extratags.style classes = _map.get("keys", []) values = _map.get("values", {}) for _type, properties in values.items(): - if "main" in properties and _type: # make sure the tag is not an empty string. since type is the value of the main tag + if "main" in properties and _type: # make sure the tag is a non-empty string for _class in classes: - style_combinations.add((_class, _type)) + style_combinations.add((_class, _type)) # type is the value of the main tag return style_combinations - - def get_classtype_pairs(self) -> Set[Tuple[str, str]]: + + def get_classtype_pairs(self) -> Set[Tuple[str, str]]: """ - Returns list of allowed special phrases from the database, - restricting to a list of combinations of classes and types - whic occur more than 100 times + Returns list of allowed special phrases from the database, + restricting to a list of combinations of classes and types + which occur more than 100 times """ - db_combinations = set() + db_combinations = set() query = """ SELECT class AS CLS, type AS typ FROM placex @@ -104,13 +104,12 @@ class SPImporter(): """ with self.db_connection.cursor() as db_cursor: - db_cursor.execute(SQL(query)) + db_cursor.execute(SQL(query)) for row in db_cursor.fetchall(): db_combinations.add((row[0], row[1])) - return db_combinations + return db_combinations - def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None: """ Iterate through all SpecialPhrases extracted from the @@ -131,11 +130,10 @@ class SPImporter(): if result: class_type_pairs.add(result) - self._create_classtype_table_and_indexes(class_type_pairs) + self._create_classtype_table_and_indexes(class_type_pairs) if should_replace: self._remove_non_existent_tables_from_db() - self.db_connection.commit() with tokenizer.name_analyzer() as analyzer: @@ -235,7 +233,7 @@ class SPImporter(): LOG.warning("Skipping phrase %s=%s: not in allowed special phrases", phrase_class, phrase_type) continue - + table_name = _classtype_table(phrase_class, phrase_type) if table_name in self.table_phrases_to_delete: @@ -268,13 +266,13 @@ class SPImporter(): """ table_name = _classtype_table(phrase_class, phrase_type) with self.db_connection.cursor() as db_cursor: - db_cursor.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS - SELECT place_id AS place_id, - st_centroid(geometry) AS centroid - FROM placex - WHERE class = %s AND type = %s - """).format(Identifier(table_name), SQL(sql_tablespace)), - (phrase_class, phrase_type)) + db_cursor.execute(SQL( + """CREATE TABLE IF NOT EXISTS {} {} AS + SELECT place_id AS place_id, + st_centroid(geometry) AS centroid + FROM placex WHERE class = %s AND type = %s + """).format(Identifier(table_name), SQL(sql_tablespace)), + (phrase_class, phrase_type)) def _create_place_classtype_indexes(self, sql_tablespace: str, phrase_class: str, phrase_type: str) -> None: @@ -321,4 +319,3 @@ class SPImporter(): drop_tables(self.db_connection, *self.table_phrases_to_delete) for _ in self.table_phrases_to_delete: self.statistics_handler.notify_one_table_deleted() - diff --git a/test/python/tools/test_sp_importer.py b/test/python/tools/test_sp_importer.py index b49d2ea1..4d2dd8d4 100644 --- a/test/python/tools/test_sp_importer.py +++ b/test/python/tools/test_sp_importer.py @@ -2,13 +2,10 @@ import pytest import tempfile import json import os -from unittest.mock import MagicMock -from nominatim_db.errors import UsageError -from nominatim_db.tools.special_phrases.sp_csv_loader import SPCsvLoader -from nominatim_db.tools.special_phrases.special_phrase import SpecialPhrase from nominatim_db.tools.special_phrases.sp_importer import SPImporter +# Testing Style Class Pair Retrival @pytest.fixture def sample_style_file(): sample_data = [ @@ -59,135 +56,106 @@ def sample_style_file(): yield tmp_path os.remove(tmp_path) +def test_get_classtype_style(sample_style_file): + class Config: + def get_import_style_file(self): + return sample_style_file + + def load_sub_configuration(self, name): + return {'blackList': {}, 'whiteList': {}} -def test_get_sp_style(sample_style_file): - mock_config = MagicMock() - mock_config.get_import_style_file.return_value = sample_style_file + config = Config() + importer = SPImporter(config=config, conn=None, sp_loader=None) - importer = SPImporter(config=mock_config, conn=None, sp_loader=None) - result = importer.get_sp_style() + result = importer.get_classtype_pairs_style() expected = { ("highway", "motorway"), } + assert expected.issubset(result) + +# Testing Database Class Pair Retrival using Mock Database +def test_get_classtype_pairs(monkeypatch): + class Config: + def load_sub_configuration(self, path, section=None): + return {"blackList": {}, "whiteList": {}} + + class Cursor: + def execute(self, query): pass + def fetchall(self): + return [ + ("highway", "motorway"), + ("historic", "castle") + ] + def __enter__(self): return self + def __exit__(self, exc_type, exc_val, exc_tb): pass + + class Connection: + def cursor(self): return Cursor() + + config = Config() + conn = Connection() + importer = SPImporter(config=config, conn=conn, sp_loader=None) + + result = importer.get_classtype_pairs() + + expected = { + ("highway", "motorway"), + ("historic", "castle") + } + assert result == expected -@pytest.fixture -def mock_phrase(): - return SpecialPhrase( - p_label="test", - p_class="highway", - p_type="motorway", - p_operator="eq" - ) +# Testing Database Class Pair Retrival using Conftest.py and placex +def test_get_classtype_pair_data(placex_table, temp_db_conn): + class Config: + def load_sub_configuration(self, *_): + return {'blackList': {}, 'whiteList': {}} + + for _ in range(101): + placex_table.add(cls='highway', typ='motorway') # edge case 101 -def test_create_classtype_table_and_indexes(): - mock_config = MagicMock() - mock_config.TABLESPACE_AUX_DATA = '' - mock_config.DATABASE_WEBUSER = 'www-data' + for _ in range(99): + placex_table.add(cls='amenity', typ='prison') # edge case 99 - mock_cursor = MagicMock() - mock_conn = MagicMock() - mock_conn.cursor.return_value.__enter__.return_value = mock_cursor + for _ in range(150): + placex_table.add(cls='tourism', typ='hotel') - importer = SPImporter(config=mock_config, conn=mock_conn, sp_loader=None) + config = Config() + importer = SPImporter(config=config, conn=temp_db_conn, sp_loader=None) - importer._create_place_classtype_table = MagicMock() - importer._create_place_classtype_indexes = MagicMock() - importer._grant_access_to_webuser = MagicMock() - importer.statistics_handler.notify_one_table_created = lambda: print("✓ Created table") - importer.statistics_handler.notify_one_table_ignored = lambda: print("⨉ Ignored table") + result = importer.get_classtype_pairs() - importer.table_phrases_to_delete = {"place_classtype_highway_motorway"} - - test_pairs = [("highway", "motorway"), ("natural", "peak")] - importer._create_classtype_table_and_indexes(test_pairs) - - print("create_place_classtype_table calls:") - for call in importer._create_place_classtype_table.call_args_list: - print(call) - - print("\ncreate_place_classtype_indexes calls:") - for call in importer._create_place_classtype_indexes.call_args_list: - print(call) - - print("\ngrant_access_to_webuser calls:") - for call in importer._grant_access_to_webuser.call_args_list: - print(call) - -@pytest.fixture -def mock_config(): - config = MagicMock() - config.TABLESPACE_AUX_DATA = '' - config.DATABASE_WEBUSER = 'www-data' - config.load_sub_configuration.return_value = {'blackList': {}, 'whiteList': {}} - return config - - -def test_import_phrases_original(mock_config): - phrase = SpecialPhrase("roundabout", "highway", "motorway", "eq") - - mock_conn = MagicMock() - mock_cursor = MagicMock() - mock_conn.cursor.return_value.__enter__.return_value = mock_cursor - mock_loader = MagicMock() - mock_loader.generate_phrases.return_value = [phrase] - - mock_analyzer = MagicMock() - mock_tokenizer = MagicMock() - mock_tokenizer.name_analyzer.return_value.__enter__.return_value = mock_analyzer - - importer = SPImporter(config=mock_config, conn=mock_conn, sp_loader=mock_loader) - importer._fetch_existing_place_classtype_tables = MagicMock() - importer._create_classtype_table_and_indexes = MagicMock() - importer._remove_non_existent_tables_from_db = MagicMock() - - importer.import_phrases(tokenizer=mock_tokenizer, should_replace=True) - - assert importer.word_phrases == {("roundabout", "highway", "motorway", "-")} - - mock_analyzer.update_special_phrases.assert_called_once_with( - importer.word_phrases, True - ) - - -def test_get_sp_filters_correctly(sample_style_file): - mock_config = MagicMock() - mock_config.get_import_style_file.return_value = sample_style_file - mock_config.load_sub_configuration.return_value = {"blackList": {}, "whiteList": {}} - - importer = SPImporter(config=mock_config, conn=MagicMock(), sp_loader=None) - - allowed_from_db = {("highway", "motorway"), ("historic", "castle")} - importer.get_sp_db = lambda: allowed_from_db - - result = importer.get_sp() - - expected = {("highway", "motorway")} + expected = { + ("highway", "motorway"), + ("tourism", "hotel") + } assert result == expected, f"Expected {expected}, got {result}" -def test_get_sp_db_filters_by_count_threshold(mock_config): - mock_cursor = MagicMock() - - # Simulate only results above the threshold being returned (as SQL would) - # These tuples simulate real SELECT class, type FROM placex GROUP BY ... HAVING COUNT(*) > 100 - mock_cursor.fetchall.return_value = [ - ("highway", "motorway"), - ("historic", "castle") - ] +def test_get_classtype_pair_data_more(placex_table, temp_db_conn): + class Config: + def load_sub_configuration(self, *_): + return {'blackList': {}, 'whiteList': {}} + + for _ in range(100): + placex_table.add(cls='emergency', typ='firehydrant') # edge case 100, not included - mock_conn = MagicMock() - mock_conn.cursor.return_value.__enter__.return_value = mock_cursor - importer = SPImporter(config=mock_config, conn=mock_conn, sp_loader=None) + for _ in range(199): + placex_table.add(cls='amenity', typ='prison') - result = importer.get_sp_db() + for _ in range(3478): + placex_table.add(cls='tourism', typ='hotel') + + config = Config() + importer = SPImporter(config=config, conn=temp_db_conn, sp_loader=None) + + result = importer.get_classtype_pairs() expected = { - ("highway", "motorway"), - ("historic", "castle") + ("amenity", "prison"), + ("tourism", "hotel") } - assert result == expected - mock_cursor.execute.assert_called_once() + assert result == expected, f"Expected {expected}, got {result}" From 59a947c5f5ae8a3e588158aab1730ae8377ab77e Mon Sep 17 00:00:00 2001 From: anqixxx Date: Mon, 14 Apr 2025 10:14:57 -0700 Subject: [PATCH 3/6] Removed class type pair getter that used style sheets from both spi_importer and the associated testing function --- .../tools/special_phrases/sp_importer.py | 25 ------- test/python/tools/test_sp_importer.py | 71 ------------------- 2 files changed, 96 deletions(-) diff --git a/src/nominatim_db/tools/special_phrases/sp_importer.py b/src/nominatim_db/tools/special_phrases/sp_importer.py index a4d0eaf6..89ac6dac 100644 --- a/src/nominatim_db/tools/special_phrases/sp_importer.py +++ b/src/nominatim_db/tools/special_phrases/sp_importer.py @@ -16,7 +16,6 @@ from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set import logging import re -import json from psycopg.sql import Identifier, SQL from ...typing import Protocol @@ -65,30 +64,6 @@ class SPImporter(): # special phrases class/type on the wiki. self.table_phrases_to_delete: Set[str] = set() - def get_classtype_pairs_style(self) -> Set[Tuple[str, str]]: - """ - Returns list of allowed special phrases from the the style file, - restricting to a list of combinations of classes and types - which have a 'main' property - - Note: This requirement was from 2021 and I am a bit unsure if it is still relevant - """ - style_file = self.config.get_import_style_file() # import style file as json - with open(style_file, 'r') as file: - style_data = json.loads(f'[{file.read()}]') - - style_combinations = set() - for _map in style_data: # following ../settings/import-extratags.style - classes = _map.get("keys", []) - values = _map.get("values", {}) - - for _type, properties in values.items(): - if "main" in properties and _type: # make sure the tag is a non-empty string - for _class in classes: - style_combinations.add((_class, _type)) # type is the value of the main tag - - return style_combinations - def get_classtype_pairs(self) -> Set[Tuple[str, str]]: """ Returns list of allowed special phrases from the database, diff --git a/test/python/tools/test_sp_importer.py b/test/python/tools/test_sp_importer.py index 4d2dd8d4..b27172c8 100644 --- a/test/python/tools/test_sp_importer.py +++ b/test/python/tools/test_sp_importer.py @@ -1,80 +1,9 @@ import pytest import tempfile -import json import os from nominatim_db.tools.special_phrases.sp_importer import SPImporter -# Testing Style Class Pair Retrival -@pytest.fixture -def sample_style_file(): - sample_data = [ - { - "keys" : ["emergency"], - "values" : { - "fire_hydrant" : "skip", - "yes" : "skip", - "no" : "skip", - "" : "main" - } - }, - { - "keys" : ["historic", "military"], - "values" : { - "no" : "skip", - "yes" : "skip", - "" : "main" - } - }, - { - "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", - "name:botanical", "wikidata", "*:wikidata"], - "values" : { - "" : "extra" - } - }, - { - "keys" : ["addr:housename"], - "values" : { - "" : "name,house" - } - }, - { - "keys": ["highway"], - "values": { - "motorway": "main", - "": "skip" - } - } - ] - content = ",".join(json.dumps(entry) for entry in sample_data) - - with tempfile.NamedTemporaryFile(mode='w+', delete=False) as tmp: - tmp.write(content) - tmp_path = tmp.name - - yield tmp_path - os.remove(tmp_path) - -def test_get_classtype_style(sample_style_file): - class Config: - def get_import_style_file(self): - return sample_style_file - - def load_sub_configuration(self, name): - return {'blackList': {}, 'whiteList': {}} - - config = Config() - importer = SPImporter(config=config, conn=None, sp_loader=None) - - result = importer.get_classtype_pairs_style() - - expected = { - ("highway", "motorway"), - } - - assert expected.issubset(result) - # Testing Database Class Pair Retrival using Mock Database def test_get_classtype_pairs(monkeypatch): class Config: From 3f51cb3fd1e5ebf0f3cf646d0142aebe0d33716a Mon Sep 17 00:00:00 2001 From: anqixxx Date: Mon, 14 Apr 2025 10:21:07 -0700 Subject: [PATCH 4/6] Made the limit configurable with an optional argument, updating the testing as well to reflect this. default is now 0, meaning that it will return everything that occurs more than once. Removed mock database test, and got rid of fetch all. Rebased all tests to monkeypatch --- .../tools/special_phrases/sp_importer.py | 12 +-- test/python/tools/test_sp_importer.py | 89 +++++++------------ 2 files changed, 41 insertions(+), 60 deletions(-) diff --git a/src/nominatim_db/tools/special_phrases/sp_importer.py b/src/nominatim_db/tools/special_phrases/sp_importer.py index 89ac6dac..323decf9 100644 --- a/src/nominatim_db/tools/special_phrases/sp_importer.py +++ b/src/nominatim_db/tools/special_phrases/sp_importer.py @@ -64,23 +64,25 @@ class SPImporter(): # special phrases class/type on the wiki. self.table_phrases_to_delete: Set[str] = set() - def get_classtype_pairs(self) -> Set[Tuple[str, str]]: + def get_classtype_pairs(self, min: int = 0) -> Set[Tuple[str, str]]: """ Returns list of allowed special phrases from the database, restricting to a list of combinations of classes and types - which occur more than 100 times + which occur more than a specified amount of times. + + Default value for this, if not specified, is at least once. """ db_combinations = set() - query = """ + query = f""" SELECT class AS CLS, type AS typ FROM placex GROUP BY class, type - HAVING COUNT(*) > 100 + HAVING COUNT(*) > {min} """ with self.db_connection.cursor() as db_cursor: db_cursor.execute(SQL(query)) - for row in db_cursor.fetchall(): + for row in db_cursor: db_combinations.add((row[0], row[1])) return db_combinations diff --git a/test/python/tools/test_sp_importer.py b/test/python/tools/test_sp_importer.py index b27172c8..dda02f11 100644 --- a/test/python/tools/test_sp_importer.py +++ b/test/python/tools/test_sp_importer.py @@ -1,60 +1,20 @@ -import pytest -import tempfile -import os - from nominatim_db.tools.special_phrases.sp_importer import SPImporter -# Testing Database Class Pair Retrival using Mock Database -def test_get_classtype_pairs(monkeypatch): - class Config: - def load_sub_configuration(self, path, section=None): - return {"blackList": {}, "whiteList": {}} - - class Cursor: - def execute(self, query): pass - def fetchall(self): - return [ - ("highway", "motorway"), - ("historic", "castle") - ] - def __enter__(self): return self - def __exit__(self, exc_type, exc_val, exc_tb): pass - - class Connection: - def cursor(self): return Cursor() - - config = Config() - conn = Connection() - importer = SPImporter(config=config, conn=conn, sp_loader=None) - - result = importer.get_classtype_pairs() - - expected = { - ("highway", "motorway"), - ("historic", "castle") - } - - assert result == expected # Testing Database Class Pair Retrival using Conftest.py and placex -def test_get_classtype_pair_data(placex_table, temp_db_conn): - class Config: - def load_sub_configuration(self, *_): - return {'blackList': {}, 'whiteList': {}} - +def test_get_classtype_pair_data(placex_table, def_config, temp_db_conn): for _ in range(101): - placex_table.add(cls='highway', typ='motorway') # edge case 101 + placex_table.add(cls='highway', typ='motorway') # edge case 101 for _ in range(99): - placex_table.add(cls='amenity', typ='prison') # edge case 99 + placex_table.add(cls='amenity', typ='prison') # edge case 99 for _ in range(150): placex_table.add(cls='tourism', typ='hotel') - config = Config() - importer = SPImporter(config=config, conn=temp_db_conn, sp_loader=None) + importer = SPImporter(config=def_config, conn=temp_db_conn, sp_loader=None) - result = importer.get_classtype_pairs() + result = importer.get_classtype_pairs(min=100) expected = { ("highway", "motorway"), @@ -63,24 +23,20 @@ def test_get_classtype_pair_data(placex_table, temp_db_conn): assert result == expected, f"Expected {expected}, got {result}" -def test_get_classtype_pair_data_more(placex_table, temp_db_conn): - class Config: - def load_sub_configuration(self, *_): - return {'blackList': {}, 'whiteList': {}} - + +def test_get_classtype_pair_data_more(placex_table, def_config, temp_db_conn): for _ in range(100): - placex_table.add(cls='emergency', typ='firehydrant') # edge case 100, not included + placex_table.add(cls='emergency', typ='firehydrant') # edge case 100, not included for _ in range(199): - placex_table.add(cls='amenity', typ='prison') + placex_table.add(cls='amenity', typ='prison') for _ in range(3478): placex_table.add(cls='tourism', typ='hotel') - config = Config() - importer = SPImporter(config=config, conn=temp_db_conn, sp_loader=None) + importer = SPImporter(config=def_config, conn=temp_db_conn, sp_loader=None) - result = importer.get_classtype_pairs() + result = importer.get_classtype_pairs(min=100) expected = { ("amenity", "prison"), @@ -88,3 +44,26 @@ def test_get_classtype_pair_data_more(placex_table, temp_db_conn): } assert result == expected, f"Expected {expected}, got {result}" + + +def test_get_classtype_pair_data_default(placex_table, def_config, temp_db_conn): + for _ in range(1): + placex_table.add(cls='emergency', typ='firehydrant') + + for _ in range(199): + placex_table.add(cls='amenity', typ='prison') + + for _ in range(3478): + placex_table.add(cls='tourism', typ='hotel') + + importer = SPImporter(config=def_config, conn=temp_db_conn, sp_loader=None) + + result = importer.get_classtype_pairs() + + expected = { + ("amenity", "prison"), + ("tourism", "hotel"), + ("emergency", "firehydrant") + } + + assert result == expected, f"Expected {expected}, got {result}" From 618fbc63d7144b250ac50698119b51d1195167ca Mon Sep 17 00:00:00 2001 From: anqixxx Date: Thu, 1 May 2025 13:43:53 -0700 Subject: [PATCH 5/6] Added testing to test get classtype pairs in import special phrases --- .../tools/special_phrases/sp_importer.py | 16 +++---- .../tools/test_import_special_phrases.py | 48 +++++++++++++++++-- 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/src/nominatim_db/tools/special_phrases/sp_importer.py b/src/nominatim_db/tools/special_phrases/sp_importer.py index 323decf9..ac50377f 100644 --- a/src/nominatim_db/tools/special_phrases/sp_importer.py +++ b/src/nominatim_db/tools/special_phrases/sp_importer.py @@ -242,14 +242,14 @@ class SPImporter(): if doesn't exit. """ table_name = _classtype_table(phrase_class, phrase_type) - with self.db_connection.cursor() as db_cursor: - db_cursor.execute(SQL( - """CREATE TABLE IF NOT EXISTS {} {} AS - SELECT place_id AS place_id, - st_centroid(geometry) AS centroid - FROM placex WHERE class = %s AND type = %s - """).format(Identifier(table_name), SQL(sql_tablespace)), - (phrase_class, phrase_type)) + with self.db_connection.cursor() as cur: + cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS + SELECT place_id AS place_id, + st_centroid(geometry) AS centroid + FROM placex + WHERE class = %s AND type = %s + """).format(Identifier(table_name), SQL(sql_tablespace)), + (phrase_class, phrase_type)) def _create_place_classtype_indexes(self, sql_tablespace: str, phrase_class: str, phrase_type: str) -> None: diff --git a/test/python/tools/test_import_special_phrases.py b/test/python/tools/test_import_special_phrases.py index d8fe8946..c676c40a 100644 --- a/test/python/tools/test_import_special_phrases.py +++ b/test/python/tools/test_import_special_phrases.py @@ -127,7 +127,7 @@ def test_grant_access_to_web_user(temp_db_conn, temp_db_cursor, table_factory, def test_create_place_classtype_table_and_indexes( temp_db_cursor, def_config, placex_table, - sp_importer, temp_db_conn): + sp_importer, temp_db_conn, monkeypatch): """ Test that _create_place_classtype_table_and_indexes() create the right place_classtype tables and place_id indexes @@ -135,7 +135,8 @@ def test_create_place_classtype_table_and_indexes( for the given set of pairs. """ pairs = set([('class1', 'type1'), ('class2', 'type2')]) - + for pair in pairs: + placex_table.add(cls=pair[0], typ=pair[1]) # adding to db sp_importer._create_classtype_table_and_indexes(pairs) temp_db_conn.commit() @@ -194,14 +195,16 @@ def test_import_phrases(monkeypatch, temp_db_cursor, def_config, sp_importer, monkeypatch.setattr('nominatim_db.tools.special_phrases.sp_wiki_loader._get_wiki_content', lambda lang: xml_wiki_content) + class_test = 'aerialway' + type_test = 'zip_line' + tokenizer = tokenizer_mock() + placex_table.add(cls=class_test, typ=type_test) # in db for special phrase filtering + placex_table.add(cls='amenity', typ='animal_shelter') # in db for special phrase filtering sp_importer.import_phrases(tokenizer, should_replace) assert len(tokenizer.analyser_cache['special_phrases']) == 18 - class_test = 'aerialway' - type_test = 'zip_line' - assert check_table_exist(temp_db_cursor, class_test, type_test) assert check_placeid_and_centroid_indexes(temp_db_cursor, class_test, type_test) assert check_grant_access(temp_db_cursor, def_config.DATABASE_WEBUSER, class_test, type_test) @@ -250,3 +253,38 @@ def check_placeid_and_centroid_indexes(temp_db_cursor, phrase_class, phrase_type and temp_db_cursor.index_exists(table_name, index_prefix + 'place_id') ) + + +@pytest.mark.parametrize("should_replace", [(True), (False)]) +def test_import_phrases_special_phrase_filtering(monkeypatch, temp_db_cursor, def_config, + sp_importer, placex_table, tokenizer_mock, + xml_wiki_content, should_replace): + + monkeypatch.setattr('nominatim_db.tools.special_phrases.sp_wiki_loader._get_wiki_content', + lambda lang: xml_wiki_content) + + class_test = 'aerialway' + type_test = 'zip_line' + + placex_table.add(cls=class_test, typ=type_test) # add to the database to make valid + tokenizer = tokenizer_mock() + sp_importer.import_phrases(tokenizer, should_replace) + + assert ('Zip Line', 'aerialway', 'zip_line', '-') in sp_importer.word_phrases + assert check_table_exist(temp_db_cursor, class_test, type_test) + assert check_placeid_and_centroid_indexes(temp_db_cursor, class_test, type_test) + assert check_grant_access(temp_db_cursor, def_config.DATABASE_WEBUSER, class_test, type_test) + + +def test_get_classtype_pairs_directly(placex_table, temp_db_conn, sp_importer): + for _ in range(101): + placex_table.add(cls='highway', typ='residential') + for _ in range(99): + placex_table.add(cls='amenity', typ='toilet') + + temp_db_conn.commit() + + result = sp_importer.get_classtype_pairs(100) + print("RESULT:", result) + assert ('highway', 'residential') in result + assert ('amenity', 'toilet') not in result From 6220bde2d61e2c18f0aaf46132ef1adffb1cf37e Mon Sep 17 00:00:00 2001 From: anqixxx Date: Wed, 21 May 2025 10:47:01 -0700 Subject: [PATCH 6/6] Added mypy ignore fix for logging.py (library change), as well as quick mac fix on mem.cached --- src/nominatim_api/logging.py | 3 ++- src/nominatim_db/tools/database_import.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/nominatim_api/logging.py b/src/nominatim_api/logging.py index 1a6aef9b..64d43fdc 100644 --- a/src/nominatim_api/logging.py +++ b/src/nominatim_api/logging.py @@ -342,7 +342,8 @@ HTML_HEADER: str = """ Nominatim - Debug