Merge pull request #3710 from anqixxx/fix-special-phrases-filtering

Fix special phrases filtering
2025-05-21 21:34:28 +02:00
parent a4d3b57f37 6220bde2d6
commit 0cf470f863
5 changed files with 146 additions and 8 deletions
--- a/src/nominatim_api/logging.py
+++ b/src/nominatim_api/logging.py
@@ -342,7 +342,8 @@ HTML_HEADER: str = """<!DOCTYPE html>
  <title>Nominatim - Debug</title>
  <style>
 """ + \
-    (HtmlFormatter(nobackground=True).get_style_defs('.highlight') if CODE_HIGHLIGHT else '') + \
+    (HtmlFormatter(nobackground=True).get_style_defs('.highlight')  # type: ignore[no-untyped-call]
     if CODE_HIGHLIGHT else '') + \
    """
    h2 { font-size: x-large }
--- a/src/nominatim_db/tools/database_import.py
+++ b/src/nominatim_db/tools/database_import.py
@@ -127,7 +127,7 @@ def import_osm_data(osm_files: Union[Path, Sequence[Path]],
                fsize += os.stat(str(fname)).st_size
        else:
            fsize = os.stat(str(osm_files)).st_size
-        options['osm2pgsql_cache'] = int(min((mem.available + mem.cached) * 0.75,
+        options['osm2pgsql_cache'] = int(min((mem.available + getattr(mem, 'cached', 0)) * 0.75,
                                             fsize * 2) / 1024 / 1024) + 1
    run_osm2pgsql(options)
--- a/src/nominatim_db/tools/special_phrases/sp_importer.py
+++ b/src/nominatim_db/tools/special_phrases/sp_importer.py
@@ -16,7 +16,6 @@
 from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set
 import logging
 import re
 from psycopg.sql import Identifier, SQL
 from ...typing import Protocol
@@ -65,6 +64,29 @@ class SPImporter():
        # special phrases class/type on the wiki.
        self.table_phrases_to_delete: Set[str] = set()
    def get_classtype_pairs(self, min: int = 0) -> Set[Tuple[str, str]]:
        """
            Returns list of allowed special phrases from the database,
            restricting to a list of combinations of classes and types
            which occur more than a specified amount of times.
            Default value for this, if not specified, is at least once.
        """
        db_combinations = set()
        query = f"""
        SELECT class AS CLS, type AS typ
        FROM placex
        GROUP BY class, type
        HAVING COUNT(*) > {min}
        """
        with self.db_connection.cursor() as db_cursor:
            db_cursor.execute(SQL(query))
            for row in db_cursor:
                db_combinations.add((row[0], row[1]))
        return db_combinations
    def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None:
        """
            Iterate through all SpecialPhrases extracted from the
@@ -88,6 +110,7 @@ class SPImporter():
        self._create_classtype_table_and_indexes(class_type_pairs)
        if should_replace:
            self._remove_non_existent_tables_from_db()
        self.db_connection.commit()
        with tokenizer.name_analyzer() as analyzer:
@@ -177,10 +200,17 @@ class SPImporter():
        with self.db_connection.cursor() as db_cursor:
            db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
        allowed_special_phrases = self.get_classtype_pairs()
        for pair in class_type_pairs:
            phrase_class = pair[0]
            phrase_type = pair[1]
            if (phrase_class, phrase_type) not in allowed_special_phrases:
                LOG.warning("Skipping phrase %s=%s: not in allowed special phrases",
                            phrase_class, phrase_type)
                continue
            table_name = _classtype_table(phrase_class, phrase_type)
            if table_name in self.table_phrases_to_delete:
--- a/test/python/tools/test_import_special_phrases.py
+++ b/test/python/tools/test_import_special_phrases.py
@@ -127,7 +127,7 @@ def test_grant_access_to_web_user(temp_db_conn, temp_db_cursor, table_factory,
 def test_create_place_classtype_table_and_indexes(
        temp_db_cursor, def_config, placex_table,
-        sp_importer, temp_db_conn):
+        sp_importer, temp_db_conn, monkeypatch):
    """
        Test that _create_place_classtype_table_and_indexes()
        create the right place_classtype tables and place_id indexes
@@ -135,7 +135,8 @@ def test_create_place_classtype_table_and_indexes(
        for the given set of pairs.
    """
    pairs = set([('class1', 'type1'), ('class2', 'type2')])
-
+    for pair in pairs:
        placex_table.add(cls=pair[0], typ=pair[1])   # adding to db
    sp_importer._create_classtype_table_and_indexes(pairs)
    temp_db_conn.commit()
@@ -194,14 +195,16 @@ def test_import_phrases(monkeypatch, temp_db_cursor, def_config, sp_importer,
    monkeypatch.setattr('nominatim_db.tools.special_phrases.sp_wiki_loader._get_wiki_content',
                        lambda lang: xml_wiki_content)
    class_test = 'aerialway'
    type_test = 'zip_line'
    tokenizer = tokenizer_mock()
    placex_table.add(cls=class_test, typ=type_test)  # in db for special phrase filtering
    placex_table.add(cls='amenity', typ='animal_shelter')  # in db for special phrase filtering
    sp_importer.import_phrases(tokenizer, should_replace)
    assert len(tokenizer.analyser_cache['special_phrases']) == 18
    class_test = 'aerialway'
    type_test = 'zip_line'
    assert check_table_exist(temp_db_cursor, class_test, type_test)
    assert check_placeid_and_centroid_indexes(temp_db_cursor, class_test, type_test)
    assert check_grant_access(temp_db_cursor, def_config.DATABASE_WEBUSER, class_test, type_test)
@@ -250,3 +253,38 @@ def check_placeid_and_centroid_indexes(temp_db_cursor, phrase_class, phrase_type
        and
        temp_db_cursor.index_exists(table_name, index_prefix + 'place_id')
    )
@pytest.mark.parametrize("should_replace", [(True), (False)])
 def test_import_phrases_special_phrase_filtering(monkeypatch, temp_db_cursor, def_config,
                                                 sp_importer, placex_table, tokenizer_mock,
                                                 xml_wiki_content, should_replace):
    monkeypatch.setattr('nominatim_db.tools.special_phrases.sp_wiki_loader._get_wiki_content',
                        lambda lang: xml_wiki_content)
    class_test = 'aerialway'
    type_test = 'zip_line'
    placex_table.add(cls=class_test, typ=type_test)  # add to the database to make valid
    tokenizer = tokenizer_mock()
    sp_importer.import_phrases(tokenizer, should_replace)
    assert ('Zip Line', 'aerialway', 'zip_line', '-') in sp_importer.word_phrases
    assert check_table_exist(temp_db_cursor, class_test, type_test)
    assert check_placeid_and_centroid_indexes(temp_db_cursor, class_test, type_test)
    assert check_grant_access(temp_db_cursor, def_config.DATABASE_WEBUSER, class_test, type_test)
 def test_get_classtype_pairs_directly(placex_table, temp_db_conn, sp_importer):
    for _ in range(101):
        placex_table.add(cls='highway', typ='residential')
    for _ in range(99):
        placex_table.add(cls='amenity', typ='toilet')
    temp_db_conn.commit()
    result = sp_importer.get_classtype_pairs(100)
    print("RESULT:", result)
    assert ('highway', 'residential') in result
    assert ('amenity', 'toilet') not in result
--- a/test/python/tools/test_sp_importer.py
+++ b/test/python/tools/test_sp_importer.py
@@ -0,0 +1,69 @@
 from nominatim_db.tools.special_phrases.sp_importer import SPImporter
 # Testing Database Class Pair Retrival using Conftest.py and placex
 def test_get_classtype_pair_data(placex_table, def_config, temp_db_conn):
    for _ in range(101):
        placex_table.add(cls='highway', typ='motorway')  # edge case 101
    for _ in range(99):
        placex_table.add(cls='amenity', typ='prison')  # edge case 99
    for _ in range(150):
        placex_table.add(cls='tourism', typ='hotel')
    importer = SPImporter(config=def_config, conn=temp_db_conn, sp_loader=None)
    result = importer.get_classtype_pairs(min=100)
    expected = {
        ("highway", "motorway"),
        ("tourism", "hotel")
    }
    assert result == expected, f"Expected {expected}, got {result}"
 def test_get_classtype_pair_data_more(placex_table, def_config, temp_db_conn):
    for _ in range(100):
        placex_table.add(cls='emergency', typ='firehydrant')  # edge case 100, not included
    for _ in range(199):
        placex_table.add(cls='amenity', typ='prison')
    for _ in range(3478):
        placex_table.add(cls='tourism', typ='hotel')
    importer = SPImporter(config=def_config, conn=temp_db_conn, sp_loader=None)
    result = importer.get_classtype_pairs(min=100)
    expected = {
        ("amenity", "prison"),
        ("tourism", "hotel")
    }
    assert result == expected, f"Expected {expected}, got {result}"
 def test_get_classtype_pair_data_default(placex_table, def_config, temp_db_conn):
    for _ in range(1):
        placex_table.add(cls='emergency', typ='firehydrant')
    for _ in range(199):
        placex_table.add(cls='amenity', typ='prison')
    for _ in range(3478):
        placex_table.add(cls='tourism', typ='hotel')
    importer = SPImporter(config=def_config, conn=temp_db_conn, sp_loader=None)
    result = importer.get_classtype_pairs()
    expected = {
        ("amenity", "prison"),
        ("tourism", "hotel"),
        ("emergency", "firehydrant")
    }
    assert result == expected, f"Expected {expected}, got {result}"