split code into submodules

2024-05-16 11:55:17 +02:00
parent 0fb4fe8e4d
commit 6e89310a92
137 changed files with 757 additions and 716 deletions
--- a/src/nominatim_db/tools/special_phrases/init.py
+++ b/src/nominatim_db/tools/special_phrases/init.py
--- a/src/nominatim_db/tools/special_phrases/importer_statistics.py
+++ b/src/nominatim_db/tools/special_phrases/importer_statistics.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+    Contains the class which handles statistics for the
+    import of special phrases.
+"""
+import logging
+LOG = logging.getLogger()
+
+class SpecialPhrasesImporterStatistics():
+    """
+        Class handling statistics of the import
+        process of special phrases.
+    """
+    def __init__(self) -> None:
+        self._intialize_values()
+
+    def _intialize_values(self) -> None:
+        """
+            Set all counts for the global
+            import to 0.
+        """
+        self.tables_created = 0
+        self.tables_deleted = 0
+        self.tables_ignored = 0
+        self.invalids = 0
+
+    def notify_one_phrase_invalid(self) -> None:
+        """
+            Add +1 to the count of invalid entries
+            fetched from the wiki.
+        """
+        self.invalids += 1
+
+    def notify_one_table_created(self) -> None:
+        """
+            Add +1 to the count of created tables.
+        """
+        self.tables_created += 1
+
+    def notify_one_table_deleted(self) -> None:
+        """
+            Add +1 to the count of deleted tables.
+        """
+        self.tables_deleted += 1
+
+    def notify_one_table_ignored(self) -> None:
+        """
+            Add +1 to the count of ignored tables.
+        """
+        self.tables_ignored += 1
+
+    def notify_import_done(self) -> None:
+        """
+            Print stats for the whole import process
+            and reset all values.
+        """
+        LOG.info('====================================================================')
+        LOG.info('Final statistics of the import:')
+        LOG.info('- %s phrases were invalid.', self.invalids)
+        if self.invalids > 0:
+            LOG.info('  Those invalid phrases have been skipped.')
+        LOG.info('- %s tables were ignored as they already exist on the database',
+                 self.tables_ignored)
+        LOG.info('- %s tables were created', self.tables_created)
+        LOG.info('- %s tables were deleted from the database', self.tables_deleted)
+        if self.tables_deleted > 0:
+            LOG.info('  They were deleted as they are not valid anymore.')
+
+        if self.invalids > 0:
+            LOG.warning('%s phrases were invalid and have been skipped during the whole process.',
+                        self.invalids)
+
+        self._intialize_values()
--- a/src/nominatim_db/tools/special_phrases/sp_csv_loader.py
+++ b/src/nominatim_db/tools/special_phrases/sp_csv_loader.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+    Module containing the SPCsvLoader class.
+
+    The class allows to load phrases from a csv file.
+"""
+from typing import Iterable
+import csv
+import os
+
+from nominatim_core.errors import UsageError
+from .special_phrase import SpecialPhrase
+
+class SPCsvLoader:
+    """
+        Handles loading of special phrases from external csv file.
+    """
+    def __init__(self, csv_path: str) -> None:
+        self.csv_path = csv_path
+
+
+    def generate_phrases(self) -> Iterable[SpecialPhrase]:
+        """ Open and parse the given csv file.
+            Create the corresponding SpecialPhrases.
+        """
+        self._check_csv_validity()
+
+        with open(self.csv_path, encoding='utf-8') as fd:
+            reader = csv.DictReader(fd, delimiter=',')
+            for row in reader:
+                yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
+
+
+    def _check_csv_validity(self) -> None:
+        """
+            Check that the csv file has the right extension.
+        """
+        _, extension = os.path.splitext(self.csv_path)
+
+        if extension != '.csv':
+            raise UsageError(f'The file {self.csv_path} is not a csv file.')
--- a/src/nominatim_db/tools/special_phrases/sp_importer.py
+++ b/src/nominatim_db/tools/special_phrases/sp_importer.py
@@ -0,0 +1,274 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+    Module containing the class handling the import
+    of the special phrases.
+
+    Phrases are analyzed and imported into the database.
+
+    The phrases already present in the database which are not
+    valids anymore are removed.
+"""
+from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set
+import logging
+import re
+
+from psycopg2.sql import Identifier, SQL
+
+from nominatim_core.typing import Protocol
+from nominatim_core.config import Configuration
+from nominatim_core.db.connection import Connection
+from .importer_statistics import SpecialPhrasesImporterStatistics
+from .special_phrase import SpecialPhrase
+from ...tokenizer.base import AbstractTokenizer
+
+LOG = logging.getLogger()
+
+def _classtype_table(phrase_class: str, phrase_type: str) -> str:
+    """ Return the name of the table for the given class and type.
+    """
+    return f'place_classtype_{phrase_class}_{phrase_type}'
+
+
+class SpecialPhraseLoader(Protocol):
+    """ Protocol for classes implementing a loader for special phrases.
+    """
+
+    def generate_phrases(self) -> Iterable[SpecialPhrase]:
+        """ Generates all special phrase terms this loader can produce.
+        """
+
+
+class SPImporter():
+    # pylint: disable-msg=too-many-instance-attributes
+    """
+        Class handling the process of special phrases importation into the database.
+
+        Take a sp loader which load the phrases from an external source.
+    """
+    def __init__(self, config: Configuration, conn: Connection,
+                 sp_loader: SpecialPhraseLoader) -> None:
+        self.config = config
+        self.db_connection = conn
+        self.sp_loader = sp_loader
+        self.statistics_handler = SpecialPhrasesImporterStatistics()
+        self.black_list, self.white_list = self._load_white_and_black_lists()
+        self.sanity_check_pattern = re.compile(r'^\w+$')
+        # This set will contain all existing phrases to be added.
+        # It contains tuples with the following format: (label, class, type, operator)
+        self.word_phrases: Set[Tuple[str, str, str, str]] = set()
+        # This set will contain all existing place_classtype tables which doesn't match any
+        # special phrases class/type on the wiki.
+        self.table_phrases_to_delete: Set[str] = set()
+
+    def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None:
+        """
+            Iterate through all SpecialPhrases extracted from the
+            loader and import them into the database.
+
+            If should_replace is set to True only the loaded phrases
+            will be kept into the database. All other phrases already
+            in the database will be removed.
+        """
+        LOG.warning('Special phrases importation starting')
+        self._fetch_existing_place_classtype_tables()
+
+        # Store pairs of class/type for further processing
+        class_type_pairs = set()
+
+        for phrase in self.sp_loader.generate_phrases():
+            result = self._process_phrase(phrase)
+            if result:
+                class_type_pairs.add(result)
+
+        self._create_classtype_table_and_indexes(class_type_pairs)
+        if should_replace:
+            self._remove_non_existent_tables_from_db()
+        self.db_connection.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.update_special_phrases(self.word_phrases, should_replace)
+
+        LOG.warning('Import done.')
+        self.statistics_handler.notify_import_done()
+
+
+    def _fetch_existing_place_classtype_tables(self) -> None:
+        """
+            Fetch existing place_classtype tables.
+            Fill the table_phrases_to_delete set of the class.
+        """
+        query = """
+            SELECT table_name
+            FROM information_schema.tables
+            WHERE table_schema='public'
+            AND table_name like 'place_classtype_%';
+        """
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute(SQL(query))
+            for row in db_cursor:
+                self.table_phrases_to_delete.add(row[0])
+
+    def _load_white_and_black_lists(self) \
+          -> Tuple[Mapping[str, Sequence[str]], Mapping[str, Sequence[str]]]:
+        """
+            Load white and black lists from phrases-settings.json.
+        """
+        settings = self.config.load_sub_configuration('phrase-settings.json')
+
+        return settings['blackList'], settings['whiteList']
+
+    def _check_sanity(self, phrase: SpecialPhrase) -> bool:
+        """
+            Check sanity of given inputs in case somebody added garbage in the wiki.
+            If a bad class/type is detected the system will exit with an error.
+        """
+        class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
+        type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
+
+        if not class_matchs or not type_matchs:
+            LOG.warning("Bad class/type: %s=%s. It will not be imported",
+                        phrase.p_class, phrase.p_type)
+            return False
+        return True
+
+    def _process_phrase(self, phrase: SpecialPhrase) -> Optional[Tuple[str, str]]:
+        """
+            Processes the given phrase by checking black and white list
+            and sanity.
+            Return the class/type pair corresponding to the phrase.
+        """
+
+        # blacklisting: disallow certain class/type combinations
+        if phrase.p_class in self.black_list.keys() \
+           and phrase.p_type in self.black_list[phrase.p_class]:
+            return None
+
+        # whitelisting: if class is in whitelist, allow only tags in the list
+        if phrase.p_class in self.white_list.keys() \
+           and phrase.p_type not in self.white_list[phrase.p_class]:
+            return None
+
+        # sanity check, in case somebody added garbage in the wiki
+        if not self._check_sanity(phrase):
+            self.statistics_handler.notify_one_phrase_invalid()
+            return None
+
+        self.word_phrases.add((phrase.p_label, phrase.p_class,
+                               phrase.p_type, phrase.p_operator))
+
+        return (phrase.p_class, phrase.p_type)
+
+
+    def _create_classtype_table_and_indexes(self,
+                                            class_type_pairs: Iterable[Tuple[str, str]]) -> None:
+        """
+            Create table place_classtype for each given pair.
+            Also create indexes on place_id and centroid.
+        """
+        LOG.warning('Create tables and indexes...')
+
+        sql_tablespace = self.config.TABLESPACE_AUX_DATA
+        if sql_tablespace:
+            sql_tablespace = ' TABLESPACE ' + sql_tablespace
+
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
+
+        for pair in class_type_pairs:
+            phrase_class = pair[0]
+            phrase_type = pair[1]
+
+            table_name = _classtype_table(phrase_class, phrase_type)
+
+            if table_name in self.table_phrases_to_delete:
+                self.statistics_handler.notify_one_table_ignored()
+                # Remove this table from the ones to delete as it match a
+                # class/type still existing on the special phrases of the wiki.
+                self.table_phrases_to_delete.remove(table_name)
+                # So don't need to create the table and indexes.
+                continue
+
+            # Table creation
+            self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
+
+            # Indexes creation
+            self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
+
+            # Grant access on read to the web user.
+            self._grant_access_to_webuser(phrase_class, phrase_type)
+
+            self.statistics_handler.notify_one_table_created()
+
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute("DROP INDEX idx_placex_classtype")
+
+
+    def _create_place_classtype_table(self, sql_tablespace: str,
+                                      phrase_class: str, phrase_type: str) -> None:
+        """
+            Create table place_classtype of the given phrase_class/phrase_type
+            if doesn't exit.
+        """
+        table_name = _classtype_table(phrase_class, phrase_type)
+        with self.db_connection.cursor() as cur:
+            cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS
+                                 SELECT place_id AS place_id,
+                                        st_centroid(geometry) AS centroid
+                                 FROM placex
+                                 WHERE class = %s AND type = %s
+                             """).format(Identifier(table_name), SQL(sql_tablespace)),
+                        (phrase_class, phrase_type))
+
+
+    def _create_place_classtype_indexes(self, sql_tablespace: str,
+                                        phrase_class: str, phrase_type: str) -> None:
+        """
+            Create indexes on centroid and place_id for the place_classtype table.
+        """
+        index_prefix = f'idx_place_classtype_{phrase_class}_{phrase_type}_'
+        base_table = _classtype_table(phrase_class, phrase_type)
+        # Index on centroid
+        if not self.db_connection.index_exists(index_prefix + 'centroid'):
+            with self.db_connection.cursor() as db_cursor:
+                db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}")
+                                  .format(Identifier(index_prefix + 'centroid'),
+                                          Identifier(base_table),
+                                          SQL(sql_tablespace)))
+
+        # Index on place_id
+        if not self.db_connection.index_exists(index_prefix + 'place_id'):
+            with self.db_connection.cursor() as db_cursor:
+                db_cursor.execute(SQL("CREATE INDEX {} ON {} USING btree(place_id) {}")
+                                  .format(Identifier(index_prefix + 'place_id'),
+                                          Identifier(base_table),
+                                          SQL(sql_tablespace)))
+
+
+    def _grant_access_to_webuser(self, phrase_class: str, phrase_type: str) -> None:
+        """
+            Grant access on read to the table place_classtype for the webuser.
+        """
+        table_name = _classtype_table(phrase_class, phrase_type)
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
+                              .format(Identifier(table_name),
+                                      Identifier(self.config.DATABASE_WEBUSER)))
+
+    def _remove_non_existent_tables_from_db(self) -> None:
+        """
+            Remove special phrases which doesn't exist on the wiki anymore.
+            Delete the place_classtype tables.
+        """
+        LOG.warning('Cleaning database...')
+
+        # Delete place_classtype tables corresponding to class/type which
+        # are not on the wiki anymore.
+        with self.db_connection.cursor() as db_cursor:
+            for table in self.table_phrases_to_delete:
+                self.statistics_handler.notify_one_table_deleted()
+                db_cursor.drop_table(table)
--- a/src/nominatim_db/tools/special_phrases/sp_wiki_loader.py
+++ b/src/nominatim_db/tools/special_phrases/sp_wiki_loader.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+    Module containing the SPWikiLoader class.
+"""
+from typing import Iterable
+import re
+import logging
+
+from nominatim_core.config import Configuration
+from nominatim_core.utils.url_utils import get_url
+from .special_phrase import SpecialPhrase
+
+LOG = logging.getLogger()
+
+def _get_wiki_content(lang: str) -> str:
+    """
+        Request and return the wiki page's content
+        corresponding to special phrases for a given lang.
+        Requested URL Example :
+            https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
+    """
+    url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+          + lang.upper()
+    return get_url(url)
+
+
+class SPWikiLoader:
+    """
+        Handles loading of special phrases from the wiki.
+    """
+    def __init__(self, config: Configuration) -> None:
+        self.config = config
+        # Compile the regex here to increase performances.
+        self.occurence_pattern = re.compile(
+            r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
+        )
+        # Hack around a bug where building=yes was imported with quotes into the wiki
+        self.type_fix_pattern = re.compile(r'\"|&quot;')
+
+        self.languages = self.config.get_str_list('LANGUAGES') or \
+                         ['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
+                          'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
+                          'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
+                          'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi',
+                          'lv', 'tr']
+
+
+    def generate_phrases(self) -> Iterable[SpecialPhrase]:
+        """ Download the wiki pages for the configured languages
+            and extract the phrases from the page.
+        """
+        for lang in self.languages:
+            LOG.warning('Importing phrases for lang: %s...', lang)
+            loaded_xml = _get_wiki_content(lang)
+
+            # One match will be of format [label, class, type, operator, plural]
+            matches = self.occurence_pattern.findall(loaded_xml)
+
+            for match in matches:
+                yield SpecialPhrase(match[0],
+                                    match[1],
+                                    self.type_fix_pattern.sub('', match[2]),
+                                    match[3])
--- a/src/nominatim_db/tools/special_phrases/special_phrase.py
+++ b/src/nominatim_db/tools/special_phrases/special_phrase.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+    Module containing the class SpecialPhrase.
+
+    This class is a model used to transfer a special phrase through
+    the process of load and importation.
+"""
+from typing import Any
+
+class SpecialPhrase:
+    """
+        Model representing a special phrase.
+    """
+    def __init__(self, p_label: str, p_class: str, p_type: str, p_operator: str) -> None:
+        self.p_label = p_label.strip()
+        self.p_class = p_class.strip()
+        self.p_type = p_type.strip()
+        # Needed if some operator in the wiki are not written in english
+        p_operator = p_operator.strip().lower()
+        self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, SpecialPhrase):
+            return False
+
+        return self.p_label == other.p_label \
+               and self.p_class == other.p_class \
+               and self.p_type == other.p_type \
+               and self.p_operator == other.p_operator
+
+    def __hash__(self) -> int:
+        return hash((self.p_label, self.p_class, self.p_type, self.p_operator))