split code into submodules

This commit is contained in:
Sarah Hoffmann
2024-05-16 11:55:17 +02:00
parent 0fb4fe8e4d
commit 6e89310a92
137 changed files with 757 additions and 716 deletions

View File

@@ -0,0 +1,78 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Contains the class which handles statistics for the
import of special phrases.
"""
import logging
LOG = logging.getLogger()
class SpecialPhrasesImporterStatistics():
"""
Class handling statistics of the import
process of special phrases.
"""
def __init__(self) -> None:
self._intialize_values()
def _intialize_values(self) -> None:
"""
Set all counts for the global
import to 0.
"""
self.tables_created = 0
self.tables_deleted = 0
self.tables_ignored = 0
self.invalids = 0
def notify_one_phrase_invalid(self) -> None:
"""
Add +1 to the count of invalid entries
fetched from the wiki.
"""
self.invalids += 1
def notify_one_table_created(self) -> None:
"""
Add +1 to the count of created tables.
"""
self.tables_created += 1
def notify_one_table_deleted(self) -> None:
"""
Add +1 to the count of deleted tables.
"""
self.tables_deleted += 1
def notify_one_table_ignored(self) -> None:
"""
Add +1 to the count of ignored tables.
"""
self.tables_ignored += 1
def notify_import_done(self) -> None:
"""
Print stats for the whole import process
and reset all values.
"""
LOG.info('====================================================================')
LOG.info('Final statistics of the import:')
LOG.info('- %s phrases were invalid.', self.invalids)
if self.invalids > 0:
LOG.info(' Those invalid phrases have been skipped.')
LOG.info('- %s tables were ignored as they already exist on the database',
self.tables_ignored)
LOG.info('- %s tables were created', self.tables_created)
LOG.info('- %s tables were deleted from the database', self.tables_deleted)
if self.tables_deleted > 0:
LOG.info(' They were deleted as they are not valid anymore.')
if self.invalids > 0:
LOG.warning('%s phrases were invalid and have been skipped during the whole process.',
self.invalids)
self._intialize_values()

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the SPCsvLoader class.
The class allows to load phrases from a csv file.
"""
from typing import Iterable
import csv
import os
from nominatim_core.errors import UsageError
from .special_phrase import SpecialPhrase
class SPCsvLoader:
"""
Handles loading of special phrases from external csv file.
"""
def __init__(self, csv_path: str) -> None:
self.csv_path = csv_path
def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Open and parse the given csv file.
Create the corresponding SpecialPhrases.
"""
self._check_csv_validity()
with open(self.csv_path, encoding='utf-8') as fd:
reader = csv.DictReader(fd, delimiter=',')
for row in reader:
yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
def _check_csv_validity(self) -> None:
"""
Check that the csv file has the right extension.
"""
_, extension = os.path.splitext(self.csv_path)
if extension != '.csv':
raise UsageError(f'The file {self.csv_path} is not a csv file.')

View File

@@ -0,0 +1,274 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the class handling the import
of the special phrases.
Phrases are analyzed and imported into the database.
The phrases already present in the database which are not
valids anymore are removed.
"""
from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set
import logging
import re
from psycopg2.sql import Identifier, SQL
from nominatim_core.typing import Protocol
from nominatim_core.config import Configuration
from nominatim_core.db.connection import Connection
from .importer_statistics import SpecialPhrasesImporterStatistics
from .special_phrase import SpecialPhrase
from ...tokenizer.base import AbstractTokenizer
LOG = logging.getLogger()
def _classtype_table(phrase_class: str, phrase_type: str) -> str:
""" Return the name of the table for the given class and type.
"""
return f'place_classtype_{phrase_class}_{phrase_type}'
class SpecialPhraseLoader(Protocol):
""" Protocol for classes implementing a loader for special phrases.
"""
def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Generates all special phrase terms this loader can produce.
"""
class SPImporter():
# pylint: disable-msg=too-many-instance-attributes
"""
Class handling the process of special phrases importation into the database.
Take a sp loader which load the phrases from an external source.
"""
def __init__(self, config: Configuration, conn: Connection,
sp_loader: SpecialPhraseLoader) -> None:
self.config = config
self.db_connection = conn
self.sp_loader = sp_loader
self.statistics_handler = SpecialPhrasesImporterStatistics()
self.black_list, self.white_list = self._load_white_and_black_lists()
self.sanity_check_pattern = re.compile(r'^\w+$')
# This set will contain all existing phrases to be added.
# It contains tuples with the following format: (label, class, type, operator)
self.word_phrases: Set[Tuple[str, str, str, str]] = set()
# This set will contain all existing place_classtype tables which doesn't match any
# special phrases class/type on the wiki.
self.table_phrases_to_delete: Set[str] = set()
def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None:
"""
Iterate through all SpecialPhrases extracted from the
loader and import them into the database.
If should_replace is set to True only the loaded phrases
will be kept into the database. All other phrases already
in the database will be removed.
"""
LOG.warning('Special phrases importation starting')
self._fetch_existing_place_classtype_tables()
# Store pairs of class/type for further processing
class_type_pairs = set()
for phrase in self.sp_loader.generate_phrases():
result = self._process_phrase(phrase)
if result:
class_type_pairs.add(result)
self._create_classtype_table_and_indexes(class_type_pairs)
if should_replace:
self._remove_non_existent_tables_from_db()
self.db_connection.commit()
with tokenizer.name_analyzer() as analyzer:
analyzer.update_special_phrases(self.word_phrases, should_replace)
LOG.warning('Import done.')
self.statistics_handler.notify_import_done()
def _fetch_existing_place_classtype_tables(self) -> None:
"""
Fetch existing place_classtype tables.
Fill the table_phrases_to_delete set of the class.
"""
query = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema='public'
AND table_name like 'place_classtype_%';
"""
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL(query))
for row in db_cursor:
self.table_phrases_to_delete.add(row[0])
def _load_white_and_black_lists(self) \
-> Tuple[Mapping[str, Sequence[str]], Mapping[str, Sequence[str]]]:
"""
Load white and black lists from phrases-settings.json.
"""
settings = self.config.load_sub_configuration('phrase-settings.json')
return settings['blackList'], settings['whiteList']
def _check_sanity(self, phrase: SpecialPhrase) -> bool:
"""
Check sanity of given inputs in case somebody added garbage in the wiki.
If a bad class/type is detected the system will exit with an error.
"""
class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
if not class_matchs or not type_matchs:
LOG.warning("Bad class/type: %s=%s. It will not be imported",
phrase.p_class, phrase.p_type)
return False
return True
def _process_phrase(self, phrase: SpecialPhrase) -> Optional[Tuple[str, str]]:
"""
Processes the given phrase by checking black and white list
and sanity.
Return the class/type pair corresponding to the phrase.
"""
# blacklisting: disallow certain class/type combinations
if phrase.p_class in self.black_list.keys() \
and phrase.p_type in self.black_list[phrase.p_class]:
return None
# whitelisting: if class is in whitelist, allow only tags in the list
if phrase.p_class in self.white_list.keys() \
and phrase.p_type not in self.white_list[phrase.p_class]:
return None
# sanity check, in case somebody added garbage in the wiki
if not self._check_sanity(phrase):
self.statistics_handler.notify_one_phrase_invalid()
return None
self.word_phrases.add((phrase.p_label, phrase.p_class,
phrase.p_type, phrase.p_operator))
return (phrase.p_class, phrase.p_type)
def _create_classtype_table_and_indexes(self,
class_type_pairs: Iterable[Tuple[str, str]]) -> None:
"""
Create table place_classtype for each given pair.
Also create indexes on place_id and centroid.
"""
LOG.warning('Create tables and indexes...')
sql_tablespace = self.config.TABLESPACE_AUX_DATA
if sql_tablespace:
sql_tablespace = ' TABLESPACE ' + sql_tablespace
with self.db_connection.cursor() as db_cursor:
db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
for pair in class_type_pairs:
phrase_class = pair[0]
phrase_type = pair[1]
table_name = _classtype_table(phrase_class, phrase_type)
if table_name in self.table_phrases_to_delete:
self.statistics_handler.notify_one_table_ignored()
# Remove this table from the ones to delete as it match a
# class/type still existing on the special phrases of the wiki.
self.table_phrases_to_delete.remove(table_name)
# So don't need to create the table and indexes.
continue
# Table creation
self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
# Indexes creation
self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
# Grant access on read to the web user.
self._grant_access_to_webuser(phrase_class, phrase_type)
self.statistics_handler.notify_one_table_created()
with self.db_connection.cursor() as db_cursor:
db_cursor.execute("DROP INDEX idx_placex_classtype")
def _create_place_classtype_table(self, sql_tablespace: str,
phrase_class: str, phrase_type: str) -> None:
"""
Create table place_classtype of the given phrase_class/phrase_type
if doesn't exit.
"""
table_name = _classtype_table(phrase_class, phrase_type)
with self.db_connection.cursor() as cur:
cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS
SELECT place_id AS place_id,
st_centroid(geometry) AS centroid
FROM placex
WHERE class = %s AND type = %s
""").format(Identifier(table_name), SQL(sql_tablespace)),
(phrase_class, phrase_type))
def _create_place_classtype_indexes(self, sql_tablespace: str,
phrase_class: str, phrase_type: str) -> None:
"""
Create indexes on centroid and place_id for the place_classtype table.
"""
index_prefix = f'idx_place_classtype_{phrase_class}_{phrase_type}_'
base_table = _classtype_table(phrase_class, phrase_type)
# Index on centroid
if not self.db_connection.index_exists(index_prefix + 'centroid'):
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}")
.format(Identifier(index_prefix + 'centroid'),
Identifier(base_table),
SQL(sql_tablespace)))
# Index on place_id
if not self.db_connection.index_exists(index_prefix + 'place_id'):
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL("CREATE INDEX {} ON {} USING btree(place_id) {}")
.format(Identifier(index_prefix + 'place_id'),
Identifier(base_table),
SQL(sql_tablespace)))
def _grant_access_to_webuser(self, phrase_class: str, phrase_type: str) -> None:
"""
Grant access on read to the table place_classtype for the webuser.
"""
table_name = _classtype_table(phrase_class, phrase_type)
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
.format(Identifier(table_name),
Identifier(self.config.DATABASE_WEBUSER)))
def _remove_non_existent_tables_from_db(self) -> None:
"""
Remove special phrases which doesn't exist on the wiki anymore.
Delete the place_classtype tables.
"""
LOG.warning('Cleaning database...')
# Delete place_classtype tables corresponding to class/type which
# are not on the wiki anymore.
with self.db_connection.cursor() as db_cursor:
for table in self.table_phrases_to_delete:
self.statistics_handler.notify_one_table_deleted()
db_cursor.drop_table(table)

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the SPWikiLoader class.
"""
from typing import Iterable
import re
import logging
from nominatim_core.config import Configuration
from nominatim_core.utils.url_utils import get_url
from .special_phrase import SpecialPhrase
LOG = logging.getLogger()
def _get_wiki_content(lang: str) -> str:
"""
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
"""
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+ lang.upper()
return get_url(url)
class SPWikiLoader:
"""
Handles loading of special phrases from the wiki.
"""
def __init__(self, config: Configuration) -> None:
self.config = config
# Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
)
# Hack around a bug where building=yes was imported with quotes into the wiki
self.type_fix_pattern = re.compile(r'\"|"')
self.languages = self.config.get_str_list('LANGUAGES') or \
['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi',
'lv', 'tr']
def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Download the wiki pages for the configured languages
and extract the phrases from the page.
"""
for lang in self.languages:
LOG.warning('Importing phrases for lang: %s...', lang)
loaded_xml = _get_wiki_content(lang)
# One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(loaded_xml)
for match in matches:
yield SpecialPhrase(match[0],
match[1],
self.type_fix_pattern.sub('', match[2]),
match[3])

View File

@@ -0,0 +1,37 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the class SpecialPhrase.
This class is a model used to transfer a special phrase through
the process of load and importation.
"""
from typing import Any
class SpecialPhrase:
"""
Model representing a special phrase.
"""
def __init__(self, p_label: str, p_class: str, p_type: str, p_operator: str) -> None:
self.p_label = p_label.strip()
self.p_class = p_class.strip()
self.p_type = p_type.strip()
# Needed if some operator in the wiki are not written in english
p_operator = p_operator.strip().lower()
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
def __eq__(self, other: Any) -> bool:
if not isinstance(other, SpecialPhrase):
return False
return self.p_label == other.p_label \
and self.p_class == other.p_class \
and self.p_type == other.p_type \
and self.p_operator == other.p_operator
def __hash__(self) -> int:
return hash((self.p_label, self.p_class, self.p_type, self.p_operator))