mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-13 06:14:07 +00:00
split code into submodules
This commit is contained in:
0
src/nominatim_db/tools/special_phrases/__init__.py
Normal file
0
src/nominatim_db/tools/special_phrases/__init__.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Contains the class which handles statistics for the
|
||||
import of special phrases.
|
||||
"""
|
||||
import logging
|
||||
LOG = logging.getLogger()
|
||||
|
||||
class SpecialPhrasesImporterStatistics():
|
||||
"""
|
||||
Class handling statistics of the import
|
||||
process of special phrases.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
self._intialize_values()
|
||||
|
||||
def _intialize_values(self) -> None:
|
||||
"""
|
||||
Set all counts for the global
|
||||
import to 0.
|
||||
"""
|
||||
self.tables_created = 0
|
||||
self.tables_deleted = 0
|
||||
self.tables_ignored = 0
|
||||
self.invalids = 0
|
||||
|
||||
def notify_one_phrase_invalid(self) -> None:
|
||||
"""
|
||||
Add +1 to the count of invalid entries
|
||||
fetched from the wiki.
|
||||
"""
|
||||
self.invalids += 1
|
||||
|
||||
def notify_one_table_created(self) -> None:
|
||||
"""
|
||||
Add +1 to the count of created tables.
|
||||
"""
|
||||
self.tables_created += 1
|
||||
|
||||
def notify_one_table_deleted(self) -> None:
|
||||
"""
|
||||
Add +1 to the count of deleted tables.
|
||||
"""
|
||||
self.tables_deleted += 1
|
||||
|
||||
def notify_one_table_ignored(self) -> None:
|
||||
"""
|
||||
Add +1 to the count of ignored tables.
|
||||
"""
|
||||
self.tables_ignored += 1
|
||||
|
||||
def notify_import_done(self) -> None:
|
||||
"""
|
||||
Print stats for the whole import process
|
||||
and reset all values.
|
||||
"""
|
||||
LOG.info('====================================================================')
|
||||
LOG.info('Final statistics of the import:')
|
||||
LOG.info('- %s phrases were invalid.', self.invalids)
|
||||
if self.invalids > 0:
|
||||
LOG.info(' Those invalid phrases have been skipped.')
|
||||
LOG.info('- %s tables were ignored as they already exist on the database',
|
||||
self.tables_ignored)
|
||||
LOG.info('- %s tables were created', self.tables_created)
|
||||
LOG.info('- %s tables were deleted from the database', self.tables_deleted)
|
||||
if self.tables_deleted > 0:
|
||||
LOG.info(' They were deleted as they are not valid anymore.')
|
||||
|
||||
if self.invalids > 0:
|
||||
LOG.warning('%s phrases were invalid and have been skipped during the whole process.',
|
||||
self.invalids)
|
||||
|
||||
self._intialize_values()
|
||||
46
src/nominatim_db/tools/special_phrases/sp_csv_loader.py
Normal file
46
src/nominatim_db/tools/special_phrases/sp_csv_loader.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Module containing the SPCsvLoader class.
|
||||
|
||||
The class allows to load phrases from a csv file.
|
||||
"""
|
||||
from typing import Iterable
|
||||
import csv
|
||||
import os
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from .special_phrase import SpecialPhrase
|
||||
|
||||
class SPCsvLoader:
|
||||
"""
|
||||
Handles loading of special phrases from external csv file.
|
||||
"""
|
||||
def __init__(self, csv_path: str) -> None:
|
||||
self.csv_path = csv_path
|
||||
|
||||
|
||||
def generate_phrases(self) -> Iterable[SpecialPhrase]:
|
||||
""" Open and parse the given csv file.
|
||||
Create the corresponding SpecialPhrases.
|
||||
"""
|
||||
self._check_csv_validity()
|
||||
|
||||
with open(self.csv_path, encoding='utf-8') as fd:
|
||||
reader = csv.DictReader(fd, delimiter=',')
|
||||
for row in reader:
|
||||
yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
|
||||
|
||||
|
||||
def _check_csv_validity(self) -> None:
|
||||
"""
|
||||
Check that the csv file has the right extension.
|
||||
"""
|
||||
_, extension = os.path.splitext(self.csv_path)
|
||||
|
||||
if extension != '.csv':
|
||||
raise UsageError(f'The file {self.csv_path} is not a csv file.')
|
||||
274
src/nominatim_db/tools/special_phrases/sp_importer.py
Normal file
274
src/nominatim_db/tools/special_phrases/sp_importer.py
Normal file
@@ -0,0 +1,274 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Module containing the class handling the import
|
||||
of the special phrases.
|
||||
|
||||
Phrases are analyzed and imported into the database.
|
||||
|
||||
The phrases already present in the database which are not
|
||||
valids anymore are removed.
|
||||
"""
|
||||
from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set
|
||||
import logging
|
||||
import re
|
||||
|
||||
from psycopg2.sql import Identifier, SQL
|
||||
|
||||
from nominatim_core.typing import Protocol
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.db.connection import Connection
|
||||
from .importer_statistics import SpecialPhrasesImporterStatistics
|
||||
from .special_phrase import SpecialPhrase
|
||||
from ...tokenizer.base import AbstractTokenizer
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _classtype_table(phrase_class: str, phrase_type: str) -> str:
|
||||
""" Return the name of the table for the given class and type.
|
||||
"""
|
||||
return f'place_classtype_{phrase_class}_{phrase_type}'
|
||||
|
||||
|
||||
class SpecialPhraseLoader(Protocol):
|
||||
""" Protocol for classes implementing a loader for special phrases.
|
||||
"""
|
||||
|
||||
def generate_phrases(self) -> Iterable[SpecialPhrase]:
|
||||
""" Generates all special phrase terms this loader can produce.
|
||||
"""
|
||||
|
||||
|
||||
class SPImporter():
|
||||
# pylint: disable-msg=too-many-instance-attributes
|
||||
"""
|
||||
Class handling the process of special phrases importation into the database.
|
||||
|
||||
Take a sp loader which load the phrases from an external source.
|
||||
"""
|
||||
def __init__(self, config: Configuration, conn: Connection,
|
||||
sp_loader: SpecialPhraseLoader) -> None:
|
||||
self.config = config
|
||||
self.db_connection = conn
|
||||
self.sp_loader = sp_loader
|
||||
self.statistics_handler = SpecialPhrasesImporterStatistics()
|
||||
self.black_list, self.white_list = self._load_white_and_black_lists()
|
||||
self.sanity_check_pattern = re.compile(r'^\w+$')
|
||||
# This set will contain all existing phrases to be added.
|
||||
# It contains tuples with the following format: (label, class, type, operator)
|
||||
self.word_phrases: Set[Tuple[str, str, str, str]] = set()
|
||||
# This set will contain all existing place_classtype tables which doesn't match any
|
||||
# special phrases class/type on the wiki.
|
||||
self.table_phrases_to_delete: Set[str] = set()
|
||||
|
||||
def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None:
|
||||
"""
|
||||
Iterate through all SpecialPhrases extracted from the
|
||||
loader and import them into the database.
|
||||
|
||||
If should_replace is set to True only the loaded phrases
|
||||
will be kept into the database. All other phrases already
|
||||
in the database will be removed.
|
||||
"""
|
||||
LOG.warning('Special phrases importation starting')
|
||||
self._fetch_existing_place_classtype_tables()
|
||||
|
||||
# Store pairs of class/type for further processing
|
||||
class_type_pairs = set()
|
||||
|
||||
for phrase in self.sp_loader.generate_phrases():
|
||||
result = self._process_phrase(phrase)
|
||||
if result:
|
||||
class_type_pairs.add(result)
|
||||
|
||||
self._create_classtype_table_and_indexes(class_type_pairs)
|
||||
if should_replace:
|
||||
self._remove_non_existent_tables_from_db()
|
||||
self.db_connection.commit()
|
||||
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
analyzer.update_special_phrases(self.word_phrases, should_replace)
|
||||
|
||||
LOG.warning('Import done.')
|
||||
self.statistics_handler.notify_import_done()
|
||||
|
||||
|
||||
def _fetch_existing_place_classtype_tables(self) -> None:
|
||||
"""
|
||||
Fetch existing place_classtype tables.
|
||||
Fill the table_phrases_to_delete set of the class.
|
||||
"""
|
||||
query = """
|
||||
SELECT table_name
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema='public'
|
||||
AND table_name like 'place_classtype_%';
|
||||
"""
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute(SQL(query))
|
||||
for row in db_cursor:
|
||||
self.table_phrases_to_delete.add(row[0])
|
||||
|
||||
def _load_white_and_black_lists(self) \
|
||||
-> Tuple[Mapping[str, Sequence[str]], Mapping[str, Sequence[str]]]:
|
||||
"""
|
||||
Load white and black lists from phrases-settings.json.
|
||||
"""
|
||||
settings = self.config.load_sub_configuration('phrase-settings.json')
|
||||
|
||||
return settings['blackList'], settings['whiteList']
|
||||
|
||||
def _check_sanity(self, phrase: SpecialPhrase) -> bool:
|
||||
"""
|
||||
Check sanity of given inputs in case somebody added garbage in the wiki.
|
||||
If a bad class/type is detected the system will exit with an error.
|
||||
"""
|
||||
class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
|
||||
type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
|
||||
|
||||
if not class_matchs or not type_matchs:
|
||||
LOG.warning("Bad class/type: %s=%s. It will not be imported",
|
||||
phrase.p_class, phrase.p_type)
|
||||
return False
|
||||
return True
|
||||
|
||||
def _process_phrase(self, phrase: SpecialPhrase) -> Optional[Tuple[str, str]]:
|
||||
"""
|
||||
Processes the given phrase by checking black and white list
|
||||
and sanity.
|
||||
Return the class/type pair corresponding to the phrase.
|
||||
"""
|
||||
|
||||
# blacklisting: disallow certain class/type combinations
|
||||
if phrase.p_class in self.black_list.keys() \
|
||||
and phrase.p_type in self.black_list[phrase.p_class]:
|
||||
return None
|
||||
|
||||
# whitelisting: if class is in whitelist, allow only tags in the list
|
||||
if phrase.p_class in self.white_list.keys() \
|
||||
and phrase.p_type not in self.white_list[phrase.p_class]:
|
||||
return None
|
||||
|
||||
# sanity check, in case somebody added garbage in the wiki
|
||||
if not self._check_sanity(phrase):
|
||||
self.statistics_handler.notify_one_phrase_invalid()
|
||||
return None
|
||||
|
||||
self.word_phrases.add((phrase.p_label, phrase.p_class,
|
||||
phrase.p_type, phrase.p_operator))
|
||||
|
||||
return (phrase.p_class, phrase.p_type)
|
||||
|
||||
|
||||
def _create_classtype_table_and_indexes(self,
|
||||
class_type_pairs: Iterable[Tuple[str, str]]) -> None:
|
||||
"""
|
||||
Create table place_classtype for each given pair.
|
||||
Also create indexes on place_id and centroid.
|
||||
"""
|
||||
LOG.warning('Create tables and indexes...')
|
||||
|
||||
sql_tablespace = self.config.TABLESPACE_AUX_DATA
|
||||
if sql_tablespace:
|
||||
sql_tablespace = ' TABLESPACE ' + sql_tablespace
|
||||
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
|
||||
|
||||
for pair in class_type_pairs:
|
||||
phrase_class = pair[0]
|
||||
phrase_type = pair[1]
|
||||
|
||||
table_name = _classtype_table(phrase_class, phrase_type)
|
||||
|
||||
if table_name in self.table_phrases_to_delete:
|
||||
self.statistics_handler.notify_one_table_ignored()
|
||||
# Remove this table from the ones to delete as it match a
|
||||
# class/type still existing on the special phrases of the wiki.
|
||||
self.table_phrases_to_delete.remove(table_name)
|
||||
# So don't need to create the table and indexes.
|
||||
continue
|
||||
|
||||
# Table creation
|
||||
self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
|
||||
|
||||
# Indexes creation
|
||||
self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
|
||||
|
||||
# Grant access on read to the web user.
|
||||
self._grant_access_to_webuser(phrase_class, phrase_type)
|
||||
|
||||
self.statistics_handler.notify_one_table_created()
|
||||
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute("DROP INDEX idx_placex_classtype")
|
||||
|
||||
|
||||
def _create_place_classtype_table(self, sql_tablespace: str,
|
||||
phrase_class: str, phrase_type: str) -> None:
|
||||
"""
|
||||
Create table place_classtype of the given phrase_class/phrase_type
|
||||
if doesn't exit.
|
||||
"""
|
||||
table_name = _classtype_table(phrase_class, phrase_type)
|
||||
with self.db_connection.cursor() as cur:
|
||||
cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS
|
||||
SELECT place_id AS place_id,
|
||||
st_centroid(geometry) AS centroid
|
||||
FROM placex
|
||||
WHERE class = %s AND type = %s
|
||||
""").format(Identifier(table_name), SQL(sql_tablespace)),
|
||||
(phrase_class, phrase_type))
|
||||
|
||||
|
||||
def _create_place_classtype_indexes(self, sql_tablespace: str,
|
||||
phrase_class: str, phrase_type: str) -> None:
|
||||
"""
|
||||
Create indexes on centroid and place_id for the place_classtype table.
|
||||
"""
|
||||
index_prefix = f'idx_place_classtype_{phrase_class}_{phrase_type}_'
|
||||
base_table = _classtype_table(phrase_class, phrase_type)
|
||||
# Index on centroid
|
||||
if not self.db_connection.index_exists(index_prefix + 'centroid'):
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}")
|
||||
.format(Identifier(index_prefix + 'centroid'),
|
||||
Identifier(base_table),
|
||||
SQL(sql_tablespace)))
|
||||
|
||||
# Index on place_id
|
||||
if not self.db_connection.index_exists(index_prefix + 'place_id'):
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute(SQL("CREATE INDEX {} ON {} USING btree(place_id) {}")
|
||||
.format(Identifier(index_prefix + 'place_id'),
|
||||
Identifier(base_table),
|
||||
SQL(sql_tablespace)))
|
||||
|
||||
|
||||
def _grant_access_to_webuser(self, phrase_class: str, phrase_type: str) -> None:
|
||||
"""
|
||||
Grant access on read to the table place_classtype for the webuser.
|
||||
"""
|
||||
table_name = _classtype_table(phrase_class, phrase_type)
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
|
||||
.format(Identifier(table_name),
|
||||
Identifier(self.config.DATABASE_WEBUSER)))
|
||||
|
||||
def _remove_non_existent_tables_from_db(self) -> None:
|
||||
"""
|
||||
Remove special phrases which doesn't exist on the wiki anymore.
|
||||
Delete the place_classtype tables.
|
||||
"""
|
||||
LOG.warning('Cleaning database...')
|
||||
|
||||
# Delete place_classtype tables corresponding to class/type which
|
||||
# are not on the wiki anymore.
|
||||
with self.db_connection.cursor() as db_cursor:
|
||||
for table in self.table_phrases_to_delete:
|
||||
self.statistics_handler.notify_one_table_deleted()
|
||||
db_cursor.drop_table(table)
|
||||
68
src/nominatim_db/tools/special_phrases/sp_wiki_loader.py
Normal file
68
src/nominatim_db/tools/special_phrases/sp_wiki_loader.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Module containing the SPWikiLoader class.
|
||||
"""
|
||||
from typing import Iterable
|
||||
import re
|
||||
import logging
|
||||
|
||||
from nominatim_core.config import Configuration
|
||||
from nominatim_core.utils.url_utils import get_url
|
||||
from .special_phrase import SpecialPhrase
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _get_wiki_content(lang: str) -> str:
|
||||
"""
|
||||
Request and return the wiki page's content
|
||||
corresponding to special phrases for a given lang.
|
||||
Requested URL Example :
|
||||
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
|
||||
"""
|
||||
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
|
||||
+ lang.upper()
|
||||
return get_url(url)
|
||||
|
||||
|
||||
class SPWikiLoader:
|
||||
"""
|
||||
Handles loading of special phrases from the wiki.
|
||||
"""
|
||||
def __init__(self, config: Configuration) -> None:
|
||||
self.config = config
|
||||
# Compile the regex here to increase performances.
|
||||
self.occurence_pattern = re.compile(
|
||||
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
|
||||
)
|
||||
# Hack around a bug where building=yes was imported with quotes into the wiki
|
||||
self.type_fix_pattern = re.compile(r'\"|"')
|
||||
|
||||
self.languages = self.config.get_str_list('LANGUAGES') or \
|
||||
['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
|
||||
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
|
||||
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
|
||||
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi',
|
||||
'lv', 'tr']
|
||||
|
||||
|
||||
def generate_phrases(self) -> Iterable[SpecialPhrase]:
|
||||
""" Download the wiki pages for the configured languages
|
||||
and extract the phrases from the page.
|
||||
"""
|
||||
for lang in self.languages:
|
||||
LOG.warning('Importing phrases for lang: %s...', lang)
|
||||
loaded_xml = _get_wiki_content(lang)
|
||||
|
||||
# One match will be of format [label, class, type, operator, plural]
|
||||
matches = self.occurence_pattern.findall(loaded_xml)
|
||||
|
||||
for match in matches:
|
||||
yield SpecialPhrase(match[0],
|
||||
match[1],
|
||||
self.type_fix_pattern.sub('', match[2]),
|
||||
match[3])
|
||||
37
src/nominatim_db/tools/special_phrases/special_phrase.py
Normal file
37
src/nominatim_db/tools/special_phrases/special_phrase.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Module containing the class SpecialPhrase.
|
||||
|
||||
This class is a model used to transfer a special phrase through
|
||||
the process of load and importation.
|
||||
"""
|
||||
from typing import Any
|
||||
|
||||
class SpecialPhrase:
|
||||
"""
|
||||
Model representing a special phrase.
|
||||
"""
|
||||
def __init__(self, p_label: str, p_class: str, p_type: str, p_operator: str) -> None:
|
||||
self.p_label = p_label.strip()
|
||||
self.p_class = p_class.strip()
|
||||
self.p_type = p_type.strip()
|
||||
# Needed if some operator in the wiki are not written in english
|
||||
p_operator = p_operator.strip().lower()
|
||||
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
|
||||
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
if not isinstance(other, SpecialPhrase):
|
||||
return False
|
||||
|
||||
return self.p_label == other.p_label \
|
||||
and self.p_class == other.p_class \
|
||||
and self.p_type == other.p_type \
|
||||
and self.p_operator == other.p_operator
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((self.p_label, self.p_class, self.p_type, self.p_operator))
|
||||
Reference in New Issue
Block a user