Refactoring loading of external special phrases and importation process by introducing SPLoader and SPWikiLoader

This commit is contained in:
AntoJvlt
2021-05-10 21:48:11 +02:00
parent 40cb17d299
commit 00959fac57
9 changed files with 226 additions and 190 deletions

View File

@@ -2,7 +2,10 @@
Implementation of the 'special-phrases' command.
"""
import logging
from nominatim.tools import SpecialPhrasesImporter
from nominatim.errors import UsageError
from pathlib import Path
from nominatim.tools import SPWikiLoader
from nominatim.tools import SPImporter
from nominatim.db.connection import connect
LOG = logging.getLogger()
@@ -21,16 +24,23 @@ class ImportSpecialPhrases:
group = parser.add_argument_group('Input arguments')
group.add_argument('--import-from-wiki', action='store_true',
help='Import special phrases from the OSM wiki to the database.')
group.add_argument('--csv-file', metavar='FILE',
help='CSV file containing phrases to import.')
@staticmethod
def run(args):
from ..tokenizer import factory as tokenizer_factory
if args.import_from_wiki:
LOG.warning('Special phrases importation starting')
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
with connect(args.config.get_libpq_dsn()) as db_connection:
SpecialPhrasesImporter(
args.config, args.phplib_dir, db_connection
).import_from_wiki(tokenizer)
SPImporter(
args.config, args.phplib_dir, db_connection, SPWikiLoader(args.config)
).import_phrases(tokenizer)
if args.csv_file:
if not Path(args.csv_file).is_file():
LOG.fatal("CSV file '%s' does not exist.", args.csv_file)
raise UsageError('Cannot access file.')
return 0

View File

@@ -3,4 +3,6 @@ Module with functions for importing, updating Nominatim databases
as well as general maintenance helpers.
"""
from nominatim.tools.special_phrases.special_phrases_importer import SpecialPhrasesImporter
from nominatim.tools.special_phrases.sp_importer import SPImporter
from nominatim.tools.special_phrases.sp_wiki_loader import SPWikiLoader
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase

View File

@@ -12,10 +12,9 @@ class SpecialPhrasesImporterStatistics():
process of special phrases.
"""
def __init__(self):
self._set_lang_values_to_0()
self._set_global_values_to_0()
self._intialize_values()
def _set_global_values_to_0(self):
def _intialize_values(self):
"""
Set all counts for the global
import to 0.
@@ -23,22 +22,14 @@ class SpecialPhrasesImporterStatistics():
self.tables_created = 0
self.tables_deleted = 0
self.tables_ignored = 0
self.global_phrases_invalid = 0
def _set_lang_values_to_0(self):
"""
Set all counts for the current
lang to 0.
"""
self.lang_phrases_invalid = 0
self.invalids = 0
def notify_one_phrase_invalid(self):
"""
Add +1 to the count of invalid entries
fetched from the wiki.
"""
self.lang_phrases_invalid += 1
self.global_phrases_invalid += 1
self.invalids += 1
def notify_one_table_created(self):
"""
@@ -58,7 +49,6 @@ class SpecialPhrasesImporterStatistics():
"""
self.tables_ignored += 1
def notify_import_done(self):
"""
Print stats for the whole import process
@@ -66,8 +56,8 @@ class SpecialPhrasesImporterStatistics():
"""
LOG.info('====================================================================')
LOG.info('Final statistics of the import:')
LOG.info('- %s phrases were invalid.', self.global_phrases_invalid)
if self.global_phrases_invalid > 0:
LOG.info('- %s phrases were invalid.', self.invalids)
if self.invalids > 0:
LOG.info(' Those invalid phrases have been skipped.')
LOG.info('- %s tables were ignored as they already exist on the database',
self.tables_ignored)
@@ -76,26 +66,8 @@ class SpecialPhrasesImporterStatistics():
if self.tables_deleted > 0:
LOG.info(' They were deleted as they are not valid anymore.')
if self.global_phrases_invalid > 0:
if self.invalids > 0:
LOG.warning('%s phrases were invalid and have been skipped during the whole process.',
self.global_phrases_invalid)
self.invalids)
self._set_global_values_to_0()
def notify_current_lang_done(self, lang):
"""
Print stats for the current lang
and then reset lang values.
"""
LOG.info('====================================================================')
LOG.info('Statistics for the import of %s:', lang)
LOG.info('- %s phrases were invalid.', self.lang_phrases_invalid)
if self.lang_phrases_invalid > 0:
LOG.info(' Those invalid phrases have been skipped.')
LOG.info('====================================================================')
if self.lang_phrases_invalid > 0:
LOG.warning('%s phrases were invalid and have been skipped for the import of lang %s.',
self.lang_phrases_invalid, lang)
self._set_lang_values_to_0()
self._intialize_values()

View File

@@ -1,5 +1,11 @@
"""
Functions to import special phrases into the database.
Module containing the class handling the import
of the special phrases.
Phrases are analyzed and imported into the database.
The phrases already present in the database which are not
valids anymore are removed.
"""
import logging
import os
@@ -10,27 +16,24 @@ import subprocess
import json
from psycopg2.sql import Identifier, Literal, SQL
from nominatim.tools.exec_utils import get_url
from nominatim.errors import UsageError
from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
LOG = logging.getLogger()
class SpecialPhrasesImporter():
class SPImporter():
# pylint: disable-msg=too-many-instance-attributes
"""
Class handling the process of special phrases importations.
Class handling the process of special phrases importations into the database.
Take a SPLoader which load the phrases from an external source.
"""
def __init__(self, config, phplib_dir, db_connection) -> None:
self.statistics_handler = SpecialPhrasesImporterStatistics()
self.db_connection = db_connection
def __init__(self, config, phplib_dir, db_connection, sp_loader) -> None:
self.config = config
self.phplib_dir = phplib_dir
self.db_connection = db_connection
self.sp_loader = sp_loader
self.statistics_handler = SpecialPhrasesImporterStatistics()
self.black_list, self.white_list = self._load_white_and_black_lists()
#Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
)
self.sanity_check_pattern = re.compile(r'^\w+$')
# This set will contain all existing phrases to be added.
# It contains tuples with the following format: (lable, class, type, operator)
@@ -39,27 +42,22 @@ class SpecialPhrasesImporter():
#special phrases class/type on the wiki.
self.table_phrases_to_delete = set()
def import_from_wiki(self, tokenizer, languages=None):
def import_phrases(self, tokenizer):
"""
Iterate through all specified languages and
extract corresponding special phrases from the wiki.
"""
if languages is not None and not isinstance(languages, list):
raise TypeError('The \'languages\' argument should be of type list.')
LOG.warning('Special phrases importation starting')
self._fetch_existing_place_classtype_tables()
#Get all languages to process.
languages = self._load_languages() if not languages else languages
#Store pairs of class/type for further processing
class_type_pairs = set()
for lang in languages:
LOG.warning('Importing phrases for lang: %s...', lang)
wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
self.statistics_handler.notify_current_lang_done(lang)
for loaded_phrases in self.sp_loader:
for phrase in loaded_phrases:
result = self._process_phrase(phrase)
if result:
class_type_pairs.update(result)
self._create_place_classtype_table_and_indexes(class_type_pairs)
self._remove_non_existent_tables_from_db()
@@ -101,89 +99,48 @@ class SpecialPhrasesImporter():
settings = json.load(json_settings)
return settings['blackList'], settings['whiteList']
def _load_languages(self):
"""
Get list of all languages from env config file
or default if there is no languages configured.
The system will extract special phrases only from all specified languages.
"""
default_languages = [
'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
@staticmethod
def _get_wiki_content(lang):
"""
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
"""
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
return get_url(url)
def _check_sanity(self, lang, phrase_class, phrase_type):
def _check_sanity(self, phrase):
"""
Check sanity of given inputs in case somebody added garbage in the wiki.
If a bad class/type is detected the system will exit with an error.
"""
type_matchs = self.sanity_check_pattern.findall(phrase_type)
class_matchs = self.sanity_check_pattern.findall(phrase_class)
class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
if not class_matchs or not type_matchs:
LOG.warning("Bad class/type for language %s: %s=%s. It will not be imported",
lang, phrase_class, phrase_type)
LOG.warning("Bad class/type: %s=%s. It will not be imported",
phrase.p_class, phrase.p_type)
return False
return True
def _process_xml_content(self, xml_content, lang):
def _process_phrase(self, phrase):
"""
Process given xml content by extracting matching patterns.
Matching patterns are processed there and returned in a
set of class/type pairs.
Processes the given phrase by checking black and white list
and sanity.
Return the class/type pair corresponding to the phrase.
"""
#One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(xml_content)
#Store pairs of class/type for further processing
class_type_pairs = set()
for match in matches:
phrase_label = match[0].strip()
phrase_class = match[1].strip()
phrase_type = match[2].strip()
phrase_operator = match[3].strip()
#Needed if some operator in the wiki are not written in english
phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
#hack around a bug where building=yes was imported with quotes into the wiki
phrase_type = re.sub(r'\"|"', '', phrase_type)
#blacklisting: disallow certain class/type combinations
if (
phrase.p_class in self.black_list.keys() and
phrase.p_type in self.black_list[phrase.p_class]
): return None
#blacklisting: disallow certain class/type combinations
if (
phrase_class in self.black_list.keys() and
phrase_type in self.black_list[phrase_class]
):
continue
#whitelisting: if class is in whitelist, allow only tags in the list
if (
phrase_class in self.white_list.keys() and
phrase_type not in self.white_list[phrase_class]
):
continue
#whitelisting: if class is in whitelist, allow only tags in the list
if (
phrase.p_class in self.white_list.keys() and
phrase.p_type not in self.white_list[phrase.p_class]
): return None
#sanity check, in case somebody added garbage in the wiki
if not self._check_sanity(lang, phrase_class, phrase_type):
self.statistics_handler.notify_one_phrase_invalid()
continue
#sanity check, in case somebody added garbage in the wiki
if not self._check_sanity(phrase):
self.statistics_handler.notify_one_phrase_invalid()
return None
class_type_pairs.add((phrase_class, phrase_type))
self.word_phrases.add((phrase.p_label, phrase.p_class,
phrase.p_type, phrase.p_operator))
self.word_phrases.add((phrase_label, phrase_class,
phrase_type, phrase_operator))
return class_type_pairs
return set({(phrase.p_class, phrase.p_type)})
def _create_place_classtype_table_and_indexes(self, class_type_pairs):

View File

@@ -0,0 +1,16 @@
"""
Module containing the SPLoader class.
"""
from abc import ABC, abstractmethod
class SPLoader(ABC):
"""
Base class for special phrases loaders.
Handle the loading of special phrases from external sources.
"""
def __iter__(self):
return self
@abstractmethod
def __next__(self):
pass

View File

@@ -0,0 +1,71 @@
"""
Module containing the SPWikiLoader class.
"""
import logging
import re
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
from nominatim.tools.special_phrases.sp_loader import SPLoader
from nominatim.tools.exec_utils import get_url
LOG = logging.getLogger()
class SPWikiLoader(SPLoader):
"""
Handles loading of special phrases from the wiki.
"""
def __init__(self, config, languages=None):
if languages is not None and not isinstance(languages, list):
raise TypeError('The \'languages\' parameter should be of type list.')
super().__init__()
self.config = config
#Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
)
self.languages = self._load_languages() if not languages else languages
def __next__(self):
if not self.languages:
raise StopIteration
lang = self.languages.pop(0)
loaded_xml = SPWikiLoader._get_wiki_content(lang)
LOG.warning('Importing phrases for lang: %s...', lang)
return self.parse_xml(loaded_xml)
def parse_xml(self, xml):
"""
Parses XML content and extracts special phrases from it.
Return a list of SpecialPhrase.
"""
#One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(xml)
returned_phrases = set()
for match in matches:
returned_phrases.add(
SpecialPhrase(match[0], match[1], match[2], match[3])
)
return returned_phrases
def _load_languages(self):
"""
Get list of all languages from env config file
or default if there is no languages configured.
The system will extract special phrases only from all specified languages.
"""
default_languages = [
'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
@staticmethod
def _get_wiki_content(lang):
"""
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
"""
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
return get_url(url)

View File

@@ -0,0 +1,19 @@
"""
Module containing the class SpecialPhrase.
This class is a model used to transfer a special phrase through
the process of load and importation.
"""
import re
class SpecialPhrase():
"""
Model representing a special phrase.
"""
def __init__(self, p_label, p_class, p_type, p_operator):
self.p_label = p_label.strip()
self.p_class = p_class.strip()
#Hack around a bug where building=yes was imported with quotes into the wiki
self.p_type = re.sub(r'\"|"', '', p_type.strip())
#Needed if some operator in the wiki are not written in english
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator