add type annotations to special phrase importer

This commit is contained in:
Sarah Hoffmann
2022-07-17 10:46:59 +02:00
parent 459ab3bbdc
commit 9963261d8d
9 changed files with 77 additions and 60 deletions

View File

@@ -9,7 +9,7 @@ Abstract class defintions for tokenizers. These base classes are here
mainly for documentation purposes. mainly for documentation purposes.
""" """
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any, Optional from typing import List, Tuple, Dict, Any, Optional, Iterable
from pathlib import Path from pathlib import Path
from typing_extensions import Protocol from typing_extensions import Protocol
@@ -81,7 +81,8 @@ class AbstractAnalyzer(ABC):
@abstractmethod @abstractmethod
def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]], def update_special_phrases(self,
phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None: should_replace: bool) -> None:
""" Update the tokenizer's special phrase tokens from the given """ Update the tokenizer's special phrase tokens from the given
list of special phrases. list of special phrases.

View File

@@ -8,7 +8,8 @@
Tokenizer implementing normalisation as used before Nominatim 4 but using Tokenizer implementing normalisation as used before Nominatim 4 but using
libICU instead of the PostgreSQL module. libICU instead of the PostgreSQL module.
""" """
from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, Dict, Set, Iterable from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
Dict, Set, Iterable
import itertools import itertools
import json import json
import logging import logging
@@ -374,7 +375,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]], def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None: should_replace: bool) -> None:
""" Replace the search index for special phrases with the new phrases. """ Replace the search index for special phrases with the new phrases.
If `should_replace` is True, then the previous set of will be If `should_replace` is True, then the previous set of will be

View File

@@ -7,7 +7,8 @@
""" """
Tokenizer implementing normalisation as used before Nominatim 4. Tokenizer implementing normalisation as used before Nominatim 4.
""" """
from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, cast, Dict, Set from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
cast, Dict, Set, Iterable
from collections import OrderedDict from collections import OrderedDict
import logging import logging
from pathlib import Path from pathlib import Path
@@ -392,7 +393,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]], def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None: should_replace: bool) -> None:
""" Replace the search index for special phrases with the new phrases. """ Replace the search index for special phrases with the new phrases.
""" """

View File

@@ -12,15 +12,14 @@ import logging
LOG = logging.getLogger() LOG = logging.getLogger()
class SpecialPhrasesImporterStatistics(): class SpecialPhrasesImporterStatistics():
# pylint: disable-msg=too-many-instance-attributes
""" """
Class handling statistics of the import Class handling statistics of the import
process of special phrases. process of special phrases.
""" """
def __init__(self): def __init__(self) -> None:
self._intialize_values() self._intialize_values()
def _intialize_values(self): def _intialize_values(self) -> None:
""" """
Set all counts for the global Set all counts for the global
import to 0. import to 0.
@@ -30,32 +29,32 @@ class SpecialPhrasesImporterStatistics():
self.tables_ignored = 0 self.tables_ignored = 0
self.invalids = 0 self.invalids = 0
def notify_one_phrase_invalid(self): def notify_one_phrase_invalid(self) -> None:
""" """
Add +1 to the count of invalid entries Add +1 to the count of invalid entries
fetched from the wiki. fetched from the wiki.
""" """
self.invalids += 1 self.invalids += 1
def notify_one_table_created(self): def notify_one_table_created(self) -> None:
""" """
Add +1 to the count of created tables. Add +1 to the count of created tables.
""" """
self.tables_created += 1 self.tables_created += 1
def notify_one_table_deleted(self): def notify_one_table_deleted(self) -> None:
""" """
Add +1 to the count of deleted tables. Add +1 to the count of deleted tables.
""" """
self.tables_deleted += 1 self.tables_deleted += 1
def notify_one_table_ignored(self): def notify_one_table_ignored(self) -> None:
""" """
Add +1 to the count of ignored tables. Add +1 to the count of ignored tables.
""" """
self.tables_ignored += 1 self.tables_ignored += 1
def notify_import_done(self): def notify_import_done(self) -> None:
""" """
Print stats for the whole import process Print stats for the whole import process
and reset all values. and reset all values.

View File

@@ -9,6 +9,7 @@
The class allows to load phrases from a csv file. The class allows to load phrases from a csv file.
""" """
from typing import Iterable
import csv import csv
import os import os
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
@@ -18,12 +19,11 @@ class SPCsvLoader:
""" """
Handles loading of special phrases from external csv file. Handles loading of special phrases from external csv file.
""" """
def __init__(self, csv_path): def __init__(self, csv_path: str) -> None:
super().__init__()
self.csv_path = csv_path self.csv_path = csv_path
def generate_phrases(self): def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Open and parse the given csv file. """ Open and parse the given csv file.
Create the corresponding SpecialPhrases. Create the corresponding SpecialPhrases.
""" """
@@ -35,7 +35,7 @@ class SPCsvLoader:
yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator']) yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
def _check_csv_validity(self): def _check_csv_validity(self) -> None:
""" """
Check that the csv file has the right extension. Check that the csv file has the right extension.
""" """

View File

@@ -13,19 +13,37 @@
The phrases already present in the database which are not The phrases already present in the database which are not
valids anymore are removed. valids anymore are removed.
""" """
from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set
import logging import logging
import re import re
from typing_extensions import Protocol
from psycopg2.sql import Identifier, SQL from psycopg2.sql import Identifier, SQL
from nominatim.config import Configuration
from nominatim.db.connection import Connection
from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
from nominatim.tokenizer.base import AbstractTokenizer
LOG = logging.getLogger() LOG = logging.getLogger()
def _classtype_table(phrase_class, phrase_type): def _classtype_table(phrase_class: str, phrase_type: str) -> str:
""" Return the name of the table for the given class and type. """ Return the name of the table for the given class and type.
""" """
return f'place_classtype_{phrase_class}_{phrase_type}' return f'place_classtype_{phrase_class}_{phrase_type}'
class SpecialPhraseLoader(Protocol):
""" Protocol for classes implementing a loader for special phrases.
"""
def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Generates all special phrase terms this loader can produce.
"""
class SPImporter(): class SPImporter():
# pylint: disable-msg=too-many-instance-attributes # pylint: disable-msg=too-many-instance-attributes
""" """
@@ -33,21 +51,22 @@ class SPImporter():
Take a sp loader which load the phrases from an external source. Take a sp loader which load the phrases from an external source.
""" """
def __init__(self, config, db_connection, sp_loader): def __init__(self, config: Configuration, conn: Connection,
sp_loader: SpecialPhraseLoader) -> None:
self.config = config self.config = config
self.db_connection = db_connection self.db_connection = conn
self.sp_loader = sp_loader self.sp_loader = sp_loader
self.statistics_handler = SpecialPhrasesImporterStatistics() self.statistics_handler = SpecialPhrasesImporterStatistics()
self.black_list, self.white_list = self._load_white_and_black_lists() self.black_list, self.white_list = self._load_white_and_black_lists()
self.sanity_check_pattern = re.compile(r'^\w+$') self.sanity_check_pattern = re.compile(r'^\w+$')
# This set will contain all existing phrases to be added. # This set will contain all existing phrases to be added.
# It contains tuples with the following format: (lable, class, type, operator) # It contains tuples with the following format: (lable, class, type, operator)
self.word_phrases = set() self.word_phrases: Set[Tuple[str, str, str, str]] = set()
# This set will contain all existing place_classtype tables which doesn't match any # This set will contain all existing place_classtype tables which doesn't match any
# special phrases class/type on the wiki. # special phrases class/type on the wiki.
self.table_phrases_to_delete = set() self.table_phrases_to_delete: Set[str] = set()
def import_phrases(self, tokenizer, should_replace): def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None:
""" """
Iterate through all SpecialPhrases extracted from the Iterate through all SpecialPhrases extracted from the
loader and import them into the database. loader and import them into the database.
@@ -67,7 +86,7 @@ class SPImporter():
if result: if result:
class_type_pairs.add(result) class_type_pairs.add(result)
self._create_place_classtype_table_and_indexes(class_type_pairs) self._create_classtype_table_and_indexes(class_type_pairs)
if should_replace: if should_replace:
self._remove_non_existent_tables_from_db() self._remove_non_existent_tables_from_db()
self.db_connection.commit() self.db_connection.commit()
@@ -79,7 +98,7 @@ class SPImporter():
self.statistics_handler.notify_import_done() self.statistics_handler.notify_import_done()
def _fetch_existing_place_classtype_tables(self): def _fetch_existing_place_classtype_tables(self) -> None:
""" """
Fetch existing place_classtype tables. Fetch existing place_classtype tables.
Fill the table_phrases_to_delete set of the class. Fill the table_phrases_to_delete set of the class.
@@ -95,7 +114,8 @@ class SPImporter():
for row in db_cursor: for row in db_cursor:
self.table_phrases_to_delete.add(row[0]) self.table_phrases_to_delete.add(row[0])
def _load_white_and_black_lists(self): def _load_white_and_black_lists(self) \
-> Tuple[Mapping[str, Sequence[str]], Mapping[str, Sequence[str]]]:
""" """
Load white and black lists from phrases-settings.json. Load white and black lists from phrases-settings.json.
""" """
@@ -103,7 +123,7 @@ class SPImporter():
return settings['blackList'], settings['whiteList'] return settings['blackList'], settings['whiteList']
def _check_sanity(self, phrase): def _check_sanity(self, phrase: SpecialPhrase) -> bool:
""" """
Check sanity of given inputs in case somebody added garbage in the wiki. Check sanity of given inputs in case somebody added garbage in the wiki.
If a bad class/type is detected the system will exit with an error. If a bad class/type is detected the system will exit with an error.
@@ -117,7 +137,7 @@ class SPImporter():
return False return False
return True return True
def _process_phrase(self, phrase): def _process_phrase(self, phrase: SpecialPhrase) -> Optional[Tuple[str, str]]:
""" """
Processes the given phrase by checking black and white list Processes the given phrase by checking black and white list
and sanity. and sanity.
@@ -145,7 +165,8 @@ class SPImporter():
return (phrase.p_class, phrase.p_type) return (phrase.p_class, phrase.p_type)
def _create_place_classtype_table_and_indexes(self, class_type_pairs): def _create_classtype_table_and_indexes(self,
class_type_pairs: Iterable[Tuple[str, str]]) -> None:
""" """
Create table place_classtype for each given pair. Create table place_classtype for each given pair.
Also create indexes on place_id and centroid. Also create indexes on place_id and centroid.
@@ -188,7 +209,8 @@ class SPImporter():
db_cursor.execute("DROP INDEX idx_placex_classtype") db_cursor.execute("DROP INDEX idx_placex_classtype")
def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type): def _create_place_classtype_table(self, sql_tablespace: str,
phrase_class: str, phrase_type: str) -> None:
""" """
Create table place_classtype of the given phrase_class/phrase_type Create table place_classtype of the given phrase_class/phrase_type
if doesn't exit. if doesn't exit.
@@ -204,7 +226,8 @@ class SPImporter():
(phrase_class, phrase_type)) (phrase_class, phrase_type))
def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type): def _create_place_classtype_indexes(self, sql_tablespace: str,
phrase_class: str, phrase_type: str) -> None:
""" """
Create indexes on centroid and place_id for the place_classtype table. Create indexes on centroid and place_id for the place_classtype table.
""" """
@@ -227,7 +250,7 @@ class SPImporter():
SQL(sql_tablespace))) SQL(sql_tablespace)))
def _grant_access_to_webuser(self, phrase_class, phrase_type): def _grant_access_to_webuser(self, phrase_class: str, phrase_type: str) -> None:
""" """
Grant access on read to the table place_classtype for the webuser. Grant access on read to the table place_classtype for the webuser.
""" """
@@ -237,7 +260,7 @@ class SPImporter():
.format(Identifier(table_name), .format(Identifier(table_name),
Identifier(self.config.DATABASE_WEBUSER))) Identifier(self.config.DATABASE_WEBUSER)))
def _remove_non_existent_tables_from_db(self): def _remove_non_existent_tables_from_db(self) -> None:
""" """
Remove special phrases which doesn't exist on the wiki anymore. Remove special phrases which doesn't exist on the wiki anymore.
Delete the place_classtype tables. Delete the place_classtype tables.

View File

@@ -7,14 +7,17 @@
""" """
Module containing the SPWikiLoader class. Module containing the SPWikiLoader class.
""" """
from typing import Iterable
import re import re
import logging import logging
from nominatim.config import Configuration
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
from nominatim.tools.exec_utils import get_url from nominatim.tools.exec_utils import get_url
LOG = logging.getLogger() LOG = logging.getLogger()
def _get_wiki_content(lang): def _get_wiki_content(lang: str) -> str:
""" """
Request and return the wiki page's content Request and return the wiki page's content
corresponding to special phrases for a given lang. corresponding to special phrases for a given lang.
@@ -30,8 +33,7 @@ class SPWikiLoader:
""" """
Handles loading of special phrases from the wiki. Handles loading of special phrases from the wiki.
""" """
def __init__(self, config): def __init__(self, config: Configuration) -> None:
super().__init__()
self.config = config self.config = config
# Compile the regex here to increase performances. # Compile the regex here to increase performances.
self.occurence_pattern = re.compile( self.occurence_pattern = re.compile(
@@ -39,10 +41,15 @@ class SPWikiLoader:
) )
# Hack around a bug where building=yes was imported with quotes into the wiki # Hack around a bug where building=yes was imported with quotes into the wiki
self.type_fix_pattern = re.compile(r'\"|"') self.type_fix_pattern = re.compile(r'\"|"')
self._load_languages()
self.languages = self.config.get_str_list('LANGUAGES') or \
['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
def generate_phrases(self): def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Download the wiki pages for the configured languages """ Download the wiki pages for the configured languages
and extract the phrases from the page. and extract the phrases from the page.
""" """
@@ -58,19 +65,3 @@ class SPWikiLoader:
match[1], match[1],
self.type_fix_pattern.sub('', match[2]), self.type_fix_pattern.sub('', match[2]),
match[3]) match[3])
def _load_languages(self):
"""
Get list of all languages from env config file
or default if there is no languages configured.
The system will extract special phrases only from all specified languages.
"""
if self.config.LANGUAGES:
self.languages = self.config.get_str_list('LANGUAGES')
else:
self.languages = [
'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']

View File

@@ -10,20 +10,21 @@
This class is a model used to transfer a special phrase through This class is a model used to transfer a special phrase through
the process of load and importation. the process of load and importation.
""" """
from typing import Any
class SpecialPhrase: class SpecialPhrase:
""" """
Model representing a special phrase. Model representing a special phrase.
""" """
def __init__(self, p_label, p_class, p_type, p_operator): def __init__(self, p_label: str, p_class: str, p_type: str, p_operator: str) -> None:
self.p_label = p_label.strip() self.p_label = p_label.strip()
self.p_class = p_class.strip() self.p_class = p_class.strip()
# Hack around a bug where building=yes was imported with quotes into the wiki
self.p_type = p_type.strip() self.p_type = p_type.strip()
# Needed if some operator in the wiki are not written in english # Needed if some operator in the wiki are not written in english
p_operator = p_operator.strip().lower() p_operator = p_operator.strip().lower()
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
def __eq__(self, other): def __eq__(self, other: Any) -> bool:
if not isinstance(other, SpecialPhrase): if not isinstance(other, SpecialPhrase):
return False return False
@@ -32,5 +33,5 @@ class SpecialPhrase:
and self.p_type == other.p_type \ and self.p_type == other.p_type \
and self.p_operator == other.p_operator and self.p_operator == other.p_operator
def __hash__(self): def __hash__(self) -> int:
return hash((self.p_label, self.p_class, self.p_type, self.p_operator)) return hash((self.p_label, self.p_class, self.p_type, self.p_operator))

View File

@@ -128,7 +128,7 @@ def test_create_place_classtype_table_and_indexes(
""" """
pairs = set([('class1', 'type1'), ('class2', 'type2')]) pairs = set([('class1', 'type1'), ('class2', 'type2')])
sp_importer._create_place_classtype_table_and_indexes(pairs) sp_importer._create_classtype_table_and_indexes(pairs)
for pair in pairs: for pair in pairs:
assert check_table_exist(temp_db_conn, pair[0], pair[1]) assert check_table_exist(temp_db_conn, pair[0], pair[1])