Merge pull request #2263 from AntoJvlt/special-phrases-autoupdate

Implemented auto update of special phrases while importing them
This commit is contained in:
Sarah Hoffmann
2021-04-15 10:13:25 +02:00
committed by GitHub
2 changed files with 256 additions and 6 deletions

View File

@@ -32,6 +32,17 @@ class SpecialPhrasesImporter():
self.sanity_check_pattern = re.compile(r'^\w+$') self.sanity_check_pattern = re.compile(r'^\w+$')
self.transliterator = Transliterator.createFromRules("special-phrases normalizer", self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
self.config.TERM_NORMALIZATION) self.config.TERM_NORMALIZATION)
#This set will contain all existing phrases from the word table which
#no longer exist on the wiki.
#It contain tuples with the following format: (normalized_word, class, type, operator)
self.words_phrases_to_delete = set()
#This set will contain the phrases which still exist from the wiki.
#It is used to prevent duplicates on the wiki by removing them from
#the word_phrases_to_delete only at the end.
self.words_phrases_still_exist = set()
#This set will contain all existing place_classtype tables which doesn't match any
#special phrases class/type on the wiki.
self.table_phrases_to_delete = set()
def import_from_wiki(self, languages=None): def import_from_wiki(self, languages=None):
""" """
@@ -41,6 +52,9 @@ class SpecialPhrasesImporter():
if languages is not None and not isinstance(languages, list): if languages is not None and not isinstance(languages, list):
raise TypeError('The \'languages\' argument should be of type list.') raise TypeError('The \'languages\' argument should be of type list.')
self._fetch_existing_words_phrases()
self._fetch_existing_place_classtype_tables()
#Get all languages to process. #Get all languages to process.
languages = self._load_languages() if not languages else languages languages = self._load_languages() if not languages else languages
@@ -53,9 +67,46 @@ class SpecialPhrasesImporter():
class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang)) class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
self._create_place_classtype_table_and_indexes(class_type_pairs) self._create_place_classtype_table_and_indexes(class_type_pairs)
self._remove_non_existent_phrases_from_db()
self.db_connection.commit() self.db_connection.commit()
LOG.warning('Import done.') LOG.warning('Import done.')
def _fetch_existing_words_phrases(self):
"""
Fetch existing special phrases from the word table.
Fill the word_phrases_to_delete set of the class.
"""
#Only extract special phrases terms:
#If class=place and type=house then it is a housenumber term.
#If class=place and type=postcode then it is a postcode term.
word_query = """
SELECT word, class, type, operator FROM word
WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
"""
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL(word_query))
for row in db_cursor:
row[3] = '-' if row[3] is None else row[3]
self.words_phrases_to_delete.add(
(row[0], row[1], row[2], row[3])
)
def _fetch_existing_place_classtype_tables(self):
"""
Fetch existing place_classtype tables.
Fill the table_phrases_to_delete set of the class.
"""
query = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema='public'
AND table_name like 'place_classtype_%';
"""
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL(query))
for row in db_cursor:
self.table_phrases_to_delete.add(row[0])
def _load_white_and_black_lists(self): def _load_white_and_black_lists(self):
""" """
Load white and black lists from phrases-settings.json. Load white and black lists from phrases-settings.json.
@@ -122,12 +173,11 @@ class SpecialPhrasesImporter():
phrase_class = match[1].strip() phrase_class = match[1].strip()
phrase_type = match[2].strip() phrase_type = match[2].strip()
phrase_operator = match[3].strip() phrase_operator = match[3].strip()
#Needed if some operator in the wiki are not written in english
phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
#hack around a bug where building=yes was imported with quotes into the wiki #hack around a bug where building=yes was imported with quotes into the wiki
phrase_type = re.sub(r'\"|"', '', phrase_type) phrase_type = re.sub(r'\"|"', '', phrase_type)
#sanity check, in case somebody added garbage in the wiki
self._check_sanity(lang, phrase_class, phrase_type)
#blacklisting: disallow certain class/type combinations #blacklisting: disallow certain class/type combinations
if ( if (
phrase_class in self.black_list.keys() and phrase_class in self.black_list.keys() and
@@ -141,7 +191,22 @@ class SpecialPhrasesImporter():
): ):
continue continue
#add class/type to the pairs dict #Check if the phrase already exists in the database.
if (
(normalized_label, phrase_class, phrase_type, phrase_operator)
in self.words_phrases_to_delete
):
#Remove this phrase from the ones to delete as it still exist on the wiki.
self.words_phrases_still_exist.add(
(normalized_label, phrase_class, phrase_type, phrase_operator)
)
class_type_pairs.add((phrase_class, phrase_type))
#Dont need to add this phrase as it already exists in the word table.
continue
#sanity check, in case somebody added garbage in the wiki
self._check_sanity(lang, phrase_class, phrase_type)
class_type_pairs.add((phrase_class, phrase_type)) class_type_pairs.add((phrase_class, phrase_type))
self._process_amenity( self._process_amenity(
@@ -191,6 +256,15 @@ class SpecialPhrasesImporter():
phrase_class = pair[0] phrase_class = pair[0]
phrase_type = pair[1] phrase_type = pair[1]
table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
if table_name in self.table_phrases_to_delete:
#Remove this table from the ones to delete as it match a class/type
#still existing on the special phrases of the wiki.
self.table_phrases_to_delete.remove(table_name)
#So dont need to create the table and indexes.
continue
#Table creation #Table creation
self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type) self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
@@ -251,6 +325,41 @@ class SpecialPhrasesImporter():
.format(Identifier(table_name), .format(Identifier(table_name),
Identifier(self.config.DATABASE_WEBUSER))) Identifier(self.config.DATABASE_WEBUSER)))
def _remove_non_existent_phrases_from_db(self):
"""
Remove special phrases which doesn't exist on the wiki anymore.
Delete from the word table and delete the place_classtype tables.
"""
LOG.warning('Cleaning database...')
self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
#Array containing all queries to execute. Contain tuples of format (query, parameters)
queries_parameters = []
#Delete phrases from the word table which are not on the wiki anymore.
for phrase_to_delete in self.words_phrases_to_delete:
if phrase_to_delete[3] == '-':
query = """
DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
"""
parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
queries_parameters.append((query, parameters))
else:
query = """
DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
"""
parameters = (phrase_to_delete[0], phrase_to_delete[1],
phrase_to_delete[2], phrase_to_delete[3], )
queries_parameters.append((query, parameters))
#Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
for table in self.table_phrases_to_delete:
query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
queries_parameters.append((query, ()))
with self.db_connection.cursor() as db_cursor:
for query, parameters in queries_parameters:
db_cursor.execute(query, parameters)
def _convert_php_settings_if_needed(self, file_path): def _convert_php_settings_if_needed(self, file_path):
""" """
Convert php settings file of special phrases to json file if it is still in php format. Convert php settings file of special phrases to json file if it is still in php format.

View File

@@ -2,6 +2,7 @@
Tests for import special phrases methods Tests for import special phrases methods
of the class SpecialPhrasesImporter. of the class SpecialPhrasesImporter.
""" """
from mocks import MockParamCapture
from nominatim.errors import UsageError from nominatim.errors import UsageError
from pathlib import Path from pathlib import Path
import tempfile import tempfile
@@ -11,6 +12,72 @@ from nominatim.tools.special_phrases import SpecialPhrasesImporter
TEST_BASE_DIR = Path(__file__) / '..' / '..' TEST_BASE_DIR = Path(__file__) / '..' / '..'
def test_fetch_existing_words_phrases_basic(special_phrases_importer, word_table,
temp_db_conn):
"""
Check for the fetch_existing_words_phrases() method.
It should return special phrase term added to the word
table.
"""
with temp_db_conn.cursor() as temp_db_cursor:
query ="""
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
'class', 'type', null, 0, 'near');
"""
temp_db_cursor.execute(query)
assert not special_phrases_importer.words_phrases_to_delete
special_phrases_importer._fetch_existing_words_phrases()
contained_phrase = special_phrases_importer.words_phrases_to_delete.pop()
assert contained_phrase == ('normalized_word', 'class', 'type', 'near')
def test_fetch_existing_words_phrases_housenumber(special_phrases_importer, word_table,
temp_db_conn):
"""
Check for the fetch_existing_words_phrases() method.
It should return nothing as the term added correspond
to a housenumber term.
"""
with temp_db_conn.cursor() as temp_db_cursor:
query ="""
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
'place', 'house', null, 0, 'near');
"""
temp_db_cursor.execute(query)
special_phrases_importer._fetch_existing_words_phrases()
assert not special_phrases_importer.words_phrases_to_delete
def test_fetch_existing_words_phrases_postcode(special_phrases_importer, word_table,
temp_db_conn):
"""
Check for the fetch_existing_words_phrases() method.
It should return nothing as the term added correspond
to a postcode term.
"""
with temp_db_conn.cursor() as temp_db_cursor:
query ="""
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
'place', 'postcode', null, 0, 'near');
"""
temp_db_cursor.execute(query)
special_phrases_importer._fetch_existing_words_phrases()
assert not special_phrases_importer.words_phrases_to_delete
def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_conn):
"""
Check for the fetch_existing_place_classtype_tables() method.
It should return the table just created.
"""
with temp_db_conn.cursor() as temp_db_cursor:
query = 'CREATE TABLE place_classtype_testclasstypetable()'
temp_db_cursor.execute(query)
special_phrases_importer._fetch_existing_place_classtype_tables()
contained_table = special_phrases_importer.table_phrases_to_delete.pop()
assert contained_table == 'place_classtype_testclasstypetable'
def test_check_sanity_class(special_phrases_importer): def test_check_sanity_class(special_phrases_importer):
""" """
Check for _check_sanity() method. Check for _check_sanity() method.
@@ -80,7 +147,7 @@ def test_convert_settings_giving_json(special_phrases_importer):
assert returned == json_file assert returned == json_file
def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs, def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs,
word_table, temp_db_conn): temp_db_conn):
""" """
Test that _process_amenity() execute well the Test that _process_amenity() execute well the
getorcreate_amenityoperator() SQL function and that getorcreate_amenityoperator() SQL function and that
@@ -188,13 +255,72 @@ def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer,
assert check_amenities_without_op(temp_db_conn) assert check_amenities_without_op(temp_db_conn)
assert results[class_test] and type_test in results.values() assert results[class_test] and type_test in results.values()
def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_phrases,
temp_db_conn):
"""
Check for the remove_non_existent_phrases_from_db() method.
It should removed entries from the word table which are contained
in the words_phrases_to_delete set and not those also contained
in the words_phrases_still_exist set.
place_classtype tables contained in table_phrases_to_delete should
be deleted.
"""
with temp_db_conn.cursor() as temp_db_cursor:
to_delete_phrase_tuple = ('normalized_word', 'class', 'type', 'near')
to_keep_phrase_tuple = (
'normalized_word_exists', 'class_exists', 'type_exists', 'near'
)
special_phrases_importer.words_phrases_to_delete = {
to_delete_phrase_tuple,
to_keep_phrase_tuple
}
special_phrases_importer.words_phrases_still_exist = {
to_keep_phrase_tuple
}
special_phrases_importer.table_phrases_to_delete = {
'place_classtype_testclasstypetable_to_delete'
}
query_words = 'SELECT word, class, type, operator FROM word;'
query_tables = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema='public'
AND table_name like 'place_classtype_%';
"""
special_phrases_importer._remove_non_existent_phrases_from_db()
temp_db_cursor.execute(query_words)
words_result = temp_db_cursor.fetchall()
temp_db_cursor.execute(query_tables)
tables_result = temp_db_cursor.fetchall()
assert len(words_result) == 1 and words_result[0] == [
'normalized_word_exists', 'class_exists', 'type_exists', 'near'
]
assert (len(tables_result) == 1 and
tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
)
def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table, def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table,
getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs): getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs, word_table):
""" """
Check that the main import_from_wiki() method is well executed. Check that the main import_from_wiki() method is well executed.
It should create the place_classtype table, the place_id and centroid indexes, It should create the place_classtype table, the place_id and centroid indexes,
grand access to the web user and executing the SQL functions for amenities. grand access to the web user and executing the SQL functions for amenities.
""" """
mock_fetch_existing_words_phrases = MockParamCapture()
mock_fetch_existing_place_classtype_tables = MockParamCapture()
mock_remove_non_existent_phrases_from_db = MockParamCapture()
monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._fetch_existing_words_phrases',
mock_fetch_existing_words_phrases)
monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._fetch_existing_place_classtype_tables',
mock_fetch_existing_place_classtype_tables)
monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._remove_non_existent_phrases_from_db',
mock_remove_non_existent_phrases_from_db)
monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content) monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
special_phrases_importer.import_from_wiki(['en']) special_phrases_importer.import_from_wiki(['en'])
@@ -206,6 +332,9 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test) assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
assert check_amenities_with_op(temp_db_conn) assert check_amenities_with_op(temp_db_conn)
assert check_amenities_without_op(temp_db_conn) assert check_amenities_without_op(temp_db_conn)
assert mock_fetch_existing_words_phrases.called == 1
assert mock_fetch_existing_place_classtype_tables.called == 1
assert mock_remove_non_existent_phrases_from_db.called == 1
def mock_get_wiki_content(lang): def mock_get_wiki_content(lang):
""" """
@@ -305,6 +434,18 @@ def temp_phplib_dir_with_migration():
yield Path(phpdir) yield Path(phpdir)
@pytest.fixture
def default_phrases(word_table, temp_db_cursor):
temp_db_cursor.execute("""
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
'class', 'type', null, 0, 'near');
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word_exists',
'class_exists', 'type_exists', null, 0, 'near');
CREATE TABLE place_classtype_testclasstypetable_to_delete();
CREATE TABLE place_classtype_testclasstypetable_to_keep();""")
@pytest.fixture @pytest.fixture
def make_strandard_name_func(temp_db_cursor): def make_strandard_name_func(temp_db_cursor):
temp_db_cursor.execute(""" temp_db_cursor.execute("""