mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-12 05:44:06 +00:00
Merge pull request #2263 from AntoJvlt/special-phrases-autoupdate
Implemented auto update of special phrases while importing them
This commit is contained in:
@@ -32,6 +32,17 @@ class SpecialPhrasesImporter():
|
|||||||
self.sanity_check_pattern = re.compile(r'^\w+$')
|
self.sanity_check_pattern = re.compile(r'^\w+$')
|
||||||
self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
|
self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
|
||||||
self.config.TERM_NORMALIZATION)
|
self.config.TERM_NORMALIZATION)
|
||||||
|
#This set will contain all existing phrases from the word table which
|
||||||
|
#no longer exist on the wiki.
|
||||||
|
#It contain tuples with the following format: (normalized_word, class, type, operator)
|
||||||
|
self.words_phrases_to_delete = set()
|
||||||
|
#This set will contain the phrases which still exist from the wiki.
|
||||||
|
#It is used to prevent duplicates on the wiki by removing them from
|
||||||
|
#the word_phrases_to_delete only at the end.
|
||||||
|
self.words_phrases_still_exist = set()
|
||||||
|
#This set will contain all existing place_classtype tables which doesn't match any
|
||||||
|
#special phrases class/type on the wiki.
|
||||||
|
self.table_phrases_to_delete = set()
|
||||||
|
|
||||||
def import_from_wiki(self, languages=None):
|
def import_from_wiki(self, languages=None):
|
||||||
"""
|
"""
|
||||||
@@ -41,6 +52,9 @@ class SpecialPhrasesImporter():
|
|||||||
if languages is not None and not isinstance(languages, list):
|
if languages is not None and not isinstance(languages, list):
|
||||||
raise TypeError('The \'languages\' argument should be of type list.')
|
raise TypeError('The \'languages\' argument should be of type list.')
|
||||||
|
|
||||||
|
self._fetch_existing_words_phrases()
|
||||||
|
self._fetch_existing_place_classtype_tables()
|
||||||
|
|
||||||
#Get all languages to process.
|
#Get all languages to process.
|
||||||
languages = self._load_languages() if not languages else languages
|
languages = self._load_languages() if not languages else languages
|
||||||
|
|
||||||
@@ -53,9 +67,46 @@ class SpecialPhrasesImporter():
|
|||||||
class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
|
class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
|
||||||
|
|
||||||
self._create_place_classtype_table_and_indexes(class_type_pairs)
|
self._create_place_classtype_table_and_indexes(class_type_pairs)
|
||||||
|
self._remove_non_existent_phrases_from_db()
|
||||||
self.db_connection.commit()
|
self.db_connection.commit()
|
||||||
LOG.warning('Import done.')
|
LOG.warning('Import done.')
|
||||||
|
|
||||||
|
def _fetch_existing_words_phrases(self):
|
||||||
|
"""
|
||||||
|
Fetch existing special phrases from the word table.
|
||||||
|
Fill the word_phrases_to_delete set of the class.
|
||||||
|
"""
|
||||||
|
#Only extract special phrases terms:
|
||||||
|
#If class=place and type=house then it is a housenumber term.
|
||||||
|
#If class=place and type=postcode then it is a postcode term.
|
||||||
|
word_query = """
|
||||||
|
SELECT word, class, type, operator FROM word
|
||||||
|
WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
|
||||||
|
"""
|
||||||
|
with self.db_connection.cursor() as db_cursor:
|
||||||
|
db_cursor.execute(SQL(word_query))
|
||||||
|
for row in db_cursor:
|
||||||
|
row[3] = '-' if row[3] is None else row[3]
|
||||||
|
self.words_phrases_to_delete.add(
|
||||||
|
(row[0], row[1], row[2], row[3])
|
||||||
|
)
|
||||||
|
|
||||||
|
def _fetch_existing_place_classtype_tables(self):
|
||||||
|
"""
|
||||||
|
Fetch existing place_classtype tables.
|
||||||
|
Fill the table_phrases_to_delete set of the class.
|
||||||
|
"""
|
||||||
|
query = """
|
||||||
|
SELECT table_name
|
||||||
|
FROM information_schema.tables
|
||||||
|
WHERE table_schema='public'
|
||||||
|
AND table_name like 'place_classtype_%';
|
||||||
|
"""
|
||||||
|
with self.db_connection.cursor() as db_cursor:
|
||||||
|
db_cursor.execute(SQL(query))
|
||||||
|
for row in db_cursor:
|
||||||
|
self.table_phrases_to_delete.add(row[0])
|
||||||
|
|
||||||
def _load_white_and_black_lists(self):
|
def _load_white_and_black_lists(self):
|
||||||
"""
|
"""
|
||||||
Load white and black lists from phrases-settings.json.
|
Load white and black lists from phrases-settings.json.
|
||||||
@@ -122,12 +173,11 @@ class SpecialPhrasesImporter():
|
|||||||
phrase_class = match[1].strip()
|
phrase_class = match[1].strip()
|
||||||
phrase_type = match[2].strip()
|
phrase_type = match[2].strip()
|
||||||
phrase_operator = match[3].strip()
|
phrase_operator = match[3].strip()
|
||||||
|
#Needed if some operator in the wiki are not written in english
|
||||||
|
phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
|
||||||
#hack around a bug where building=yes was imported with quotes into the wiki
|
#hack around a bug where building=yes was imported with quotes into the wiki
|
||||||
phrase_type = re.sub(r'\"|"', '', phrase_type)
|
phrase_type = re.sub(r'\"|"', '', phrase_type)
|
||||||
|
|
||||||
#sanity check, in case somebody added garbage in the wiki
|
|
||||||
self._check_sanity(lang, phrase_class, phrase_type)
|
|
||||||
|
|
||||||
#blacklisting: disallow certain class/type combinations
|
#blacklisting: disallow certain class/type combinations
|
||||||
if (
|
if (
|
||||||
phrase_class in self.black_list.keys() and
|
phrase_class in self.black_list.keys() and
|
||||||
@@ -141,7 +191,22 @@ class SpecialPhrasesImporter():
|
|||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
#add class/type to the pairs dict
|
#Check if the phrase already exists in the database.
|
||||||
|
if (
|
||||||
|
(normalized_label, phrase_class, phrase_type, phrase_operator)
|
||||||
|
in self.words_phrases_to_delete
|
||||||
|
):
|
||||||
|
#Remove this phrase from the ones to delete as it still exist on the wiki.
|
||||||
|
self.words_phrases_still_exist.add(
|
||||||
|
(normalized_label, phrase_class, phrase_type, phrase_operator)
|
||||||
|
)
|
||||||
|
class_type_pairs.add((phrase_class, phrase_type))
|
||||||
|
#Dont need to add this phrase as it already exists in the word table.
|
||||||
|
continue
|
||||||
|
|
||||||
|
#sanity check, in case somebody added garbage in the wiki
|
||||||
|
self._check_sanity(lang, phrase_class, phrase_type)
|
||||||
|
|
||||||
class_type_pairs.add((phrase_class, phrase_type))
|
class_type_pairs.add((phrase_class, phrase_type))
|
||||||
|
|
||||||
self._process_amenity(
|
self._process_amenity(
|
||||||
@@ -191,6 +256,15 @@ class SpecialPhrasesImporter():
|
|||||||
phrase_class = pair[0]
|
phrase_class = pair[0]
|
||||||
phrase_type = pair[1]
|
phrase_type = pair[1]
|
||||||
|
|
||||||
|
table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
|
||||||
|
|
||||||
|
if table_name in self.table_phrases_to_delete:
|
||||||
|
#Remove this table from the ones to delete as it match a class/type
|
||||||
|
#still existing on the special phrases of the wiki.
|
||||||
|
self.table_phrases_to_delete.remove(table_name)
|
||||||
|
#So dont need to create the table and indexes.
|
||||||
|
continue
|
||||||
|
|
||||||
#Table creation
|
#Table creation
|
||||||
self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
|
self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
|
||||||
|
|
||||||
@@ -251,6 +325,41 @@ class SpecialPhrasesImporter():
|
|||||||
.format(Identifier(table_name),
|
.format(Identifier(table_name),
|
||||||
Identifier(self.config.DATABASE_WEBUSER)))
|
Identifier(self.config.DATABASE_WEBUSER)))
|
||||||
|
|
||||||
|
def _remove_non_existent_phrases_from_db(self):
|
||||||
|
"""
|
||||||
|
Remove special phrases which doesn't exist on the wiki anymore.
|
||||||
|
Delete from the word table and delete the place_classtype tables.
|
||||||
|
"""
|
||||||
|
LOG.warning('Cleaning database...')
|
||||||
|
self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
|
||||||
|
#Array containing all queries to execute. Contain tuples of format (query, parameters)
|
||||||
|
queries_parameters = []
|
||||||
|
|
||||||
|
#Delete phrases from the word table which are not on the wiki anymore.
|
||||||
|
for phrase_to_delete in self.words_phrases_to_delete:
|
||||||
|
if phrase_to_delete[3] == '-':
|
||||||
|
query = """
|
||||||
|
DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
|
||||||
|
"""
|
||||||
|
parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
|
||||||
|
queries_parameters.append((query, parameters))
|
||||||
|
else:
|
||||||
|
query = """
|
||||||
|
DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
|
||||||
|
"""
|
||||||
|
parameters = (phrase_to_delete[0], phrase_to_delete[1],
|
||||||
|
phrase_to_delete[2], phrase_to_delete[3], )
|
||||||
|
queries_parameters.append((query, parameters))
|
||||||
|
|
||||||
|
#Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
|
||||||
|
for table in self.table_phrases_to_delete:
|
||||||
|
query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
|
||||||
|
queries_parameters.append((query, ()))
|
||||||
|
|
||||||
|
with self.db_connection.cursor() as db_cursor:
|
||||||
|
for query, parameters in queries_parameters:
|
||||||
|
db_cursor.execute(query, parameters)
|
||||||
|
|
||||||
def _convert_php_settings_if_needed(self, file_path):
|
def _convert_php_settings_if_needed(self, file_path):
|
||||||
"""
|
"""
|
||||||
Convert php settings file of special phrases to json file if it is still in php format.
|
Convert php settings file of special phrases to json file if it is still in php format.
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
Tests for import special phrases methods
|
Tests for import special phrases methods
|
||||||
of the class SpecialPhrasesImporter.
|
of the class SpecialPhrasesImporter.
|
||||||
"""
|
"""
|
||||||
|
from mocks import MockParamCapture
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import tempfile
|
import tempfile
|
||||||
@@ -11,6 +12,72 @@ from nominatim.tools.special_phrases import SpecialPhrasesImporter
|
|||||||
|
|
||||||
TEST_BASE_DIR = Path(__file__) / '..' / '..'
|
TEST_BASE_DIR = Path(__file__) / '..' / '..'
|
||||||
|
|
||||||
|
def test_fetch_existing_words_phrases_basic(special_phrases_importer, word_table,
|
||||||
|
temp_db_conn):
|
||||||
|
"""
|
||||||
|
Check for the fetch_existing_words_phrases() method.
|
||||||
|
It should return special phrase term added to the word
|
||||||
|
table.
|
||||||
|
"""
|
||||||
|
with temp_db_conn.cursor() as temp_db_cursor:
|
||||||
|
query ="""
|
||||||
|
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
|
||||||
|
'class', 'type', null, 0, 'near');
|
||||||
|
"""
|
||||||
|
temp_db_cursor.execute(query)
|
||||||
|
|
||||||
|
assert not special_phrases_importer.words_phrases_to_delete
|
||||||
|
special_phrases_importer._fetch_existing_words_phrases()
|
||||||
|
contained_phrase = special_phrases_importer.words_phrases_to_delete.pop()
|
||||||
|
assert contained_phrase == ('normalized_word', 'class', 'type', 'near')
|
||||||
|
|
||||||
|
def test_fetch_existing_words_phrases_housenumber(special_phrases_importer, word_table,
|
||||||
|
temp_db_conn):
|
||||||
|
"""
|
||||||
|
Check for the fetch_existing_words_phrases() method.
|
||||||
|
It should return nothing as the term added correspond
|
||||||
|
to a housenumber term.
|
||||||
|
"""
|
||||||
|
with temp_db_conn.cursor() as temp_db_cursor:
|
||||||
|
query ="""
|
||||||
|
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
|
||||||
|
'place', 'house', null, 0, 'near');
|
||||||
|
"""
|
||||||
|
temp_db_cursor.execute(query)
|
||||||
|
|
||||||
|
special_phrases_importer._fetch_existing_words_phrases()
|
||||||
|
assert not special_phrases_importer.words_phrases_to_delete
|
||||||
|
|
||||||
|
def test_fetch_existing_words_phrases_postcode(special_phrases_importer, word_table,
|
||||||
|
temp_db_conn):
|
||||||
|
"""
|
||||||
|
Check for the fetch_existing_words_phrases() method.
|
||||||
|
It should return nothing as the term added correspond
|
||||||
|
to a postcode term.
|
||||||
|
"""
|
||||||
|
with temp_db_conn.cursor() as temp_db_cursor:
|
||||||
|
query ="""
|
||||||
|
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
|
||||||
|
'place', 'postcode', null, 0, 'near');
|
||||||
|
"""
|
||||||
|
temp_db_cursor.execute(query)
|
||||||
|
|
||||||
|
special_phrases_importer._fetch_existing_words_phrases()
|
||||||
|
assert not special_phrases_importer.words_phrases_to_delete
|
||||||
|
|
||||||
|
def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_conn):
|
||||||
|
"""
|
||||||
|
Check for the fetch_existing_place_classtype_tables() method.
|
||||||
|
It should return the table just created.
|
||||||
|
"""
|
||||||
|
with temp_db_conn.cursor() as temp_db_cursor:
|
||||||
|
query = 'CREATE TABLE place_classtype_testclasstypetable()'
|
||||||
|
temp_db_cursor.execute(query)
|
||||||
|
|
||||||
|
special_phrases_importer._fetch_existing_place_classtype_tables()
|
||||||
|
contained_table = special_phrases_importer.table_phrases_to_delete.pop()
|
||||||
|
assert contained_table == 'place_classtype_testclasstypetable'
|
||||||
|
|
||||||
def test_check_sanity_class(special_phrases_importer):
|
def test_check_sanity_class(special_phrases_importer):
|
||||||
"""
|
"""
|
||||||
Check for _check_sanity() method.
|
Check for _check_sanity() method.
|
||||||
@@ -80,7 +147,7 @@ def test_convert_settings_giving_json(special_phrases_importer):
|
|||||||
assert returned == json_file
|
assert returned == json_file
|
||||||
|
|
||||||
def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs,
|
def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs,
|
||||||
word_table, temp_db_conn):
|
temp_db_conn):
|
||||||
"""
|
"""
|
||||||
Test that _process_amenity() execute well the
|
Test that _process_amenity() execute well the
|
||||||
getorcreate_amenityoperator() SQL function and that
|
getorcreate_amenityoperator() SQL function and that
|
||||||
@@ -188,13 +255,72 @@ def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer,
|
|||||||
assert check_amenities_without_op(temp_db_conn)
|
assert check_amenities_without_op(temp_db_conn)
|
||||||
assert results[class_test] and type_test in results.values()
|
assert results[class_test] and type_test in results.values()
|
||||||
|
|
||||||
|
def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_phrases,
|
||||||
|
temp_db_conn):
|
||||||
|
"""
|
||||||
|
Check for the remove_non_existent_phrases_from_db() method.
|
||||||
|
|
||||||
|
It should removed entries from the word table which are contained
|
||||||
|
in the words_phrases_to_delete set and not those also contained
|
||||||
|
in the words_phrases_still_exist set.
|
||||||
|
|
||||||
|
place_classtype tables contained in table_phrases_to_delete should
|
||||||
|
be deleted.
|
||||||
|
"""
|
||||||
|
with temp_db_conn.cursor() as temp_db_cursor:
|
||||||
|
to_delete_phrase_tuple = ('normalized_word', 'class', 'type', 'near')
|
||||||
|
to_keep_phrase_tuple = (
|
||||||
|
'normalized_word_exists', 'class_exists', 'type_exists', 'near'
|
||||||
|
)
|
||||||
|
special_phrases_importer.words_phrases_to_delete = {
|
||||||
|
to_delete_phrase_tuple,
|
||||||
|
to_keep_phrase_tuple
|
||||||
|
}
|
||||||
|
special_phrases_importer.words_phrases_still_exist = {
|
||||||
|
to_keep_phrase_tuple
|
||||||
|
}
|
||||||
|
special_phrases_importer.table_phrases_to_delete = {
|
||||||
|
'place_classtype_testclasstypetable_to_delete'
|
||||||
|
}
|
||||||
|
|
||||||
|
query_words = 'SELECT word, class, type, operator FROM word;'
|
||||||
|
query_tables = """
|
||||||
|
SELECT table_name
|
||||||
|
FROM information_schema.tables
|
||||||
|
WHERE table_schema='public'
|
||||||
|
AND table_name like 'place_classtype_%';
|
||||||
|
"""
|
||||||
|
|
||||||
|
special_phrases_importer._remove_non_existent_phrases_from_db()
|
||||||
|
|
||||||
|
temp_db_cursor.execute(query_words)
|
||||||
|
words_result = temp_db_cursor.fetchall()
|
||||||
|
temp_db_cursor.execute(query_tables)
|
||||||
|
tables_result = temp_db_cursor.fetchall()
|
||||||
|
assert len(words_result) == 1 and words_result[0] == [
|
||||||
|
'normalized_word_exists', 'class_exists', 'type_exists', 'near'
|
||||||
|
]
|
||||||
|
assert (len(tables_result) == 1 and
|
||||||
|
tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
|
||||||
|
)
|
||||||
|
|
||||||
def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table,
|
def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table,
|
||||||
getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs):
|
getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs, word_table):
|
||||||
"""
|
"""
|
||||||
Check that the main import_from_wiki() method is well executed.
|
Check that the main import_from_wiki() method is well executed.
|
||||||
It should create the place_classtype table, the place_id and centroid indexes,
|
It should create the place_classtype table, the place_id and centroid indexes,
|
||||||
grand access to the web user and executing the SQL functions for amenities.
|
grand access to the web user and executing the SQL functions for amenities.
|
||||||
"""
|
"""
|
||||||
|
mock_fetch_existing_words_phrases = MockParamCapture()
|
||||||
|
mock_fetch_existing_place_classtype_tables = MockParamCapture()
|
||||||
|
mock_remove_non_existent_phrases_from_db = MockParamCapture()
|
||||||
|
|
||||||
|
monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._fetch_existing_words_phrases',
|
||||||
|
mock_fetch_existing_words_phrases)
|
||||||
|
monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._fetch_existing_place_classtype_tables',
|
||||||
|
mock_fetch_existing_place_classtype_tables)
|
||||||
|
monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._remove_non_existent_phrases_from_db',
|
||||||
|
mock_remove_non_existent_phrases_from_db)
|
||||||
monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
|
monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
|
||||||
special_phrases_importer.import_from_wiki(['en'])
|
special_phrases_importer.import_from_wiki(['en'])
|
||||||
|
|
||||||
@@ -206,6 +332,9 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
|
|||||||
assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
|
assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
|
||||||
assert check_amenities_with_op(temp_db_conn)
|
assert check_amenities_with_op(temp_db_conn)
|
||||||
assert check_amenities_without_op(temp_db_conn)
|
assert check_amenities_without_op(temp_db_conn)
|
||||||
|
assert mock_fetch_existing_words_phrases.called == 1
|
||||||
|
assert mock_fetch_existing_place_classtype_tables.called == 1
|
||||||
|
assert mock_remove_non_existent_phrases_from_db.called == 1
|
||||||
|
|
||||||
def mock_get_wiki_content(lang):
|
def mock_get_wiki_content(lang):
|
||||||
"""
|
"""
|
||||||
@@ -305,6 +434,18 @@ def temp_phplib_dir_with_migration():
|
|||||||
|
|
||||||
yield Path(phpdir)
|
yield Path(phpdir)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def default_phrases(word_table, temp_db_cursor):
|
||||||
|
temp_db_cursor.execute("""
|
||||||
|
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
|
||||||
|
'class', 'type', null, 0, 'near');
|
||||||
|
|
||||||
|
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word_exists',
|
||||||
|
'class_exists', 'type_exists', null, 0, 'near');
|
||||||
|
|
||||||
|
CREATE TABLE place_classtype_testclasstypetable_to_delete();
|
||||||
|
CREATE TABLE place_classtype_testclasstypetable_to_keep();""")
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def make_strandard_name_func(temp_db_cursor):
|
def make_strandard_name_func(temp_db_cursor):
|
||||||
temp_db_cursor.execute("""
|
temp_db_cursor.execute("""
|
||||||
|
|||||||
Reference in New Issue
Block a user