Merge pull request #2731 from lonvia/cleanup-special-phrases

Minor code reorganisation around special phrase parsing
This commit is contained in:
Sarah Hoffmann
2022-05-31 17:13:56 +02:00
committed by GitHub
16 changed files with 145 additions and 157 deletions

View File

@@ -81,13 +81,16 @@ jobs:
ubuntu: ${{ matrix.ubuntu }} ubuntu: ${{ matrix.ubuntu }}
- name: Install test prerequsites - name: Install test prerequsites
run: sudo apt-get install -y -qq pylint python3-pytest python3-behave run: sudo apt-get install -y -qq python3-pytest python3-behave
if: matrix.ubuntu == 20 if: matrix.ubuntu == 20
- name: Install test prerequsites - name: Install test prerequsites
run: pip3 install pylint==2.6.0 pytest behave==1.2.6 run: pip3 install pytest behave==1.2.6
if: matrix.ubuntu == 18 if: matrix.ubuntu == 18
- name: Install latest pylint
run: pip3 install pylint
- name: PHP linting - name: PHP linting
run: phpcs --report-width=120 . run: phpcs --report-width=120 .
working-directory: Nominatim working-directory: Nominatim

View File

@@ -32,7 +32,7 @@ It has the following additional requirements:
* [behave test framework](https://behave.readthedocs.io) >= 1.2.6 * [behave test framework](https://behave.readthedocs.io) >= 1.2.6
* [phpunit](https://phpunit.de) (9.5 is known to work) * [phpunit](https://phpunit.de) (9.5 is known to work)
* [PHP CodeSniffer](https://github.com/squizlabs/PHP_CodeSniffer) * [PHP CodeSniffer](https://github.com/squizlabs/PHP_CodeSniffer)
* [Pylint](https://pylint.org/) (2.6.0 is used for the CI) * [Pylint](https://pylint.org/) (CI always runs the latest version from pip)
* [pytest](https://pytest.org) * [pytest](https://pytest.org)
The documentation is built with mkdocs: The documentation is built with mkdocs:

View File

@@ -128,7 +128,7 @@ class SetupAll:
drop=args.no_updates) drop=args.no_updates)
LOG.warning('Create search index for default country names.') LOG.warning('Create search index for default country names.')
country_info.create_country_names(conn, tokenizer, country_info.create_country_names(conn, tokenizer,
args.config.LANGUAGES) args.config.get_str_list('LANGUAGES'))
if args.no_updates: if args.no_updates:
freeze.drop_update_tables(conn) freeze.drop_update_tables(conn)
tokenizer.finalize_import(args.config) tokenizer.finalize_import(args.config)

View File

@@ -99,6 +99,17 @@ class Configuration:
raise UsageError("Configuration error.") from exp raise UsageError("Configuration error.") from exp
def get_str_list(self, name):
""" Return the given configuration parameter as a list of strings.
The values are assumed to be given as a comma-sparated list and
will be stripped before returning them. On empty values None
is returned.
"""
raw = self.__getattr__(name)
return [v.strip() for v in raw.split(',')] if raw else None
def get_path(self, name): def get_path(self, name):
""" Return the given configuration parameter as a Path. """ Return the given configuration parameter as a Path.
If a relative path is configured, then the function converts this If a relative path is configured, then the function converts this

View File

@@ -131,9 +131,6 @@ def create_country_names(conn, tokenizer, languages=None):
empty then only name translations for the given languages are added empty then only name translations for the given languages are added
to the index. to the index.
""" """
if languages:
languages = languages.split(',')
def _include_key(key): def _include_key(key):
return ':' not in key or not languages or \ return ':' not in key or not languages or \
key[key.index(':') + 1:] in languages key[key.index(':') + 1:] in languages

View File

@@ -11,43 +11,31 @@
""" """
import csv import csv
import os import os
from collections.abc import Iterator
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
from nominatim.errors import UsageError from nominatim.errors import UsageError
class SPCsvLoader(Iterator): class SPCsvLoader:
""" """
Handles loading of special phrases from external csv file. Handles loading of special phrases from external csv file.
""" """
def __init__(self, csv_path): def __init__(self, csv_path):
super().__init__() super().__init__()
self.csv_path = csv_path self.csv_path = csv_path
self.has_been_read = False
def __next__(self):
if self.has_been_read:
raise StopIteration()
self.has_been_read = True def generate_phrases(self):
self.check_csv_validity() """ Open and parse the given csv file.
return self.parse_csv()
def parse_csv(self):
"""
Open and parse the given csv file.
Create the corresponding SpecialPhrases. Create the corresponding SpecialPhrases.
""" """
phrases = set() self._check_csv_validity()
with open(self.csv_path, encoding='utf-8') as fd: with open(self.csv_path, encoding='utf-8') as fd:
reader = csv.DictReader(fd, delimiter=',') reader = csv.DictReader(fd, delimiter=',')
for row in reader: for row in reader:
phrases.add( yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
)
return phrases
def check_csv_validity(self):
def _check_csv_validity(self):
""" """
Check that the csv file has the right extension. Check that the csv file has the right extension.
""" """

View File

@@ -62,11 +62,10 @@ class SPImporter():
# Store pairs of class/type for further processing # Store pairs of class/type for further processing
class_type_pairs = set() class_type_pairs = set()
for loaded_phrases in self.sp_loader: for phrase in self.sp_loader.generate_phrases():
for phrase in loaded_phrases: result = self._process_phrase(phrase)
result = self._process_phrase(phrase) if result:
if result: class_type_pairs.add(result)
class_type_pairs.add(result)
self._create_place_classtype_table_and_indexes(class_type_pairs) self._create_place_classtype_table_and_indexes(class_type_pairs)
if should_replace: if should_replace:

View File

@@ -9,46 +9,56 @@
""" """
import re import re
import logging import logging
from collections.abc import Iterator
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
from nominatim.tools.exec_utils import get_url from nominatim.tools.exec_utils import get_url
LOG = logging.getLogger() LOG = logging.getLogger()
class SPWikiLoader(Iterator):
def _get_wiki_content(lang):
"""
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
"""
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+ lang.upper()
return get_url(url)
class SPWikiLoader:
""" """
Handles loading of special phrases from the wiki. Handles loading of special phrases from the wiki.
""" """
def __init__(self, config, languages=None): def __init__(self, config):
super().__init__() super().__init__()
self.config = config self.config = config
# Compile the regex here to increase performances. # Compile the regex here to increase performances.
self.occurence_pattern = re.compile( self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])' r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
) )
self.languages = self._load_languages() if not languages else list(languages) # Hack around a bug where building=yes was imported with quotes into the wiki
self.type_fix_pattern = re.compile(r'\"|"')
self._load_languages()
def __next__(self):
if not self.languages:
raise StopIteration
lang = self.languages.pop(0) def generate_phrases(self):
loaded_xml = self._get_wiki_content(lang) """ Download the wiki pages for the configured languages
LOG.warning('Importing phrases for lang: %s...', lang) and extract the phrases from the page.
return self.parse_xml(loaded_xml)
def parse_xml(self, xml):
""" """
Parses XML content and extracts special phrases from it. for lang in self.languages:
Return a list of SpecialPhrase. LOG.warning('Importing phrases for lang: %s...', lang)
""" loaded_xml = _get_wiki_content(lang)
# One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(xml) # One match will be of format [label, class, type, operator, plural]
returned_phrases = set() matches = self.occurence_pattern.findall(loaded_xml)
for match in matches:
returned_phrases.add( for match in matches:
SpecialPhrase(match[0], match[1], match[2], match[3]) yield SpecialPhrase(match[0],
) match[1],
return returned_phrases self.type_fix_pattern.sub('', match[2]),
match[3])
def _load_languages(self): def _load_languages(self):
""" """
@@ -56,21 +66,11 @@ class SPWikiLoader(Iterator):
or default if there is no languages configured. or default if there is no languages configured.
The system will extract special phrases only from all specified languages. The system will extract special phrases only from all specified languages.
""" """
default_languages = [ if self.config.LANGUAGES:
self.languages = self.config.get_str_list('LANGUAGES')
else:
self.languages = [
'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es', 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
@staticmethod
def _get_wiki_content(lang):
"""
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
"""
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+ lang.upper()
return get_url(url)

View File

@@ -10,9 +10,7 @@
This class is a model used to transfer a special phrase through This class is a model used to transfer a special phrase through
the process of load and importation. the process of load and importation.
""" """
import re class SpecialPhrase:
class SpecialPhrase():
""" """
Model representing a special phrase. Model representing a special phrase.
""" """
@@ -20,7 +18,19 @@ class SpecialPhrase():
self.p_label = p_label.strip() self.p_label = p_label.strip()
self.p_class = p_class.strip() self.p_class = p_class.strip()
# Hack around a bug where building=yes was imported with quotes into the wiki # Hack around a bug where building=yes was imported with quotes into the wiki
self.p_type = re.sub(r'\"|"', '', p_type.strip()) self.p_type = p_type.strip()
# Needed if some operator in the wiki are not written in english # Needed if some operator in the wiki are not written in english
p_operator = p_operator.strip().lower() p_operator = p_operator.strip().lower()
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
def __eq__(self, other):
if not isinstance(other, SpecialPhrase):
return False
return self.p_label == other.p_label \
and self.p_class == other.p_class \
and self.p_type == other.p_type \
and self.p_operator == other.p_operator
def __hash__(self):
return hash((self.p_label, self.p_class, self.p_type, self.p_operator))

View File

@@ -173,6 +173,23 @@ def test_get_int_empty(make_config):
config.get_int('DATABASE_MODULE_PATH') config.get_int('DATABASE_MODULE_PATH')
@pytest.mark.parametrize("value,outlist", [('sd', ['sd']),
('dd,rr', ['dd', 'rr']),
(' a , b ', ['a', 'b'])])
def test_get_str_list_success(make_config, monkeypatch, value, outlist):
config = make_config()
monkeypatch.setenv('NOMINATIM_MYLIST', value)
assert config.get_str_list('MYLIST') == outlist
def test_get_str_list_empty(make_config):
config = make_config()
assert config.get_str_list('LANGUAGES') is None
def test_get_path_empty(make_config): def test_get_path_empty(make_config):
config = make_config() config = make_config()

View File

@@ -49,7 +49,7 @@ def test_setup_country_tables(src_dir, temp_db_with_extensions, dsn, temp_db_cur
assert temp_db_cursor.table_rows('country_osm_grid') > 100 assert temp_db_cursor.table_rows('country_osm_grid') > 100
@pytest.mark.parametrize("languages", (None, ' fr,en')) @pytest.mark.parametrize("languages", (None, ['fr', 'en']))
def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor, def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
table_factory, tokenizer_mock, languages, loaded_country): table_factory, tokenizer_mock, languages, loaded_country):

View File

@@ -18,16 +18,12 @@ from nominatim.errors import UsageError
from cursor import CursorForTesting from cursor import CursorForTesting
@pytest.fixture @pytest.fixture
def testfile_dir(src_dir): def sp_importer(temp_db_conn, def_config, monkeypatch):
return src_dir / 'test' / 'testfiles'
@pytest.fixture
def sp_importer(temp_db_conn, def_config):
""" """
Return an instance of SPImporter. Return an instance of SPImporter.
""" """
loader = SPWikiLoader(def_config, ['en']) monkeypatch.setenv('NOMINATIM_LANGUAGES', 'en')
loader = SPWikiLoader(def_config)
return SPImporter(def_config, temp_db_conn, loader) return SPImporter(def_config, temp_db_conn, loader)
@@ -186,8 +182,8 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
table_factory('place_classtype_amenity_animal_shelter') table_factory('place_classtype_amenity_animal_shelter')
table_factory('place_classtype_wrongclass_wrongtype') table_factory('place_classtype_wrongclass_wrongtype')
monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader.SPWikiLoader._get_wiki_content', monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader._get_wiki_content',
lambda self, lang: xml_wiki_content) lambda lang: xml_wiki_content)
tokenizer = tokenizer_mock() tokenizer = tokenizer_mock()
sp_importer.import_phrases(tokenizer, should_replace) sp_importer.import_phrases(tokenizer, should_replace)

View File

@@ -11,50 +11,7 @@ import pytest
from nominatim.errors import UsageError from nominatim.errors import UsageError
from nominatim.tools.special_phrases.sp_csv_loader import SPCsvLoader from nominatim.tools.special_phrases.sp_csv_loader import SPCsvLoader
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
def test_parse_csv(sp_csv_loader):
"""
Test method parse_csv()
Should return the right SpecialPhrase objects.
"""
phrases = sp_csv_loader.parse_csv()
assert check_phrases_content(phrases)
def test_next(sp_csv_loader):
"""
Test objects returned from the next() method.
It should return all SpecialPhrases objects of
the sp_csv_test.csv special phrases.
"""
phrases = next(sp_csv_loader)
assert check_phrases_content(phrases)
def test_check_csv_validity(sp_csv_loader):
"""
Test method check_csv_validity()
It should raise an exception when file with a
different exception than .csv is given.
"""
sp_csv_loader.csv_path = 'test.csv'
sp_csv_loader.check_csv_validity()
sp_csv_loader.csv_path = 'test.wrong'
with pytest.raises(UsageError):
assert sp_csv_loader.check_csv_validity()
def check_phrases_content(phrases):
"""
Asserts that the given phrases list contains
the right phrases of the sp_csv_test.csv special phrases.
"""
return len(phrases) > 1 \
and any(p.p_label == 'Billboard'
and p.p_class == 'advertising'
and p.p_type == 'billboard'
and p.p_operator == '-' for p in phrases) \
and any(p.p_label == 'Zip Lines'
and p.p_class == 'aerialway'
and p.p_type == 'zip_line'
and p.p_operator == '-' for p in phrases)
@pytest.fixture @pytest.fixture
def sp_csv_loader(src_dir): def sp_csv_loader(src_dir):
@@ -64,3 +21,29 @@ def sp_csv_loader(src_dir):
csv_path = (src_dir / 'test' / 'testdata' / 'sp_csv_test.csv').resolve() csv_path = (src_dir / 'test' / 'testdata' / 'sp_csv_test.csv').resolve()
loader = SPCsvLoader(csv_path) loader = SPCsvLoader(csv_path)
return loader return loader
def test_generate_phrases(sp_csv_loader):
"""
Test method parse_csv()
Should return the right SpecialPhrase objects.
"""
phrases = list(sp_csv_loader.generate_phrases())
assert len(phrases) == 42
assert len(set(phrases)) == 41
assert SpecialPhrase('Billboard', 'advertising', 'billboard', '-') in phrases
assert SpecialPhrase('Zip Lines', 'aerialway', 'zip_line', '-') in phrases
def test_invalid_cvs_file():
"""
Test method check_csv_validity()
It should raise an exception when file with a
different exception than .csv is given.
"""
loader = SPCsvLoader('test.wrong')
with pytest.raises(UsageError, match='not a csv file'):
next(loader.generate_phrases())

View File

@@ -10,49 +10,32 @@
import pytest import pytest
from nominatim.tools.special_phrases.sp_wiki_loader import SPWikiLoader from nominatim.tools.special_phrases.sp_wiki_loader import SPWikiLoader
@pytest.fixture
def xml_wiki_content(src_dir):
"""
return the content of the static xml test file.
"""
xml_test_content = src_dir / 'test' / 'testdata' / 'special_phrases_test_content.txt'
return xml_test_content.read_text()
@pytest.fixture @pytest.fixture
def sp_wiki_loader(monkeypatch, def_config, xml_wiki_content): def sp_wiki_loader(src_dir, monkeypatch, def_config):
""" """
Return an instance of SPWikiLoader. Return an instance of SPWikiLoader.
""" """
loader = SPWikiLoader(def_config, ['en']) monkeypatch.setenv('NOMINATIM_LANGUAGES', 'en')
monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader.SPWikiLoader._get_wiki_content', loader = SPWikiLoader(def_config)
lambda self, lang: xml_wiki_content)
def _mock_wiki_content(lang):
xml_test_content = src_dir / 'test' / 'testdata' / 'special_phrases_test_content.txt'
return xml_test_content.read_text()
monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader._get_wiki_content',
_mock_wiki_content)
return loader return loader
def test_parse_xml(sp_wiki_loader, xml_wiki_content): def test_generate_phrases(sp_wiki_loader):
"""
Test method parse_xml()
Should return the right SpecialPhrase objects.
"""
phrases = sp_wiki_loader.parse_xml(xml_wiki_content)
check_phrases_content(phrases)
def test_next(sp_wiki_loader):
""" """
Test objects returned from the next() method. Test objects returned from the next() method.
It should return all SpecialPhrases objects of It should return all SpecialPhrases objects of
the 'en' special phrases. the 'en' special phrases.
""" """
phrases = next(sp_wiki_loader) phrases = list(sp_wiki_loader.generate_phrases())
check_phrases_content(phrases)
def check_phrases_content(phrases):
"""
Asserts that the given phrases list contains
the right phrases of the 'en' special phrases.
"""
assert set((p.p_label, p.p_class, p.p_type, p.p_operator) for p in phrases) ==\ assert set((p.p_label, p.p_class, p.p_type, p.p_operator) for p in phrases) ==\
{('Zip Line', 'aerialway', 'zip_line', '-'), {('Zip Line', 'aerialway', 'zip_line', '-'),
('Zip Lines', 'aerialway', 'zip_line', '-'), ('Zip Lines', 'aerialway', 'zip_line', '-'),

View File

@@ -18,6 +18,7 @@ Zipline near,aerialway,zip_line,near,N
Ziplines near,aerialway,zip_line,near,Y Ziplines near,aerialway,zip_line,near,Y
Zipwire,aerialway,zip_line,-,N Zipwire,aerialway,zip_line,-,N
Zipwires,aerialway,zip_line,-,Y Zipwires,aerialway,zip_line,-,Y
Zipwires,aerialway,zip_line,name,Y
Zipwire in,aerialway,zip_line,in,N Zipwire in,aerialway,zip_line,in,N
Zipwires in,aerialway,zip_line,in,Y Zipwires in,aerialway,zip_line,in,Y
Zipwire near,aerialway,zip_line,near,N Zipwire near,aerialway,zip_line,near,N
1 phrase class type operator plural
18 Ziplines near aerialway zip_line near Y
19 Zipwire aerialway zip_line - N
20 Zipwires aerialway zip_line - Y
21 Zipwires aerialway zip_line name Y
22 Zipwire in aerialway zip_line in N
23 Zipwires in aerialway zip_line in Y
24 Zipwire near aerialway zip_line near N

View File

@@ -70,7 +70,7 @@
<model>wikitext</model> <model>wikitext</model>
<format>text/x-wiki</format> <format>text/x-wiki</format>
<text bytes="158218" sha1="cst5x7tt58izti1pxzgljf27tx8qjcj" xml:space="preserve"> <text bytes="158218" sha1="cst5x7tt58izti1pxzgljf27tx8qjcj" xml:space="preserve">
== en == {| class="wikitable sortable" |- ! Word / Phrase !! Key !! Value !! Operator !! Plural |- | Zip Line || aerialway || zip_line || - || N |- | Zip Lines || aerialway || zip_line || - || Y |- | Zip Line in || aerialway || zip_line || in || N |- | Zip Lines in || aerialway || zip_line || in || Y |- | Zip Line near || aerialway || zip_line || near || N |- | Animal shelter || amenity || animal_shelter || - || N |- | Animal shelters || amenity || animal_shelter || - || Y |- | Animal shelter in || amenity || animal_shelter || in || N |- | Animal shelters in || amenity || animal_shelter || in || Y |- | Animal shelter near || amenity || animal_shelter || near|| N |- | Animal shelters near || amenity || animal_shelter || NEAR|| Y |- | Drinking Water near || amenity || drinking_water || near || N |- | Water || amenity || drinking_water || - || N |- | Water in || amenity || drinking_water || In || N |- | Water near || amenity || drinking_water || near || N |- | Embassy || amenity || embassy || - || N |- | Embassys || amenity || embassy || - || Y |- | Embassies || amenity || embassy || - || Y |- |Coworkings near |amenity |coworking_space |near |Y |} [[Category:Word list]] == en == {| class="wikitable sortable" |- ! Word / Phrase !! Key !! Value !! Operator !! Plural |- | Zip Line || aerialway || zip_line || - || N |- | Zip Lines || aerialway || zip_line || - || Y |- | Zip Line in || aerialway || zip_line || in || N |- | Zip Lines in || aerialway || zip_line || in || Y |- | Zip Line near || aerialway || zip_line || near || N |- | Animal shelter || amenity || animal_shelter || - || N |- | Animal shelters || amenity || animal_shelter || - || Y |- | Animal shelter in || amenity || animal_shelter || in || N |- | Animal shelters in || amenity || animal_shelter || in || Y |- | Animal shelter near || amenity || animal_shelter || near|| N |- | Animal shelters near || amenity || animal_shelter || NEAR|| Y |- | Drinking Water near || amenity || drinking_water || near || N |- | Water || amenity || drinking_water || - || N |- | Water in || amenity || drinking_water || In || N |- | Water near || amenity || drinking_water || near || N |- | Embassy || amenity || embassy || - || N |- | Embassys || amenity || "embassy" || - || Y |- | Embassies || amenity || embassy || - || Y |- |Coworkings near |amenity |coworking_space |near |Y |} [[Category:Word list]]
</text> </text>
<sha1>cst5x7tt58izti1pxzgljf27tx8qjcj</sha1> <sha1>cst5x7tt58izti1pxzgljf27tx8qjcj</sha1>
</revision> </revision>