Merge pull request #2731 from lonvia/cleanup-special-phrases

Minor code reorganisation around special phrase parsing
This commit is contained in:
Sarah Hoffmann
2022-05-31 17:13:56 +02:00
committed by GitHub
16 changed files with 145 additions and 157 deletions

View File

@@ -81,13 +81,16 @@ jobs:
ubuntu: ${{ matrix.ubuntu }}
- name: Install test prerequsites
run: sudo apt-get install -y -qq pylint python3-pytest python3-behave
run: sudo apt-get install -y -qq python3-pytest python3-behave
if: matrix.ubuntu == 20
- name: Install test prerequsites
run: pip3 install pylint==2.6.0 pytest behave==1.2.6
run: pip3 install pytest behave==1.2.6
if: matrix.ubuntu == 18
- name: Install latest pylint
run: pip3 install pylint
- name: PHP linting
run: phpcs --report-width=120 .
working-directory: Nominatim

View File

@@ -32,7 +32,7 @@ It has the following additional requirements:
* [behave test framework](https://behave.readthedocs.io) >= 1.2.6
* [phpunit](https://phpunit.de) (9.5 is known to work)
* [PHP CodeSniffer](https://github.com/squizlabs/PHP_CodeSniffer)
* [Pylint](https://pylint.org/) (2.6.0 is used for the CI)
* [Pylint](https://pylint.org/) (CI always runs the latest version from pip)
* [pytest](https://pytest.org)
The documentation is built with mkdocs:

View File

@@ -128,7 +128,7 @@ class SetupAll:
drop=args.no_updates)
LOG.warning('Create search index for default country names.')
country_info.create_country_names(conn, tokenizer,
args.config.LANGUAGES)
args.config.get_str_list('LANGUAGES'))
if args.no_updates:
freeze.drop_update_tables(conn)
tokenizer.finalize_import(args.config)

View File

@@ -99,6 +99,17 @@ class Configuration:
raise UsageError("Configuration error.") from exp
def get_str_list(self, name):
""" Return the given configuration parameter as a list of strings.
The values are assumed to be given as a comma-sparated list and
will be stripped before returning them. On empty values None
is returned.
"""
raw = self.__getattr__(name)
return [v.strip() for v in raw.split(',')] if raw else None
def get_path(self, name):
""" Return the given configuration parameter as a Path.
If a relative path is configured, then the function converts this

View File

@@ -131,9 +131,6 @@ def create_country_names(conn, tokenizer, languages=None):
empty then only name translations for the given languages are added
to the index.
"""
if languages:
languages = languages.split(',')
def _include_key(key):
return ':' not in key or not languages or \
key[key.index(':') + 1:] in languages

View File

@@ -11,43 +11,31 @@
"""
import csv
import os
from collections.abc import Iterator
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
from nominatim.errors import UsageError
class SPCsvLoader(Iterator):
class SPCsvLoader:
"""
Handles loading of special phrases from external csv file.
"""
def __init__(self, csv_path):
super().__init__()
self.csv_path = csv_path
self.has_been_read = False
def __next__(self):
if self.has_been_read:
raise StopIteration()
self.has_been_read = True
self.check_csv_validity()
return self.parse_csv()
def parse_csv(self):
"""
Open and parse the given csv file.
def generate_phrases(self):
""" Open and parse the given csv file.
Create the corresponding SpecialPhrases.
"""
phrases = set()
self._check_csv_validity()
with open(self.csv_path, encoding='utf-8') as fd:
reader = csv.DictReader(fd, delimiter=',')
for row in reader:
phrases.add(
SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
)
return phrases
yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
def check_csv_validity(self):
def _check_csv_validity(self):
"""
Check that the csv file has the right extension.
"""

View File

@@ -62,8 +62,7 @@ class SPImporter():
# Store pairs of class/type for further processing
class_type_pairs = set()
for loaded_phrases in self.sp_loader:
for phrase in loaded_phrases:
for phrase in self.sp_loader.generate_phrases():
result = self._process_phrase(phrase)
if result:
class_type_pairs.add(result)

View File

@@ -9,61 +9,11 @@
"""
import re
import logging
from collections.abc import Iterator
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
from nominatim.tools.exec_utils import get_url
LOG = logging.getLogger()
class SPWikiLoader(Iterator):
"""
Handles loading of special phrases from the wiki.
"""
def __init__(self, config, languages=None):
super().__init__()
self.config = config
# Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
)
self.languages = self._load_languages() if not languages else list(languages)
def __next__(self):
if not self.languages:
raise StopIteration
lang = self.languages.pop(0)
loaded_xml = self._get_wiki_content(lang)
LOG.warning('Importing phrases for lang: %s...', lang)
return self.parse_xml(loaded_xml)
def parse_xml(self, xml):
"""
Parses XML content and extracts special phrases from it.
Return a list of SpecialPhrase.
"""
# One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(xml)
returned_phrases = set()
for match in matches:
returned_phrases.add(
SpecialPhrase(match[0], match[1], match[2], match[3])
)
return returned_phrases
def _load_languages(self):
"""
Get list of all languages from env config file
or default if there is no languages configured.
The system will extract special phrases only from all specified languages.
"""
default_languages = [
'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
@staticmethod
def _get_wiki_content(lang):
"""
Request and return the wiki page's content
@@ -74,3 +24,53 @@ class SPWikiLoader(Iterator):
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+ lang.upper()
return get_url(url)
class SPWikiLoader:
"""
Handles loading of special phrases from the wiki.
"""
def __init__(self, config):
super().__init__()
self.config = config
# Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
)
# Hack around a bug where building=yes was imported with quotes into the wiki
self.type_fix_pattern = re.compile(r'\"|"')
self._load_languages()
def generate_phrases(self):
""" Download the wiki pages for the configured languages
and extract the phrases from the page.
"""
for lang in self.languages:
LOG.warning('Importing phrases for lang: %s...', lang)
loaded_xml = _get_wiki_content(lang)
# One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(loaded_xml)
for match in matches:
yield SpecialPhrase(match[0],
match[1],
self.type_fix_pattern.sub('', match[2]),
match[3])
def _load_languages(self):
"""
Get list of all languages from env config file
or default if there is no languages configured.
The system will extract special phrases only from all specified languages.
"""
if self.config.LANGUAGES:
self.languages = self.config.get_str_list('LANGUAGES')
else:
self.languages = [
'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']

View File

@@ -10,9 +10,7 @@
This class is a model used to transfer a special phrase through
the process of load and importation.
"""
import re
class SpecialPhrase():
class SpecialPhrase:
"""
Model representing a special phrase.
"""
@@ -20,7 +18,19 @@ class SpecialPhrase():
self.p_label = p_label.strip()
self.p_class = p_class.strip()
# Hack around a bug where building=yes was imported with quotes into the wiki
self.p_type = re.sub(r'\"|"', '', p_type.strip())
self.p_type = p_type.strip()
# Needed if some operator in the wiki are not written in english
p_operator = p_operator.strip().lower()
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
def __eq__(self, other):
if not isinstance(other, SpecialPhrase):
return False
return self.p_label == other.p_label \
and self.p_class == other.p_class \
and self.p_type == other.p_type \
and self.p_operator == other.p_operator
def __hash__(self):
return hash((self.p_label, self.p_class, self.p_type, self.p_operator))

View File

@@ -173,6 +173,23 @@ def test_get_int_empty(make_config):
config.get_int('DATABASE_MODULE_PATH')
@pytest.mark.parametrize("value,outlist", [('sd', ['sd']),
('dd,rr', ['dd', 'rr']),
(' a , b ', ['a', 'b'])])
def test_get_str_list_success(make_config, monkeypatch, value, outlist):
config = make_config()
monkeypatch.setenv('NOMINATIM_MYLIST', value)
assert config.get_str_list('MYLIST') == outlist
def test_get_str_list_empty(make_config):
config = make_config()
assert config.get_str_list('LANGUAGES') is None
def test_get_path_empty(make_config):
config = make_config()

View File

@@ -49,7 +49,7 @@ def test_setup_country_tables(src_dir, temp_db_with_extensions, dsn, temp_db_cur
assert temp_db_cursor.table_rows('country_osm_grid') > 100
@pytest.mark.parametrize("languages", (None, ' fr,en'))
@pytest.mark.parametrize("languages", (None, ['fr', 'en']))
def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
table_factory, tokenizer_mock, languages, loaded_country):

View File

@@ -18,16 +18,12 @@ from nominatim.errors import UsageError
from cursor import CursorForTesting
@pytest.fixture
def testfile_dir(src_dir):
return src_dir / 'test' / 'testfiles'
@pytest.fixture
def sp_importer(temp_db_conn, def_config):
def sp_importer(temp_db_conn, def_config, monkeypatch):
"""
Return an instance of SPImporter.
"""
loader = SPWikiLoader(def_config, ['en'])
monkeypatch.setenv('NOMINATIM_LANGUAGES', 'en')
loader = SPWikiLoader(def_config)
return SPImporter(def_config, temp_db_conn, loader)
@@ -186,8 +182,8 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
table_factory('place_classtype_amenity_animal_shelter')
table_factory('place_classtype_wrongclass_wrongtype')
monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader.SPWikiLoader._get_wiki_content',
lambda self, lang: xml_wiki_content)
monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader._get_wiki_content',
lambda lang: xml_wiki_content)
tokenizer = tokenizer_mock()
sp_importer.import_phrases(tokenizer, should_replace)

View File

@@ -11,50 +11,7 @@ import pytest
from nominatim.errors import UsageError
from nominatim.tools.special_phrases.sp_csv_loader import SPCsvLoader
def test_parse_csv(sp_csv_loader):
"""
Test method parse_csv()
Should return the right SpecialPhrase objects.
"""
phrases = sp_csv_loader.parse_csv()
assert check_phrases_content(phrases)
def test_next(sp_csv_loader):
"""
Test objects returned from the next() method.
It should return all SpecialPhrases objects of
the sp_csv_test.csv special phrases.
"""
phrases = next(sp_csv_loader)
assert check_phrases_content(phrases)
def test_check_csv_validity(sp_csv_loader):
"""
Test method check_csv_validity()
It should raise an exception when file with a
different exception than .csv is given.
"""
sp_csv_loader.csv_path = 'test.csv'
sp_csv_loader.check_csv_validity()
sp_csv_loader.csv_path = 'test.wrong'
with pytest.raises(UsageError):
assert sp_csv_loader.check_csv_validity()
def check_phrases_content(phrases):
"""
Asserts that the given phrases list contains
the right phrases of the sp_csv_test.csv special phrases.
"""
return len(phrases) > 1 \
and any(p.p_label == 'Billboard'
and p.p_class == 'advertising'
and p.p_type == 'billboard'
and p.p_operator == '-' for p in phrases) \
and any(p.p_label == 'Zip Lines'
and p.p_class == 'aerialway'
and p.p_type == 'zip_line'
and p.p_operator == '-' for p in phrases)
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
@pytest.fixture
def sp_csv_loader(src_dir):
@@ -64,3 +21,29 @@ def sp_csv_loader(src_dir):
csv_path = (src_dir / 'test' / 'testdata' / 'sp_csv_test.csv').resolve()
loader = SPCsvLoader(csv_path)
return loader
def test_generate_phrases(sp_csv_loader):
"""
Test method parse_csv()
Should return the right SpecialPhrase objects.
"""
phrases = list(sp_csv_loader.generate_phrases())
assert len(phrases) == 42
assert len(set(phrases)) == 41
assert SpecialPhrase('Billboard', 'advertising', 'billboard', '-') in phrases
assert SpecialPhrase('Zip Lines', 'aerialway', 'zip_line', '-') in phrases
def test_invalid_cvs_file():
"""
Test method check_csv_validity()
It should raise an exception when file with a
different exception than .csv is given.
"""
loader = SPCsvLoader('test.wrong')
with pytest.raises(UsageError, match='not a csv file'):
next(loader.generate_phrases())

View File

@@ -10,49 +10,32 @@
import pytest
from nominatim.tools.special_phrases.sp_wiki_loader import SPWikiLoader
@pytest.fixture
def xml_wiki_content(src_dir):
"""
return the content of the static xml test file.
"""
xml_test_content = src_dir / 'test' / 'testdata' / 'special_phrases_test_content.txt'
return xml_test_content.read_text()
@pytest.fixture
def sp_wiki_loader(monkeypatch, def_config, xml_wiki_content):
def sp_wiki_loader(src_dir, monkeypatch, def_config):
"""
Return an instance of SPWikiLoader.
"""
loader = SPWikiLoader(def_config, ['en'])
monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader.SPWikiLoader._get_wiki_content',
lambda self, lang: xml_wiki_content)
monkeypatch.setenv('NOMINATIM_LANGUAGES', 'en')
loader = SPWikiLoader(def_config)
def _mock_wiki_content(lang):
xml_test_content = src_dir / 'test' / 'testdata' / 'special_phrases_test_content.txt'
return xml_test_content.read_text()
monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader._get_wiki_content',
_mock_wiki_content)
return loader
def test_parse_xml(sp_wiki_loader, xml_wiki_content):
"""
Test method parse_xml()
Should return the right SpecialPhrase objects.
"""
phrases = sp_wiki_loader.parse_xml(xml_wiki_content)
check_phrases_content(phrases)
def test_next(sp_wiki_loader):
def test_generate_phrases(sp_wiki_loader):
"""
Test objects returned from the next() method.
It should return all SpecialPhrases objects of
the 'en' special phrases.
"""
phrases = next(sp_wiki_loader)
check_phrases_content(phrases)
phrases = list(sp_wiki_loader.generate_phrases())
def check_phrases_content(phrases):
"""
Asserts that the given phrases list contains
the right phrases of the 'en' special phrases.
"""
assert set((p.p_label, p.p_class, p.p_type, p.p_operator) for p in phrases) ==\
{('Zip Line', 'aerialway', 'zip_line', '-'),
('Zip Lines', 'aerialway', 'zip_line', '-'),

View File

@@ -18,6 +18,7 @@ Zipline near,aerialway,zip_line,near,N
Ziplines near,aerialway,zip_line,near,Y
Zipwire,aerialway,zip_line,-,N
Zipwires,aerialway,zip_line,-,Y
Zipwires,aerialway,zip_line,name,Y
Zipwire in,aerialway,zip_line,in,N
Zipwires in,aerialway,zip_line,in,Y
Zipwire near,aerialway,zip_line,near,N
1 phrase class type operator plural
18 Ziplines near aerialway zip_line near Y
19 Zipwire aerialway zip_line - N
20 Zipwires aerialway zip_line - Y
21 Zipwires aerialway zip_line name Y
22 Zipwire in aerialway zip_line in N
23 Zipwires in aerialway zip_line in Y
24 Zipwire near aerialway zip_line near N

View File

@@ -70,7 +70,7 @@
<model>wikitext</model>
<format>text/x-wiki</format>
<text bytes="158218" sha1="cst5x7tt58izti1pxzgljf27tx8qjcj" xml:space="preserve">
== en == {| class="wikitable sortable" |- ! Word / Phrase !! Key !! Value !! Operator !! Plural |- | Zip Line || aerialway || zip_line || - || N |- | Zip Lines || aerialway || zip_line || - || Y |- | Zip Line in || aerialway || zip_line || in || N |- | Zip Lines in || aerialway || zip_line || in || Y |- | Zip Line near || aerialway || zip_line || near || N |- | Animal shelter || amenity || animal_shelter || - || N |- | Animal shelters || amenity || animal_shelter || - || Y |- | Animal shelter in || amenity || animal_shelter || in || N |- | Animal shelters in || amenity || animal_shelter || in || Y |- | Animal shelter near || amenity || animal_shelter || near|| N |- | Animal shelters near || amenity || animal_shelter || NEAR|| Y |- | Drinking Water near || amenity || drinking_water || near || N |- | Water || amenity || drinking_water || - || N |- | Water in || amenity || drinking_water || In || N |- | Water near || amenity || drinking_water || near || N |- | Embassy || amenity || embassy || - || N |- | Embassys || amenity || embassy || - || Y |- | Embassies || amenity || embassy || - || Y |- |Coworkings near |amenity |coworking_space |near |Y |} [[Category:Word list]]
== en == {| class="wikitable sortable" |- ! Word / Phrase !! Key !! Value !! Operator !! Plural |- | Zip Line || aerialway || zip_line || - || N |- | Zip Lines || aerialway || zip_line || - || Y |- | Zip Line in || aerialway || zip_line || in || N |- | Zip Lines in || aerialway || zip_line || in || Y |- | Zip Line near || aerialway || zip_line || near || N |- | Animal shelter || amenity || animal_shelter || - || N |- | Animal shelters || amenity || animal_shelter || - || Y |- | Animal shelter in || amenity || animal_shelter || in || N |- | Animal shelters in || amenity || animal_shelter || in || Y |- | Animal shelter near || amenity || animal_shelter || near|| N |- | Animal shelters near || amenity || animal_shelter || NEAR|| Y |- | Drinking Water near || amenity || drinking_water || near || N |- | Water || amenity || drinking_water || - || N |- | Water in || amenity || drinking_water || In || N |- | Water near || amenity || drinking_water || near || N |- | Embassy || amenity || embassy || - || N |- | Embassys || amenity || "embassy" || - || Y |- | Embassies || amenity || embassy || - || Y |- |Coworkings near |amenity |coworking_space |near |Y |} [[Category:Word list]]
</text>
<sha1>cst5x7tt58izti1pxzgljf27tx8qjcj</sha1>
</revision>