mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
Merge pull request #2731 from lonvia/cleanup-special-phrases
Minor code reorganisation around special phrase parsing
This commit is contained in:
@@ -128,7 +128,7 @@ class SetupAll:
|
||||
drop=args.no_updates)
|
||||
LOG.warning('Create search index for default country names.')
|
||||
country_info.create_country_names(conn, tokenizer,
|
||||
args.config.LANGUAGES)
|
||||
args.config.get_str_list('LANGUAGES'))
|
||||
if args.no_updates:
|
||||
freeze.drop_update_tables(conn)
|
||||
tokenizer.finalize_import(args.config)
|
||||
|
||||
@@ -99,6 +99,17 @@ class Configuration:
|
||||
raise UsageError("Configuration error.") from exp
|
||||
|
||||
|
||||
def get_str_list(self, name):
|
||||
""" Return the given configuration parameter as a list of strings.
|
||||
The values are assumed to be given as a comma-sparated list and
|
||||
will be stripped before returning them. On empty values None
|
||||
is returned.
|
||||
"""
|
||||
raw = self.__getattr__(name)
|
||||
|
||||
return [v.strip() for v in raw.split(',')] if raw else None
|
||||
|
||||
|
||||
def get_path(self, name):
|
||||
""" Return the given configuration parameter as a Path.
|
||||
If a relative path is configured, then the function converts this
|
||||
|
||||
@@ -131,9 +131,6 @@ def create_country_names(conn, tokenizer, languages=None):
|
||||
empty then only name translations for the given languages are added
|
||||
to the index.
|
||||
"""
|
||||
if languages:
|
||||
languages = languages.split(',')
|
||||
|
||||
def _include_key(key):
|
||||
return ':' not in key or not languages or \
|
||||
key[key.index(':') + 1:] in languages
|
||||
|
||||
@@ -11,43 +11,31 @@
|
||||
"""
|
||||
import csv
|
||||
import os
|
||||
from collections.abc import Iterator
|
||||
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
class SPCsvLoader(Iterator):
|
||||
class SPCsvLoader:
|
||||
"""
|
||||
Handles loading of special phrases from external csv file.
|
||||
"""
|
||||
def __init__(self, csv_path):
|
||||
super().__init__()
|
||||
self.csv_path = csv_path
|
||||
self.has_been_read = False
|
||||
|
||||
def __next__(self):
|
||||
if self.has_been_read:
|
||||
raise StopIteration()
|
||||
|
||||
self.has_been_read = True
|
||||
self.check_csv_validity()
|
||||
return self.parse_csv()
|
||||
|
||||
def parse_csv(self):
|
||||
"""
|
||||
Open and parse the given csv file.
|
||||
def generate_phrases(self):
|
||||
""" Open and parse the given csv file.
|
||||
Create the corresponding SpecialPhrases.
|
||||
"""
|
||||
phrases = set()
|
||||
self._check_csv_validity()
|
||||
|
||||
with open(self.csv_path, encoding='utf-8') as fd:
|
||||
reader = csv.DictReader(fd, delimiter=',')
|
||||
for row in reader:
|
||||
phrases.add(
|
||||
SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
|
||||
)
|
||||
return phrases
|
||||
yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
|
||||
|
||||
def check_csv_validity(self):
|
||||
|
||||
def _check_csv_validity(self):
|
||||
"""
|
||||
Check that the csv file has the right extension.
|
||||
"""
|
||||
|
||||
@@ -62,11 +62,10 @@ class SPImporter():
|
||||
# Store pairs of class/type for further processing
|
||||
class_type_pairs = set()
|
||||
|
||||
for loaded_phrases in self.sp_loader:
|
||||
for phrase in loaded_phrases:
|
||||
result = self._process_phrase(phrase)
|
||||
if result:
|
||||
class_type_pairs.add(result)
|
||||
for phrase in self.sp_loader.generate_phrases():
|
||||
result = self._process_phrase(phrase)
|
||||
if result:
|
||||
class_type_pairs.add(result)
|
||||
|
||||
self._create_place_classtype_table_and_indexes(class_type_pairs)
|
||||
if should_replace:
|
||||
|
||||
@@ -9,46 +9,56 @@
|
||||
"""
|
||||
import re
|
||||
import logging
|
||||
from collections.abc import Iterator
|
||||
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
|
||||
from nominatim.tools.exec_utils import get_url
|
||||
|
||||
LOG = logging.getLogger()
|
||||
class SPWikiLoader(Iterator):
|
||||
|
||||
def _get_wiki_content(lang):
|
||||
"""
|
||||
Request and return the wiki page's content
|
||||
corresponding to special phrases for a given lang.
|
||||
Requested URL Example :
|
||||
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
|
||||
"""
|
||||
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
|
||||
+ lang.upper()
|
||||
return get_url(url)
|
||||
|
||||
|
||||
class SPWikiLoader:
|
||||
"""
|
||||
Handles loading of special phrases from the wiki.
|
||||
"""
|
||||
def __init__(self, config, languages=None):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
# Compile the regex here to increase performances.
|
||||
self.occurence_pattern = re.compile(
|
||||
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
|
||||
)
|
||||
self.languages = self._load_languages() if not languages else list(languages)
|
||||
# Hack around a bug where building=yes was imported with quotes into the wiki
|
||||
self.type_fix_pattern = re.compile(r'\"|"')
|
||||
self._load_languages()
|
||||
|
||||
def __next__(self):
|
||||
if not self.languages:
|
||||
raise StopIteration
|
||||
|
||||
lang = self.languages.pop(0)
|
||||
loaded_xml = self._get_wiki_content(lang)
|
||||
LOG.warning('Importing phrases for lang: %s...', lang)
|
||||
return self.parse_xml(loaded_xml)
|
||||
|
||||
def parse_xml(self, xml):
|
||||
def generate_phrases(self):
|
||||
""" Download the wiki pages for the configured languages
|
||||
and extract the phrases from the page.
|
||||
"""
|
||||
Parses XML content and extracts special phrases from it.
|
||||
Return a list of SpecialPhrase.
|
||||
"""
|
||||
# One match will be of format [label, class, type, operator, plural]
|
||||
matches = self.occurence_pattern.findall(xml)
|
||||
returned_phrases = set()
|
||||
for match in matches:
|
||||
returned_phrases.add(
|
||||
SpecialPhrase(match[0], match[1], match[2], match[3])
|
||||
)
|
||||
return returned_phrases
|
||||
for lang in self.languages:
|
||||
LOG.warning('Importing phrases for lang: %s...', lang)
|
||||
loaded_xml = _get_wiki_content(lang)
|
||||
|
||||
# One match will be of format [label, class, type, operator, plural]
|
||||
matches = self.occurence_pattern.findall(loaded_xml)
|
||||
|
||||
for match in matches:
|
||||
yield SpecialPhrase(match[0],
|
||||
match[1],
|
||||
self.type_fix_pattern.sub('', match[2]),
|
||||
match[3])
|
||||
|
||||
|
||||
def _load_languages(self):
|
||||
"""
|
||||
@@ -56,21 +66,11 @@ class SPWikiLoader(Iterator):
|
||||
or default if there is no languages configured.
|
||||
The system will extract special phrases only from all specified languages.
|
||||
"""
|
||||
default_languages = [
|
||||
if self.config.LANGUAGES:
|
||||
self.languages = self.config.get_str_list('LANGUAGES')
|
||||
else:
|
||||
self.languages = [
|
||||
'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
|
||||
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
|
||||
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
|
||||
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
|
||||
return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
|
||||
|
||||
@staticmethod
|
||||
def _get_wiki_content(lang):
|
||||
"""
|
||||
Request and return the wiki page's content
|
||||
corresponding to special phrases for a given lang.
|
||||
Requested URL Example :
|
||||
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
|
||||
"""
|
||||
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
|
||||
+ lang.upper()
|
||||
return get_url(url)
|
||||
|
||||
@@ -10,9 +10,7 @@
|
||||
This class is a model used to transfer a special phrase through
|
||||
the process of load and importation.
|
||||
"""
|
||||
import re
|
||||
|
||||
class SpecialPhrase():
|
||||
class SpecialPhrase:
|
||||
"""
|
||||
Model representing a special phrase.
|
||||
"""
|
||||
@@ -20,7 +18,19 @@ class SpecialPhrase():
|
||||
self.p_label = p_label.strip()
|
||||
self.p_class = p_class.strip()
|
||||
# Hack around a bug where building=yes was imported with quotes into the wiki
|
||||
self.p_type = re.sub(r'\"|"', '', p_type.strip())
|
||||
self.p_type = p_type.strip()
|
||||
# Needed if some operator in the wiki are not written in english
|
||||
p_operator = p_operator.strip().lower()
|
||||
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, SpecialPhrase):
|
||||
return False
|
||||
|
||||
return self.p_label == other.p_label \
|
||||
and self.p_class == other.p_class \
|
||||
and self.p_type == other.p_type \
|
||||
and self.p_operator == other.p_operator
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.p_label, self.p_class, self.p_type, self.p_operator))
|
||||
|
||||
Reference in New Issue
Block a user