mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
77 lines
2.7 KiB
Python
77 lines
2.7 KiB
Python
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
#
|
|
# This file is part of Nominatim. (https://nominatim.org)
|
|
#
|
|
# Copyright (C) 2024 by the Nominatim developer community.
|
|
# For a full list of authors see the git log.
|
|
"""
|
|
Module containing the SPWikiLoader class.
|
|
"""
|
|
from typing import Iterable
|
|
import re
|
|
import logging
|
|
|
|
import mwparserfromhell
|
|
|
|
from ...config import Configuration
|
|
from ...utils.url_utils import get_url
|
|
from .special_phrase import SpecialPhrase
|
|
|
|
LOG = logging.getLogger()
|
|
|
|
|
|
def _get_wiki_content(lang: str) -> str:
|
|
"""
|
|
Request and return the wiki page's content
|
|
corresponding to special phrases for a given lang.
|
|
Requested URL Example :
|
|
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
|
|
"""
|
|
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
|
|
+ lang.upper()
|
|
return get_url(url)
|
|
|
|
|
|
class SPWikiLoader:
|
|
"""
|
|
Handles loading of special phrases from the wiki.
|
|
"""
|
|
def __init__(self, config: Configuration) -> None:
|
|
self.config = config
|
|
# Hack around a bug where building=yes was imported with quotes into the wiki
|
|
self.type_fix_pattern = re.compile(r'\"|"')
|
|
|
|
self.languages = self.config.get_str_list('LANGUAGES') or \
|
|
['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
|
|
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
|
|
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
|
|
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi',
|
|
'lv', 'tr']
|
|
|
|
def generate_phrases(self) -> Iterable[SpecialPhrase]:
|
|
""" Download the wiki pages for the configured languages
|
|
and extract the phrases from the page.
|
|
"""
|
|
for lang in self.languages:
|
|
LOG.warning('Importing phrases for lang: %s...', lang)
|
|
loaded_xml = _get_wiki_content(lang)
|
|
|
|
wikicode = mwparserfromhell.parse(loaded_xml)
|
|
|
|
for table in wikicode.filter_tags(matches=lambda t: t.tag == 'table'):
|
|
for row in table.contents.filter_tags(matches=lambda t: t.tag == 'tr'):
|
|
cells = list(row.contents.filter_tags(matches=lambda t: t.tag == 'td'))
|
|
|
|
if len(cells) < 5:
|
|
continue
|
|
|
|
label = cells[0].contents.strip_code().strip()
|
|
cls = cells[1].contents.strip_code().strip()
|
|
typ = cells[2].contents.strip_code().strip()
|
|
operator = cells[3].contents.strip_code().strip()
|
|
|
|
yield SpecialPhrase(label,
|
|
cls,
|
|
self.type_fix_pattern.sub('', typ),
|
|
operator)
|