Files
Nominatim/src/nominatim_db/tools/special_phrases/sp_wiki_loader.py
2025-12-08 11:01:14 +05:30

77 lines
2.7 KiB
Python

# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the SPWikiLoader class.
"""
from typing import Iterable
import re
import logging
import mwparserfromhell
from ...config import Configuration
from ...utils.url_utils import get_url
from .special_phrase import SpecialPhrase
LOG = logging.getLogger()
def _get_wiki_content(lang: str) -> str:
"""
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
"""
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+ lang.upper()
return get_url(url)
class SPWikiLoader:
"""
Handles loading of special phrases from the wiki.
"""
def __init__(self, config: Configuration) -> None:
self.config = config
# Hack around a bug where building=yes was imported with quotes into the wiki
self.type_fix_pattern = re.compile(r'\"|"')
self.languages = self.config.get_str_list('LANGUAGES') or \
['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi',
'lv', 'tr']
def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Download the wiki pages for the configured languages
and extract the phrases from the page.
"""
for lang in self.languages:
LOG.warning('Importing phrases for lang: %s...', lang)
loaded_xml = _get_wiki_content(lang)
wikicode = mwparserfromhell.parse(loaded_xml)
for table in wikicode.filter_tags(matches=lambda t: t.tag == 'table'):
for row in table.contents.filter_tags(matches=lambda t: t.tag == 'tr'):
cells = list(row.contents.filter_tags(matches=lambda t: t.tag == 'td'))
if len(cells) < 5:
continue
label = cells[0].contents.strip_code().strip()
cls = cells[1].contents.strip_code().strip()
typ = cells[2].contents.strip_code().strip()
operator = cells[3].contents.strip_code().strip()
yield SpecialPhrase(label,
cls,
self.type_fix_pattern.sub('', typ),
operator)