use mwparserfromhell to parse SP wiki page reliably

This commit is contained in:
Ayush Dhar Dubey
2025-12-07 22:37:31 +05:30
parent 4919240377
commit 49454048c4
2 changed files with 21 additions and 12 deletions

View File

@@ -20,7 +20,8 @@ dependencies = [
"jinja2",
"pyYAML>=5.1",
"psutil",
"PyICU"
"PyICU",
"mwparserfromhell"
]
dynamic = ["version"]

View File

@@ -11,6 +11,8 @@ from typing import Iterable
import re
import logging
import mwparserfromhell
from ...config import Configuration
from ...utils.url_utils import get_url
from .special_phrase import SpecialPhrase
@@ -36,10 +38,6 @@ class SPWikiLoader:
"""
def __init__(self, config: Configuration) -> None:
self.config = config
# Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
)
# Hack around a bug where building=yes was imported with quotes into the wiki
self.type_fix_pattern = re.compile(r'\"|"')
@@ -58,11 +56,21 @@ class SPWikiLoader:
LOG.warning('Importing phrases for lang: %s...', lang)
loaded_xml = _get_wiki_content(lang)
# One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(loaded_xml)
wikicode = mwparserfromhell.parse(loaded_xml)
for match in matches:
yield SpecialPhrase(match[0],
match[1],
self.type_fix_pattern.sub('', match[2]),
match[3])
for table in wikicode.filter_tags(matches=lambda t: t.tag == 'table'):
for row in table.contents.filter_tags(matches=lambda t: t.tag == 'tr'):
cells = list(row.contents.filter_tags(matches=lambda t: t.tag == 'td'))
if len(cells) < 5:
continue
label = cells[0].contents.strip_code().strip()
cls = cells[1].contents.strip_code().strip()
typ = cells[2].contents.strip_code().strip()
operator = cells[3].contents.strip_code().strip()
yield SpecialPhrase(label,
cls,
self.type_fix_pattern.sub('', typ),
operator)