forked from hans/Nominatim
use mwparserfromhell to parse SP wiki page reliably
This commit is contained in:
@@ -20,7 +20,8 @@ dependencies = [
|
|||||||
"jinja2",
|
"jinja2",
|
||||||
"pyYAML>=5.1",
|
"pyYAML>=5.1",
|
||||||
"psutil",
|
"psutil",
|
||||||
"PyICU"
|
"PyICU",
|
||||||
|
"mwparserfromhell"
|
||||||
]
|
]
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
|||||||
@@ -11,6 +11,8 @@ from typing import Iterable
|
|||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
import mwparserfromhell
|
||||||
|
|
||||||
from ...config import Configuration
|
from ...config import Configuration
|
||||||
from ...utils.url_utils import get_url
|
from ...utils.url_utils import get_url
|
||||||
from .special_phrase import SpecialPhrase
|
from .special_phrase import SpecialPhrase
|
||||||
@@ -36,10 +38,6 @@ class SPWikiLoader:
|
|||||||
"""
|
"""
|
||||||
def __init__(self, config: Configuration) -> None:
|
def __init__(self, config: Configuration) -> None:
|
||||||
self.config = config
|
self.config = config
|
||||||
# Compile the regex here to increase performances.
|
|
||||||
self.occurence_pattern = re.compile(
|
|
||||||
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
|
|
||||||
)
|
|
||||||
# Hack around a bug where building=yes was imported with quotes into the wiki
|
# Hack around a bug where building=yes was imported with quotes into the wiki
|
||||||
self.type_fix_pattern = re.compile(r'\"|"')
|
self.type_fix_pattern = re.compile(r'\"|"')
|
||||||
|
|
||||||
@@ -58,11 +56,21 @@ class SPWikiLoader:
|
|||||||
LOG.warning('Importing phrases for lang: %s...', lang)
|
LOG.warning('Importing phrases for lang: %s...', lang)
|
||||||
loaded_xml = _get_wiki_content(lang)
|
loaded_xml = _get_wiki_content(lang)
|
||||||
|
|
||||||
# One match will be of format [label, class, type, operator, plural]
|
wikicode = mwparserfromhell.parse(loaded_xml)
|
||||||
matches = self.occurence_pattern.findall(loaded_xml)
|
|
||||||
|
|
||||||
for match in matches:
|
for table in wikicode.filter_tags(matches=lambda t: t.tag == 'table'):
|
||||||
yield SpecialPhrase(match[0],
|
for row in table.contents.filter_tags(matches=lambda t: t.tag == 'tr'):
|
||||||
match[1],
|
cells = list(row.contents.filter_tags(matches=lambda t: t.tag == 'td'))
|
||||||
self.type_fix_pattern.sub('', match[2]),
|
|
||||||
match[3])
|
if len(cells) < 5:
|
||||||
|
continue
|
||||||
|
|
||||||
|
label = cells[0].contents.strip_code().strip()
|
||||||
|
cls = cells[1].contents.strip_code().strip()
|
||||||
|
typ = cells[2].contents.strip_code().strip()
|
||||||
|
operator = cells[3].contents.strip_code().strip()
|
||||||
|
|
||||||
|
yield SpecialPhrase(label,
|
||||||
|
cls,
|
||||||
|
self.type_fix_pattern.sub('', typ),
|
||||||
|
operator)
|
||||||
|
|||||||
Reference in New Issue
Block a user