Merge pull request #3901 from AyushDharDubey/fix/issue_3829-use-mwparserfromhell-to-parse-sp-wiki-page

Replace regex with `mwparserfromhell` based MW WikiCode Parsing for Special Phrases
2026-02-16 15:47:58 +00:00 · 2025-12-08 11:51:50 +01:00
parent 681daeea29 fe170c9286
commit 58e56ec53d
8 changed files with 147 additions and 93 deletions
--- a/src/nominatim_db/tools/special_phrases/sp_wiki_loader.py
+++ b/src/nominatim_db/tools/special_phrases/sp_wiki_loader.py
@@ -11,6 +11,8 @@ from typing import Iterable
 import re
 import logging

+import mwparserfromhell
+
 from ...config import Configuration
 from ...utils.url_utils import get_url
 from .special_phrase import SpecialPhrase
@@ -36,10 +38,6 @@ class SPWikiLoader:
    """
    def __init__(self, config: Configuration) -> None:
        self.config = config
-        # Compile the regex here to increase performances.
-        self.occurence_pattern = re.compile(
-            r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
-        )
        # Hack around a bug where building=yes was imported with quotes into the wiki
        self.type_fix_pattern = re.compile(r'\"|&quot;')

@@ -58,11 +56,21 @@ class SPWikiLoader:
            LOG.warning('Importing phrases for lang: %s...', lang)
            loaded_xml = _get_wiki_content(lang)

-            # One match will be of format [label, class, type, operator, plural]
-            matches = self.occurence_pattern.findall(loaded_xml)
+            wikicode = mwparserfromhell.parse(loaded_xml)

-            for match in matches:
-                yield SpecialPhrase(match[0],
-                                    match[1],
-                                    self.type_fix_pattern.sub('', match[2]),
-                                    match[3])
+            for table in wikicode.filter_tags(matches=lambda t: t.tag == 'table'):
+                for row in table.contents.filter_tags(matches=lambda t: t.tag == 'tr'):
+                    cells = list(row.contents.filter_tags(matches=lambda t: t.tag == 'td'))
+
+                    if len(cells) < 5:
+                        continue
+
+                    label = cells[0].contents.strip_code().strip()
+                    cls = cells[1].contents.strip_code().strip()
+                    typ = cells[2].contents.strip_code().strip()
+                    operator = cells[3].contents.strip_code().strip()
+
+                    yield SpecialPhrase(label,
+                                        cls,
+                                        self.type_fix_pattern.sub('', typ),
+                                        operator)