Merge pull request #3901 from AyushDharDubey/fix/issue_3829-use-mwparserfromhell-to-parse-sp-wiki-page

Replace regex with `mwparserfromhell` based MW WikiCode Parsing for Special Phrases
This commit is contained in:
Sarah Hoffmann
2025-12-08 11:51:50 +01:00
committed by GitHub
8 changed files with 147 additions and 93 deletions

View File

@@ -22,7 +22,7 @@ runs:
- name: Install prerequisites from apt
run: |
sudo apt-get install -y -qq python3-icu python3-datrie python3-jinja2 python3-psutil python3-dotenv python3-yaml python3-sqlalchemy python3-psycopg python3-asyncpg
sudo apt-get install -y -qq python3-icu python3-datrie python3-jinja2 python3-psutil python3-dotenv python3-yaml python3-sqlalchemy python3-psycopg python3-asyncpg python3-mwparserfromhell
shell: bash
if: inputs.dependencies == 'apt'

View File

@@ -37,6 +37,7 @@ Furthermore the following Python libraries are required:
* [Jinja2](https://palletsprojects.com/p/jinja/)
* [PyICU](https://pypi.org/project/PyICU/)
* [PyYaml](https://pyyaml.org/) (5.1+)
* [mwparserfromhell](https://github.com/earwig/mwparserfromhell/)
These will be installed automatically when using pip installation.

View File

@@ -73,7 +73,7 @@ virtualenv ~/nominatim-dev-venv
types-jinja2 types-markupsafe types-psutil types-psycopg2 \
types-pygments types-pyyaml types-requests types-ujson \
types-urllib3 typing-extensions unicorn falcon starlette \
uvicorn mypy osmium aiosqlite
uvicorn mypy osmium aiosqlite mwparserfromhell
```
Now enter the virtual environment whenever you want to develop:

View File

@@ -20,7 +20,8 @@ dependencies = [
"jinja2",
"pyYAML>=5.1",
"psutil",
"PyICU"
"PyICU",
"mwparserfromhell"
]
dynamic = ["version"]

View File

@@ -11,6 +11,8 @@ from typing import Iterable
import re
import logging
import mwparserfromhell
from ...config import Configuration
from ...utils.url_utils import get_url
from .special_phrase import SpecialPhrase
@@ -36,10 +38,6 @@ class SPWikiLoader:
"""
def __init__(self, config: Configuration) -> None:
self.config = config
# Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
)
# Hack around a bug where building=yes was imported with quotes into the wiki
self.type_fix_pattern = re.compile(r'\"|"')
@@ -58,11 +56,21 @@ class SPWikiLoader:
LOG.warning('Importing phrases for lang: %s...', lang)
loaded_xml = _get_wiki_content(lang)
# One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(loaded_xml)
wikicode = mwparserfromhell.parse(loaded_xml)
for match in matches:
yield SpecialPhrase(match[0],
match[1],
self.type_fix_pattern.sub('', match[2]),
match[3])
for table in wikicode.filter_tags(matches=lambda t: t.tag == 'table'):
for row in table.contents.filter_tags(matches=lambda t: t.tag == 'tr'):
cells = list(row.contents.filter_tags(matches=lambda t: t.tag == 'td'))
if len(cells) < 5:
continue
label = cells[0].contents.strip_code().strip()
cls = cells[1].contents.strip_code().strip()
typ = cells[2].contents.strip_code().strip()
operator = cells[3].contents.strip_code().strip()
yield SpecialPhrase(label,
cls,
self.type_fix_pattern.sub('', typ),
operator)

View File

@@ -203,7 +203,7 @@ def test_import_phrases(monkeypatch, temp_db_cursor, def_config, sp_importer,
placex_table.add(cls='amenity', typ='animal_shelter') # in db for special phrase filtering
sp_importer.import_phrases(tokenizer, should_replace)
assert len(tokenizer.analyser_cache['special_phrases']) == 18
assert len(tokenizer.analyser_cache['special_phrases']) == 19
assert check_table_exist(temp_db_cursor, class_test, type_test)
assert check_placeid_and_centroid_indexes(temp_db_cursor, class_test, type_test)

View File

@@ -54,4 +54,6 @@ def test_generate_phrases(sp_wiki_loader):
('Water near', 'amenity', 'drinking_water', 'near'),
('Embassy', 'amenity', 'embassy', '-'),
('Embassys', 'amenity', 'embassy', '-'),
('Embassies', 'amenity', 'embassy', '-')}
('Embassies', 'amenity', 'embassy', '-'),
# test for one-cell-per-line format
('Coworkings near', 'amenity', 'coworking_space', 'near')}

View File

@@ -1,78 +1,120 @@
<mediawiki xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="en">
<siteinfo>
<sitename>OpenStreetMap Wiki</sitename>
<dbname>wiki</dbname>
<base>https://wiki.openstreetmap.org/wiki/Main_Page</base>
<generator>MediaWiki 1.35.2</generator>
<case>first-letter</case>
<namespaces>
<namespace key="-2" case="first-letter">Media</namespace>
<namespace key="-1" case="first-letter">Special</namespace>
<namespace key="0" case="first-letter"/>
<namespace key="1" case="first-letter">Talk</namespace>
<namespace key="2" case="first-letter">User</namespace>
<namespace key="3" case="first-letter">User talk</namespace>
<namespace key="4" case="first-letter">Wiki</namespace>
<namespace key="5" case="first-letter">Wiki talk</namespace>
<namespace key="6" case="first-letter">File</namespace>
<namespace key="7" case="first-letter">File talk</namespace>
<namespace key="8" case="first-letter">MediaWiki</namespace>
<namespace key="9" case="first-letter">MediaWiki talk</namespace>
<namespace key="10" case="first-letter">Template</namespace>
<namespace key="11" case="first-letter">Template talk</namespace>
<namespace key="12" case="first-letter">Help</namespace>
<namespace key="13" case="first-letter">Help talk</namespace>
<namespace key="14" case="first-letter">Category</namespace>
<namespace key="15" case="first-letter">Category talk</namespace>
<namespace key="120" case="first-letter">Item</namespace>
<namespace key="121" case="first-letter">Item talk</namespace>
<namespace key="122" case="first-letter">Property</namespace>
<namespace key="123" case="first-letter">Property talk</namespace>
<namespace key="200" case="first-letter">DE</namespace>
<namespace key="201" case="first-letter">DE talk</namespace>
<namespace key="202" case="first-letter">FR</namespace>
<namespace key="203" case="first-letter">FR talk</namespace>
<namespace key="204" case="first-letter">ES</namespace>
<namespace key="205" case="first-letter">ES talk</namespace>
<namespace key="206" case="first-letter">IT</namespace>
<namespace key="207" case="first-letter">IT talk</namespace>
<namespace key="208" case="first-letter">NL</namespace>
<namespace key="209" case="first-letter">NL talk</namespace>
<namespace key="210" case="first-letter">RU</namespace>
<namespace key="211" case="first-letter">RU talk</namespace>
<namespace key="212" case="first-letter">JA</namespace>
<namespace key="213" case="first-letter">JA talk</namespace>
<namespace key="710" case="first-letter">TimedText</namespace>
<namespace key="711" case="first-letter">TimedText talk</namespace>
<namespace key="828" case="first-letter">Module</namespace>
<namespace key="829" case="first-letter">Module talk</namespace>
<namespace key="2300" case="first-letter">Gadget</namespace>
<namespace key="2301" case="first-letter">Gadget talk</namespace>
<namespace key="2302" case="case-sensitive">Gadget definition</namespace>
<namespace key="2303" case="case-sensitive">Gadget definition talk</namespace>
</namespaces>
</siteinfo>
<page>
<title>Nominatim/Special Phrases/EN</title>
<ns>0</ns>
<id>67365</id>
<revision>
<id>2100424</id>
<parentid>2100422</parentid>
<timestamp>2021-01-27T20:29:53Z</timestamp>
<contributor>
<username>Violaine Do</username>
<id>88152</id>
</contributor>
<minor/>
<comment>/* en */ add coworking amenity</comment>
<origin>2100424</origin>
<model>wikitext</model>
<format>text/x-wiki</format>
<text bytes="158218" sha1="cst5x7tt58izti1pxzgljf27tx8qjcj" xml:space="preserve">
== en == {| class="wikitable sortable" |- ! Word / Phrase !! Key !! Value !! Operator !! Plural |- | Zip Line || aerialway || zip_line || - || N |- | Zip Lines || aerialway || zip_line || - || Y |- | Zip Line in || aerialway || zip_line || in || N |- | Zip Lines in || aerialway || zip_line || in || Y |- | Zip Line near || aerialway || zip_line || near || N |- | Animal shelter || amenity || animal_shelter || - || N |- | Animal shelters || amenity || animal_shelter || - || Y |- | Animal shelter in || amenity || animal_shelter || in || N |- | Animal shelters in || amenity || animal_shelter || in || Y |- | Animal shelter near || amenity || animal_shelter || near|| N |- | Animal shelters near || amenity || animal_shelter || NEAR|| Y |- | Drinking Water near || amenity || drinking_water || near || N |- | Water || amenity || drinking_water || - || N |- | Water in || amenity || drinking_water || In || N |- | Water near || amenity || drinking_water || near || N |- | Embassy || amenity || embassy || - || N |- | Embassys || amenity || "embassy" || - || Y |- | Embassies || amenity || embassy || - || Y |- |Coworkings near |amenity |coworking_space |near |Y |} [[Category:Word list]]
</text>
<sha1>cst5x7tt58izti1pxzgljf27tx8qjcj</sha1>
</revision>
</page>
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.11/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="en">
<siteinfo>
<sitename>OpenStreetMap Wiki</sitename>
<dbname>wiki</dbname>
<base>https://wiki.openstreetmap.org/wiki/Main_Page</base>
<generator>MediaWiki 1.43.5</generator>
<case>first-letter</case>
<namespaces>
<namespace key="-2" case="first-letter">Media</namespace>
<namespace key="-1" case="first-letter">Special</namespace>
<namespace key="0" case="first-letter"/>
<namespace key="1" case="first-letter">Talk</namespace>
<namespace key="2" case="first-letter">User</namespace>
<namespace key="3" case="first-letter">User talk</namespace>
<namespace key="4" case="first-letter">Wiki</namespace>
<namespace key="5" case="first-letter">Wiki talk</namespace>
<namespace key="6" case="first-letter">File</namespace>
<namespace key="7" case="first-letter">File talk</namespace>
<namespace key="8" case="first-letter">MediaWiki</namespace>
<namespace key="9" case="first-letter">MediaWiki talk</namespace>
<namespace key="10" case="first-letter">Template</namespace>
<namespace key="11" case="first-letter">Template talk</namespace>
<namespace key="12" case="first-letter">Help</namespace>
<namespace key="13" case="first-letter">Help talk</namespace>
<namespace key="14" case="first-letter">Category</namespace>
<namespace key="15" case="first-letter">Category talk</namespace>
<namespace key="120" case="first-letter">Item</namespace>
<namespace key="121" case="first-letter">Item talk</namespace>
<namespace key="122" case="first-letter">Property</namespace>
<namespace key="123" case="first-letter">Property talk</namespace>
<namespace key="200" case="first-letter">DE</namespace>
<namespace key="201" case="first-letter">DE talk</namespace>
<namespace key="202" case="first-letter">FR</namespace>
<namespace key="203" case="first-letter">FR talk</namespace>
<namespace key="204" case="first-letter">ES</namespace>
<namespace key="205" case="first-letter">ES talk</namespace>
<namespace key="206" case="first-letter">IT</namespace>
<namespace key="207" case="first-letter">IT talk</namespace>
<namespace key="208" case="first-letter">NL</namespace>
<namespace key="209" case="first-letter">NL talk</namespace>
<namespace key="210" case="first-letter">RU</namespace>
<namespace key="211" case="first-letter">RU talk</namespace>
<namespace key="212" case="first-letter">JA</namespace>
<namespace key="213" case="first-letter">JA talk</namespace>
<namespace key="710" case="first-letter">TimedText</namespace>
<namespace key="711" case="first-letter">TimedText talk</namespace>
<namespace key="828" case="first-letter">Module</namespace>
<namespace key="829" case="first-letter">Module talk</namespace>
<namespace key="3000" case="first-letter">Proposal</namespace>
<namespace key="3001" case="first-letter">Proposal talk</namespace>
</namespaces>
</siteinfo>
<page>
<title>Nominatim/Special Phrases/EN</title>
<ns>0</ns>
<id>67365</id>
<revision>
<id>2861977</id>
<parentid>2634159</parentid>
<timestamp>2025-06-02T14:00:52Z</timestamp>
<contributor>
<username>Lonvia</username>
<id>17191</id>
</contributor>
<comment>overgeneralized entry removed, phrases need to chosen so that all results with the given tag can be described with that phrase</comment>
<origin>2861977</origin>
<model>wikitext</model>
<format>text/x-wiki</format>
<text bytes="160765" sha1="0zlpuvnjs4io9e006rntbxm5b84kgst" xml:space="preserve">== en ==
{| class="wikitable sortable"
|-
! Word / Phrase !! Key !! Value !! Operator !! Plural
|-
| Zip Line || aerialway || zip_line || - || N
|-
| Zip Lines || aerialway || zip_line || - || Y
|-
| Zip Line in || aerialway || zip_line || in || N
|-
| Zip Lines in || aerialway || zip_line || in || Y
|-
| Zip Line near || aerialway || zip_line || near || N
|-
| Animal shelter || amenity || animal_shelter || - || N
|-
| Animal shelters || amenity || animal_shelter || - || Y
|-
| Animal shelter in || amenity || animal_shelter || in || N
|-
| Animal shelters in || amenity || animal_shelter || in || Y
|-
| Animal shelter near || amenity || animal_shelter || near|| N
|-
| Animal shelters near || amenity || animal_shelter || NEAR|| Y
|-
| Drinking Water near || amenity || drinking_water || near || N
|-
| Water || amenity || drinking_water || - || N
|-
| Water in || amenity || drinking_water || In || N
|-
| Water near || amenity || drinking_water || near || N
|-
| Embassy || amenity || embassy || - || N
|-
| Embassys || amenity || "embassy" || - || Y
|-
| Embassies || amenity || embassy || - || Y
|-
| Coworkings near
| amenity
| coworking_space
| near
| Y
|}
[[Category:Word list]]</text>
<sha1>0zlpuvnjs4io9e006rntbxm5b84kgst</sha1>
</revision>
</page>
</mediawiki>