mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-10 03:54:06 +00:00
Merge pull request #3901 from AyushDharDubey/fix/issue_3829-use-mwparserfromhell-to-parse-sp-wiki-page
Replace regex with `mwparserfromhell` based MW WikiCode Parsing for Special Phrases
This commit is contained in:
2
.github/actions/build-nominatim/action.yml
vendored
2
.github/actions/build-nominatim/action.yml
vendored
@@ -22,7 +22,7 @@ runs:
|
|||||||
|
|
||||||
- name: Install prerequisites from apt
|
- name: Install prerequisites from apt
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get install -y -qq python3-icu python3-datrie python3-jinja2 python3-psutil python3-dotenv python3-yaml python3-sqlalchemy python3-psycopg python3-asyncpg
|
sudo apt-get install -y -qq python3-icu python3-datrie python3-jinja2 python3-psutil python3-dotenv python3-yaml python3-sqlalchemy python3-psycopg python3-asyncpg python3-mwparserfromhell
|
||||||
shell: bash
|
shell: bash
|
||||||
if: inputs.dependencies == 'apt'
|
if: inputs.dependencies == 'apt'
|
||||||
|
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ Furthermore the following Python libraries are required:
|
|||||||
* [Jinja2](https://palletsprojects.com/p/jinja/)
|
* [Jinja2](https://palletsprojects.com/p/jinja/)
|
||||||
* [PyICU](https://pypi.org/project/PyICU/)
|
* [PyICU](https://pypi.org/project/PyICU/)
|
||||||
* [PyYaml](https://pyyaml.org/) (5.1+)
|
* [PyYaml](https://pyyaml.org/) (5.1+)
|
||||||
|
* [mwparserfromhell](https://github.com/earwig/mwparserfromhell/)
|
||||||
|
|
||||||
These will be installed automatically when using pip installation.
|
These will be installed automatically when using pip installation.
|
||||||
|
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ virtualenv ~/nominatim-dev-venv
|
|||||||
types-jinja2 types-markupsafe types-psutil types-psycopg2 \
|
types-jinja2 types-markupsafe types-psutil types-psycopg2 \
|
||||||
types-pygments types-pyyaml types-requests types-ujson \
|
types-pygments types-pyyaml types-requests types-ujson \
|
||||||
types-urllib3 typing-extensions unicorn falcon starlette \
|
types-urllib3 typing-extensions unicorn falcon starlette \
|
||||||
uvicorn mypy osmium aiosqlite
|
uvicorn mypy osmium aiosqlite mwparserfromhell
|
||||||
```
|
```
|
||||||
|
|
||||||
Now enter the virtual environment whenever you want to develop:
|
Now enter the virtual environment whenever you want to develop:
|
||||||
|
|||||||
@@ -20,7 +20,8 @@ dependencies = [
|
|||||||
"jinja2",
|
"jinja2",
|
||||||
"pyYAML>=5.1",
|
"pyYAML>=5.1",
|
||||||
"psutil",
|
"psutil",
|
||||||
"PyICU"
|
"PyICU",
|
||||||
|
"mwparserfromhell"
|
||||||
]
|
]
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
|||||||
@@ -11,6 +11,8 @@ from typing import Iterable
|
|||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
import mwparserfromhell
|
||||||
|
|
||||||
from ...config import Configuration
|
from ...config import Configuration
|
||||||
from ...utils.url_utils import get_url
|
from ...utils.url_utils import get_url
|
||||||
from .special_phrase import SpecialPhrase
|
from .special_phrase import SpecialPhrase
|
||||||
@@ -36,10 +38,6 @@ class SPWikiLoader:
|
|||||||
"""
|
"""
|
||||||
def __init__(self, config: Configuration) -> None:
|
def __init__(self, config: Configuration) -> None:
|
||||||
self.config = config
|
self.config = config
|
||||||
# Compile the regex here to increase performances.
|
|
||||||
self.occurence_pattern = re.compile(
|
|
||||||
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
|
|
||||||
)
|
|
||||||
# Hack around a bug where building=yes was imported with quotes into the wiki
|
# Hack around a bug where building=yes was imported with quotes into the wiki
|
||||||
self.type_fix_pattern = re.compile(r'\"|"')
|
self.type_fix_pattern = re.compile(r'\"|"')
|
||||||
|
|
||||||
@@ -58,11 +56,21 @@ class SPWikiLoader:
|
|||||||
LOG.warning('Importing phrases for lang: %s...', lang)
|
LOG.warning('Importing phrases for lang: %s...', lang)
|
||||||
loaded_xml = _get_wiki_content(lang)
|
loaded_xml = _get_wiki_content(lang)
|
||||||
|
|
||||||
# One match will be of format [label, class, type, operator, plural]
|
wikicode = mwparserfromhell.parse(loaded_xml)
|
||||||
matches = self.occurence_pattern.findall(loaded_xml)
|
|
||||||
|
|
||||||
for match in matches:
|
for table in wikicode.filter_tags(matches=lambda t: t.tag == 'table'):
|
||||||
yield SpecialPhrase(match[0],
|
for row in table.contents.filter_tags(matches=lambda t: t.tag == 'tr'):
|
||||||
match[1],
|
cells = list(row.contents.filter_tags(matches=lambda t: t.tag == 'td'))
|
||||||
self.type_fix_pattern.sub('', match[2]),
|
|
||||||
match[3])
|
if len(cells) < 5:
|
||||||
|
continue
|
||||||
|
|
||||||
|
label = cells[0].contents.strip_code().strip()
|
||||||
|
cls = cells[1].contents.strip_code().strip()
|
||||||
|
typ = cells[2].contents.strip_code().strip()
|
||||||
|
operator = cells[3].contents.strip_code().strip()
|
||||||
|
|
||||||
|
yield SpecialPhrase(label,
|
||||||
|
cls,
|
||||||
|
self.type_fix_pattern.sub('', typ),
|
||||||
|
operator)
|
||||||
|
|||||||
@@ -203,7 +203,7 @@ def test_import_phrases(monkeypatch, temp_db_cursor, def_config, sp_importer,
|
|||||||
placex_table.add(cls='amenity', typ='animal_shelter') # in db for special phrase filtering
|
placex_table.add(cls='amenity', typ='animal_shelter') # in db for special phrase filtering
|
||||||
sp_importer.import_phrases(tokenizer, should_replace)
|
sp_importer.import_phrases(tokenizer, should_replace)
|
||||||
|
|
||||||
assert len(tokenizer.analyser_cache['special_phrases']) == 18
|
assert len(tokenizer.analyser_cache['special_phrases']) == 19
|
||||||
|
|
||||||
assert check_table_exist(temp_db_cursor, class_test, type_test)
|
assert check_table_exist(temp_db_cursor, class_test, type_test)
|
||||||
assert check_placeid_and_centroid_indexes(temp_db_cursor, class_test, type_test)
|
assert check_placeid_and_centroid_indexes(temp_db_cursor, class_test, type_test)
|
||||||
|
|||||||
@@ -54,4 +54,6 @@ def test_generate_phrases(sp_wiki_loader):
|
|||||||
('Water near', 'amenity', 'drinking_water', 'near'),
|
('Water near', 'amenity', 'drinking_water', 'near'),
|
||||||
('Embassy', 'amenity', 'embassy', '-'),
|
('Embassy', 'amenity', 'embassy', '-'),
|
||||||
('Embassys', 'amenity', 'embassy', '-'),
|
('Embassys', 'amenity', 'embassy', '-'),
|
||||||
('Embassies', 'amenity', 'embassy', '-')}
|
('Embassies', 'amenity', 'embassy', '-'),
|
||||||
|
# test for one-cell-per-line format
|
||||||
|
('Coworkings near', 'amenity', 'coworking_space', 'near')}
|
||||||
|
|||||||
196
test/testdata/special_phrases_test_content.txt
vendored
196
test/testdata/special_phrases_test_content.txt
vendored
@@ -1,78 +1,120 @@
|
|||||||
<mediawiki xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="en">
|
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.11/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="en">
|
||||||
<siteinfo>
|
<siteinfo>
|
||||||
<sitename>OpenStreetMap Wiki</sitename>
|
<sitename>OpenStreetMap Wiki</sitename>
|
||||||
<dbname>wiki</dbname>
|
<dbname>wiki</dbname>
|
||||||
<base>https://wiki.openstreetmap.org/wiki/Main_Page</base>
|
<base>https://wiki.openstreetmap.org/wiki/Main_Page</base>
|
||||||
<generator>MediaWiki 1.35.2</generator>
|
<generator>MediaWiki 1.43.5</generator>
|
||||||
<case>first-letter</case>
|
<case>first-letter</case>
|
||||||
<namespaces>
|
<namespaces>
|
||||||
<namespace key="-2" case="first-letter">Media</namespace>
|
<namespace key="-2" case="first-letter">Media</namespace>
|
||||||
<namespace key="-1" case="first-letter">Special</namespace>
|
<namespace key="-1" case="first-letter">Special</namespace>
|
||||||
<namespace key="0" case="first-letter"/>
|
<namespace key="0" case="first-letter"/>
|
||||||
<namespace key="1" case="first-letter">Talk</namespace>
|
<namespace key="1" case="first-letter">Talk</namespace>
|
||||||
<namespace key="2" case="first-letter">User</namespace>
|
<namespace key="2" case="first-letter">User</namespace>
|
||||||
<namespace key="3" case="first-letter">User talk</namespace>
|
<namespace key="3" case="first-letter">User talk</namespace>
|
||||||
<namespace key="4" case="first-letter">Wiki</namespace>
|
<namespace key="4" case="first-letter">Wiki</namespace>
|
||||||
<namespace key="5" case="first-letter">Wiki talk</namespace>
|
<namespace key="5" case="first-letter">Wiki talk</namespace>
|
||||||
<namespace key="6" case="first-letter">File</namespace>
|
<namespace key="6" case="first-letter">File</namespace>
|
||||||
<namespace key="7" case="first-letter">File talk</namespace>
|
<namespace key="7" case="first-letter">File talk</namespace>
|
||||||
<namespace key="8" case="first-letter">MediaWiki</namespace>
|
<namespace key="8" case="first-letter">MediaWiki</namespace>
|
||||||
<namespace key="9" case="first-letter">MediaWiki talk</namespace>
|
<namespace key="9" case="first-letter">MediaWiki talk</namespace>
|
||||||
<namespace key="10" case="first-letter">Template</namespace>
|
<namespace key="10" case="first-letter">Template</namespace>
|
||||||
<namespace key="11" case="first-letter">Template talk</namespace>
|
<namespace key="11" case="first-letter">Template talk</namespace>
|
||||||
<namespace key="12" case="first-letter">Help</namespace>
|
<namespace key="12" case="first-letter">Help</namespace>
|
||||||
<namespace key="13" case="first-letter">Help talk</namespace>
|
<namespace key="13" case="first-letter">Help talk</namespace>
|
||||||
<namespace key="14" case="first-letter">Category</namespace>
|
<namespace key="14" case="first-letter">Category</namespace>
|
||||||
<namespace key="15" case="first-letter">Category talk</namespace>
|
<namespace key="15" case="first-letter">Category talk</namespace>
|
||||||
<namespace key="120" case="first-letter">Item</namespace>
|
<namespace key="120" case="first-letter">Item</namespace>
|
||||||
<namespace key="121" case="first-letter">Item talk</namespace>
|
<namespace key="121" case="first-letter">Item talk</namespace>
|
||||||
<namespace key="122" case="first-letter">Property</namespace>
|
<namespace key="122" case="first-letter">Property</namespace>
|
||||||
<namespace key="123" case="first-letter">Property talk</namespace>
|
<namespace key="123" case="first-letter">Property talk</namespace>
|
||||||
<namespace key="200" case="first-letter">DE</namespace>
|
<namespace key="200" case="first-letter">DE</namespace>
|
||||||
<namespace key="201" case="first-letter">DE talk</namespace>
|
<namespace key="201" case="first-letter">DE talk</namespace>
|
||||||
<namespace key="202" case="first-letter">FR</namespace>
|
<namespace key="202" case="first-letter">FR</namespace>
|
||||||
<namespace key="203" case="first-letter">FR talk</namespace>
|
<namespace key="203" case="first-letter">FR talk</namespace>
|
||||||
<namespace key="204" case="first-letter">ES</namespace>
|
<namespace key="204" case="first-letter">ES</namespace>
|
||||||
<namespace key="205" case="first-letter">ES talk</namespace>
|
<namespace key="205" case="first-letter">ES talk</namespace>
|
||||||
<namespace key="206" case="first-letter">IT</namespace>
|
<namespace key="206" case="first-letter">IT</namespace>
|
||||||
<namespace key="207" case="first-letter">IT talk</namespace>
|
<namespace key="207" case="first-letter">IT talk</namespace>
|
||||||
<namespace key="208" case="first-letter">NL</namespace>
|
<namespace key="208" case="first-letter">NL</namespace>
|
||||||
<namespace key="209" case="first-letter">NL talk</namespace>
|
<namespace key="209" case="first-letter">NL talk</namespace>
|
||||||
<namespace key="210" case="first-letter">RU</namespace>
|
<namespace key="210" case="first-letter">RU</namespace>
|
||||||
<namespace key="211" case="first-letter">RU talk</namespace>
|
<namespace key="211" case="first-letter">RU talk</namespace>
|
||||||
<namespace key="212" case="first-letter">JA</namespace>
|
<namespace key="212" case="first-letter">JA</namespace>
|
||||||
<namespace key="213" case="first-letter">JA talk</namespace>
|
<namespace key="213" case="first-letter">JA talk</namespace>
|
||||||
<namespace key="710" case="first-letter">TimedText</namespace>
|
<namespace key="710" case="first-letter">TimedText</namespace>
|
||||||
<namespace key="711" case="first-letter">TimedText talk</namespace>
|
<namespace key="711" case="first-letter">TimedText talk</namespace>
|
||||||
<namespace key="828" case="first-letter">Module</namespace>
|
<namespace key="828" case="first-letter">Module</namespace>
|
||||||
<namespace key="829" case="first-letter">Module talk</namespace>
|
<namespace key="829" case="first-letter">Module talk</namespace>
|
||||||
<namespace key="2300" case="first-letter">Gadget</namespace>
|
<namespace key="3000" case="first-letter">Proposal</namespace>
|
||||||
<namespace key="2301" case="first-letter">Gadget talk</namespace>
|
<namespace key="3001" case="first-letter">Proposal talk</namespace>
|
||||||
<namespace key="2302" case="case-sensitive">Gadget definition</namespace>
|
</namespaces>
|
||||||
<namespace key="2303" case="case-sensitive">Gadget definition talk</namespace>
|
</siteinfo>
|
||||||
</namespaces>
|
<page>
|
||||||
</siteinfo>
|
<title>Nominatim/Special Phrases/EN</title>
|
||||||
<page>
|
<ns>0</ns>
|
||||||
<title>Nominatim/Special Phrases/EN</title>
|
<id>67365</id>
|
||||||
<ns>0</ns>
|
<revision>
|
||||||
<id>67365</id>
|
<id>2861977</id>
|
||||||
<revision>
|
<parentid>2634159</parentid>
|
||||||
<id>2100424</id>
|
<timestamp>2025-06-02T14:00:52Z</timestamp>
|
||||||
<parentid>2100422</parentid>
|
<contributor>
|
||||||
<timestamp>2021-01-27T20:29:53Z</timestamp>
|
<username>Lonvia</username>
|
||||||
<contributor>
|
<id>17191</id>
|
||||||
<username>Violaine Do</username>
|
</contributor>
|
||||||
<id>88152</id>
|
<comment>overgeneralized entry removed, phrases need to chosen so that all results with the given tag can be described with that phrase</comment>
|
||||||
</contributor>
|
<origin>2861977</origin>
|
||||||
<minor/>
|
<model>wikitext</model>
|
||||||
<comment>/* en */ add coworking amenity</comment>
|
<format>text/x-wiki</format>
|
||||||
<origin>2100424</origin>
|
<text bytes="160765" sha1="0zlpuvnjs4io9e006rntbxm5b84kgst" xml:space="preserve">== en ==
|
||||||
<model>wikitext</model>
|
{| class="wikitable sortable"
|
||||||
<format>text/x-wiki</format>
|
|-
|
||||||
<text bytes="158218" sha1="cst5x7tt58izti1pxzgljf27tx8qjcj" xml:space="preserve">
|
! Word / Phrase !! Key !! Value !! Operator !! Plural
|
||||||
== en == {| class="wikitable sortable" |- ! Word / Phrase !! Key !! Value !! Operator !! Plural |- | Zip Line || aerialway || zip_line || - || N |- | Zip Lines || aerialway || zip_line || - || Y |- | Zip Line in || aerialway || zip_line || in || N |- | Zip Lines in || aerialway || zip_line || in || Y |- | Zip Line near || aerialway || zip_line || near || N |- | Animal shelter || amenity || animal_shelter || - || N |- | Animal shelters || amenity || animal_shelter || - || Y |- | Animal shelter in || amenity || animal_shelter || in || N |- | Animal shelters in || amenity || animal_shelter || in || Y |- | Animal shelter near || amenity || animal_shelter || near|| N |- | Animal shelters near || amenity || animal_shelter || NEAR|| Y |- | Drinking Water near || amenity || drinking_water || near || N |- | Water || amenity || drinking_water || - || N |- | Water in || amenity || drinking_water || In || N |- | Water near || amenity || drinking_water || near || N |- | Embassy || amenity || embassy || - || N |- | Embassys || amenity || "embassy" || - || Y |- | Embassies || amenity || embassy || - || Y |- |Coworkings near |amenity |coworking_space |near |Y |} [[Category:Word list]]
|
|-
|
||||||
</text>
|
| Zip Line || aerialway || zip_line || - || N
|
||||||
<sha1>cst5x7tt58izti1pxzgljf27tx8qjcj</sha1>
|
|-
|
||||||
</revision>
|
| Zip Lines || aerialway || zip_line || - || Y
|
||||||
</page>
|
|-
|
||||||
|
| Zip Line in || aerialway || zip_line || in || N
|
||||||
|
|-
|
||||||
|
| Zip Lines in || aerialway || zip_line || in || Y
|
||||||
|
|-
|
||||||
|
| Zip Line near || aerialway || zip_line || near || N
|
||||||
|
|-
|
||||||
|
| Animal shelter || amenity || animal_shelter || - || N
|
||||||
|
|-
|
||||||
|
| Animal shelters || amenity || animal_shelter || - || Y
|
||||||
|
|-
|
||||||
|
| Animal shelter in || amenity || animal_shelter || in || N
|
||||||
|
|-
|
||||||
|
| Animal shelters in || amenity || animal_shelter || in || Y
|
||||||
|
|-
|
||||||
|
| Animal shelter near || amenity || animal_shelter || near|| N
|
||||||
|
|-
|
||||||
|
| Animal shelters near || amenity || animal_shelter || NEAR|| Y
|
||||||
|
|-
|
||||||
|
| Drinking Water near || amenity || drinking_water || near || N
|
||||||
|
|-
|
||||||
|
| Water || amenity || drinking_water || - || N
|
||||||
|
|-
|
||||||
|
| Water in || amenity || drinking_water || In || N
|
||||||
|
|-
|
||||||
|
| Water near || amenity || drinking_water || near || N
|
||||||
|
|-
|
||||||
|
| Embassy || amenity || embassy || - || N
|
||||||
|
|-
|
||||||
|
| Embassys || amenity || "embassy" || - || Y
|
||||||
|
|-
|
||||||
|
| Embassies || amenity || embassy || - || Y
|
||||||
|
|-
|
||||||
|
| Coworkings near
|
||||||
|
| amenity
|
||||||
|
| coworking_space
|
||||||
|
| near
|
||||||
|
| Y
|
||||||
|
|}
|
||||||
|
[[Category:Word list]]</text>
|
||||||
|
<sha1>0zlpuvnjs4io9e006rntbxm5b84kgst</sha1>
|
||||||
|
</revision>
|
||||||
|
</page>
|
||||||
</mediawiki>
|
</mediawiki>
|
||||||
|
|||||||
Reference in New Issue
Block a user