diff --git a/.github/actions/build-nominatim/action.yml b/.github/actions/build-nominatim/action.yml index 85eef53f..73f40747 100644 --- a/.github/actions/build-nominatim/action.yml +++ b/.github/actions/build-nominatim/action.yml @@ -22,7 +22,7 @@ runs: - name: Install prerequisites from apt run: | - sudo apt-get install -y -qq python3-icu python3-datrie python3-jinja2 python3-psutil python3-dotenv python3-yaml python3-sqlalchemy python3-psycopg python3-asyncpg + sudo apt-get install -y -qq python3-icu python3-datrie python3-jinja2 python3-psutil python3-dotenv python3-yaml python3-sqlalchemy python3-psycopg python3-asyncpg python3-mwparserfromhell shell: bash if: inputs.dependencies == 'apt' diff --git a/docs/admin/Installation.md b/docs/admin/Installation.md index 99387ed6..7b67edd7 100644 --- a/docs/admin/Installation.md +++ b/docs/admin/Installation.md @@ -37,6 +37,7 @@ Furthermore the following Python libraries are required: * [Jinja2](https://palletsprojects.com/p/jinja/) * [PyICU](https://pypi.org/project/PyICU/) * [PyYaml](https://pyyaml.org/) (5.1+) + * [mwparserfromhell](https://github.com/earwig/mwparserfromhell/) These will be installed automatically when using pip installation. diff --git a/docs/develop/Development-Environment.md b/docs/develop/Development-Environment.md index 577ccbd0..29d376a7 100644 --- a/docs/develop/Development-Environment.md +++ b/docs/develop/Development-Environment.md @@ -73,7 +73,7 @@ virtualenv ~/nominatim-dev-venv types-jinja2 types-markupsafe types-psutil types-psycopg2 \ types-pygments types-pyyaml types-requests types-ujson \ types-urllib3 typing-extensions unicorn falcon starlette \ - uvicorn mypy osmium aiosqlite + uvicorn mypy osmium aiosqlite mwparserfromhell ``` Now enter the virtual environment whenever you want to develop: diff --git a/packaging/nominatim-db/pyproject.toml b/packaging/nominatim-db/pyproject.toml index c2020f13..19a37512 100644 --- a/packaging/nominatim-db/pyproject.toml +++ b/packaging/nominatim-db/pyproject.toml @@ -20,7 +20,8 @@ dependencies = [ "jinja2", "pyYAML>=5.1", "psutil", - "PyICU" + "PyICU", + "mwparserfromhell" ] dynamic = ["version"] diff --git a/src/nominatim_db/tools/special_phrases/sp_wiki_loader.py b/src/nominatim_db/tools/special_phrases/sp_wiki_loader.py index 9908f753..06853cdf 100644 --- a/src/nominatim_db/tools/special_phrases/sp_wiki_loader.py +++ b/src/nominatim_db/tools/special_phrases/sp_wiki_loader.py @@ -11,6 +11,8 @@ from typing import Iterable import re import logging +import mwparserfromhell + from ...config import Configuration from ...utils.url_utils import get_url from .special_phrase import SpecialPhrase @@ -36,10 +38,6 @@ class SPWikiLoader: """ def __init__(self, config: Configuration) -> None: self.config = config - # Compile the regex here to increase performances. - self.occurence_pattern = re.compile( - r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])' - ) # Hack around a bug where building=yes was imported with quotes into the wiki self.type_fix_pattern = re.compile(r'\"|"') @@ -58,11 +56,21 @@ class SPWikiLoader: LOG.warning('Importing phrases for lang: %s...', lang) loaded_xml = _get_wiki_content(lang) - # One match will be of format [label, class, type, operator, plural] - matches = self.occurence_pattern.findall(loaded_xml) + wikicode = mwparserfromhell.parse(loaded_xml) - for match in matches: - yield SpecialPhrase(match[0], - match[1], - self.type_fix_pattern.sub('', match[2]), - match[3]) + for table in wikicode.filter_tags(matches=lambda t: t.tag == 'table'): + for row in table.contents.filter_tags(matches=lambda t: t.tag == 'tr'): + cells = list(row.contents.filter_tags(matches=lambda t: t.tag == 'td')) + + if len(cells) < 5: + continue + + label = cells[0].contents.strip_code().strip() + cls = cells[1].contents.strip_code().strip() + typ = cells[2].contents.strip_code().strip() + operator = cells[3].contents.strip_code().strip() + + yield SpecialPhrase(label, + cls, + self.type_fix_pattern.sub('', typ), + operator) diff --git a/test/python/tools/test_import_special_phrases.py b/test/python/tools/test_import_special_phrases.py index c676c40a..2df1c682 100644 --- a/test/python/tools/test_import_special_phrases.py +++ b/test/python/tools/test_import_special_phrases.py @@ -203,7 +203,7 @@ def test_import_phrases(monkeypatch, temp_db_cursor, def_config, sp_importer, placex_table.add(cls='amenity', typ='animal_shelter') # in db for special phrase filtering sp_importer.import_phrases(tokenizer, should_replace) - assert len(tokenizer.analyser_cache['special_phrases']) == 18 + assert len(tokenizer.analyser_cache['special_phrases']) == 19 assert check_table_exist(temp_db_cursor, class_test, type_test) assert check_placeid_and_centroid_indexes(temp_db_cursor, class_test, type_test) diff --git a/test/python/tools/test_sp_wiki_loader.py b/test/python/tools/test_sp_wiki_loader.py index b8e41cbe..9b937112 100644 --- a/test/python/tools/test_sp_wiki_loader.py +++ b/test/python/tools/test_sp_wiki_loader.py @@ -54,4 +54,6 @@ def test_generate_phrases(sp_wiki_loader): ('Water near', 'amenity', 'drinking_water', 'near'), ('Embassy', 'amenity', 'embassy', '-'), ('Embassys', 'amenity', 'embassy', '-'), - ('Embassies', 'amenity', 'embassy', '-')} + ('Embassies', 'amenity', 'embassy', '-'), + # test for one-cell-per-line format + ('Coworkings near', 'amenity', 'coworking_space', 'near')} diff --git a/test/testdata/special_phrases_test_content.txt b/test/testdata/special_phrases_test_content.txt index e5f340b9..edfcc322 100644 --- a/test/testdata/special_phrases_test_content.txt +++ b/test/testdata/special_phrases_test_content.txt @@ -1,78 +1,120 @@ - - -OpenStreetMap Wiki -wiki -https://wiki.openstreetmap.org/wiki/Main_Page -MediaWiki 1.35.2 -first-letter - -Media -Special - -Talk -User -User talk -Wiki -Wiki talk -File -File talk -MediaWiki -MediaWiki talk -Template -Template talk -Help -Help talk -Category -Category talk -Item -Item talk -Property -Property talk -DE -DE talk -FR -FR talk -ES -ES talk -IT -IT talk -NL -NL talk -RU -RU talk -JA -JA talk -TimedText -TimedText talk -Module -Module talk -Gadget -Gadget talk -Gadget definition -Gadget definition talk - - - -Nominatim/Special Phrases/EN -0 -67365 - -2100424 -2100422 -2021-01-27T20:29:53Z - -Violaine Do -88152 - - -/* en */ add coworking amenity -2100424 -wikitext -text/x-wiki - -== en == {| class="wikitable sortable" |- ! Word / Phrase !! Key !! Value !! Operator !! Plural |- | Zip Line || aerialway || zip_line || - || N |- | Zip Lines || aerialway || zip_line || - || Y |- | Zip Line in || aerialway || zip_line || in || N |- | Zip Lines in || aerialway || zip_line || in || Y |- | Zip Line near || aerialway || zip_line || near || N |- | Animal shelter || amenity || animal_shelter || - || N |- | Animal shelters || amenity || animal_shelter || - || Y |- | Animal shelter in || amenity || animal_shelter || in || N |- | Animal shelters in || amenity || animal_shelter || in || Y |- | Animal shelter near || amenity || animal_shelter || near|| N |- | Animal shelters near || amenity || animal_shelter || NEAR|| Y |- | Drinking Water near || amenity || drinking_water || near || N |- | Water || amenity || drinking_water || - || N |- | Water in || amenity || drinking_water || In || N |- | Water near || amenity || drinking_water || near || N |- | Embassy || amenity || embassy || - || N |- | Embassys || amenity || "embassy" || - || Y |- | Embassies || amenity || embassy || - || Y |- |Coworkings near |amenity |coworking_space |near |Y |} [[Category:Word list]] - -cst5x7tt58izti1pxzgljf27tx8qjcj - - + + + OpenStreetMap Wiki + wiki + https://wiki.openstreetmap.org/wiki/Main_Page + MediaWiki 1.43.5 + first-letter + + Media + Special + + Talk + User + User talk + Wiki + Wiki talk + File + File talk + MediaWiki + MediaWiki talk + Template + Template talk + Help + Help talk + Category + Category talk + Item + Item talk + Property + Property talk + DE + DE talk + FR + FR talk + ES + ES talk + IT + IT talk + NL + NL talk + RU + RU talk + JA + JA talk + TimedText + TimedText talk + Module + Module talk + Proposal + Proposal talk + + + + Nominatim/Special Phrases/EN + 0 + 67365 + + 2861977 + 2634159 + 2025-06-02T14:00:52Z + + Lonvia + 17191 + + overgeneralized entry removed, phrases need to chosen so that all results with the given tag can be described with that phrase + 2861977 + wikitext + text/x-wiki + == en == +{| class="wikitable sortable" +|- +! Word / Phrase !! Key !! Value !! Operator !! Plural +|- +| Zip Line || aerialway || zip_line || - || N +|- +| Zip Lines || aerialway || zip_line || - || Y +|- +| Zip Line in || aerialway || zip_line || in || N +|- +| Zip Lines in || aerialway || zip_line || in || Y +|- +| Zip Line near || aerialway || zip_line || near || N +|- +| Animal shelter || amenity || animal_shelter || - || N +|- +| Animal shelters || amenity || animal_shelter || - || Y +|- +| Animal shelter in || amenity || animal_shelter || in || N +|- +| Animal shelters in || amenity || animal_shelter || in || Y +|- +| Animal shelter near || amenity || animal_shelter || near|| N +|- +| Animal shelters near || amenity || animal_shelter || NEAR|| Y +|- +| Drinking Water near || amenity || drinking_water || near || N +|- +| Water || amenity || drinking_water || - || N +|- +| Water in || amenity || drinking_water || In || N +|- +| Water near || amenity || drinking_water || near || N +|- +| Embassy || amenity || embassy || - || N +|- +| Embassys || amenity || "embassy" || - || Y +|- +| Embassies || amenity || embassy || - || Y +|- +| Coworkings near +| amenity +| coworking_space +| near +| Y +|} +[[Category:Word list]] + 0zlpuvnjs4io9e006rntbxm5b84kgst + +