mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
Update documentation, optimise regex_replace, add tests
This commit is contained in:
@@ -67,7 +67,12 @@ Here is an example configuration file:
|
|||||||
|
|
||||||
``` yaml
|
``` yaml
|
||||||
query-preprocessing:
|
query-preprocessing:
|
||||||
- normalize
|
- step: regex_replace
|
||||||
|
replacements:
|
||||||
|
- pattern: https?://[^\s]* # Filter URLs starting with http or https
|
||||||
|
replace: ''
|
||||||
|
- step: normalize
|
||||||
|
|
||||||
normalization:
|
normalization:
|
||||||
- ":: lower ()"
|
- ":: lower ()"
|
||||||
- "ß > 'ss'" # German szet is unambiguously equal to double ss
|
- "ß > 'ss'" # German szet is unambiguously equal to double ss
|
||||||
@@ -88,8 +93,8 @@ token-analysis:
|
|||||||
replacements: ['ä', 'ae']
|
replacements: ['ä', 'ae']
|
||||||
```
|
```
|
||||||
|
|
||||||
The configuration file contains four sections:
|
The configuration file contains five sections:
|
||||||
`normalization`, `transliteration`, `sanitizers` and `token-analysis`.
|
`query-preprocessing`, `normalization`, `transliteration`, `sanitizers` and `token-analysis`.
|
||||||
|
|
||||||
#### Query preprocessing
|
#### Query preprocessing
|
||||||
|
|
||||||
@@ -106,6 +111,17 @@ The following is a list of preprocessors that are shipped with Nominatim.
|
|||||||
heading_level: 6
|
heading_level: 6
|
||||||
docstring_section_style: spacy
|
docstring_section_style: spacy
|
||||||
|
|
||||||
|
::: nominatim_api.query_preprocessing.regex_replace
|
||||||
|
options:
|
||||||
|
members: False
|
||||||
|
heading_level: 6
|
||||||
|
docstring_section_style: spacy
|
||||||
|
description:
|
||||||
|
This option runs any given regex pattern on the input and replaces values accordingly
|
||||||
|
replacements:
|
||||||
|
- pattern: regex pattern
|
||||||
|
replace: string to replace with
|
||||||
|
|
||||||
|
|
||||||
#### Normalization and Transliteration
|
#### Normalization and Transliteration
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,5 @@
|
|||||||
query-preprocessing:
|
query-preprocessing:
|
||||||
- step: split_japanese_phrases
|
- step: split_japanese_phrases
|
||||||
- step: regex_replace
|
|
||||||
replacements:
|
|
||||||
- pattern: \b(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(?:\.(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}\b # Filter for IPv4 addresses
|
|
||||||
replace: ''
|
|
||||||
- pattern: \b(?:(?:[A-Fa-f0-9]{1,4}:){1,7}|:)(?:[A-Fa-f0-9]{1,4})?\b # Filter for IPv6 addresses
|
|
||||||
replace: ''
|
|
||||||
- pattern: https?://[^\s]* # Filter URLs starting with http or https
|
|
||||||
replace: ''
|
|
||||||
- step: normalize
|
- step: normalize
|
||||||
normalization:
|
normalization:
|
||||||
- ":: lower ()"
|
- ":: lower ()"
|
||||||
|
|||||||
@@ -20,24 +20,25 @@ class _GenericPreprocessing:
|
|||||||
def __init__(self, config: QueryConfig) -> None:
|
def __init__(self, config: QueryConfig) -> None:
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
|
match_patterns = self.config.get('replacements', 'Key not found')
|
||||||
|
self.compiled_patterns = [
|
||||||
|
(re.compile(item['pattern']), item['replace']) for item in match_patterns
|
||||||
|
]
|
||||||
|
|
||||||
def split_phrase(self, phrase: Phrase) -> Phrase:
|
def split_phrase(self, phrase: Phrase) -> Phrase:
|
||||||
"""
|
"""
|
||||||
This function performs replacements on the given text using regex patterns.
|
This function performs replacements on the given text using regex patterns.
|
||||||
"""
|
"""
|
||||||
|
for item in self.compiled_patterns:
|
||||||
if phrase.text is None:
|
phrase.text = item[0].sub(item[1], phrase.text)
|
||||||
return phrase
|
|
||||||
|
|
||||||
match_patterns = self.config.get('replacements', 'Key not found')
|
|
||||||
for item in match_patterns:
|
|
||||||
phrase.text = re.sub(item['pattern'], item['replace'], phrase.text)
|
|
||||||
|
|
||||||
return phrase
|
return phrase
|
||||||
|
|
||||||
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
|
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
|
||||||
"""Apply regex replacements to the given addresses.
|
"""Apply regex replacements to the given addresses.
|
||||||
"""
|
"""
|
||||||
return [self.split_phrase(p) for p in phrases]
|
result = [p for p in map(self.split_phrase, phrases) if p.text.strip()]
|
||||||
|
return result if result else []
|
||||||
|
|
||||||
|
|
||||||
def create(config: QueryConfig) -> QueryProcessingFunc:
|
def create(config: QueryConfig) -> QueryProcessingFunc:
|
||||||
|
|||||||
51
test/python/api/query_processing/test_regex_replace.py
Normal file
51
test/python/api/query_processing/test_regex_replace.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
'''
|
||||||
|
Tests for replacing values in an input using custom regex.
|
||||||
|
'''
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import nominatim_api.search.query as qmod
|
||||||
|
from nominatim_api.query_preprocessing.config import QueryConfig
|
||||||
|
from nominatim_api.query_preprocessing import regex_replace
|
||||||
|
|
||||||
|
|
||||||
|
def run_preprocessor_on(query):
|
||||||
|
config = QueryConfig()
|
||||||
|
config.set_normalizer(None)
|
||||||
|
|
||||||
|
config['replacements'] = [
|
||||||
|
{'pattern': r'\b(?:\d{1,3}\.){3}\d{1,3}\b', 'replace': ''}, # IPv4
|
||||||
|
{'pattern': r'https?://\S+', 'replace': ''} # HTTP/HTTPS URLs
|
||||||
|
]
|
||||||
|
|
||||||
|
proc = regex_replace.create(config)
|
||||||
|
return proc(query)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('inp,outp', [
|
||||||
|
(['45.67.89.101'], []),
|
||||||
|
(['198.51.100.23'], []),
|
||||||
|
(['203.0.113.255'], []),
|
||||||
|
(['http://www.openstreetmap.org'], []),
|
||||||
|
(['https://www.openstreetmap.org/edit'], []),
|
||||||
|
(['http://osm.org'], []),
|
||||||
|
(['https://www.openstreetmap.org/user/abc'], []),
|
||||||
|
(['https://tile.openstreetmap.org/12/2048/2048.png'], []),
|
||||||
|
(['Check the map at https://www.openstreetmap.org'], ['Check the map at ']),
|
||||||
|
(['Use 203.0.113.255 for routing'], ['Use for routing']),
|
||||||
|
(['Find maps at https://osm.org and http://openstreetmap.org'], ['Find maps at and ']),
|
||||||
|
(['203.0.113.255', 'Some Address'], ['Some Address']),
|
||||||
|
(['https://osm.org', 'Another Place'], ['Another Place']),
|
||||||
|
])
|
||||||
|
def test_split_phrases(inp, outp):
|
||||||
|
query = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in inp]
|
||||||
|
|
||||||
|
out = run_preprocessor_on(query)
|
||||||
|
expected_out = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in outp]
|
||||||
|
|
||||||
|
assert out == expected_out, f"Expected {expected_out}, but got {out}"
|
||||||
Reference in New Issue
Block a user