mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
Add generic preprocessor
This commit is contained in:
@@ -1,5 +1,13 @@
|
|||||||
query-preprocessing:
|
query-preprocessing:
|
||||||
- step: split_japanese_phrases
|
- step: split_japanese_phrases
|
||||||
|
- step: regex_replace
|
||||||
|
replacements:
|
||||||
|
- pattern: \b(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(?:\.(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}\b # Filter for IPv4 addresses
|
||||||
|
replace: ''
|
||||||
|
- pattern: \b(?:(?:[A-Fa-f0-9]{1,4}:){1,7}|:)(?:[A-Fa-f0-9]{1,4})?\b # Filter for IPv6 addresses
|
||||||
|
replace: ''
|
||||||
|
- pattern: https?://[^\s]* # Filter URLs starting with http or https
|
||||||
|
replace: ''
|
||||||
- step: normalize
|
- step: normalize
|
||||||
normalization:
|
normalization:
|
||||||
- ":: lower ()"
|
- ":: lower ()"
|
||||||
|
|||||||
46
src/nominatim_api/query_preprocessing/regex_replace.py
Normal file
46
src/nominatim_api/query_preprocessing/regex_replace.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
This file replaces values based on pre-defined regex rules:
|
||||||
|
"""
|
||||||
|
from typing import List
|
||||||
|
import re
|
||||||
|
|
||||||
|
from .config import QueryConfig
|
||||||
|
from .base import QueryProcessingFunc
|
||||||
|
from ..search.query import Phrase
|
||||||
|
|
||||||
|
|
||||||
|
class _GenericPreprocessing:
|
||||||
|
|
||||||
|
def __init__(self, config: QueryConfig) -> None:
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def split_phrase(self, phrase: Phrase) -> Phrase:
|
||||||
|
"""
|
||||||
|
This function performs replacements on the given text using regex patterns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if phrase.text is None:
|
||||||
|
return phrase
|
||||||
|
|
||||||
|
match_patterns = self.config.get('replacements', 'Key not found')
|
||||||
|
for item in match_patterns:
|
||||||
|
phrase.text = re.sub(item['pattern'], item['replace'], phrase.text)
|
||||||
|
|
||||||
|
return phrase
|
||||||
|
|
||||||
|
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
|
||||||
|
"""Apply regex replacements to the given addresses.
|
||||||
|
"""
|
||||||
|
return [self.split_phrase(p) for p in phrases]
|
||||||
|
|
||||||
|
|
||||||
|
def create(config: QueryConfig) -> QueryProcessingFunc:
|
||||||
|
""" Create a function for generic preprocessing.
|
||||||
|
"""
|
||||||
|
return _GenericPreprocessing(config)
|
||||||
Reference in New Issue
Block a user