Add generic preprocessor

This commit is contained in:
TuringVerified
2025-03-13 20:01:21 +05:30
parent 32728d6c89
commit 4665ea3e77
2 changed files with 54 additions and 0 deletions

View File

@@ -1,5 +1,13 @@
query-preprocessing: query-preprocessing:
- step: split_japanese_phrases - step: split_japanese_phrases
- step: regex_replace
replacements:
- pattern: \b(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(?:\.(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}\b # Filter for IPv4 addresses
replace: ''
- pattern: \b(?:(?:[A-Fa-f0-9]{1,4}:){1,7}|:)(?:[A-Fa-f0-9]{1,4})?\b # Filter for IPv6 addresses
replace: ''
- pattern: https?://[^\s]* # Filter URLs starting with http or https
replace: ''
- step: normalize - step: normalize
normalization: normalization:
- ":: lower ()" - ":: lower ()"

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This file replaces values based on pre-defined regex rules:
"""
from typing import List
import re
from .config import QueryConfig
from .base import QueryProcessingFunc
from ..search.query import Phrase
class _GenericPreprocessing:
def __init__(self, config: QueryConfig) -> None:
self.config = config
def split_phrase(self, phrase: Phrase) -> Phrase:
"""
This function performs replacements on the given text using regex patterns.
"""
if phrase.text is None:
return phrase
match_patterns = self.config.get('replacements', 'Key not found')
for item in match_patterns:
phrase.text = re.sub(item['pattern'], item['replace'], phrase.text)
return phrase
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
"""Apply regex replacements to the given addresses.
"""
return [self.split_phrase(p) for p in phrases]
def create(config: QueryConfig) -> QueryProcessingFunc:
""" Create a function for generic preprocessing.
"""
return _GenericPreprocessing(config)