Merge pull request #3675 from TuringVerified/generic-preprocessors

Add generic preprocessors
This commit is contained in:
Sarah Hoffmann
2025-04-01 20:14:43 +02:00
committed by GitHub
3 changed files with 123 additions and 3 deletions

View File

@@ -67,7 +67,13 @@ Here is an example configuration file:
``` yaml ``` yaml
query-preprocessing: query-preprocessing:
- normalize - step: split_japanese_phrases
- step: regex_replace
replacements:
- pattern: https?://[^\s]* # Filter URLs starting with http or https
replace: ''
- step: normalize
normalization: normalization:
- ":: lower ()" - ":: lower ()"
- "ß > 'ss'" # German szet is unambiguously equal to double ss - "ß > 'ss'" # German szet is unambiguously equal to double ss
@@ -88,8 +94,8 @@ token-analysis:
replacements: ['ä', 'ae'] replacements: ['ä', 'ae']
``` ```
The configuration file contains four sections: The configuration file contains five sections:
`normalization`, `transliteration`, `sanitizers` and `token-analysis`. `query-preprocessing`, `normalization`, `transliteration`, `sanitizers` and `token-analysis`.
#### Query preprocessing #### Query preprocessing
@@ -106,6 +112,19 @@ The following is a list of preprocessors that are shipped with Nominatim.
heading_level: 6 heading_level: 6
docstring_section_style: spacy docstring_section_style: spacy
##### regex-replace
::: nominatim_api.query_preprocessing.regex_replace
options:
members: False
heading_level: 6
docstring_section_style: spacy
description:
This option runs any given regex pattern on the input and replaces values accordingly
replacements:
- pattern: regex pattern
replace: string to replace with
#### Normalization and Transliteration #### Normalization and Transliteration

View File

@@ -0,0 +1,52 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This preprocessor replaces values in a given input based on pre-defined regex rules.
Arguments:
pattern: Regex pattern to be applied on the input
replace: The string that it is to be replaced with
"""
from typing import List
import re
from .config import QueryConfig
from .base import QueryProcessingFunc
from ..search.query import Phrase
class _GenericPreprocessing:
"""Perform replacements to input phrases using custom regex patterns."""
def __init__(self, config: QueryConfig) -> None:
"""Initialise the _GenericPreprocessing class with patterns from the ICU config file."""
self.config = config
match_patterns = self.config.get('replacements', 'Key not found')
self.compiled_patterns = [
(re.compile(item['pattern']), item['replace']) for item in match_patterns
]
def split_phrase(self, phrase: Phrase) -> Phrase:
"""This function performs replacements on the given text using regex patterns."""
for item in self.compiled_patterns:
phrase.text = item[0].sub(item[1], phrase.text)
return phrase
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
"""
Return the final Phrase list.
Returns an empty list if there is nothing left after split_phrase.
"""
result = [p for p in map(self.split_phrase, phrases) if p.text.strip()]
return result
def create(config: QueryConfig) -> QueryProcessingFunc:
""" Create a function for generic preprocessing."""
return _GenericPreprocessing(config)

View File

@@ -0,0 +1,49 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
'''
Tests for replacing values in an input using custom regex.
'''
import pytest
import nominatim_api.search.query as qmod
from nominatim_api.query_preprocessing.config import QueryConfig
from nominatim_api.query_preprocessing import regex_replace
def run_preprocessor_on(query):
config = QueryConfig()
config.set_normalizer(None)
config['replacements'] = [
{'pattern': r'\b(?:\d{1,3}\.){3}\d{1,3}\b', 'replace': ''}, # IPv4
{'pattern': r'https?://\S+', 'replace': ''} # HTTP/HTTPS URLs
]
proc = regex_replace.create(config)
return proc(query)
@pytest.mark.parametrize('inp,outp', [
(['45.67.89.101'], []),
(['198.51.100.23'], []),
(['203.0.113.255'], []),
(['http://www.openstreetmap.org'], []),
(['https://www.openstreetmap.org/edit'], []),
(['http://osm.org'], []),
(['https://www.openstreetmap.org/user/abc'], []),
(['https://tile.openstreetmap.org/12/2048/2048.png'], []),
(['Check the map at https://www.openstreetmap.org'], ['Check the map at ']),
(['Use 203.0.113.255 for routing'], ['Use for routing']),
(['Find maps at https://osm.org and http://openstreetmap.org'], ['Find maps at and ']),
(['203.0.113.255', 'Some Address'], ['Some Address']),
(['https://osm.org', 'Another Place'], ['Another Place']),
])
def test_split_phrases(inp, outp):
query = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in inp]
out = run_preprocessor_on(query)
assert out == [qmod.Phrase(qmod.PHRASE_ANY, text) for text in outp]