diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index d290c148..23db34c9 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -67,7 +67,13 @@ Here is an example configuration file: ``` yaml query-preprocessing: - - normalize + - step: split_japanese_phrases + - step: regex_replace + replacements: + - pattern: https?://[^\s]* # Filter URLs starting with http or https + replace: '' + - step: normalize + normalization: - ":: lower ()" - "ß > 'ss'" # German szet is unambiguously equal to double ss @@ -88,8 +94,8 @@ token-analysis: replacements: ['ä', 'ae'] ``` -The configuration file contains four sections: -`normalization`, `transliteration`, `sanitizers` and `token-analysis`. +The configuration file contains five sections: +`query-preprocessing`, `normalization`, `transliteration`, `sanitizers` and `token-analysis`. #### Query preprocessing @@ -106,6 +112,19 @@ The following is a list of preprocessors that are shipped with Nominatim. heading_level: 6 docstring_section_style: spacy +##### regex-replace + +::: nominatim_api.query_preprocessing.regex_replace + options: + members: False + heading_level: 6 + docstring_section_style: spacy + description: + This option runs any given regex pattern on the input and replaces values accordingly + replacements: + - pattern: regex pattern + replace: string to replace with + #### Normalization and Transliteration diff --git a/src/nominatim_api/query_preprocessing/regex_replace.py b/src/nominatim_api/query_preprocessing/regex_replace.py new file mode 100644 index 00000000..b3a02495 --- /dev/null +++ b/src/nominatim_api/query_preprocessing/regex_replace.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +This preprocessor replaces values in a given input based on pre-defined regex rules. + +Arguments: + pattern: Regex pattern to be applied on the input + replace: The string that it is to be replaced with +""" +from typing import List +import re + +from .config import QueryConfig +from .base import QueryProcessingFunc +from ..search.query import Phrase + + +class _GenericPreprocessing: + """Perform replacements to input phrases using custom regex patterns.""" + + def __init__(self, config: QueryConfig) -> None: + """Initialise the _GenericPreprocessing class with patterns from the ICU config file.""" + self.config = config + + match_patterns = self.config.get('replacements', 'Key not found') + self.compiled_patterns = [ + (re.compile(item['pattern']), item['replace']) for item in match_patterns + ] + + def split_phrase(self, phrase: Phrase) -> Phrase: + """This function performs replacements on the given text using regex patterns.""" + for item in self.compiled_patterns: + phrase.text = item[0].sub(item[1], phrase.text) + + return phrase + + def __call__(self, phrases: List[Phrase]) -> List[Phrase]: + """ + Return the final Phrase list. + Returns an empty list if there is nothing left after split_phrase. + """ + result = [p for p in map(self.split_phrase, phrases) if p.text.strip()] + return result + + +def create(config: QueryConfig) -> QueryProcessingFunc: + """ Create a function for generic preprocessing.""" + return _GenericPreprocessing(config) diff --git a/test/python/api/query_processing/test_regex_replace.py b/test/python/api/query_processing/test_regex_replace.py new file mode 100644 index 00000000..ef759ba1 --- /dev/null +++ b/test/python/api/query_processing/test_regex_replace.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +''' +Tests for replacing values in an input using custom regex. +''' +import pytest + +import nominatim_api.search.query as qmod +from nominatim_api.query_preprocessing.config import QueryConfig +from nominatim_api.query_preprocessing import regex_replace + + +def run_preprocessor_on(query): + config = QueryConfig() + config.set_normalizer(None) + + config['replacements'] = [ + {'pattern': r'\b(?:\d{1,3}\.){3}\d{1,3}\b', 'replace': ''}, # IPv4 + {'pattern': r'https?://\S+', 'replace': ''} # HTTP/HTTPS URLs + ] + + proc = regex_replace.create(config) + return proc(query) + + +@pytest.mark.parametrize('inp,outp', [ + (['45.67.89.101'], []), + (['198.51.100.23'], []), + (['203.0.113.255'], []), + (['http://www.openstreetmap.org'], []), + (['https://www.openstreetmap.org/edit'], []), + (['http://osm.org'], []), + (['https://www.openstreetmap.org/user/abc'], []), + (['https://tile.openstreetmap.org/12/2048/2048.png'], []), + (['Check the map at https://www.openstreetmap.org'], ['Check the map at ']), + (['Use 203.0.113.255 for routing'], ['Use for routing']), + (['Find maps at https://osm.org and http://openstreetmap.org'], ['Find maps at and ']), + (['203.0.113.255', 'Some Address'], ['Some Address']), + (['https://osm.org', 'Another Place'], ['Another Place']), +]) +def test_split_phrases(inp, outp): + query = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in inp] + + out = run_preprocessor_on(query) + assert out == [qmod.Phrase(qmod.PHRASE_ANY, text) for text in outp]