Merge pull request #3675 from TuringVerified/generic-preprocessors

Add generic preprocessors
2026-03-06 18:14:16 +00:00 · 2025-04-01 20:14:43 +02:00
parent 9cf5eee5d4 2eeec46040
commit a49e8b9cf7
3 changed files with 123 additions and 3 deletions
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -67,7 +67,13 @@ Here is an example configuration file:
 ``` yaml
 query-preprocessing:
-    - normalize
+    - step: split_japanese_phrases
    - step: regex_replace
      replacements:
        - pattern: https?://[^\s]* # Filter URLs starting with http or https
          replace: ''
    - step: normalize
 normalization:
    - ":: lower ()"
    - "ß > 'ss'" # German szet is unambiguously equal to double ss
@@ -88,8 +94,8 @@ token-analysis:
            replacements: ['ä', 'ae']
 ```
-The configuration file contains four sections:
+The configuration file contains five sections:
-`normalization`, `transliteration`, `sanitizers` and `token-analysis`.
+`query-preprocessing`, `normalization`, `transliteration`, `sanitizers` and `token-analysis`.
 #### Query preprocessing
@@ -106,6 +112,19 @@ The following is a list of preprocessors that are shipped with Nominatim.
        heading_level: 6
        docstring_section_style: spacy
 ##### regex-replace
 ::: nominatim_api.query_preprocessing.regex_replace
    options:
        members: False
        heading_level: 6
        docstring_section_style: spacy
    description: 
        This option runs any given regex pattern on the input and replaces values accordingly
    replacements:
        - pattern: regex pattern
          replace: string to replace with
 #### Normalization and Transliteration
--- a/src/nominatim_api/query_preprocessing/regex_replace.py
+++ b/src/nominatim_api/query_preprocessing/regex_replace.py
@@ -0,0 +1,52 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 This preprocessor replaces values in a given input based on pre-defined regex rules.
 Arguments:
    pattern: Regex pattern to be applied on the input
    replace: The string that it is to be replaced with
 """
 from typing import List
 import re
 from .config import QueryConfig
 from .base import QueryProcessingFunc
 from ..search.query import Phrase
 class _GenericPreprocessing:
    """Perform replacements to input phrases using custom regex patterns."""
    def __init__(self, config: QueryConfig) -> None:
        """Initialise the _GenericPreprocessing class with patterns from the ICU config file."""
        self.config = config
        match_patterns = self.config.get('replacements', 'Key not found')
        self.compiled_patterns = [
            (re.compile(item['pattern']), item['replace']) for item in match_patterns
            ]
    def split_phrase(self, phrase: Phrase) -> Phrase:
        """This function performs replacements on the given text using regex patterns."""
        for item in self.compiled_patterns:
            phrase.text = item[0].sub(item[1], phrase.text)
        return phrase
    def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
        """
        Return the final Phrase list.
        Returns an empty list if there is nothing left after split_phrase.
        """
        result = [p for p in map(self.split_phrase, phrases) if p.text.strip()]
        return result
 def create(config: QueryConfig) -> QueryProcessingFunc:
    """ Create a function for generic preprocessing."""
    return _GenericPreprocessing(config)
--- a/test/python/api/query_processing/test_regex_replace.py
+++ b/test/python/api/query_processing/test_regex_replace.py
@@ -0,0 +1,49 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 '''
 Tests for replacing values in an input using custom regex.
 '''
 import pytest
 import nominatim_api.search.query as qmod
 from nominatim_api.query_preprocessing.config import QueryConfig
 from nominatim_api.query_preprocessing import regex_replace
 def run_preprocessor_on(query):
    config = QueryConfig()
    config.set_normalizer(None)
    config['replacements'] = [
        {'pattern': r'\b(?:\d{1,3}\.){3}\d{1,3}\b', 'replace': ''},  # IPv4
        {'pattern': r'https?://\S+', 'replace': ''}  # HTTP/HTTPS URLs
    ]
    proc = regex_replace.create(config)
    return proc(query)
@pytest.mark.parametrize('inp,outp', [
    (['45.67.89.101'], []),
    (['198.51.100.23'], []),
    (['203.0.113.255'], []),
    (['http://www.openstreetmap.org'], []),
    (['https://www.openstreetmap.org/edit'], []),
    (['http://osm.org'], []),
    (['https://www.openstreetmap.org/user/abc'], []),
    (['https://tile.openstreetmap.org/12/2048/2048.png'], []),
    (['Check the map at https://www.openstreetmap.org'], ['Check the map at ']),
    (['Use 203.0.113.255 for routing'], ['Use  for routing']),
    (['Find maps at https://osm.org and http://openstreetmap.org'], ['Find maps at  and ']),
    (['203.0.113.255', 'Some Address'], ['Some Address']),
    (['https://osm.org', 'Another Place'], ['Another Place']),
 ])
 def test_split_phrases(inp, outp):
    query = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in inp]
    out = run_preprocessor_on(query)
    assert out == [qmod.Phrase(qmod.PHRASE_ANY, text) for text in outp]