add japanese phrase preprocessing

Code adapted from GSOC code by @miku.
2026-02-16 15:47:58 +00:00 · 2025-01-08 19:43:25 +01:00
parent 86ad9efa8a
commit efc09a5cfc
3 changed files with 96 additions and 0 deletions
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -1,4 +1,5 @@
 query-preprocessing:
    - step: split_japanese_phrases
    - step: normalize
 normalization:
    - ":: lower ()"
--- a/src/nominatim_api/query_preprocessing/split_japanese_phrases.py
+++ b/src/nominatim_api/query_preprocessing/split_japanese_phrases.py
@@ -0,0 +1,61 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 This file divides Japanese addresses into three categories:
 prefecture, municipality, and other.
 The division is not strict but simple using these keywords.
 """
 from typing import List
 import re
 from .config import QueryConfig
 from .base import QueryProcessingFunc
 from ..search.query import Phrase
 MATCH_PATTERNS = [
    r'''
                (...??[都都道府県縣])            # [group1] prefecture
                (.+?[市区區町村])              # [group2] municipalities (city/wards/towns/villages)
                (.+)                         # [group3] other words
                ''',
    r'''
                (...??[都都道府県縣])            # [group1] prefecture
                (.+)                         # [group3] other words
                ''',
    r'''
                (.+?[市区區町村])              # [group2] municipalities (city/wards/towns/villages)
                (.+)                         # [group3] other words
                '''
 ]
 class _JapanesePreprocessing:
    def __init__(self, config: QueryConfig) -> None:
        self.config = config
    def split_phrase(self, phrase: Phrase) -> Phrase:
        """
        This function performs a division on the given text using a regular expression.
        """
        for pattern in MATCH_PATTERNS:
            result = re.match(pattern, phrase.text, re.VERBOSE)
            if result is not None:
                return Phrase(phrase.ptype, ':'.join(result.groups()))
        return phrase
    def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
        """Split a Japanese address using japanese_tokenizer.
        """
        return [self.split_phrase(p) for p in phrases]
 def create(config: QueryConfig) -> QueryProcessingFunc:
    """ Create a function of japanese preprocessing.
    """
    return _JapanesePreprocessing(config)
--- a/test/python/api/query_processing/test_split_japanese_phrases.py
+++ b/test/python/api/query_processing/test_split_japanese_phrases.py
@@ -0,0 +1,34 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Tests for japanese phrase splitting.
 """
 from pathlib import Path
 import pytest
 from icu import Transliterator
 import nominatim_api.search.query as qmod
 from nominatim_api.query_preprocessing.config import QueryConfig
 from nominatim_api.query_preprocessing import split_japanese_phrases
 def run_preprocessor_on(query):
    proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None))
    return proc(query)
@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'),
                                      ('大阪府大阪', '大阪府:大阪'),
                                      ('大阪市大阪', '大阪市:大阪')])
 def test_split_phrases(inp, outp):
    query = [qmod.Phrase(qmod.PhraseType.NONE, inp)]
    out = run_preprocessor_on(query)
    assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]