forked from hans/Nominatim
add japanese phrase preprocessing
Code adapted from GSOC code by @miku.
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
query-preprocessing:
|
||||
- step: split_japanese_phrases
|
||||
- step: normalize
|
||||
normalization:
|
||||
- ":: lower ()"
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
This file divides Japanese addresses into three categories:
|
||||
prefecture, municipality, and other.
|
||||
The division is not strict but simple using these keywords.
|
||||
"""
|
||||
from typing import List
|
||||
import re
|
||||
|
||||
from .config import QueryConfig
|
||||
from .base import QueryProcessingFunc
|
||||
from ..search.query import Phrase
|
||||
|
||||
MATCH_PATTERNS = [
|
||||
r'''
|
||||
(...??[都都道府県縣]) # [group1] prefecture
|
||||
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
|
||||
(.+) # [group3] other words
|
||||
''',
|
||||
r'''
|
||||
(...??[都都道府県縣]) # [group1] prefecture
|
||||
(.+) # [group3] other words
|
||||
''',
|
||||
r'''
|
||||
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
|
||||
(.+) # [group3] other words
|
||||
'''
|
||||
]
|
||||
|
||||
|
||||
class _JapanesePreprocessing:
|
||||
|
||||
def __init__(self, config: QueryConfig) -> None:
|
||||
self.config = config
|
||||
|
||||
def split_phrase(self, phrase: Phrase) -> Phrase:
|
||||
"""
|
||||
This function performs a division on the given text using a regular expression.
|
||||
"""
|
||||
for pattern in MATCH_PATTERNS:
|
||||
result = re.match(pattern, phrase.text, re.VERBOSE)
|
||||
if result is not None:
|
||||
return Phrase(phrase.ptype, ':'.join(result.groups()))
|
||||
|
||||
return phrase
|
||||
|
||||
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
|
||||
"""Split a Japanese address using japanese_tokenizer.
|
||||
"""
|
||||
return [self.split_phrase(p) for p in phrases]
|
||||
|
||||
|
||||
def create(config: QueryConfig) -> QueryProcessingFunc:
|
||||
""" Create a function of japanese preprocessing.
|
||||
"""
|
||||
return _JapanesePreprocessing(config)
|
||||
@@ -0,0 +1,34 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for japanese phrase splitting.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
import nominatim_api.search.query as qmod
|
||||
from nominatim_api.query_preprocessing.config import QueryConfig
|
||||
from nominatim_api.query_preprocessing import split_japanese_phrases
|
||||
|
||||
def run_preprocessor_on(query):
|
||||
proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None))
|
||||
|
||||
return proc(query)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'),
|
||||
('大阪府大阪', '大阪府:大阪'),
|
||||
('大阪市大阪', '大阪市:大阪')])
|
||||
def test_split_phrases(inp, outp):
|
||||
query = [qmod.Phrase(qmod.PhraseType.NONE, inp)]
|
||||
|
||||
out = run_preprocessor_on(query)
|
||||
|
||||
assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]
|
||||
Reference in New Issue
Block a user