mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-16 15:47:58 +00:00
add japanese phrase preprocessing
Code adapted from GSOC code by @miku.
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
query-preprocessing:
|
query-preprocessing:
|
||||||
|
- step: split_japanese_phrases
|
||||||
- step: normalize
|
- step: normalize
|
||||||
normalization:
|
normalization:
|
||||||
- ":: lower ()"
|
- ":: lower ()"
|
||||||
|
|||||||
@@ -0,0 +1,61 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
This file divides Japanese addresses into three categories:
|
||||||
|
prefecture, municipality, and other.
|
||||||
|
The division is not strict but simple using these keywords.
|
||||||
|
"""
|
||||||
|
from typing import List
|
||||||
|
import re
|
||||||
|
|
||||||
|
from .config import QueryConfig
|
||||||
|
from .base import QueryProcessingFunc
|
||||||
|
from ..search.query import Phrase
|
||||||
|
|
||||||
|
MATCH_PATTERNS = [
|
||||||
|
r'''
|
||||||
|
(...??[都都道府県縣]) # [group1] prefecture
|
||||||
|
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
|
||||||
|
(.+) # [group3] other words
|
||||||
|
''',
|
||||||
|
r'''
|
||||||
|
(...??[都都道府県縣]) # [group1] prefecture
|
||||||
|
(.+) # [group3] other words
|
||||||
|
''',
|
||||||
|
r'''
|
||||||
|
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
|
||||||
|
(.+) # [group3] other words
|
||||||
|
'''
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class _JapanesePreprocessing:
|
||||||
|
|
||||||
|
def __init__(self, config: QueryConfig) -> None:
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def split_phrase(self, phrase: Phrase) -> Phrase:
|
||||||
|
"""
|
||||||
|
This function performs a division on the given text using a regular expression.
|
||||||
|
"""
|
||||||
|
for pattern in MATCH_PATTERNS:
|
||||||
|
result = re.match(pattern, phrase.text, re.VERBOSE)
|
||||||
|
if result is not None:
|
||||||
|
return Phrase(phrase.ptype, ':'.join(result.groups()))
|
||||||
|
|
||||||
|
return phrase
|
||||||
|
|
||||||
|
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
|
||||||
|
"""Split a Japanese address using japanese_tokenizer.
|
||||||
|
"""
|
||||||
|
return [self.split_phrase(p) for p in phrases]
|
||||||
|
|
||||||
|
|
||||||
|
def create(config: QueryConfig) -> QueryProcessingFunc:
|
||||||
|
""" Create a function of japanese preprocessing.
|
||||||
|
"""
|
||||||
|
return _JapanesePreprocessing(config)
|
||||||
@@ -0,0 +1,34 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Tests for japanese phrase splitting.
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from icu import Transliterator
|
||||||
|
|
||||||
|
import nominatim_api.search.query as qmod
|
||||||
|
from nominatim_api.query_preprocessing.config import QueryConfig
|
||||||
|
from nominatim_api.query_preprocessing import split_japanese_phrases
|
||||||
|
|
||||||
|
def run_preprocessor_on(query):
|
||||||
|
proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None))
|
||||||
|
|
||||||
|
return proc(query)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'),
|
||||||
|
('大阪府大阪', '大阪府:大阪'),
|
||||||
|
('大阪市大阪', '大阪市:大阪')])
|
||||||
|
def test_split_phrases(inp, outp):
|
||||||
|
query = [qmod.Phrase(qmod.PhraseType.NONE, inp)]
|
||||||
|
|
||||||
|
out = run_preprocessor_on(query)
|
||||||
|
|
||||||
|
assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]
|
||||||
Reference in New Issue
Block a user