Files
Nominatim/test/python/test_tokenizer_icu_rule_loader.py
2021-07-04 10:28:20 +02:00

175 lines
5.5 KiB
Python

"""
Tests for converting a config file to ICU rules.
"""
import pytest
from textwrap import dedent
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.errors import UsageError
from icu import Transliterator
@pytest.fixture
def cfgfile(tmp_path, suffix='.yaml'):
def _create_config(suffixes, abbr):
content = dedent("""\
normalization:
- ":: NFD ()"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
transliteration:
- ":: Latin ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
""")
content += "compound_suffixes:\n"
content += '\n'.join((" - " + s for s in suffixes)) + '\n'
content += "abbreviations:\n"
content += '\n'.join((" - " + s for s in abbr)) + '\n'
fpath = tmp_path / ('test_config' + suffix)
fpath.write_text(dedent(content))
return fpath
return _create_config
def test_empty_rule_file(tmp_path):
fpath = tmp_path / ('test_config.yaml')
fpath.write_text(dedent("""\
normalization:
transliteration:
compound_suffixes:
abbreviations:
"""))
rules = ICURuleLoader(fpath)
assert rules.get_search_rules() == ''
assert rules.get_normalization_rules() == ''
assert rules.get_transliteration_rules() == ''
assert rules.get_replacement_pairs() == []
CONFIG_SECTIONS = ('normalization', 'transliteration',
'compound_suffixes', 'abbreviations')
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
def test_missing_normalization(tmp_path, section):
fpath = tmp_path / ('test_config.yaml')
with fpath.open('w') as fd:
for name in CONFIG_SECTIONS:
if name != section:
fd.write(name + ':\n')
with pytest.raises(UsageError):
ICURuleLoader(fpath)
@pytest.mark.parametrize("abbr", ["simple",
"double => arrow => bad",
"bad = > arrow"])
def test_bad_abbreviation_syntax(tmp_path, abbr):
fpath = tmp_path / ('test_config.yaml')
fpath.write_text(dedent("""\
normalization:
transliteration:
compound_suffixes:
abbreviations:
- {}
""".format(abbr)))
with pytest.raises(UsageError):
rules = ICURuleLoader(fpath)
def test_get_search_rules(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'],
['strasse,straße => str',
'prospekt => pr'])
loader = ICURuleLoader(fpath)
rules = loader.get_search_rules()
trans = Transliterator.createFromRules("test", rules)
assert trans.transliterate(" Baum straße ") == " baum straße "
assert trans.transliterate(" Baumstraße ") == " baum straße "
assert trans.transliterate(" Baumstrasse ") == " baum strasse "
assert trans.transliterate(" Baumstr ") == " baum str "
assert trans.transliterate(" Baumwegstr ") == " baumweg str "
assert trans.transliterate(" Αθήνα ") == " athēna "
assert trans.transliterate(" проспект ") == " prospekt "
def test_get_normalization_rules(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'],
['strasse,straße => str'])
loader = ICURuleLoader(fpath)
rules = loader.get_normalization_rules()
trans = Transliterator.createFromRules("test", rules)
assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
def test_get_transliteration_rules(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'],
['strasse,straße => str'])
loader = ICURuleLoader(fpath)
rules = loader.get_transliteration_rules()
trans = Transliterator.createFromRules("test", rules)
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
def test_get_replacement_pairs_multi_to(cfgfile):
fpath = cfgfile(['Pfad', 'Strasse'],
['Strasse => str,st'])
repl = ICURuleLoader(fpath).get_replacement_pairs()
assert [(a, sorted(b)) for a, b in repl] == \
[(' strasse ', [' st ', ' str ', ' strasse ']),
('strasse ', [' st ', ' str ', ' strasse ']),
('pfad ', [' pfad ']),
('str ' , [' str ']),
('st ' , [' st '])]
def test_get_replacement_pairs_multi_from(cfgfile):
fpath = cfgfile([], ['saint,Sainte => st'])
repl = ICURuleLoader(fpath).get_replacement_pairs()
assert [(a, sorted(b)) for a, b in repl] == \
[(' sainte ', [' sainte ', ' st ']),
(' saint ', [' saint ', ' st '])]
def test_get_replacement_pairs_cross_abbreviations(cfgfile):
fpath = cfgfile([], ['saint,Sainte => st',
'sainte => ste'])
repl = ICURuleLoader(fpath).get_replacement_pairs()
assert [(a, sorted(b)) for a, b in repl] == \
[(' sainte ', [' sainte ', ' st ', ' ste ']),
(' saint ', [' saint ', ' st '])]
@pytest.mark.parametrize("abbr", ["missing to =>",
" => missing from",
"=>"])
def test_bad_abbreviation_syntax(tmp_path, abbr):
fpath = tmp_path / ('test_config.yaml')
fpath.write_text(dedent("""\
normalization:
transliteration:
compound_suffixes:
abbreviations:
- {}
""".format(abbr)))
repl = ICURuleLoader(fpath).get_replacement_pairs()
assert repl == []