move abbreviation computation into import phase

This adds precomputation of abbreviated terms for names and removes
abbreviation of terms in the query. Basic import works but still
needs some thorough testing as well as speed improvements during
import.

New dependency for python library datrie.
This commit is contained in:
Sarah Hoffmann
2021-05-28 22:06:13 +02:00
parent 6ba00e6aee
commit 8413075249
10 changed files with 665 additions and 206 deletions

View File

@@ -0,0 +1,60 @@
"""
Tests for import name normalisation and variant generation.
"""
from textwrap import dedent
import pytest
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.errors import UsageError
@pytest.fixture
def cfgfile(tmp_path, suffix='.yaml'):
def _create_config(suffixes, abbr):
content = dedent("""\
normalization:
- ":: NFD ()"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
transliteration:
- ":: Latin ()"
""")
content += "compound_suffixes:\n"
content += '\n'.join((" - " + s for s in suffixes)) + '\n'
content += "abbreviations:\n"
content += '\n'.join((" - " + s for s in abbr)) + '\n'
fpath = tmp_path / ('test_config' + suffix)
fpath.write_text(dedent(content))
return fpath
return _create_config
def test_simple_variants(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'],
['strasse,straße => str',
'prospekt => pr'])
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
assert set(proc.get_normalized_variants("Bauwegstraße")) \
== {'bauweg straße', 'bauweg str'}
assert proc.get_normalized_variants("Bauwegstr") == ['bauweg str']
assert proc.get_normalized_variants("holzweg") == ['holz weg']
assert proc.get_normalized_variants("hallo") == ['hallo']
def test_multiple_replacements(cfgfile):
fpath = cfgfile([], ['saint => s,st', 'street => st'])
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
assert set(proc.get_normalized_variants("Saint Johns Street")) == \
{'saint johns street', 's johns street', 'st johns street',
'saint johns st', 's johns st', 'st johns st'}

View File

@@ -0,0 +1,75 @@
"""
Tests for converting a config file to ICU rules.
"""
import pytest
from textwrap import dedent
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.errors import UsageError
from icu import Transliterator
@pytest.fixture
def cfgfile(tmp_path, suffix='.yaml'):
def _create_config(suffixes, abbr):
content = dedent("""\
normalization:
- ":: NFD ()"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
transliteration:
- ":: Latin ()"
""")
content += "compound_suffixes:\n"
content += '\n'.join((" - " + s for s in suffixes)) + '\n'
content += "abbreviations:\n"
content += '\n'.join((" - " + s for s in abbr)) + '\n'
fpath = tmp_path / ('test_config' + suffix)
fpath.write_text(dedent(content))
return fpath
return _create_config
def test_missing_normalization(tmp_path):
fpath = tmp_path / ('test_config.yaml')
fpath.write_text(dedent("""\
normalizatio:
- ":: NFD ()"
"""))
with pytest.raises(UsageError):
ICURuleLoader(fpath)
def test_get_search_rules(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'],
['strasse,straße => str',
'prospekt => pr'])
loader = ICURuleLoader(fpath)
rules = loader.get_search_rules()
trans = Transliterator.createFromRules("test", rules)
assert trans.transliterate(" Baumstraße ") == " baum straße "
assert trans.transliterate(" Baumstrasse ") == " baum strasse "
assert trans.transliterate(" Baumstr ") == " baum str "
assert trans.transliterate(" Baumwegstr ") == " baumweg str "
assert trans.transliterate(" Αθήνα ") == " athēna "
assert trans.transliterate(" проспект ") == " prospekt "
def test_get_synonym_pairs(cfgfile):
fpath = cfgfile(['Weg', 'Strasse'],
['Strasse => str,st'])
loader = ICURuleLoader(fpath)
repl = loader.get_replacement_pairs()
assert repl == [(' strasse ', {' strasse ', ' str ', ' st '}),
('strasse ', {' strasse ', ' str ', ' st '}),
('weg ', {' weg '})]