make compund decomposition pure import feature

Compound decomposition now creates a full name variant on
import just like abbreviations. This simplifies query time
normalization and opens a path for changing abbreviation
and compund decomposition lists for an existing database.
This commit is contained in:
Sarah Hoffmann
2021-06-11 10:03:31 +02:00
parent 9ff4f66f55
commit f70930b1a0
5 changed files with 35 additions and 38 deletions

View File

@@ -31,23 +31,13 @@ class ICURuleLoader:
def get_search_rules(self): def get_search_rules(self):
""" Return the ICU rules to be used during search. """ Return the ICU rules to be used during search.
The rules combine normalization, compound decomposition (including The rules combine normalization and transliteration.
abbreviated compounds) and transliteration.
""" """
# First apply the normalization rules. # First apply the normalization rules.
rules = io.StringIO() rules = io.StringIO()
rules.write(self.normalization_rules) rules.write(self.normalization_rules)
# For all compound suffixes: add them in their full and any abbreviated form. # Then add transliteration.
suffixes = set()
for suffix in self.compound_suffixes:
suffixes.add(suffix)
suffixes.update(self.abbreviations.get(suffix, []))
for suffix in sorted(suffixes, key=len, reverse=True):
rules.write("'{0} ' > ' {0} ';".format(suffix))
# Finally add transliteration.
rules.write(self.transliteration_rules) rules.write(self.transliteration_rules)
return rules.getvalue() return rules.getvalue()
@@ -69,6 +59,12 @@ class ICURuleLoader:
""" """
synonyms = defaultdict(set) synonyms = defaultdict(set)
# First add entries for compound decomposition.
for suffix in self.compound_suffixes:
variants = (suffix + ' ', ' ' + suffix + ' ')
for key in variants:
synonyms[key].update(variants)
for full, abbr in self.abbreviations.items(): for full, abbr in self.abbreviations.items():
key = ' ' + full + ' ' key = ' ' + full + ' '
# Entries in the abbreviation list always apply to full words: # Entries in the abbreviation list always apply to full words:
@@ -76,15 +72,15 @@ class ICURuleLoader:
# Replacements are optional, so add a noop # Replacements are optional, so add a noop
synonyms[key].add(key) synonyms[key].add(key)
# Entries in the compound list expand to themselves and to if full in self.compound_suffixes:
# abbreviations. # Full word abbreviating to compunded version.
for suffix in self.compound_suffixes: synonyms[key].update((a + ' ' for a in abbr))
keyset = synonyms[suffix + ' ']
keyset.add(' ' + suffix + ' ') key = full + ' '
keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, []))) # Uncompunded suffix abbrevitating to decompounded version.
# The terms the entries are shortended to, need to be decompunded as well. synonyms[key].update((' ' + a + ' ' for a in abbr))
for abbr in self.abbreviations.get(suffix, []): # Uncompunded suffix abbrevitating to compunded version.
synonyms[abbr + ' '].add(' ' + abbr + ' ') synonyms[key].update((a + ' ' for a in abbr))
# sort the resulting list by descending length (longer matches are prefered). # sort the resulting list by descending length (longer matches are prefered).
sorted_keys = sorted(synonyms.keys(), key=len, reverse=True) sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)

View File

@@ -53,7 +53,7 @@ Feature: Import and search of names
Scenario: Special characters in name Scenario: Special characters in name
Given the places Given the places
| osm | class | type | name | | osm | class | type | name |
| N1 | place | locality | Jim-Knopf-Str | | N1 | place | locality | Jim-Knopf-Straße |
| N2 | place | locality | Smith/Weston | | N2 | place | locality | Smith/Weston |
| N3 | place | locality | space mountain | | N3 | place | locality | space mountain |
| N4 | place | locality | space | | N4 | place | locality | space |

View File

@@ -48,9 +48,10 @@ def test_simple_variants(cfgfile):
proc = ICUNameProcessor(rules) proc = ICUNameProcessor(rules)
assert set(get_normalized_variants(proc, "Bauwegstraße")) \ assert set(get_normalized_variants(proc, "Bauwegstraße")) \
== {'bauweg straße', 'bauweg str'} == {'bauweg straße', 'bauweg str', 'bauwegstraße', 'bauwegstr'}
assert get_normalized_variants(proc, "Bauwegstr") == ['bauweg str'] assert get_normalized_variants(proc, "Bauwegstr") == ['bauwegstr']
assert get_normalized_variants(proc, "holzweg") == ['holz weg'] assert set(get_normalized_variants(proc, "holzweg")) \
== {'holz weg', 'holzweg'}
assert get_normalized_variants(proc, "hallo") == ['hallo'] assert get_normalized_variants(proc, "hallo") == ['hallo']
@@ -82,6 +83,6 @@ def test_search_normalized(cfgfile):
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules) proc = ICUNameProcessor(rules)
assert proc.get_search_normalized('Master Street') == 'master street' assert proc.get_search_normalized('Master Street') == 'master street'
assert proc.get_search_normalized('Earnes St') == 'earne s st' assert proc.get_search_normalized('Earnes St') == 'earnes st'
assert proc.get_search_normalized('Nostreet') == 'no street' assert proc.get_search_normalized('Nostreet') == 'nostreet'

View File

@@ -91,10 +91,10 @@ def test_get_search_rules(cfgfile):
trans = Transliterator.createFromRules("test", rules) trans = Transliterator.createFromRules("test", rules)
assert trans.transliterate(" Baum straße ") == " baum straße " assert trans.transliterate(" Baum straße ") == " baum straße "
assert trans.transliterate(" Baumstraße ") == " baum straße " assert trans.transliterate(" Baumstraße ") == " baumstraße "
assert trans.transliterate(" Baumstrasse ") == " baum strasse " assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
assert trans.transliterate(" Baumstr ") == " baum str " assert trans.transliterate(" Baumstr ") == " baumstr "
assert trans.transliterate(" Baumwegstr ") == " baumweg str " assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
assert trans.transliterate(" Αθήνα ") == " athēna " assert trans.transliterate(" Αθήνα ") == " athēna "
assert trans.transliterate(" проспект ") == " prospekt " assert trans.transliterate(" проспект ") == " prospekt "
@@ -128,11 +128,10 @@ def test_get_replacement_pairs_multi_to(cfgfile):
repl = ICURuleLoader(fpath).get_replacement_pairs() repl = ICURuleLoader(fpath).get_replacement_pairs()
assert [(a, sorted(b)) for a, b in repl] == \ assert [(a, sorted(b)) for a, b in repl] == \
[(' strasse ', [' st ', ' str ', ' strasse ']), [(' strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
('strasse ', [' st ', ' str ', ' strasse ']), ('strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
('pfad ', [' pfad ']), (' pfad ', [' pfad ', 'pfad ']),
('str ' , [' str ']), ('pfad ', [' pfad ', 'pfad '])]
('st ' , [' st '])]
def test_get_replacement_pairs_multi_from(cfgfile): def test_get_replacement_pairs_multi_from(cfgfile):

View File

@@ -151,8 +151,9 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
tok = tokenizer_factory() tok = tokenizer_factory()
tok.init_new_db(test_config) tok.init_new_db(test_config)
assert word_table.get_partial_words() == {('te', 1), ('st', 1), ('52', 1), assert word_table.get_partial_words() == {('test', 1), ('52', 1),
('no', 1), ('area', 2), ('no', 1), ('area', 2),
('holzstrasse', 1), ('holzstr', 1),
('holz', 1), ('strasse', 1), ('holz', 1), ('strasse', 1),
('str', 1)} ('str', 1)}