mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-11 05:14:07 +00:00
make compund decomposition pure import feature
Compound decomposition now creates a full name variant on import just like abbreviations. This simplifies query time normalization and opens a path for changing abbreviation and compund decomposition lists for an existing database.
This commit is contained in:
@@ -31,23 +31,13 @@ class ICURuleLoader:
|
|||||||
|
|
||||||
def get_search_rules(self):
|
def get_search_rules(self):
|
||||||
""" Return the ICU rules to be used during search.
|
""" Return the ICU rules to be used during search.
|
||||||
The rules combine normalization, compound decomposition (including
|
The rules combine normalization and transliteration.
|
||||||
abbreviated compounds) and transliteration.
|
|
||||||
"""
|
"""
|
||||||
# First apply the normalization rules.
|
# First apply the normalization rules.
|
||||||
rules = io.StringIO()
|
rules = io.StringIO()
|
||||||
rules.write(self.normalization_rules)
|
rules.write(self.normalization_rules)
|
||||||
|
|
||||||
# For all compound suffixes: add them in their full and any abbreviated form.
|
# Then add transliteration.
|
||||||
suffixes = set()
|
|
||||||
for suffix in self.compound_suffixes:
|
|
||||||
suffixes.add(suffix)
|
|
||||||
suffixes.update(self.abbreviations.get(suffix, []))
|
|
||||||
|
|
||||||
for suffix in sorted(suffixes, key=len, reverse=True):
|
|
||||||
rules.write("'{0} ' > ' {0} ';".format(suffix))
|
|
||||||
|
|
||||||
# Finally add transliteration.
|
|
||||||
rules.write(self.transliteration_rules)
|
rules.write(self.transliteration_rules)
|
||||||
return rules.getvalue()
|
return rules.getvalue()
|
||||||
|
|
||||||
@@ -69,6 +59,12 @@ class ICURuleLoader:
|
|||||||
"""
|
"""
|
||||||
synonyms = defaultdict(set)
|
synonyms = defaultdict(set)
|
||||||
|
|
||||||
|
# First add entries for compound decomposition.
|
||||||
|
for suffix in self.compound_suffixes:
|
||||||
|
variants = (suffix + ' ', ' ' + suffix + ' ')
|
||||||
|
for key in variants:
|
||||||
|
synonyms[key].update(variants)
|
||||||
|
|
||||||
for full, abbr in self.abbreviations.items():
|
for full, abbr in self.abbreviations.items():
|
||||||
key = ' ' + full + ' '
|
key = ' ' + full + ' '
|
||||||
# Entries in the abbreviation list always apply to full words:
|
# Entries in the abbreviation list always apply to full words:
|
||||||
@@ -76,15 +72,15 @@ class ICURuleLoader:
|
|||||||
# Replacements are optional, so add a noop
|
# Replacements are optional, so add a noop
|
||||||
synonyms[key].add(key)
|
synonyms[key].add(key)
|
||||||
|
|
||||||
# Entries in the compound list expand to themselves and to
|
if full in self.compound_suffixes:
|
||||||
# abbreviations.
|
# Full word abbreviating to compunded version.
|
||||||
for suffix in self.compound_suffixes:
|
synonyms[key].update((a + ' ' for a in abbr))
|
||||||
keyset = synonyms[suffix + ' ']
|
|
||||||
keyset.add(' ' + suffix + ' ')
|
key = full + ' '
|
||||||
keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
|
# Uncompunded suffix abbrevitating to decompounded version.
|
||||||
# The terms the entries are shortended to, need to be decompunded as well.
|
synonyms[key].update((' ' + a + ' ' for a in abbr))
|
||||||
for abbr in self.abbreviations.get(suffix, []):
|
# Uncompunded suffix abbrevitating to compunded version.
|
||||||
synonyms[abbr + ' '].add(' ' + abbr + ' ')
|
synonyms[key].update((a + ' ' for a in abbr))
|
||||||
|
|
||||||
# sort the resulting list by descending length (longer matches are prefered).
|
# sort the resulting list by descending length (longer matches are prefered).
|
||||||
sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
|
sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ Feature: Import and search of names
|
|||||||
Scenario: Special characters in name
|
Scenario: Special characters in name
|
||||||
Given the places
|
Given the places
|
||||||
| osm | class | type | name |
|
| osm | class | type | name |
|
||||||
| N1 | place | locality | Jim-Knopf-Str |
|
| N1 | place | locality | Jim-Knopf-Straße |
|
||||||
| N2 | place | locality | Smith/Weston |
|
| N2 | place | locality | Smith/Weston |
|
||||||
| N3 | place | locality | space mountain |
|
| N3 | place | locality | space mountain |
|
||||||
| N4 | place | locality | space |
|
| N4 | place | locality | space |
|
||||||
|
|||||||
@@ -48,9 +48,10 @@ def test_simple_variants(cfgfile):
|
|||||||
proc = ICUNameProcessor(rules)
|
proc = ICUNameProcessor(rules)
|
||||||
|
|
||||||
assert set(get_normalized_variants(proc, "Bauwegstraße")) \
|
assert set(get_normalized_variants(proc, "Bauwegstraße")) \
|
||||||
== {'bauweg straße', 'bauweg str'}
|
== {'bauweg straße', 'bauweg str', 'bauwegstraße', 'bauwegstr'}
|
||||||
assert get_normalized_variants(proc, "Bauwegstr") == ['bauweg str']
|
assert get_normalized_variants(proc, "Bauwegstr") == ['bauwegstr']
|
||||||
assert get_normalized_variants(proc, "holzweg") == ['holz weg']
|
assert set(get_normalized_variants(proc, "holzweg")) \
|
||||||
|
== {'holz weg', 'holzweg'}
|
||||||
assert get_normalized_variants(proc, "hallo") == ['hallo']
|
assert get_normalized_variants(proc, "hallo") == ['hallo']
|
||||||
|
|
||||||
|
|
||||||
@@ -82,6 +83,6 @@ def test_search_normalized(cfgfile):
|
|||||||
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
|
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
|
||||||
proc = ICUNameProcessor(rules)
|
proc = ICUNameProcessor(rules)
|
||||||
|
|
||||||
assert proc.get_search_normalized('Master Street') == 'master street'
|
assert proc.get_search_normalized('Master Street') == 'master street'
|
||||||
assert proc.get_search_normalized('Earnes St') == 'earne s st'
|
assert proc.get_search_normalized('Earnes St') == 'earnes st'
|
||||||
assert proc.get_search_normalized('Nostreet') == 'no street'
|
assert proc.get_search_normalized('Nostreet') == 'nostreet'
|
||||||
|
|||||||
@@ -91,10 +91,10 @@ def test_get_search_rules(cfgfile):
|
|||||||
trans = Transliterator.createFromRules("test", rules)
|
trans = Transliterator.createFromRules("test", rules)
|
||||||
|
|
||||||
assert trans.transliterate(" Baum straße ") == " baum straße "
|
assert trans.transliterate(" Baum straße ") == " baum straße "
|
||||||
assert trans.transliterate(" Baumstraße ") == " baum straße "
|
assert trans.transliterate(" Baumstraße ") == " baumstraße "
|
||||||
assert trans.transliterate(" Baumstrasse ") == " baum strasse "
|
assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
|
||||||
assert trans.transliterate(" Baumstr ") == " baum str "
|
assert trans.transliterate(" Baumstr ") == " baumstr "
|
||||||
assert trans.transliterate(" Baumwegstr ") == " baumweg str "
|
assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
|
||||||
assert trans.transliterate(" Αθήνα ") == " athēna "
|
assert trans.transliterate(" Αθήνα ") == " athēna "
|
||||||
assert trans.transliterate(" проспект ") == " prospekt "
|
assert trans.transliterate(" проспект ") == " prospekt "
|
||||||
|
|
||||||
@@ -128,11 +128,10 @@ def test_get_replacement_pairs_multi_to(cfgfile):
|
|||||||
repl = ICURuleLoader(fpath).get_replacement_pairs()
|
repl = ICURuleLoader(fpath).get_replacement_pairs()
|
||||||
|
|
||||||
assert [(a, sorted(b)) for a, b in repl] == \
|
assert [(a, sorted(b)) for a, b in repl] == \
|
||||||
[(' strasse ', [' st ', ' str ', ' strasse ']),
|
[(' strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
|
||||||
('strasse ', [' st ', ' str ', ' strasse ']),
|
('strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
|
||||||
('pfad ', [' pfad ']),
|
(' pfad ', [' pfad ', 'pfad ']),
|
||||||
('str ' , [' str ']),
|
('pfad ', [' pfad ', 'pfad '])]
|
||||||
('st ' , [' st '])]
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_replacement_pairs_multi_from(cfgfile):
|
def test_get_replacement_pairs_multi_from(cfgfile):
|
||||||
|
|||||||
@@ -151,8 +151,9 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
|
|||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
tok.init_new_db(test_config)
|
tok.init_new_db(test_config)
|
||||||
|
|
||||||
assert word_table.get_partial_words() == {('te', 1), ('st', 1), ('52', 1),
|
assert word_table.get_partial_words() == {('test', 1), ('52', 1),
|
||||||
('no', 1), ('area', 2),
|
('no', 1), ('area', 2),
|
||||||
|
('holzstrasse', 1), ('holzstr', 1),
|
||||||
('holz', 1), ('strasse', 1),
|
('holz', 1), ('strasse', 1),
|
||||||
('str', 1)}
|
('str', 1)}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user