only consider partials in multi-words for initial count

This ensures that it is less likely that we exclude meaningful
words like 'hauptstrasse' just because they are frequent.
This commit is contained in:
Sarah Hoffmann
2021-06-26 11:57:09 +02:00
parent 5dd24b3ef0
commit b9fbfeff67
2 changed files with 3 additions and 3 deletions

View File

@@ -168,7 +168,8 @@ class LegacyICUTokenizer:
for name, cnt in cur:
terms = set()
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
terms.update(word.split())
if ' ' in word:
terms.update(word.split())
for term in terms:
words[term] += cnt