diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index e5cbeb6f..a3c62e67 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -7,7 +7,7 @@ normalization: - "'nº' > 'no'" - "ª > a" - "º > o" - - "[[:Punctuation:][:Symbol:]] > ' '" + - "[[:Punctuation:][:Symbol:]\u02bc] > ' '" - "ß > 'ss'" # German szet is unimbigiously equal to double ss - "[^[:Letter:] [:Number:] [:Space:]] >" - "[:Lm:] >"