improve normalization

Make sure all special symbols are removed during normalization already.
Those won't be interpreted in any way because they are unlikely to be
searched for.
This commit is contained in:
Sarah Hoffmann
2021-06-26 19:38:08 +02:00
parent b9fbfeff67
commit 4fd2e961b6
3 changed files with 41 additions and 8 deletions

View File

@@ -1,20 +1,29 @@
normalization:
- ":: NFD ()"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- ":: lower ()"
- !include icu-rules/unicode-digits-to-decimal.yaml
- "'№' > 'no'"
- "'n°' > 'no'"
- "'nº' > 'no'"
- "ª > a"
- "º > o"
- "[[:Punctuation:][:Symbol:]] > ' '"
- "ß > 'ss'" # German szet is unimbigiously equal to double ss
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
- "[^[:Letter:] [:Number:] [:Space:]] >"
- "[:Lm:] >"
- ":: [[:Number:]] Latin ()"
- ":: [[:Number:]] Ascii ();"
- ":: [[:Number:]] NFD ();"
- "[[:Nonspacing Mark:] [:Cf:]] >;"
- "[:Space:]+ > ' '"
transliteration:
- ":: Latin ()"
- !include icu-rules/extended-unicode-to-asccii.yaml
- ":: Ascii ()"
- ":: NFD ()"
- "'' >"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- "[^[:Ascii:]] >"
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
- "[:Space:]+ > ' '"
variants:
- words:
- ~hal => hal