From 4fd2e961b6daaabba02f6f720f01b918364e5500 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 26 Jun 2021 19:38:08 +0200 Subject: [PATCH] improve normalization Make sure all special symbols are removed during normalization already. Those won't be interpreted in any way because they are unlikely to be searched for. --- .../icu-rules/extended-unicode-to-asccii.yaml | 2 +- .../icu-rules/unicode-digits-to-decimal.yaml | 24 +++++++++++++++++++ settings/legacy_icu_tokenizer.yaml | 23 ++++++++++++------ 3 files changed, 41 insertions(+), 8 deletions(-) create mode 100644 settings/icu-rules/unicode-digits-to-decimal.yaml diff --git a/settings/icu-rules/extended-unicode-to-asccii.yaml b/settings/icu-rules/extended-unicode-to-asccii.yaml index 921874f5..959774d2 100644 --- a/settings/icu-rules/extended-unicode-to-asccii.yaml +++ b/settings/icu-rules/extended-unicode-to-asccii.yaml @@ -1,4 +1,4 @@ -- ":: Latin ()" +- "'ล‚' > 'l'" - "'ยช' > 'a'" - "'ยต' > 'u'" - "'ยบ' > 'o'" diff --git a/settings/icu-rules/unicode-digits-to-decimal.yaml b/settings/icu-rules/unicode-digits-to-decimal.yaml new file mode 100644 index 00000000..55b3274a --- /dev/null +++ b/settings/icu-rules/unicode-digits-to-decimal.yaml @@ -0,0 +1,24 @@ +- "[๐žฅ๐’ ฿€๐–ญ๊ค€๐–ฉ ๐‘“๐‘‘๐‘‹ฐ๐‘„ถ๊ฉ๊˜ แฑ€แญแฎฐแ แŸ แฅ†เผ เป๊งฐแ‚แชแช€แง๐‘ต๊ฏฐแฑ๐‘ฑ๐‘œฐ๐‘›€๐‘™๐‘‡๊ง๊ฃเทฆ๐‘ฆ๏ผ๐Ÿถ๐Ÿ˜๐Ÿฌ๐ŸŽ๐Ÿขโ‚€โ“ฟโ“ชโฐ] > 0" +- "[๐žฅ‘๐’ก฿๐–ญ‘๊ค๐–ฉก๐‘“‘๐‘‘‘๐‘‹ฑ๐‘„ท๊ฉ‘๊˜กแฑแญ‘แฎฑแ ‘แŸกแฅ‡เผกเป‘๊งฑแ‚‘แช‘แชแง‘๐‘ต‘๊ฏฑแฑ‘๐‘ฑ‘๐‘œฑ๐‘›๐‘™‘๐‘‡‘๊ง‘๊ฃ‘เทง๐‘ง๏ผ‘๐Ÿท๐Ÿ™๐Ÿญ๐Ÿ๐Ÿฃโ‚ยนโ‘ โ‘ดโ’ˆโถโž€โžŠโ“ต] > 1" +- "[๐žฅ’๐’ข฿‚๐–ญ’๊ค‚๐–ฉข๐‘“’๐‘‘’๐‘‹ฒ๐‘„ธ๊ฉ’๊˜ขแฑ‚แญ’แฎฒแ ’แŸขแฅˆเผขเป’๊งฒแ‚’แช’แช‚แง’๐‘ต’๊ฏฒแฑ’๐‘ฑ’๐‘œฒ๐‘›‚๐‘™’๐‘‡’๊ง’๊ฃ’เทจ๐‘จ๏ผ’๐Ÿธ๐Ÿš๐Ÿฎ๐Ÿ๐Ÿคโ‚‚ยฒโ‘กโ‘ตโ’‰โทโžโž‹โ“ถ] > 2" +- "[๐žฅ“๐’ฃ฿ƒ๐–ญ“๊คƒ๐–ฉฃ๐‘““๐‘‘“๐‘‹ณ๐‘„น๊ฉ“๊˜ฃแฑƒแญ“แฎณแ “แŸฃแฅ‰เผฃเป“๊งณแ‚“แช“แชƒแง“๐‘ต“๊ฏณแฑ“๐‘ฑ“๐‘œณ๐‘›ƒ๐‘™“๐‘‡“๊ง“๊ฃ“เทฉ๐‘ฉ๏ผ“๐Ÿน๐Ÿ›๐Ÿฏ๐Ÿ‘๐Ÿฅโ‚ƒยณโ‘ขโ‘ถโ’Šโธโž‚โžŒโ“ท] > 3" +- "[๐žฅ”๐’ค฿„๐–ญ”๊ค„๐–ฉค๐‘“”๐‘‘”๐‘‹ด๐‘„บ๊ฉ”๊˜คแฑ„แญ”แฎดแ ”แŸคแฅŠเผคเป”๊งดแ‚”แช”แช„แง”๐‘ต”๊ฏดแฑ”๐‘ฑ”๐‘œด๐‘›„๐‘™”๐‘‡”๊ง”๊ฃ”เทช๐‘ช๏ผ”๐Ÿบ๐Ÿœ๐Ÿฐ๐Ÿ’๐Ÿฆโ‚„โดโ‘ฃโ‘ทโ’‹โนโžƒโžโ“ธ] > 4" +- "[๐žฅ•๐’ฅ฿…๐–ญ•๊ค…๐–ฉฅ๐‘“•๐‘‘•๐‘‹ต๐‘„ป๊ฉ•๊˜ฅแฑ…แญ•แฎตแ •แŸฅแฅ‹เผฅเป•๊งตแ‚•แช•แช…แง•๐‘ต•๊ฏตแฑ•๐‘ฑ•๐‘œต๐‘›…๐‘™•๐‘‡•๊ง•๊ฃ•เทซ๐‘ซ๏ผ•๐Ÿป๐Ÿ๐Ÿฑ๐Ÿ“๐Ÿงโ‚…โตโ‘คโ‘ธโ’Œโบโž„โžŽโ“น] > 5" +- "[๐žฅ–๐’ฆ฿†๐–ญ–๊ค†๐–ฉฆ๐‘“–๐‘‘–๐‘‹ถ๐‘„ผ๊ฉ–๊˜ฆแฑ†แญ–แฎถแ –แŸฆแฅŒเผฆเป–๊งถแ‚–แช–แช†แง–๐‘ต–๊ฏถแฑ–๐‘ฑ–๐‘œถ๐‘›†๐‘™–๐‘‡–๊ง–๊ฃ–เทฌ๐‘ฌ๏ผ–๐Ÿผ๐Ÿž๐Ÿฒ๐Ÿ”๐Ÿจโ‚†โถโ‘ฅโ‘นโ’โปโž…โžโ“บ] > 6" +- "[๐žฅ—๐’ง฿‡๐–ญ—๊ค‡๐–ฉง๐‘“—๐‘‘—๐‘‹ท๐‘„ฝ๊ฉ—๊˜งแฑ‡แญ—แฎทแ —แŸงแฅเผงเป—๊งทแ‚—แช—แช‡แง—๐‘ต—๊ฏทแฑ—๐‘ฑ—๐‘œท๐‘›‡๐‘™—๐‘‡—๊ง—๊ฃ—เทญ๐‘ญ๏ผ—๐Ÿฝ๐ŸŸ๐Ÿณ๐Ÿ•๐Ÿฉโ‚‡โทโ‘ฆโ‘บโ’Žโผโž†โžโ“ป] > 7" +- "[๐žฅ˜๐’จ฿ˆ๐–ญ˜๊คˆ๐–ฉจ๐‘“˜๐‘‘˜๐‘‹ธ๐‘„พ๊ฉ˜๊˜จแฑˆแญ˜แฎธแ ˜แŸจแฅŽเผจเป˜๊งธแ‚˜แช˜แชˆแง˜๐‘ต˜๊ฏธแฑ˜๐‘ฑ˜๐‘œธ๐‘›ˆ๐‘™˜๐‘‡˜๊ง˜๊ฃ˜เทฎ๐‘ฎ๏ผ˜๐Ÿพ๐Ÿ ๐Ÿด๐Ÿ–๐Ÿชโ‚ˆโธโ‘งโ‘ปโ’โฝโž‡โž‘โ“ผ] > 8" +- "[๐žฅ™๐’ฉ฿‰๐–ญ™๊ค‰๐–ฉฉ๐‘“™๐‘‘™๐‘‹น๐‘„ฟ๊ฉ™๊˜ฉแฑ‰แญ™แฎนแ ™แŸฉแฅเผฉเป™๊งนแ‚™แช™แช‰แง™๐‘ต™๊ฏนแฑ™๐‘ฑ™๐‘œน๐‘›‰๐‘™™๐‘‡™๊ง™๊ฃ™เทฏ๐‘ฏ๏ผ™๐Ÿฟ๐Ÿก๐Ÿต๐Ÿ—๐Ÿซโ‚‰โนโ‘จโ‘ผโ’โพโžˆโž’โ“ฝ] > 9" +- "[๐‘œบโ‘ฉโ‘ฝโ’‘โฟโž‰โž“โ“พ] > '10'" +- "[โ‘ชโ‘พโ’’โ“ซ] > '11'" +- "[โ‘ซโ‘ฟโ’“โ“ฌ] > '12'" +- "[โ‘ฌโ’€โ’”โ“ญ] > '13'" +- "[โ‘ญโ’โ’•โ“ฎ] > '14'" +- "[โ‘ฎโ’‚โ’–โ“ฏ] > '15'" +- "[โ‘ฏโ’ƒโ’—โ“ฐ] > '16'" +- "[โ‘ฐโ’„โ’˜โ“ฑ] > '17'" +- "[โ‘ฑโ’…โ’™โ“ฒ] > '18'" +- "[โ‘ฒโ’†โ’šโ“ณ] > '19'" +- "[๐‘œปโ‘ณโ’‡โ’›โ“ด] > '20'" +- "โ… > ' 1/7'" +- "โ…‘ > ' 1/9'" +- "โ…’ > ' 1/10'" diff --git a/settings/legacy_icu_tokenizer.yaml b/settings/legacy_icu_tokenizer.yaml index a3f1c027..7972b156 100644 --- a/settings/legacy_icu_tokenizer.yaml +++ b/settings/legacy_icu_tokenizer.yaml @@ -1,20 +1,29 @@ normalization: - - ":: NFD ()" - - "[[:Nonspacing Mark:] [:Cf:]] >" - ":: lower ()" + - !include icu-rules/unicode-digits-to-decimal.yaml + - "'โ„–' > 'no'" + - "'nยฐ' > 'no'" + - "'nยบ' > 'no'" + - "ยช > a" + - "ยบ > o" + - "[[:Punctuation:][:Symbol:]] > ' '" - "รŸ > 'ss'" # German szet is unimbigiously equal to double ss - - "[[:Punctuation:][:Space:]]+ > ' '" - - ":: NFC ()" + - "[^[:Letter:] [:Number:] [:Space:]] >" + - "[:Lm:] >" + - ":: [[:Number:]] Latin ()" + - ":: [[:Number:]] Ascii ();" + - ":: [[:Number:]] NFD ();" + - "[[:Nonspacing Mark:] [:Cf:]] >;" + - "[:Space:]+ > ' '" transliteration: + - ":: Latin ()" - !include icu-rules/extended-unicode-to-asccii.yaml - ":: Ascii ()" - ":: NFD ()" - - "'' >" - - "[[:Nonspacing Mark:] [:Cf:]] >" - "[^[:Ascii:]] >" - ":: lower ()" - - "[[:Punctuation:][:Space:]]+ > ' '" - ":: NFC ()" + - "[:Space:]+ > ' '" variants: - words: - ~hal => hal