Files
Nominatim/settings/icu_tokenizer.yaml
Alexander Sapozhnikov 7f909dbbd8 Add replacement for Russian
2022-11-01 02:54:07 +05:00

211 lines
5.4 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

normalization:
- ":: lower ()"
- ":: Hans-Hant"
- !include icu-rules/unicode-digits-to-decimal.yaml
- "'№' > 'no'"
- "'n°' > 'no'"
- "'nº' > 'no'"
- "ª > a"
- "º > o"
- "[[:Punctuation:][:Symbol:]\u02bc] > ' '"
- "ß > 'ss'" # German szet is unambiguously equal to double ss
- "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
- "[:Lm:] >"
- ":: [[:Number:]] Latin ()"
- ":: [[:Number:]] Ascii ();"
- ":: [[:Number:]] NFD ();"
- "[[:Nonspacing Mark:] [:Cf:]] >;"
- "[:Space:]+ > ' '"
transliteration:
- ":: Latin ()"
- !include icu-rules/extended-unicode-to-asccii.yaml
- ":: Ascii ()"
- ":: NFD ()"
- ":: lower ()"
- "[^a-z0-9[:Space:]] >"
- ":: NFC ()"
sanitizers:
- step: clean-housenumbers
filter-kind:
- housenumber
- conscriptionnumber
- streetnumber
convert-to-name:
- (\A|.*,)[^\d,]{3,}(,.*|\Z)
- step: clean-postcodes
convert-to-address: yes
default-pattern: "[A-Z0-9- ]{3,12}"
- step: split-name-list
- step: strip-brace-terms
- step: tag-analyzer-by-language
filter-kind: [".*name.*"]
whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
use-defaults: all
mode: append
token-analysis:
- analyzer: generic
- id: "@housenumber"
analyzer: housenumbers
- id: "@postcode"
analyzer: postcodes
- id: bg
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-bg.yaml
- id: ca
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-ca.yaml
- id: cs
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-cs.yaml
- id: da
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-da.yaml
- id: de
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-de.yaml
mutations:
- pattern: ä
replacements: ["ä", "ae"]
- pattern: ö
replacements: ["ö", "oe"]
- pattern: ü
replacements: ["ü", "ue"]
- id: el
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-el.yaml
- id: en
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-en.yaml
- id: es
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-es.yaml
- id: et
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-et.yaml
- id: eu
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-eu.yaml
- id: fi
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-fi.yaml
- id: fr
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-fr.yaml
- id: gl
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-gl.yaml
- id: hu
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-hu.yaml
- id: it
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-it.yaml
- id: ja
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-ja.yaml
- id: mg
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-mg.yaml
- id: ms
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-ms.yaml
- id: nl
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-nl.yaml
- id: no
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-no.yaml
- id: pl
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-pl.yaml
- id: pt
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-pt.yaml
- id: ro
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-ro.yaml
- id: ru
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-ru.yaml
mutations:
- pattern: ё
replacements: ["ё", "е"]
- id: sk
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-sk.yaml
- id: sl
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-sl.yaml
- id: sv
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-sv.yaml
- id: tr
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-tr.yaml
- id: uk
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-uk.yaml
- id: vi
analyzer: generic
mode: variant-only
variants:
- !include icu-rules/variants-vi.yaml