use analyser provided in the 'analyzer' property

Implements per-name choice of analyzer. If a non-default analyzer is choosen, then the 'word' identifier is extended with the name of the ana;yzer, so that we still have unique items.
2026-02-26 11:08:13 +00:00 · 2021-10-05 14:10:32 +02:00
parent 92f6ec2328
commit d35400a7d7
6 changed files with 50 additions and 51 deletions
--- a/nominatim/tokenizer/icu_token_analysis.py
+++ b/nominatim/tokenizer/icu_token_analysis.py
@@ -0,0 +1,23 @@
+"""
+Container class collecting all components required to transform an OSM name
+into a Nominatim token.
+"""
+
+from icu import Transliterator
+
+class ICUTokenAnalysis:
+    """ Container class collecting the transliterators and token analysis
+        modules for a single NameAnalyser instance.
+    """
+
+    def __init__(self, norm_rules, trans_rules, analysis_rules):
+        self.normalizer = Transliterator.createFromRules("icu_normalization",
+                                                         norm_rules)
+        trans_rules += ";[:Space:]+ > ' '"
+        self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
+                                                       trans_rules)
+        self.search = Transliterator.createFromRules("icu_search",
+                                                     norm_rules + trans_rules)
+
+        self.analysis = {name: arules.create(self.to_ascii, arules.config)
+                         for name, arules in analysis_rules.items()}