reorganize keyword creation for legacy tokenizer

- only save partial words without internal spaces - consider comma and semicolon a separator of full words - consider parts before an opening bracket a full word (but not the part after the bracket) Fixes #244.
2026-02-26 11:08:13 +00:00 · 2021-05-23 23:58:58 +02:00
parent fa3e48c59f
commit 4f4d15c28a
4 changed files with 85 additions and 29 deletions
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -423,8 +423,7 @@ class LegacyICUNameAnalyzer:
        names = place.get('name')

        if names:
-            full_names = set((self.make_standard_word(name) for name in names.values()))
-            full_names.discard('')
+            full_names = self._compute_full_names(names)

            token_info.add_names(self.conn, full_names)

@@ -461,6 +460,25 @@ class LegacyICUNameAnalyzer:
        return token_info.data


+    def _compute_full_names(self, names):
+        """ Return the set of all full name word ids to be used with the
+            given dictionary of names.
+        """
+        full_names = set()
+        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
+            word = self.make_standard_word(name)
+            if word:
+                full_names.add(word)
+
+                brace_split = name.split('(', 2)
+                if len(brace_split) > 1:
+                    word = self.make_standard_word(brace_split[0])
+                    if word:
+                        full_names.add(word)
+
+        return full_names
+
+
    def _add_postcode(self, postcode):
        """ Make sure the normalized postcode is present in the word table.
        """