sanitizer: move helpers into a configuration class

2022-02-07 10:48:00 +01:00
parent a79a3210e6
commit 610f2cc254
10 changed files with 124 additions and 80 deletions
--- a/nominatim/tokenizer/sanitizers/config.py
+++ b/nominatim/tokenizer/sanitizers/config.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Configuration for Sanitizers.
+"""
+from collections import UserDict
+import re
+
+from nominatim.errors import UsageError
+
+class SanitizerConfig(UserDict):
+    """ Dictionary with configuration options for a sanitizer.
+
+        In addition to the usualy dictionary function, the class provides
+        accessors to standard sanatizer options that are used by many of the
+        sanitizers.
+    """
+
+    def get_string_list(self, param, default=tuple()):
+        """ Extract a configuration parameter as a string list.
+            If the parameter value is a simple string, it is returned as a
+            one-item list. If the parameter value does not exist, the given
+            default is returned. If the parameter value is a list, it is checked
+            to contain only strings before being returned.
+        """
+        values = self.data.get(param, None)
+
+        if values is None:
+            return None if default is None else list(default)
+
+        if isinstance(values, str):
+            return [values]
+
+        if not isinstance(values, (list, tuple)):
+            raise UsageError(f"Parameter '{param}' must be string or list of strings.")
+
+        if any(not isinstance(value, str) for value in values):
+            raise UsageError(f"Parameter '{param}' must be string or list of strings.")
+
+        return values
+
+
+    def get_delimiter(self, default=',;'):
+        """ Return the 'delimiter' parameter in the configuration as a
+            compiled regular expression that can be used to split the names on the
+            delimiters. The regular expression makes sure that the resulting names
+            are stripped and that repeated delimiters
+            are ignored but it will still create empty fields on occasion. The
+            code needs to filter those.
+
+            The 'default' parameter defines the delimiter set to be used when
+            not explicitly configured.
+        """
+        delimiter_set = set(self.data.get('delimiters', default))
+        if not delimiter_set:
+            raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
+
+        return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
+
+
+    def get_filter_kind(self, *default):
+        """ Return a filter function for the name kind from the 'filter-kind'
+            config parameter. The filter functions takes a name item and returns
+            True when the item passes the filter.
+
+            If the parameter is empty, the filter lets all items pass. If the
+            paramter is a string, it is interpreted as a single regular expression
+            that must match the full kind string. If the parameter is a list then
+            any of the regular expressions in the list must match to pass.
+        """
+        filters = self.get_string_list('filter-kind', default)
+
+        if not filters:
+            return lambda _: True
+
+        regexes = [re.compile(regex) for regex in filters]
+
+        return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)