define formal public Python interface for tokenizer

This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes.
2026-03-12 05:44:06 +00:00 · 2021-08-10 14:51:35 +02:00
parent e25e268e2e
commit 90b40fc3e6
6 changed files with 311 additions and 20 deletions
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -16,6 +16,7 @@ from nominatim.db import properties
 from nominatim.db import utils as db_utils
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.errors import UsageError
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer

 DBCFG_NORMALIZATION = "tokenizer_normalization"
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
@@ -76,7 +77,7 @@ def _check_module(module_dir, conn):
            raise UsageError("Database module cannot be accessed.") from err


-class LegacyTokenizer:
+class LegacyTokenizer(AbstractTokenizer):
    """ The legacy tokenizer uses a special PostgreSQL module to normalize
        names and queries. The tokenizer thus implements normalization through
        calls to the database.
@@ -238,7 +239,7 @@ class LegacyTokenizer:
        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)


-class LegacyNameAnalyzer:
+class LegacyNameAnalyzer(AbstractAnalyzer):
    """ The legacy analyzer uses the special Postgresql module for
        splitting names.

@@ -255,14 +256,6 @@ class LegacyNameAnalyzer:
        self._cache = _TokenCache(self.conn)


-    def __enter__(self):
-        return self
-
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-
-
    def close(self):
        """ Free all resources used by the analyzer.
        """