define formal public Python interface for tokenizer

This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes.
2026-02-26 11:08:13 +00:00 · 2021-08-10 14:51:35 +02:00
parent e25e268e2e
commit 90b40fc3e6
6 changed files with 311 additions and 20 deletions
--- a/docs/develop/Tokenizers.md
+++ b/docs/develop/Tokenizers.md
@@ -73,3 +73,67 @@ the saved tokens in the database. It then returns the list of possibly matching
 tokens and the list of possible splits to the query parser. The parser uses
 this information to compute all possible interpretations of the query and
 rank them accordingly.
 ## Tokenizer API
 The following section describes the functions that need to be implemented
 for a custom tokenizer implementation.
 !!! warning
    This API is currently in early alpha status. While this API is meant to
    be a public API on which other tokenizers may be implemented, the API is
    far away from being stable at the moment.
 ### Directory Structure
 Nominatim expects two files for a tokenizer:
 * `nominiatim/tokenizer/<NAME>_tokenizer.py` containing the Pythonpart of the
  implementation
 * `lib-php/tokenizer/<NAME>_tokenizer.php` with the PHP part of the
  implementation
 where `<NAME>` is a unique name for the tokenizer consisting of only lower-case
 letters, digits and underscore. A tokenizer also needs to install some SQL
 functions. By convention, these should be placed in `lib-sql/tokenizer`.
 If the tokenizer has a default configuration file, this should be saved in
 the `settings/<NAME>_tokenizer.<SUFFIX>`.
 ### Configuration and Persistance
 Tokenizers may define custom settings for their configuration. All settings
 must be prefixed with `NOMINATIM_TOKENIZER_`. Settings may be transient or
 persistent. Transient settings are loaded from the configuration file when
 Nominatim is started and may thus be changed at any time. Persistent settings
 are tied to a database installation and must only be read during installation
 time. If they are needed for the runtime then they must be saved into the
 `nominatim_properties` table and later loaded from there.
 ### The Python module
 The Python module is expect to export a single factory function:
 ```python
 def create(dsn: str, data_dir: Path) -> AbstractTokenizer
 ```
 The `dsn` parameter contains the DSN of the Nominatim database. The `data_dir`
 is a directory in the project directory that the tokenizer may use to save
 database-specific data. The function must return the instance of the tokenizer
 class as defined below.
 ### Python Tokenizer Class
 All tokenizers must inherit from `nominatim.tokenizer.base.AbstractTokenizer`
 and implement the abstract functions defined there.
 ::: nominatim.tokenizer.base.AbstractTokenizer
    rendering:
        heading_level: 4
 ### Python Analyzer Class
 ::: nominatim.tokenizer.base.AbstractAnalyzer
    rendering:
        heading_level: 4
--- a/docs/extra.css
+++ b/docs/extra.css
@@ -13,3 +13,11 @@ th, td {
 th {
    background-color: #eee;
 }
 /* Indentation for mkdocstrings.
 div.doc-contents:not(.first) {
  padding-left: 25px;
  border-left: 4px solid rgba(230, 230, 230);
  margin-bottom: 60px;
 }*/
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -30,6 +30,7 @@ pages:
        - 'Architecture Overview' : 'develop/overview.md'
        - 'OSM Data Import' : 'develop/Import.md'
        - 'Place Ranking' : 'develop/Ranking.md'
        - 'Tokenizers' : 'develop/Tokenizers.md'
        - 'Postcodes' : 'develop/Postcodes.md'
        - 'Testing' : 'develop/Testing.md'
        - 'External Data Sources': 'develop/data-sources.md'
@@ -44,3 +45,11 @@ markdown_extensions:
    - toc:
        permalink: 
 extra_css: [extra.css, styles.css]
 plugins:
    - search
    - mkdocstrings:
        handlers:
          python:
            rendering:
              show_source: false
              show_signature_annotations: false
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -0,0 +1,224 @@
 """
 Abstract class defintions for tokenizers. These base classes are here
 mainly for documentation purposes.
 """
 from abc import ABC, abstractmethod
 from typing import List, Tuple, Dict, Any
 from nominatim.config import Configuration
 # pylint: disable=unnecessary-pass
 class AbstractAnalyzer(ABC):
    """ The analyzer provides the functions for analysing names and building
        the token database.
        Analyzers are instantiated on a per-thread base. Access to global data
        structures must be synchronised accordingly.
    """
    def __enter__(self) -> 'AbstractAnalyzer':
        return self
    def __exit__(self, exc_type, exc_value, traceback) -> None:
        self.close()
    @abstractmethod
    def close(self) -> None:
        """ Free all resources used by the analyzer.
        """
        pass
    @abstractmethod
    def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
        """ Return token information for the given list of words.
            The function is used for testing and debugging only
            and does not need to be particularly efficient.
            Arguments:
                words: A list of words to look up the tokens for.
                       If a word starts with # it is assumed to be a full name
                       otherwise is a partial term.
            Returns:
                The function returns the list of all tuples that could be
                found for the given words. Each list entry is a tuple of
                (original word, word token, word id).
        """
        pass
    @abstractmethod
    def normalize_postcode(self, postcode: str) -> str:
        """ Convert the postcode to its standardized form.
            This function must yield exactly the same result as the SQL function
            `token_normalized_postcode()`.
            Arguments:
                postcode: The postcode to be normalized.
            Returns:
                The given postcode after normalization.
        """
        pass
    @abstractmethod
    def update_postcodes_from_db(self) -> None:
        """ Update the tokenizer's postcode tokens from the current content
            of the `location_postcode` table.
        """
        pass
    @abstractmethod
    def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
                               should_replace: bool) -> None:
        """ Update the tokenizer's special phrase tokens from the given
            list of special phrases.
            Arguments:
                phrases: The new list of special phrases. Each entry is
                         a tuple of (phrase, class, type, operator).
                should_replace: If true, replace the current list of phrases.
                                When false, just add the given phrases to the
                                ones that already exist.
        """
        pass
    @abstractmethod
    def add_country_names(self, country_code: str, names: Dict[str, str]):
        """ Add the given names to the tokenizer's list of country tokens.
            Arguments:
                country_code: two-letter country code for the country the names
                              refer to.
                names: Dictionary of name type to name.
        """
        pass
    @abstractmethod
    def process_place(self, place: Dict) -> Any:
        """ Extract tokens for the given place and compute the
            information to be handed to the PL/pgSQL processor for building
            the search index.
            Arguments:
                place: Dictionary with the information about the place. Currently
                       the following fields may be present:
                       - *name* is a dictionary of names for the place together
                         with the designation of the name.
                       - *address* is a dictionary of address terms.
                       - *country_feature* is set to a country code when the
                         place describes a country.
            Returns:
                A JSON-serialisable structure that will be handed into
                the database via the `token_info` field.
        """
 class AbstractTokenizer(ABC):
    """ The tokenizer instance is the central instance of the tokenizer in
        the system. There will only be a single instance of the tokenizer
        active at any time.
    """
    @abstractmethod
    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
        """ Set up a new tokenizer for the database.
            The function should copy all necessary data into the project
            directory or save it in the property table to make sure that
            the tokenizer remains stable over updates.
            Arguments:
              config: Read-only object with configuration obtions.
              init_db: When set to False, then initialisation of database
                tables should be skipped. This option is only required for
                migration purposes and can be savely ignored by custom
                tokenizers.
            TODO: can we move the init_db parameter somewhere else?
        """
        pass
    @abstractmethod
    def init_from_project(self) -> None:
        """ Initialise the tokenizer from an existing database setup.
            The function should load all previously saved configuration from
            the project directory and/or the property table.
        """
        pass
    @abstractmethod
    def finalize_import(self, config: Configuration) -> None:
        """ This function is called at the very end of an import when all
            data has been imported and indexed. The tokenizer may create
            at this point any additional indexes and data structures needed
            during query time.
            Arguments:
              config: Read-only object with configuration obtions.
        """
        pass
    @abstractmethod
    def update_sql_functions(self, config: Configuration) -> None:
        """ Update the SQL part of the tokenizer. This function is called
            automatically on migrations or may be called explicitly by the
            user through the `nominatim refresh --functions` command.
            The tokenizer must only update the code of the tokenizer. The
            data structures or data itself must not be changed by this function.
            Arguments:
              config: Read-only object with configuration obtions.
        """
        pass
    @abstractmethod
    def check_database(self) -> str:
        """ Check that the database is set up correctly and ready for being
            queried.
            Returns:
              If an issue was found, return an error message with the
              description of the issue as well as hints for the user on
              how to resolve the issue.
              Return `None`, if no issue was found.
        """
        pass
    @abstractmethod
    def name_analyzer(self) -> AbstractAnalyzer:
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
            be used accordingly:
            ```
            with tokenizer.name_analyzer() as analyzer:
                analyser.tokenize()
            ```
            When used outside the with construct, the caller must ensure to
            call the close() function before destructing the analyzer.
        """
        pass
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -16,6 +16,7 @@ from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -28,7 +29,7 @@ def create(dsn, data_dir):
    return LegacyICUTokenizer(dsn, data_dir)
-class LegacyICUTokenizer:
+class LegacyICUTokenizer(AbstractTokenizer):
    """ This tokenizer uses libICU to covert names and queries to ASCII.
        Otherwise it uses the same algorithms and data structures as the
        normalization routines in Nominatim 3.
@@ -192,7 +193,7 @@ class LegacyICUTokenizer:
        return words
-class LegacyICUNameAnalyzer:
+class LegacyICUNameAnalyzer(AbstractAnalyzer):
    """ The legacy analyzer uses the ICU library for splitting names.
        Each instance opens a connection to the database to request the
@@ -207,14 +208,6 @@ class LegacyICUNameAnalyzer:
        self._cache = _TokenCache()
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
    def close(self):
        """ Free all resources used by the analyzer.
        """
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -16,6 +16,7 @@ from nominatim.db import properties
 from nominatim.db import utils as db_utils
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.errors import UsageError
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 DBCFG_NORMALIZATION = "tokenizer_normalization"
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
@@ -76,7 +77,7 @@ def _check_module(module_dir, conn):
            raise UsageError("Database module cannot be accessed.") from err
-class LegacyTokenizer:
+class LegacyTokenizer(AbstractTokenizer):
    """ The legacy tokenizer uses a special PostgreSQL module to normalize
        names and queries. The tokenizer thus implements normalization through
        calls to the database.
@@ -238,7 +239,7 @@ class LegacyTokenizer:
        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
-class LegacyNameAnalyzer:
+class LegacyNameAnalyzer(AbstractAnalyzer):
    """ The legacy analyzer uses the special Postgresql module for
        splitting names.
@@ -255,14 +256,6 @@ class LegacyNameAnalyzer:
        self._cache = _TokenCache(self.conn)
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
    def close(self):
        """ Free all resources used by the analyzer.
        """