define formal public Python interface for tokenizer

This introduces an abstract class for the Tokenizer/Analyzer
for documentation purposes.
This commit is contained in:
Sarah Hoffmann
2021-08-10 14:51:35 +02:00
parent e25e268e2e
commit 90b40fc3e6
6 changed files with 311 additions and 20 deletions

View File

@@ -16,6 +16,7 @@ from nominatim.db.utils import CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -28,7 +29,7 @@ def create(dsn, data_dir):
return LegacyICUTokenizer(dsn, data_dir)
class LegacyICUTokenizer:
class LegacyICUTokenizer(AbstractTokenizer):
""" This tokenizer uses libICU to covert names and queries to ASCII.
Otherwise it uses the same algorithms and data structures as the
normalization routines in Nominatim 3.
@@ -192,7 +193,7 @@ class LegacyICUTokenizer:
return words
class LegacyICUNameAnalyzer:
class LegacyICUNameAnalyzer(AbstractAnalyzer):
""" The legacy analyzer uses the ICU library for splitting names.
Each instance opens a connection to the database to request the
@@ -207,14 +208,6 @@ class LegacyICUNameAnalyzer:
self._cache = _TokenCache()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
def close(self):
""" Free all resources used by the analyzer.
"""