Merge pull request #3702 from lonvia/remove-tokenizer-dir

Remove automatic setup of tokenizer directory

So far the tokenizer factory would create a directory for private data for the tokenizer and then hand in the directory location to the tokenizer.

ICU tokenizer doesn't need any extra data anymore, so it doesn't make sense to create a directory which then remains empty. If a tokenizer needs such a directory in the future, it needs to create it on its own and make sure to handle the situation correctly where no project directory is used at all.
This commit is contained in:
Sarah Hoffmann
2025-04-03 09:04:48 +02:00
committed by GitHub
8 changed files with 16 additions and 52 deletions

View File

@@ -2,7 +2,7 @@
# #
# This file is part of Nominatim. (https://nominatim.org) # This file is part of Nominatim. (https://nominatim.org)
# #
# Copyright (C) 2024 by the Nominatim developer community. # Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log. # For a full list of authors see the git log.
""" """
Abstract class definitions for tokenizers. These base classes are here Abstract class definitions for tokenizers. These base classes are here
@@ -10,7 +10,6 @@ mainly for documentation purposes.
""" """
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any, Optional, Iterable from typing import List, Tuple, Dict, Any, Optional, Iterable
from pathlib import Path
from ..typing import Protocol from ..typing import Protocol
from ..config import Configuration from ..config import Configuration
@@ -232,6 +231,6 @@ class TokenizerModule(Protocol):
own tokenizer. own tokenizer.
""" """
def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer: def create(self, dsn: str) -> AbstractTokenizer:
""" Factory for new tokenizers. """ Factory for new tokenizers.
""" """

View File

@@ -2,7 +2,7 @@
# #
# This file is part of Nominatim. (https://nominatim.org) # This file is part of Nominatim. (https://nominatim.org)
# #
# Copyright (C) 2024 by the Nominatim developer community. # Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log. # For a full list of authors see the git log.
""" """
Functions for creating a tokenizer or initialising the right one for an Functions for creating a tokenizer or initialising the right one for an
@@ -52,19 +52,10 @@ def create_tokenizer(config: Configuration, init_db: bool = True,
if module_name is None: if module_name is None:
module_name = config.TOKENIZER module_name = config.TOKENIZER
# Create the directory for the tokenizer data
assert config.project_dir is not None
basedir = config.project_dir / 'tokenizer'
if not basedir.exists():
basedir.mkdir()
elif not basedir.is_dir():
LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
raise UsageError("Tokenizer setup failed.")
# Import and initialize the tokenizer. # Import and initialize the tokenizer.
tokenizer_module = _import_tokenizer(module_name) tokenizer_module = _import_tokenizer(module_name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir) tokenizer = tokenizer_module.create(config.get_libpq_dsn())
tokenizer.init_new_db(config, init_db=init_db) tokenizer.init_new_db(config, init_db=init_db)
with connect(config.get_libpq_dsn()) as conn: with connect(config.get_libpq_dsn()) as conn:
@@ -80,10 +71,6 @@ def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
and initialises it. and initialises it.
""" """
assert config.project_dir is not None assert config.project_dir is not None
basedir = config.project_dir / 'tokenizer'
if not basedir.is_dir():
# Directory will be repopulated by tokenizer below.
basedir.mkdir()
with connect(config.get_libpq_dsn()) as conn: with connect(config.get_libpq_dsn()) as conn:
name = properties.get_property(conn, 'tokenizer') name = properties.get_property(conn, 'tokenizer')
@@ -94,7 +81,7 @@ def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
tokenizer_module = _import_tokenizer(name) tokenizer_module = _import_tokenizer(name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir) tokenizer = tokenizer_module.create(config.get_libpq_dsn())
tokenizer.init_from_project(config) tokenizer.init_from_project(config)
return tokenizer return tokenizer

View File

@@ -2,7 +2,7 @@
# #
# This file is part of Nominatim. (https://nominatim.org) # This file is part of Nominatim. (https://nominatim.org)
# #
# Copyright (C) 2024 by the Nominatim developer community. # Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log. # For a full list of authors see the git log.
""" """
Tokenizer implementing normalisation as used before Nominatim 4 but using Tokenizer implementing normalisation as used before Nominatim 4 but using
@@ -12,7 +12,6 @@ from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
Dict, Set, Iterable Dict, Set, Iterable
import itertools import itertools
import logging import logging
from pathlib import Path
from psycopg.types.json import Jsonb from psycopg.types.json import Jsonb
from psycopg import sql as pysql from psycopg import sql as pysql
@@ -38,10 +37,10 @@ WORD_TYPES = (('country_names', 'C'),
('housenumbers', 'H')) ('housenumbers', 'H'))
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer': def create(dsn: str) -> 'ICUTokenizer':
""" Create a new instance of the tokenizer provided by this module. """ Create a new instance of the tokenizer provided by this module.
""" """
return ICUTokenizer(dsn, data_dir) return ICUTokenizer(dsn)
class ICUTokenizer(AbstractTokenizer): class ICUTokenizer(AbstractTokenizer):
@@ -50,9 +49,8 @@ class ICUTokenizer(AbstractTokenizer):
normalization routines in Nominatim 3. normalization routines in Nominatim 3.
""" """
def __init__(self, dsn: str, data_dir: Path) -> None: def __init__(self, dsn: str) -> None:
self.dsn = dsn self.dsn = dsn
self.data_dir = data_dir
self.loader: Optional[ICURuleLoader] = None self.loader: Optional[ICURuleLoader] = None
def init_new_db(self, config: Configuration, init_db: bool = True) -> None: def init_new_db(self, config: Configuration, init_db: bool = True) -> None:

View File

@@ -234,6 +234,6 @@ def tokenizer_mock(monkeypatch, property_table):
property_table.set('tokenizer', 'dummy') property_table.set('tokenizer', 'dummy')
def _create_tokenizer(): def _create_tokenizer():
return dummy_tokenizer.DummyTokenizer(None, None) return dummy_tokenizer.DummyTokenizer(None)
return _create_tokenizer return _create_tokenizer

View File

@@ -11,17 +11,16 @@ from nominatim_db.data.place_info import PlaceInfo
from nominatim_db.config import Configuration from nominatim_db.config import Configuration
def create(dsn, data_dir): def create(dsn):
""" Create a new instance of the tokenizer provided by this module. """ Create a new instance of the tokenizer provided by this module.
""" """
return DummyTokenizer(dsn, data_dir) return DummyTokenizer(dsn)
class DummyTokenizer: class DummyTokenizer:
def __init__(self, dsn, data_dir): def __init__(self, dsn):
self.dsn = dsn self.dsn = dsn
self.data_dir = data_dir
self.init_state = None self.init_state = None
self.analyser_cache = {} self.analyser_cache = {}

View File

@@ -32,24 +32,9 @@ class TestFactory:
assert isinstance(tokenizer, DummyTokenizer) assert isinstance(tokenizer, DummyTokenizer)
assert tokenizer.init_state == "new" assert tokenizer.init_state == "new"
assert (self.config.project_dir / 'tokenizer').is_dir()
assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy' assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
def test_setup_tokenizer_dir_exists(self):
(self.config.project_dir / 'tokenizer').mkdir()
tokenizer = factory.create_tokenizer(self.config)
assert isinstance(tokenizer, DummyTokenizer)
assert tokenizer.init_state == "new"
def test_setup_tokenizer_dir_failure(self):
(self.config.project_dir / 'tokenizer').write_text("foo")
with pytest.raises(UsageError):
factory.create_tokenizer(self.config)
def test_load_tokenizer(self): def test_load_tokenizer(self):
factory.create_tokenizer(self.config) factory.create_tokenizer(self.config)
@@ -64,7 +49,6 @@ class TestFactory:
self.config.project_dir = self.config.project_dir self.config.project_dir = self.config.project_dir
factory.get_tokenizer_for_db(self.config) factory.get_tokenizer_for_db(self.config)
assert (self.config.project_dir / 'tokenizer').exists()
def test_load_missing_property(self, temp_db_cursor): def test_load_missing_property(self, temp_db_cursor):
factory.create_tokenizer(self.config) factory.create_tokenizer(self.config)

View File

@@ -39,12 +39,9 @@ def test_config(project_env, tmp_path):
@pytest.fixture @pytest.fixture
def tokenizer_factory(dsn, tmp_path, property_table, def tokenizer_factory(dsn, property_table, sql_preprocessor, place_table, word_table):
sql_preprocessor, place_table, word_table):
(tmp_path / 'tokenizer').mkdir()
def _maker(): def _maker():
return icu_tokenizer.create(dsn, tmp_path / 'tokenizer') return icu_tokenizer.create(dsn)
return _maker return _maker

View File

@@ -63,7 +63,7 @@ class MockPostcodeTable:
@pytest.fixture @pytest.fixture
def tokenizer(): def tokenizer():
return dummy_tokenizer.DummyTokenizer(None, None) return dummy_tokenizer.DummyTokenizer(None)
@pytest.fixture @pytest.fixture