Merge pull request #2458 from lonvia/add-tokenizer-preprocessing

Add a "sanitation" step for name and address tags before token processing
2026-02-26 11:08:13 +00:00 · 2021-10-01 21:53:34 +02:00
parent d44a428b74 6b348d43c6
commit 19d4e047f6
28 changed files with 726 additions and 262 deletions
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -1,30 +1,33 @@
 -- Trigger functions for the placex table.
 -- Information returned by update preparation.
 DROP TYPE IF EXISTS prepare_update_info CASCADE;
 CREATE TYPE prepare_update_info AS (
  name HSTORE,
  address HSTORE,
  rank_address SMALLINT,
  country_code TEXT,
  class TEXT,
  type TEXT,
  linked_place_id BIGINT
 );
 -- Retrieve the data needed by the indexer for updating the place.
--
+CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex)
-- Return parameters:
+  RETURNS prepare_update_info
 --  name            list of names
 --  address         list of address tags, either from the object or a surrounding
 --                  building
 --  country_feature If the place is a country feature, this contains the
 --                  country code, otherwise it is null.
 CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
                                                 OUT name HSTORE,
                                                 OUT address HSTORE,
                                                 OUT country_feature VARCHAR,
                                                 OUT linked_place_id BIGINT)
  AS $$
 DECLARE
  location RECORD;
  result prepare_update_info;
 BEGIN
  -- For POI nodes, check if the address should be derived from a surrounding
  -- building.
  IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
-    address := p.address;
+    result.address := p.address;
  ELSE
    -- The additional && condition works around the misguided query
    -- planner of postgis 3.0.
-    SELECT placex.address || hstore('_inherited', '') INTO address
+    SELECT placex.address || hstore('_inherited', '') INTO result.address
      FROM placex
     WHERE ST_Covers(geometry, p.centroid)
           and geometry && p.centroid
@@ -34,27 +37,26 @@ BEGIN
     LIMIT 1;
  END IF;
-  address := address - '_unlisted_place'::TEXT;
+  result.address := result.address - '_unlisted_place'::TEXT;
-  name := p.name;
+  result.name := p.name;
  result.class := p.class;
  result.type := p.type;
  result.country_code := p.country_code;
  result.rank_address := p.rank_address;
  -- Names of linked places need to be merged in, so search for a linkable
  -- place already here.
  SELECT * INTO location FROM find_linked_place(p);
  IF location.place_id is not NULL THEN
-    linked_place_id := location.place_id;
+    result.linked_place_id := location.place_id;
    IF NOT location.name IS NULL THEN
-      name := location.name || name;
+      result.name := location.name || result.name;
    END IF;
  END IF;
-  country_feature := CASE WHEN p.admin_level = 2
+  RETURN result;
                               and p.class = 'boundary' and p.type = 'administrative'
                               and p.osm_type = 'R'
                          THEN p.country_code
                          ELSE null
                     END;
 END;
 $$
 LANGUAGE plpgsql STABLE;
--- a/nominatim/indexer/place_info.py
+++ b/nominatim/indexer/place_info.py
@@ -0,0 +1,68 @@
 """
 Wrapper around place information the indexer gets from the database and hands to
 the tokenizer.
 """
 import psycopg2.extras
 class PlaceInfo:
    """ Data class containing all information the tokenizer gets about a
        place it should process the names for.
    """
    def __init__(self, info):
        self._info = info
    def analyze(self, analyzer):
        """ Process this place with the given tokenizer and return the
            result in psycopg2-compatible Json.
        """
        return psycopg2.extras.Json(analyzer.process_place(self))
    @property
    def name(self):
        """ A dictionary with the names of the place or None if the place
            has no names.
        """
        return self._info.get('name')
    @property
    def address(self):
        """ A dictionary with the address elements of the place
            or None if no address information is available.
        """
        return self._info.get('address')
    @property
    def country_code(self):
        """ The country code of the country the place is in. Guaranteed
            to be a two-letter lower-case string or None, if no country
            could be found.
        """
        return self._info.get('country_code')
    @property
    def rank_address(self):
        """ The computed rank address before rank correction.
        """
        return self._info.get('rank_address')
    def is_a(self, key, value):
        """ Check if the place's primary tag corresponds to the given
            key and value.
        """
        return self._info.get('class') == key and self._info.get('type') == value
    def is_country(self):
        """ Check if the place is a valid country boundary.
        """
        return self.rank_address == 4 \
               and self.is_a('boundary', 'administrative') \
               and self.country_code is not None
--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@@ -4,14 +4,16 @@ tasks.
 """
 import functools
 import psycopg2.extras
 from psycopg2 import sql as pysql
 from nominatim.indexer.place_info import PlaceInfo
 # pylint: disable=C0111
 def _mk_valuelist(template, num):
    return pysql.SQL(',').join([pysql.SQL(template)] * num)
 class AbstractPlacexRunner:
    """ Returns SQL commands for indexing of the placex table.
    """
@@ -37,7 +39,7 @@ class AbstractPlacexRunner:
    @staticmethod
    def get_place_details(worker, ids):
-        worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
+        worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).*
                          FROM placex WHERE place_id IN %s""",
                       (tuple((p[0] for p in ids)), ))
@@ -47,7 +49,7 @@ class AbstractPlacexRunner:
        for place in places:
            for field in ('place_id', 'name', 'address', 'linked_place_id'):
                values.append(place[field])
-            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+            values.append(PlaceInfo(place).analyze(self.analyzer))
        worker.perform(self._index_sql(len(places)), values)
@@ -141,7 +143,7 @@ class InterpolationRunner:
        values = []
        for place in places:
            values.extend((place[x] for x in ('place_id', 'address')))
-            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+            values.append(PlaceInfo(place).analyze(self.analyzer))
        worker.perform(self._index_sql(len(places)), values)
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
 from typing import List, Tuple, Dict, Any
 from nominatim.config import Configuration
 from nominatim.indexer.place_info import PlaceInfo
 # pylint: disable=unnecessary-pass
@@ -105,20 +106,13 @@ class AbstractAnalyzer(ABC):
    @abstractmethod
-    def process_place(self, place: Dict) -> Any:
+    def process_place(self, place: PlaceInfo) -> Any:
        """ Extract tokens for the given place and compute the
            information to be handed to the PL/pgSQL processor for building
            the search index.
            Arguments:
-                place: Dictionary with the information about the place. Currently
+                place: Place information retrived from the database.
                       the following fields may be present:
                       - *name* is a dictionary of names for the place together
                         with the designation of the name.
                       - *address* is a dictionary of address terms.
                       - *country_feature* is set to a country code when the
                         place describes a country.
            Returns:
                A JSON-serialisable structure that will be handed into
@@ -142,7 +136,7 @@ class AbstractTokenizer(ABC):
            the tokenizer remains stable over updates.
            Arguments:
-              config: Read-only object with configuration obtions.
+              config: Read-only object with configuration options.
              init_db: When set to False, then initialisation of database
                tables should be skipped. This option is only required for
@@ -155,11 +149,14 @@ class AbstractTokenizer(ABC):
    @abstractmethod
-    def init_from_project(self) -> None:
+    def init_from_project(self, config: Configuration) -> None:
        """ Initialise the tokenizer from an existing database setup.
            The function should load all previously saved configuration from
            the project directory and/or the property table.
            Arguments:
              config: Read-only object with configuration options.
        """
        pass
@@ -172,7 +169,7 @@ class AbstractTokenizer(ABC):
            during query time.
            Arguments:
-              config: Read-only object with configuration obtions.
+              config: Read-only object with configuration options.
        """
        pass
@@ -187,13 +184,13 @@ class AbstractTokenizer(ABC):
            data structures or data itself must not be changed by this function.
            Arguments:
-              config: Read-only object with configuration obtions.
+              config: Read-only object with configuration options.
        """
        pass
    @abstractmethod
-    def check_database(self) -> str:
+    def check_database(self, config: Configuration) -> str:
        """ Check that the database is set up correctly and ready for being
            queried.
@@ -202,6 +199,9 @@ class AbstractTokenizer(ABC):
              description of the issue as well as hints for the user on
              how to resolve the issue.
            Arguments:
              config: Read-only object with configuration options.
              Return `None`, if no issue was found.
        """
        pass
--- a/nominatim/tokenizer/factory.py
+++ b/nominatim/tokenizer/factory.py
@@ -85,6 +85,6 @@ def get_tokenizer_for_db(config):
    tokenizer_module = _import_tokenizer(name)
    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
-    tokenizer.init_from_project()
+    tokenizer.init_from_project(config)
    return tokenizer
--- a/nominatim/tokenizer/icu_name_processor.py
+++ b/nominatim/tokenizer/icu_name_processor.py
@@ -8,67 +8,25 @@ import itertools
 from icu import Transliterator
 import datrie
 from nominatim.db.properties import set_property, get_property
 from nominatim.tokenizer import icu_variants as variants
 DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
 DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
 DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
 DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
 class ICUNameProcessorRules:
    """ Data object that saves the rules needed for the name processor.
        The rules can either be initialised through an ICURuleLoader or
        be loaded from a database when a connection is given.
    """
    def __init__(self, loader=None, conn=None):
        if loader is not None:
            self.norm_rules = loader.get_normalization_rules()
            self.trans_rules = loader.get_transliteration_rules()
            self.replacements = loader.get_replacement_pairs()
            self.search_rules = loader.get_search_rules()
        elif conn is not None:
            self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
            self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
            self.replacements = \
                variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
            self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
        else:
            assert False, "Parameter loader or conn required."
    def save_rules(self, conn):
        """ Save the rules in the property table of the given database.
            the rules can be loaded again by handing in a connection into
            the constructor of the class.
        """
        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
        set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
                     variants.pickle_variant_set(self.replacements))
        set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
 class ICUNameProcessor:
    """ Collects the different transformation rules for normalisation of names
-        and provides the functions to aply the transformations.
+        and provides the functions to apply the transformations.
    """
-    def __init__(self, rules):
+    def __init__(self, norm_rules, trans_rules, replacements):
        self.normalizer = Transliterator.createFromRules("icu_normalization",
-                                                         rules.norm_rules)
+                                                         norm_rules)
        self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
-                                                       rules.trans_rules +
+                                                       trans_rules +
                                                       ";[:Space:]+ > ' '")
        self.search = Transliterator.createFromRules("icu_search",
-                                                     rules.search_rules)
+                                                     norm_rules + trans_rules)
        # Intermediate reorder by source. Also compute required character set.
        immediate = defaultdict(list)
        chars = set()
-        for variant in rules.replacements:
+        for variant in replacements:
            if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
                replstr = variant.replacement[:-1]
            else:
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -2,17 +2,26 @@
 Helper class to create ICU rules from a configuration file.
 """
 import io
 import json
 import logging
 import itertools
 import re
 from icu import Transliterator
 from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 import nominatim.tokenizer.icu_variants as variants
 LOG = logging.getLogger()
 DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
 DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
 DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
 def _flatten_config_list(content):
    if not content:
        return []
@@ -46,12 +55,52 @@ class ICURuleLoader:
    """ Compiler for ICU rules from a tokenizer configuration file.
    """
-    def __init__(self, rules):
+    def __init__(self, config):
        rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                              config='TOKENIZER_CONFIG')
        self.variants = set()
        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
-        self._parse_variant_list(self._get_section(rules, 'variants'))
+        self.analysis_rules = self._get_section(rules, 'variants')
        self._parse_variant_list()
        # Load optional sanitizer rule set.
        self.sanitizer_rules = rules.get('sanitizers', [])
    def load_config_from_db(self, conn):
        """ Get previously saved parts of the configuration from the
            database.
        """
        self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
        self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
        self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
        self._parse_variant_list()
    def save_config_to_db(self, conn):
        """ Save the part of the configuration that cannot be changed into
            the database.
        """
        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
        set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
    def make_sanitizer(self):
        """ Create a place sanitizer from the configured rules.
        """
        return PlaceSanitizer(self.sanitizer_rules)
    def make_token_analysis(self):
        """ Create a token analyser from the reviouly loaded rules.
        """
        return ICUNameProcessor(self.normalization_rules,
                                self.transliteration_rules,
                                self.variants)
    def get_search_rules(self):
@@ -112,7 +161,9 @@ class ICURuleLoader:
        return ';'.join(_flatten_config_list(content)) + ';'
-    def _parse_variant_list(self, rules):
+    def _parse_variant_list(self):
        rules = self.analysis_rules
        self.variants.clear()
        if not rules:
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -13,8 +13,8 @@ from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.indexer.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -36,7 +36,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
    def __init__(self, dsn, data_dir):
        self.dsn = dsn
        self.data_dir = data_dir
-        self.naming_rules = None
+        self.loader = None
        self.term_normalization = None
@@ -46,9 +46,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
            This copies all necessary data in the project directory to make
            sure the tokenizer remains stable even over updates.
        """
-        loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
+        self.loader = ICURuleLoader(config)
-                                                             config='TOKENIZER_CONFIG'))
+
        self.naming_rules = ICUNameProcessorRules(loader=loader)
        self.term_normalization = config.TERM_NORMALIZATION
        self._install_php(config.lib_dir.php)
@@ -59,11 +58,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
            self._init_db_tables(config)
-    def init_from_project(self):
+    def init_from_project(self, config):
        """ Initialise the tokenizer from the project directory.
        """
        self.loader = ICURuleLoader(config)
        with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.loader.load_config_from_db(conn)
            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
@@ -81,12 +82,12 @@ class LegacyICUTokenizer(AbstractTokenizer):
            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
-    def check_database(self):
+    def check_database(self, config):
        """ Check that the tokenizer is set up correctly.
        """
-        self.init_from_project()
+        self.init_from_project(config)
-        if self.naming_rules is None:
+        if self.term_normalization is None:
            return "Configuration for tokenizer 'icu' are missing."
        return None
@@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
                                     self.loader.make_token_analysis())
    def _install_php(self, phpdir):
@@ -118,7 +120,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
            <?php
            @define('CONST_Max_Word_Frequency', 10000000);
            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
@@ -127,8 +129,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
            database as database properties.
        """
        with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
+            self.loader.save_config_to_db(conn)
            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
@@ -163,7 +164,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
        """ Count the partial terms from the names in the place table.
        """
        words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
+        name_proc = self.loader.make_token_analysis()
        with conn.cursor(name="words") as cur:
            cur.execute(""" SELECT v, count(*) FROM
@@ -188,10 +189,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        normalization.
    """
-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
        self.token_analysis = token_analysis
        self._cache = _TokenCache()
@@ -204,6 +206,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            self.conn = None
    def _search_normalized(self, name):
        """ Return the search token transliteration of the given name.
        """
        return self.token_analysis.get_search_normalized(name)
    def _normalized(self, name):
        """ Return the normalized version of the given name with all
            non-relevant information removed.
        """
        return self.token_analysis.get_normalized(name)
    def get_word_token_info(self, words):
        """ Return token information for the given list of words.
            If a word starts with # it is assumed to be a full name
@@ -219,9 +234,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        partial_tokens = {}
        for word in words:
            if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
            else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)
        with self.conn.cursor() as cur:
            cur.execute("""SELECT word_token, word_id
@@ -252,7 +267,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            This function takes minor shortcuts on transliteration.
        """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)
    def update_postcodes_from_db(self):
        """ Update postcode tokens in the word table from the location_postcode
@@ -275,7 +290,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                    if postcode is None:
                        to_delete.append(word)
                    else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                    'P', postcode)
                if to_delete:
@@ -293,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            completely replaced. Otherwise the phrases are added to the
            already existing ones.
        """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                            for p in phrases))
        with self.conn.cursor() as cur:
@@ -323,7 +338,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        added = 0
        with CopyBuffer() as copystr:
            for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                if term:
                    copystr.add(term, 'S', word,
                                json.dumps({'class': cls, 'type': typ,
@@ -357,9 +372,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
    def add_country_names(self, country_code, names):
        """ Add names for the given country to the search index.
        """
        # Make sure any name preprocessing for country names applies.
        info = PlaceInfo({'name': names, 'country_code': country_code,
                          'rank_address': 4, 'class': 'boundary',
                          'type': 'administrative'})
        self._add_country_full_names(country_code,
                                     self.sanitizer.process_names(info)[0])
    def _add_country_full_names(self, country_code, names):
        """ Add names for the given country from an already sanitized
            name list.
        """
        word_tokens = set()
-        for name in self._compute_full_names(names):
+        for name in names:
-            norm_name = self.name_processor.get_search_normalized(name)
+            norm_name = self._search_normalized(name.name)
            if norm_name:
                word_tokens.add(norm_name)
@@ -385,23 +412,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
    def process_place(self, place):
        """ Determine tokenizer information about the given place.
-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
            the database via the token_info field.
        """
        token_info = _TokenInfo(self._cache)
-        names = place.get('name')
+        names, address = self.sanitizer.process_names(place)
        if names:
            fulls, partials = self._compute_name_tokens(names)
            token_info.add_names(fulls, partials)
-            country_feature = place.get('country_feature')
+            if place.is_country():
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
+                self._add_country_full_names(place.country_code, names)
                self.add_country_names(country_feature.lower(), names)
        address = place.get('address')
        if address:
            self._process_place_address(token_info, address)
@@ -411,18 +436,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
    def _process_place_address(self, token_info, address):
        hnrs = []
        addr_terms = []
-        for key, value in address.items():
+        for item in address:
-            if key == 'postcode':
+            if item.kind == 'postcode':
-                self._add_postcode(value)
+                self._add_postcode(item.name)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
+                hnrs.append(item.name)
-            elif key == 'street':
+            elif item.kind == 'street':
-                token_info.add_street(self._compute_partial_tokens(value))
+                token_info.add_street(self._compute_partial_tokens(item.name))
-            elif key == 'place':
+            elif item.kind == 'place':
-                token_info.add_place(self._compute_partial_tokens(value))
+                token_info.add_place(self._compute_partial_tokens(item.name))
-            elif not key.startswith('_') and \
+            elif not item.kind.startswith('_') and \
-                 key not in ('country', 'full'):
+                 item.kind not in ('country', 'full'):
-                addr_terms.append((key, self._compute_partial_tokens(value)))
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
        if hnrs:
            hnrs = self._split_housenumbers(hnrs)
@@ -435,7 +460,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        """ Normalize the given term, split it into partial words and return
            then token list for them.
        """
-        norm_name = self.name_processor.get_search_normalized(name)
+        norm_name = self._search_normalized(name)
        tokens = []
        need_lookup = []
@@ -458,19 +483,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        return tokens
    def _compute_name_tokens(self, names):
        """ Computes the full name and partial name tokens for the given
            dictionary of names.
        """
        full_names = self._compute_full_names(names)
        full_tokens = set()
        partial_tokens = set()
-        for name in full_names:
+        for name in names:
-            norm_name = self.name_processor.get_normalized(name)
+            norm_name = self._normalized(name.name)
            full, part = self._cache.names.get(norm_name, (None, None))
            if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.get_variants_ascii(norm_name)
                if not variants:
                    continue
@@ -487,23 +512,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        return full_tokens, partial_tokens
    @staticmethod
    def _compute_full_names(names):
        """ Return the set of all full name word ids to be used with the
            given dictionary of names.
        """
        full_names = set()
        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
            if name:
                full_names.add(name)
                brace_idx = name.find('(')
                if brace_idx >= 0:
                    full_names.add(name[:brace_idx].strip())
        return full_names
    def _add_postcode(self, postcode):
        """ Make sure the normalized postcode is present in the word table.
        """
@@ -511,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            postcode = self.normalize_postcode(postcode)
            if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                if not term:
                    return
--- a/nominatim/tokenizer/icu_variants.py
+++ b/nominatim/tokenizer/icu_variants.py
@@ -2,7 +2,6 @@
 Data structures for saving variant expansions for ICU tokenizer.
 """
 from collections import namedtuple
 import json
 _ICU_VARIANT_PORPERTY_FIELDS = ['lang']
@@ -24,34 +23,3 @@ class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORP
 ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
 def pickle_variant_set(variants):
    """ Serializes an iterable of variant rules to a string.
    """
    # Create a list of property sets. So they don't need to be duplicated
    properties = {}
    pid = 1
    for variant in variants:
        if variant.properties not in properties:
            properties[variant.properties] = pid
            pid += 1
    # Convert the variants into a simple list.
    variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
    # Convert everythin to json.
    return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
                       'variants': variants})
 def unpickle_variant_set(variant_string):
    """ Deserializes a variant string that was previously created with
        pickle_variant_set() into a set of ICUVariants.
    """
    data = json.loads(variant_string)
    properties = {int(k): ICUVariantProperties.from_rules(v)
                  for k, v in data['properties'].items()}
    return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -113,7 +113,7 @@ class LegacyTokenizer(AbstractTokenizer):
            self._init_db_tables(config)
-    def init_from_project(self):
+    def init_from_project(self, _):
        """ Initialise the tokenizer from the project directory.
        """
        with connect(self.dsn) as conn:
@@ -142,7 +142,7 @@ class LegacyTokenizer(AbstractTokenizer):
                              modulepath=modulepath)
-    def check_database(self):
+    def check_database(self, _):
        """ Check that the tokenizer is set up correctly.
        """
        hint = """\
@@ -405,16 +405,15 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
        """
        token_info = _TokenInfo(self._cache)
-        names = place.get('name')
+        names = place.name
        if names:
            token_info.add_names(self.conn, names)
-            country_feature = place.get('country_feature')
+            if place.is_country():
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
+                self.add_country_names(place.country_code, names)
                self.add_country_names(country_feature.lower(), names)
-        address = place.get('address')
+        address = place.address
        if address:
            self._process_place_address(token_info, address)
--- a/nominatim/tokenizer/place_sanitizer.py
+++ b/nominatim/tokenizer/place_sanitizer.py
@@ -0,0 +1,127 @@
 """
 Handler for cleaning name and address tags in place information before it
 is handed to the token analysis.
 """
 import importlib
 from nominatim.errors import UsageError
 class PlaceName:
    """ A searchable name for a place together with properties.
        Every name object saves the name proper and two basic properties:
        * 'kind' describes the name of the OSM key used without any suffixes
          (i.e. the part after the colon removed)
        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
          is the part of the key after the first colon.
        In addition to that, the name may have arbitrary additional attributes.
        Which attributes are used, depends on the token analyser.
    """
    def __init__(self, name, kind, suffix):
        self.name = name
        self.kind = kind
        self.suffix = suffix
        self.attr = {}
    def __repr__(self):
        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
    def clone(self, name=None, kind=None, suffix=None, attr=None):
        """ Create a deep copy of the place name, optionally with the
            given parameters replaced. In the attribute list only the given
            keys are updated. The list is not replaced completely.
            In particular, the function cannot to be used to remove an
            attribute from a place name.
        """
        newobj = PlaceName(name or self.name,
                           kind or self.kind,
                           suffix or self.suffix)
        newobj.attr.update(self.attr)
        if attr:
            newobj.attr.update(attr)
        return newobj
    def set_attr(self, key, value):
        """ Add the given property to the name. If the property was already
            set, then the value is overwritten.
        """
        self.attr[key] = value
    def get_attr(self, key, default=None):
        """ Return the given property or the value of 'default' if it
            is not set.
        """
        return self.attr.get(key, default)
    def has_attr(self, key):
        """ Check if the given attribute is set.
        """
        return key in self.attr
 class _ProcessInfo:
    """ Container class for information handed into to handler functions.
        The 'names' and 'address' members are mutable. A handler must change
        them by either modifying the lists place or replacing the old content
        with a new list.
    """
    def __init__(self, place):
        self.place = place
        self.names = self._convert_name_dict(place.name)
        self.address = self._convert_name_dict(place.address)
    @staticmethod
    def _convert_name_dict(names):
        """ Convert a dictionary of names into a list of PlaceNames.
            The dictionary key is split into the primary part of the key
            and the suffix (the part after an optional colon).
        """
        out = []
        if names:
            for key, value in names.items():
                parts = key.split(':', 1)
                out.append(PlaceName(value.strip(),
                                     parts[0].strip(),
                                     parts[1].strip() if len(parts) > 1 else None))
        return out
 class PlaceSanitizer:
    """ Controller class which applies sanitizer functions on the place
        names and address before they are used by the token analysers.
    """
    def __init__(self, rules):
        self.handlers = []
        if rules:
            for func in rules:
                if 'step' not in func:
                    raise UsageError("Sanitizer rule is missing the 'step' attribute.")
                module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
                handler_module = importlib.import_module(module_name)
                self.handlers.append(handler_module.create(func))
    def process_names(self, place):
        """ Extract a sanitized list of names and address parts from the
            given place. The function returns a tuple
            (list of names, list of address names)
        """
        obj = _ProcessInfo(place)
        for func in self.handlers:
            func(obj)
        return obj.names, obj.address
--- a/nominatim/tokenizer/sanitizers/init.py
+++ b/nominatim/tokenizer/sanitizers/init.py
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -0,0 +1,35 @@
 """
 Name processor that splits name values with multiple values into their components.
 """
 import re
 from nominatim.errors import UsageError
 def create(func):
    """ Create a name processing function that splits name values with
        multiple values into their components. The optional parameter
        'delimiters' can be used to define the characters that should be used
        for splitting. The default is ',;'.
    """
    delimiter_set = set(func.get('delimiters', ',;'))
    if not delimiter_set:
        raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
    regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
    def _process(obj):
        if not obj.names:
            return
        new_names = []
        for name in obj.names:
            split_names = regexp.split(name.name)
            print(split_names)
            if len(split_names) == 1:
                new_names.append(name)
            else:
                new_names.extend(name.clone(name=n) for n in split_names if n)
        obj.names = new_names
    return _process
--- a/nominatim/tokenizer/sanitizers/strip_brace_terms.py
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@@ -0,0 +1,22 @@
 """
 Sanitizer handling names with addendums in braces.
 """
 def create(_):
    """ Create a name processing function that creates additional name variants
        when a name has an addendum in brackets (e.g. "Halle (Saale)"). The
        additional variant only contains the main name without the bracket part.
    """
    def _process(obj):
        """ Add variants for names that have a bracket extension.
        """
        if obj.names:
            new_names = []
            for name in (n for n in obj.names if '(' in n.name):
                new_name = name.name.split('(')[0].strip()
                if new_name:
                    new_names.append(name.clone(name=new_name))
            obj.names.extend(new_names)
    return _process
--- a/nominatim/tools/check_database.py
+++ b/nominatim/tools/check_database.py
@@ -166,7 +166,7 @@ def check_tokenizer(_, config):
        return CheckState.FAIL, dict(msg="""\
            Cannot load tokenizer. Did the import finish sucessfully?""")
-    result = tokenizer.check_database()
+    result = tokenizer.check_database(config)
    if result is None:
        return CheckState.OK
--- a/nominatim/tools/tiger_data.py
+++ b/nominatim/tools/tiger_data.py
@@ -7,12 +7,11 @@ import logging
 import os
 import tarfile
 import psycopg2.extras
 from nominatim.db.connection import connect
 from nominatim.db.async_connection import WorkerPool
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.errors import UsageError
 from nominatim.indexer.place_info import PlaceInfo
 LOG = logging.getLogger()
@@ -58,7 +57,7 @@ def handle_threaded_sql_statements(pool, fd, analyzer):
            address = dict(street=row['street'], postcode=row['postcode'])
            args = ('SRID=4326;' + row['geometry'],
                    int(row['from']), int(row['to']), row['interpolation'],
-                    psycopg2.extras.Json(analyzer.process_place(dict(address=address))),
+                    PlaceInfo({'address': address}).analyze(analyzer),
                    analyzer.normalize_postcode(row['postcode']))
        except ValueError:
            continue
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -24,6 +24,9 @@ transliteration:
    - "[^[:Ascii:]] >"
    - ":: lower ()"
    - ":: NFC ()"
 sanitizers:
    - step: split-name-list
    - step: strip-brace-terms
 variants:
    - !include icu-rules/variants-bg.yaml
    - !include icu-rules/variants-ca.yaml
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@ -1,6 +1,8 @@
 """
 Tokenizer for testing.
 """
 from nominatim.indexer.place_info import PlaceInfo
 from nominatim.config import Configuration
 def create(dsn, data_dir):
    """ Create a new instance of the tokenizer provided by this module.
@@ -21,7 +23,8 @@ class DummyTokenizer:
        self.init_state = "new"
-    def init_from_project(self):
+    def init_from_project(self, config):
        assert isinstance(config, Configuration)
        assert self.init_state is None
        self.init_state = "loaded"
@@ -68,4 +71,5 @@ class DummyNameAnalyzer:
    @staticmethod
    def process_place(place):
        assert isinstance(place, PlaceInfo)
        return {}
--- a/test/python/test_db_connection.py
+++ b/test/python/test_db_connection.py
@@ -100,6 +100,6 @@ def test_get_pg_env_overwrite_variable(monkeypatch):
 def test_get_pg_env_ignore_unknown():
-    env = get_pg_env('tty=stuff', base_env={})
+    env = get_pg_env('client_encoding=stuff', base_env={})
    assert env == {}
--- a/test/python/test_indexing.py
+++ b/test/python/test_indexing.py
@@ -29,6 +29,7 @@ class IndexerTestDB:
                                                indexed_date TIMESTAMP,
                                                partition SMALLINT,
                                                admin_level SMALLINT,
                                                country_code TEXT,
                                                address HSTORE,
                                                token_info JSONB,
                                                geometry_sector INTEGER)""")
@@ -54,15 +55,26 @@ class IndexerTestDB:
                             END IF;
                             RETURN NEW;
                           END; $$ LANGUAGE plpgsql;""")
-            cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
+            cur.execute("DROP TYPE IF EXISTS prepare_update_info CASCADE")
-                                                      OUT name HSTORE,
+            cur.execute("""CREATE TYPE prepare_update_info AS (
-                                                      OUT address HSTORE,
+                             name HSTORE,
-                                                      OUT country_feature VARCHAR,
+                             address HSTORE,
-                                                      OUT linked_place_id BIGINT)
+                             rank_address SMALLINT,
                             country_code TEXT,
                             class TEXT,
                             type TEXT,
                             linked_place_id BIGINT
                           )""")
            cur.execute("""CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex,
                                                     OUT result prepare_update_info)
                           AS $$
                           BEGIN
-                            address := p.address;
+                             result.address := p.address;
-                            name := p.name;
+                             result.name := p.name;
                             result.class := p.class;
                             result.type := p.type;
                             result.country_code := p.country_code;
                             result.rank_address := p.rank_address;
                           END;
                           $$ LANGUAGE plpgsql STABLE;
                        """)
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -7,10 +7,10 @@ import yaml
 import pytest
 from nominatim.tokenizer import icu_tokenizer
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db import properties
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.indexer.place_info import PlaceInfo
 from mock_icu_word_table import MockIcuWordTable
@@ -67,11 +67,14 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
    monkeypatch.undo()
    def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
-                     variants=('~gasse -> gasse', 'street => st', )):
+                     variants=('~gasse -> gasse', 'street => st', ),
                     sanitizers=[]):
        cfgstr = {'normalization' : list(norm),
-                   'transliteration' : list(trans),
+                  'sanitizers' : sanitizers,
-                   'variants' : [ {'words': list(variants)}]}
+                  'transliteration' : list(trans),
-        tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
+                  'variants' : [ {'words': list(variants)}]}
        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
        tok.loader = ICURuleLoader(test_config)
        return tok.name_analyzer()
@@ -177,9 +180,9 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
    monkeypatch.undo()
    tok = tokenizer_factory()
-    tok.init_from_project()
+    tok.init_from_project(test_config)
-    assert tok.naming_rules is not None
+    assert tok.loader is not None
    assert tok.term_normalization == ':: lower();'
@@ -308,44 +311,54 @@ class TestPlaceNames:
    @pytest.fixture(autouse=True)
    def setup(self, analyzer, sql_functions):
-        with analyzer() as anl:
+        sanitizers = [{'step': 'split-name-list'},
                      {'step': 'strip-brace-terms'}]
        with analyzer(sanitizers=sanitizers) as anl:
            self.analyzer = anl
            yield anl
    def expect_name_terms(self, info, *expected_terms):
        tokens = self.analyzer.get_word_token_info(expected_terms)
        print (tokens)
        for token in tokens:
            assert token[2] is not None, "No token for {0}".format(token)
        assert eval(info['names']) == set((t[2] for t in tokens))
    def process_named_place(self, names):
        return self.analyzer.process_place(PlaceInfo({'name': names}))
    def test_simple_names(self):
-        info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}})
+        info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'})
        self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
    @pytest.mark.parametrize('sep', [',' , ';'])
    def test_names_with_separator(self, sep):
-        info = self.analyzer.process_place({'name': {'name': sep.join(('New York', 'Big Apple'))}})
+        info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))})
        self.expect_name_terms(info, '#New York', '#Big Apple',
                                     'new', 'york', 'big', 'apple')
    def test_full_names_with_bracket(self):
-        info = self.analyzer.process_place({'name': {'name': 'Houseboat (left)'}})
+        info = self.process_named_place({'name': 'Houseboat (left)'})
        self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
                                     'houseboat', 'left')
    def test_country_name(self, word_table):
-        info = self.analyzer.process_place({'name': {'name': 'Norge'},
+        place = PlaceInfo({'name' : {'name': 'Norge'},
-                                           'country_feature': 'no'})
+                           'country_code': 'no',
                           'rank_address': 4,
                           'class': 'boundary',
                           'type': 'administrative'})
        info = self.analyzer.process_place(place)
        self.expect_name_terms(info, '#norge', 'norge')
        assert word_table.get_country() == {('no', 'NORGE')}
@@ -361,7 +374,7 @@ class TestPlaceAddress:
    def process_address(self, **kwargs):
-        return self.analyzer.process_place({'address': kwargs})
+        return self.analyzer.process_place(PlaceInfo({'address': kwargs}))
    def name_token_set(self, *expected_terms):
--- a/test/python/test_tokenizer_icu_name_processor.py
+++ b/test/python/test_tokenizer_icu_name_processor.py
@@ -4,15 +4,17 @@ Tests for import name normalisation and variant generation.
 from textwrap import dedent
 import pytest
 import yaml
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 from nominatim.errors import UsageError
@pytest.fixture
-def cfgfile():
+def cfgfile(def_config, tmp_path):
    project_dir = tmp_path / 'project_dir'
    project_dir.mkdir()
    def_config.project_dir = project_dir
    def _create_config(*variants, **kwargs):
        content = dedent("""\
        normalization:
@@ -30,7 +32,9 @@ def cfgfile():
        content += '\n'.join(("      - " + s for s in variants)) + '\n'
        for k, v in kwargs:
            content += "    {}: {}\n".format(k, v)
-        return yaml.safe_load(content)
+        (project_dir / 'icu_tokenizer.yaml').write_text(content)
        return def_config
    return _create_config
@@ -40,10 +44,9 @@ def get_normalized_variants(proc, name):
 def test_variants_empty(cfgfile):
-    fpath = cfgfile('saint -> 🜵', 'street -> st')
+    config = cfgfile('saint -> 🜵', 'street -> st')
-    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
+    proc = ICURuleLoader(config).make_token_analysis()
    proc = ICUNameProcessor(rules)
    assert get_normalized_variants(proc, '🜵') == []
    assert get_normalized_variants(proc, '🜳') == []
@@ -83,8 +86,8 @@ VARIANT_TESTS = [
@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
 def test_variants(cfgfile, rules, name, variants):
-    fpath = cfgfile(*rules)
+    config = cfgfile(*rules)
-    proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath)))
+    proc = ICURuleLoader(config).make_token_analysis()
    result = get_normalized_variants(proc, name)
@@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants):
 def test_search_normalized(cfgfile):
-    fpath = cfgfile('~street => s,st', 'master => mstr')
+    config = cfgfile('~street => s,st', 'master => mstr')
-
+    proc = ICURuleLoader(config).make_token_analysis()
    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
    proc = ICUNameProcessor(rules)
    assert proc.get_search_normalized('Master Street') == 'master street'
    assert proc.get_search_normalized('Earnes St') == 'earnes st'
--- a/test/python/test_tokenizer_icu_rule_loader.py
+++ b/test/python/test_tokenizer_icu_rule_loader.py
@@ -12,7 +12,16 @@ from nominatim.errors import UsageError
 from icu import Transliterator
@pytest.fixture
-def cfgrules():
+def test_config(def_config, tmp_path):
    project_dir = tmp_path / 'project_dir'
    project_dir.mkdir()
    def_config.project_dir = project_dir
    return def_config
@pytest.fixture
 def cfgrules(test_config):
    def _create_config(*variants, **kwargs):
        content = dedent("""\
        normalization:
@@ -29,19 +38,21 @@ def cfgrules():
        content += '\n'.join(("      - " + s for s in variants)) + '\n'
        for k, v in kwargs:
            content += "    {}: {}\n".format(k, v)
-        return yaml.safe_load(content)
+        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
        return test_config
    return _create_config
-def test_empty_rule_set():
+def test_empty_rule_set(test_config):
-    rule_cfg = yaml.safe_load(dedent("""\
+    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
        normalization:
        transliteration:
        variants:
        """))
-    rules = ICURuleLoader(rule_cfg)
+    rules = ICURuleLoader(test_config)
    assert rules.get_search_rules() == ''
    assert rules.get_normalization_rules() == ''
    assert rules.get_transliteration_rules() == ''
@@ -50,11 +61,12 @@ def test_empty_rule_set():
 CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
-def test_missing_section(section):
+def test_missing_section(section, test_config):
    rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
    with pytest.raises(UsageError):
-        ICURuleLoader(rule_cfg)
+        ICURuleLoader(test_config)
 def test_get_search_rules(cfgrules):
@@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules):
    assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
-def test_transliteration_rules_from_file(def_config, tmp_path):
+def test_transliteration_rules_from_file(test_config):
-    def_config.project_dir = tmp_path
+    cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
    cfgpath = tmp_path / ('test_config.yaml')
    cfgpath.write_text(dedent("""\
        normalization:
        transliteration:
@@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path):
            - !include transliteration.yaml
        variants:
        """))
-    transpath = tmp_path / ('transliteration.yaml')
+    transpath = test_config.project_dir / ('transliteration.yaml')
    transpath.write_text('- "x > y"')
-    loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
+    loader = ICURuleLoader(test_config)
    rules = loader.get_transliteration_rules()
    trans = Transliterator.createFromRules("test", rules)
--- a/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@@ -5,6 +5,7 @@ import shutil
 import pytest
 from nominatim.indexer.place_info import PlaceInfo
 from nominatim.tokenizer import legacy_tokenizer
 from nominatim.db import properties
 from nominatim.errors import UsageError
@@ -131,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config,
    assert not (test_config.project_dir / 'module').exists()
-def test_init_from_project(tokenizer_setup, tokenizer_factory):
+def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
    tok = tokenizer_factory()
-    tok.init_from_project()
+    tok.init_from_project(test_config)
    assert tok.normalization is not None
@@ -284,21 +285,21 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name):
 def test_process_place_names(analyzer, make_keywords):
-    info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
+    info = analyzer.process_place(PlaceInfo({'name' : {'name' : 'Soft bAr', 'ref': '34'}}))
    assert info['names'] == '{1,2,3}'
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
 def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode):
-    analyzer.process_place({'address': {'postcode' : pcode}})
+    analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
    assert word_table.get_postcodes() == {pcode, }
@pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
 def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode):
-    analyzer.process_place({'address': {'postcode' : pcode}})
+    analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
    assert not word_table.get_postcodes()
@@ -319,7 +320,7 @@ class TestHousenumberName:
    @staticmethod
    @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
    def test_process_place_housenumbers_simple(analyzer, hnr):
-        info = analyzer.process_place({'address': {'housenumber' : hnr}})
+        info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : hnr}}))
        assert info['hnr'] == hnr
        assert info['hnr_tokens'].startswith("{")
@@ -327,15 +328,15 @@ class TestHousenumberName:
    @staticmethod
    def test_process_place_housenumbers_lists(analyzer):
-        info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
+        info = analyzer.process_place(PlaceInfo({'address': {'conscriptionnumber' : '1; 2;3'}}))
        assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
    @staticmethod
    def test_process_place_housenumbers_duplicates(analyzer):
-        info = analyzer.process_place({'address': {'housenumber' : '134',
+        info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : '134',
                                                   'conscriptionnumber' : '134',
-                                                   'streetnumber' : '99a'}})
+                                                   'streetnumber' : '99a'}}))
        assert set(info['hnr'].split(';')) == set(('134', '99a'))
--- a/test/python/test_tools_check_database.py
+++ b/test/python/test_tools_check_database.py
@@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch,
                         check_result, state):
    class _TestTokenizer:
        @staticmethod
-        def check_database():
+        def check_database(_):
            return check_result
    monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
--- a/test/python/tokenizer/sanitizers/test_split_name_list.py
+++ b/test/python/tokenizer/sanitizers/test_split_name_list.py
@@ -0,0 +1,65 @@
 """
 Tests for the sanitizer that splitts multivalue lists.
 """
 import pytest
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.indexer.place_info import PlaceInfo
 from nominatim.errors import UsageError
 def run_sanitizer_on(**kwargs):
    place = PlaceInfo({'name': kwargs})
    name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
    return sorted([(p.name, p.kind, p.suffix) for p in name])
 def sanitize_with_delimiter(delimiter, name):
    place = PlaceInfo({'name': {'name': name}})
    san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}])
    name, _ = san.process_names(place)
    return sorted([p.name for p in name])
 def test_simple():
    assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
    assert run_sanitizer_on(name='') == [('', 'name', None)]
 def test_splits():
    assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
                                              ('B', 'name', None),
                                              ('C', 'name', None)]
    assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
                                                            ('boat', 'short_name', None)]
 def test_empty_fields():
    assert run_sanitizer_on(name='A;;B') == [('A', 'name', None),
                                             ('B', 'name', None)]
    assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None),
                                              ('B', 'name', None)]
    assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
    assert run_sanitizer_on(name='B,') == [('B', 'name', None)]
 def test_custom_delimiters():
    assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
    assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
    assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
    assert sanitize_with_delimiter(' ', 'morning  sun') == ['morning', 'sun']
 def test_empty_delimiter_set():
    with pytest.raises(UsageError):
        sanitize_with_delimiter('', 'abc')
 def test_no_name_list():
    place = PlaceInfo({'address': {'housenumber': '3'}})
    name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
    assert not name
    assert len(address) == 1
--- a/test/python/tokenizer/sanitizers/test_strip_brace_terms.py
+++ b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py
@@ -0,0 +1,44 @@
 """
 Tests for the sanitizer that handles braced suffixes.
 """
 import pytest
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.indexer.place_info import PlaceInfo
 def run_sanitizer_on(**kwargs):
    place = PlaceInfo({'name': kwargs})
    name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
    return sorted([(p.name, p.kind, p.suffix) for p in name])
 def test_no_braces():
    assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None),
                                                      ('foo', 'name', None)]
 def test_simple_braces():
    assert run_sanitizer_on(name='Halle (Saale)', ref='3')\
      == [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
    assert run_sanitizer_on(name='ack ( bar')\
      == [('ack', 'name', None), ('ack ( bar', 'name', None)]
 def test_only_braces():
    assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)]
 def test_double_braces():
    assert run_sanitizer_on(name='a((b))') == [('a', 'name', None),
                                               ('a((b))', 'name', None)]
    assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None),
                                                  ('a (b) (c)', 'name', None)]
 def test_no_names():
    place = PlaceInfo({'address': {'housenumber': '3'}})
    name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
    assert not name
    assert len(address) == 1
--- a/test/python/tokenizer/test_place_sanitizer.py
+++ b/test/python/tokenizer/test_place_sanitizer.py
@@ -0,0 +1,71 @@
 """
 Tests for execution of the sanitztion step.
 """
 import pytest
 from nominatim.errors import UsageError
 import nominatim.tokenizer.place_sanitizer as sanitizer
 from nominatim.indexer.place_info import PlaceInfo
 def test_placeinfo_clone_new_name():
    place = sanitizer.PlaceName('foo', 'ki', 'su')
    newplace = place.clone(name='bar')
    assert place.name == 'foo'
    assert newplace.name == 'bar'
    assert newplace.kind == 'ki'
    assert newplace.suffix == 'su'
 def test_placeinfo_clone_merge_attr():
    place = sanitizer.PlaceName('foo', 'ki', 'su')
    place.set_attr('a1', 'v1')
    place.set_attr('a2', 'v2')
    newplace = place.clone(attr={'a2': 'new', 'b2': 'foo'})
    assert place.get_attr('a2') == 'v2'
    assert place.get_attr('b2') is None
    assert newplace.get_attr('a1') == 'v1'
    assert newplace.get_attr('a2') == 'new'
    assert newplace.get_attr('b2') == 'foo'
 def test_placeinfo_has_attr():
    place = sanitizer.PlaceName('foo', 'ki', 'su')
    place.set_attr('a1', 'v1')
    assert place.has_attr('a1')
    assert not place.has_attr('whatever')
 def test_sanitizer_default():
    san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}])
    name, address =  san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'},
                                                  'address': {'street': 'Bald'}}))
    assert len(name) == 3
    assert all(isinstance(n, sanitizer.PlaceName) for n in name)
    assert all(n.kind == 'name'  for n in name)
    assert all(n.suffix == 'de:de'  for n in name)
    assert len(address) == 1
    assert all(isinstance(n, sanitizer.PlaceName) for n in address)
@pytest.mark.parametrize('rules', [None, []])
 def test_sanitizer_empty_list(rules):
    san = sanitizer.PlaceSanitizer(rules)
    name, address =  san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}}))
    assert len(name) == 1
    assert all(isinstance(n, sanitizer.PlaceName) for n in name)
 def test_sanitizer_missing_step_definition():
    with pytest.raises(UsageError):
        san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}])