split code into submodules

2026-03-09 11:34:07 +00:00 · 2024-05-16 11:55:17 +02:00
parent 0fb4fe8e4d
commit 6e89310a92
137 changed files with 757 additions and 716 deletions
--- a/src/nominatim_db/init.py
+++ b/src/nominatim_db/init.py
--- a/src/nominatim_db/cli.py
+++ b/src/nominatim_db/cli.py
@@ -0,0 +1,228 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Command-line interface to the Nominatim functions for import, update,
+database administration and querying.
+"""
+from typing import Optional, Any
+import importlib
+import logging
+import os
+import sys
+import argparse
+from pathlib import Path
+
+from nominatim_core.config import Configuration
+from nominatim_core.errors import UsageError
+from .tools.exec_utils import run_php_server
+from . import clicmd
+from . import version
+from .clicmd.args import NominatimArgs, Subcommand
+
+LOG = logging.getLogger()
+
+class CommandlineParser:
+    """ Wraps some of the common functions for parsing the command line
+        and setting up subcommands.
+    """
+    def __init__(self, prog: str, description: Optional[str]):
+        self.parser = argparse.ArgumentParser(
+            prog=prog,
+            description=description,
+            formatter_class=argparse.RawDescriptionHelpFormatter)
+
+        self.subs = self.parser.add_subparsers(title='available commands',
+                                               dest='subcommand')
+
+        # Global arguments that only work if no sub-command given
+        self.parser.add_argument('--version', action='store_true',
+                                 help='Print Nominatim version and exit')
+
+        # Arguments added to every sub-command
+        self.default_args = argparse.ArgumentParser(add_help=False)
+        group = self.default_args.add_argument_group('Default arguments')
+        group.add_argument('-h', '--help', action='help',
+                           help='Show this help message and exit')
+        group.add_argument('-q', '--quiet', action='store_const', const=0,
+                           dest='verbose', default=1,
+                           help='Print only error messages')
+        group.add_argument('-v', '--verbose', action='count', default=1,
+                           help='Increase verboseness of output')
+        group.add_argument('--project-dir', metavar='DIR', default='.',
+                           help='Base directory of the Nominatim installation (default:.)')
+        group.add_argument('-j', '--threads', metavar='NUM', type=int,
+                           help='Number of parallel threads to use')
+
+
+    def nominatim_version_text(self) -> str:
+        """ Program name and version number as string
+        """
+        text = f'Nominatim version {version.NOMINATIM_VERSION!s}'
+        if version.GIT_COMMIT_HASH is not None:
+            text += f' ({version.GIT_COMMIT_HASH})'
+        return text
+
+
+    def add_subcommand(self, name: str, cmd: Subcommand) -> None:
+        """ Add a subcommand to the parser. The subcommand must be a class
+            with a function add_args() that adds the parameters for the
+            subcommand and a run() function that executes the command.
+        """
+        assert cmd.__doc__ is not None
+
+        parser = self.subs.add_parser(name, parents=[self.default_args],
+                                      help=cmd.__doc__.split('\n', 1)[0],
+                                      description=cmd.__doc__,
+                                      formatter_class=argparse.RawDescriptionHelpFormatter,
+                                      add_help=False)
+        parser.set_defaults(command=cmd)
+        cmd.add_args(parser)
+
+
+    def run(self, **kwargs: Any) -> int:
+        """ Parse the command line arguments of the program and execute the
+            appropriate subcommand.
+        """
+        args = NominatimArgs()
+        try:
+            self.parser.parse_args(args=kwargs.get('cli_args'), namespace=args)
+        except SystemExit:
+            return 1
+
+        if args.version:
+            print(self.nominatim_version_text())
+            return 0
+
+        if args.subcommand is None:
+            self.parser.print_help()
+            return 1
+
+        args.project_dir = Path(args.project_dir).resolve()
+
+        if 'cli_args' not in kwargs:
+            logging.basicConfig(stream=sys.stderr,
+                                format='%(asctime)s: %(message)s',
+                                datefmt='%Y-%m-%d %H:%M:%S',
+                                level=max(4 - args.verbose, 1) * 10)
+
+        args.config = Configuration(args.project_dir,
+                                    environ=kwargs.get('environ', os.environ))
+        args.config.set_libdirs(module=kwargs['module_dir'],
+                                osm2pgsql=kwargs['osm2pgsql_path'])
+
+        log = logging.getLogger()
+        log.warning('Using project directory: %s', str(args.project_dir))
+
+        try:
+            return args.command.run(args)
+        except UsageError as exception:
+            if log.isEnabledFor(logging.DEBUG):
+                raise # use Python's exception printing
+            log.fatal('FATAL: %s', exception)
+
+        # If we get here, then execution has failed in some way.
+        return 1
+
+
+# Subcommand classes
+#
+# Each class needs to implement two functions: add_args() adds the CLI parameters
+# for the subfunction, run() executes the subcommand.
+#
+# The class documentation doubles as the help text for the command. The
+# first line is also used in the summary when calling the program without
+# a subcommand.
+#
+# No need to document the functions each time.
+# pylint: disable=C0111
+class AdminServe:
+    """\
+    Start a simple web server for serving the API.
+
+    This command starts a built-in webserver to serve the website
+    from the current project directory. This webserver is only suitable
+    for testing and development. Do not use it in production setups!
+
+    There are different webservers available. The default 'php' engine
+    runs the classic PHP frontend. The other engines are Python servers
+    which run the new Python frontend code. This is highly experimental
+    at the moment and may not include the full API.
+
+    By the default, the webserver can be accessed at: http://127.0.0.1:8088
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group = parser.add_argument_group('Server arguments')
+        group.add_argument('--server', default='127.0.0.1:8088',
+                           help='The address the server will listen to.')
+        group.add_argument('--engine', default='falcon',
+                           choices=('php', 'falcon', 'starlette'),
+                           help='Webserver framework to run. (default: falcon)')
+
+
+    def run(self, args: NominatimArgs) -> int:
+        if args.engine == 'php':
+            if args.config.lib_dir.php is None:
+                raise UsageError("PHP frontend not configured.")
+            run_php_server(args.server, args.project_dir / 'website')
+        else:
+            import uvicorn # pylint: disable=import-outside-toplevel
+            server_info = args.server.split(':', 1)
+            host = server_info[0]
+            if len(server_info) > 1:
+                if not server_info[1].isdigit():
+                    raise UsageError('Invalid format for --server parameter. Use <host>:<port>')
+                port = int(server_info[1])
+            else:
+                port = 8088
+
+            server_module = importlib.import_module(f'nominatim.server.{args.engine}.server')
+
+            app = server_module.get_application(args.project_dir)
+            uvicorn.run(app, host=host, port=port)
+
+        return 0
+
+
+def get_set_parser() -> CommandlineParser:
+    """\
+    Initializes the parser and adds various subcommands for
+    nominatim cli.
+    """
+    parser = CommandlineParser('nominatim', nominatim.__doc__)
+
+    parser.add_subcommand('import', clicmd.SetupAll())
+    parser.add_subcommand('freeze', clicmd.SetupFreeze())
+    parser.add_subcommand('replication', clicmd.UpdateReplication())
+
+    parser.add_subcommand('special-phrases', clicmd.ImportSpecialPhrases())
+
+    parser.add_subcommand('add-data', clicmd.UpdateAddData())
+    parser.add_subcommand('index', clicmd.UpdateIndex())
+    parser.add_subcommand('refresh', clicmd.UpdateRefresh())
+
+    parser.add_subcommand('admin', clicmd.AdminFuncs())
+
+    parser.add_subcommand('export', clicmd.QueryExport())
+    parser.add_subcommand('convert', clicmd.ConvertDB())
+    parser.add_subcommand('serve', AdminServe())
+
+    parser.add_subcommand('search', clicmd.APISearch())
+    parser.add_subcommand('reverse', clicmd.APIReverse())
+    parser.add_subcommand('lookup', clicmd.APILookup())
+    parser.add_subcommand('details', clicmd.APIDetails())
+    parser.add_subcommand('status', clicmd.APIStatus())
+
+    return parser
+
+
+def nominatim(**kwargs: Any) -> int:
+    """\
+    Command-line tools for importing, updating, administrating and
+    querying the Nominatim database.
+    """
+    return get_set_parser().run(**kwargs)
--- a/src/nominatim_db/clicmd/init.py
+++ b/src/nominatim_db/clicmd/init.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2023 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Subcommand definitions for the command-line tool.
+"""
+# mypy and pylint disagree about the style of explicit exports,
+# see https://github.com/PyCQA/pylint/issues/6006.
+# pylint: disable=useless-import-alias
+
+from .setup import SetupAll as SetupAll
+from .replication import UpdateReplication as UpdateReplication
+from .api import (APISearch as APISearch,
+                  APIReverse as APIReverse,
+                  APILookup as APILookup,
+                  APIDetails as APIDetails,
+                  APIStatus as APIStatus)
+from .index import UpdateIndex as UpdateIndex
+from .refresh import UpdateRefresh as UpdateRefresh
+from .add_data import UpdateAddData as UpdateAddData
+from .admin import AdminFuncs as AdminFuncs
+from .freeze import SetupFreeze as SetupFreeze
+from .special_phrases import ImportSpecialPhrases as ImportSpecialPhrases
+from .export import QueryExport as QueryExport
+from .convert import ConvertDB as ConvertDB
--- a/src/nominatim_db/clicmd/add_data.py
+++ b/src/nominatim_db/clicmd/add_data.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Implementation of the 'add-data' subcommand.
+"""
+from typing import cast
+import argparse
+import logging
+
+import psutil
+
+from .args import NominatimArgs
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415
+
+LOG = logging.getLogger()
+
+class UpdateAddData:
+    """\
+    Add additional data from a file or an online source.
+
+    This command allows to add or update the search data in the database.
+    The data can come either from an OSM file or single OSM objects can
+    directly be downloaded from the OSM API. This function only loads the
+    data into the database. Afterwards it still needs to be integrated
+    in the search index. Use the `nominatim index` command for that.
+
+    The command can also be used to add external non-OSM data to the
+    database. At the moment the only supported format is TIGER housenumber
+    data. See the online documentation at
+    https://nominatim.org/release-docs/latest/admin/Import/#installing-tiger-housenumber-data-for-the-us
+    for more information.
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group_name = parser.add_argument_group('Source')
+        group1 = group_name.add_mutually_exclusive_group(required=True)
+        group1.add_argument('--file', metavar='FILE',
+                            help='Import data from an OSM file or diff file')
+        group1.add_argument('--diff', metavar='FILE',
+                            help='Import data from an OSM diff file (deprecated: use --file)')
+        group1.add_argument('--node', metavar='ID', type=int,
+                            help='Import a single node from the API')
+        group1.add_argument('--way', metavar='ID', type=int,
+                            help='Import a single way from the API')
+        group1.add_argument('--relation', metavar='ID', type=int,
+                            help='Import a single relation from the API')
+        group1.add_argument('--tiger-data', metavar='DIR',
+                            help='Add housenumbers from the US TIGER census database')
+        group2 = parser.add_argument_group('Extra arguments')
+        group2.add_argument('--use-main-api', action='store_true',
+                            help='Use OSM API instead of Overpass to download objects')
+        group2.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
+                            help='Size of cache to be used by osm2pgsql (in MB)')
+        group2.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
+                            help='Set timeout for file downloads')
+
+
+    def run(self, args: NominatimArgs) -> int:
+        from ..tokenizer import factory as tokenizer_factory
+        from ..tools import tiger_data, add_osm_data
+
+        if args.tiger_data:
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+            return tiger_data.add_tiger_data(args.tiger_data,
+                                             args.config,
+                                             args.threads or psutil.cpu_count()  or 1,
+                                             tokenizer)
+
+        osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
+        if args.file or args.diff:
+            return add_osm_data.add_data_from_file(args.config.get_libpq_dsn(),
+                                                   cast(str, args.file or args.diff),
+                                                   osm2pgsql_params)
+
+        if args.node:
+            return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
+                                               'node', args.node,
+                                               args.use_main_api,
+                                               osm2pgsql_params)
+
+        if args.way:
+            return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
+                                               'way', args.way,
+                                               args.use_main_api,
+                                               osm2pgsql_params)
+
+        if args.relation:
+            return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
+                                               'relation', args.relation,
+                                               args.use_main_api,
+                                               osm2pgsql_params)
+
+        return 0
--- a/src/nominatim_db/clicmd/admin.py
+++ b/src/nominatim_db/clicmd/admin.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Implementation of the 'admin' subcommand.
+"""
+import logging
+import argparse
+import random
+
+import nominatim_api as napi
+from nominatim_core.db.connection import connect
+from .args import NominatimArgs
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415
+
+LOG = logging.getLogger()
+
+
+class AdminFuncs:
+    """\
+    Analyse and maintain the database.
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group = parser.add_argument_group('Admin tasks')
+        objs = group.add_mutually_exclusive_group(required=True)
+        objs.add_argument('--warm', action='store_true',
+                          help='Warm database caches for search and reverse queries')
+        objs.add_argument('--check-database', action='store_true',
+                          help='Check that the database is complete and operational')
+        objs.add_argument('--migrate', action='store_true',
+                          help='Migrate the database to a new software version')
+        objs.add_argument('--analyse-indexing', action='store_true',
+                          help='Print performance analysis of the indexing process')
+        objs.add_argument('--collect-os-info', action="store_true",
+                          help="Generate a report about the host system information")
+        objs.add_argument('--clean-deleted', action='store', metavar='AGE',
+                          help='Clean up deleted relations')
+        group = parser.add_argument_group('Arguments for cache warming')
+        group.add_argument('--search-only', action='store_const', dest='target',
+                           const='search',
+                           help="Only pre-warm tables for search queries")
+        group.add_argument('--reverse-only', action='store_const', dest='target',
+                           const='reverse',
+                           help="Only pre-warm tables for reverse queries")
+        group = parser.add_argument_group('Arguments for index anaysis')
+        mgroup = group.add_mutually_exclusive_group()
+        mgroup.add_argument('--osm-id', type=str,
+                            help='Analyse indexing of the given OSM object')
+        mgroup.add_argument('--place-id', type=int,
+                            help='Analyse indexing of the given Nominatim object')
+
+
+    def run(self, args: NominatimArgs) -> int:
+        # pylint: disable=too-many-return-statements
+        if args.warm:
+            return self._warm(args)
+
+        if args.check_database:
+            LOG.warning('Checking database')
+            from ..tools import check_database
+            return check_database.check_database(args.config)
+
+        if args.analyse_indexing:
+            LOG.warning('Analysing performance of indexing function')
+            from ..tools import admin
+            admin.analyse_indexing(args.config, osm_id=args.osm_id, place_id=args.place_id)
+            return 0
+
+        if args.migrate:
+            LOG.warning('Checking for necessary database migrations')
+            from ..tools import migration
+            return migration.migrate(args.config, args)
+
+        if args.collect_os_info:
+            LOG.warning("Reporting System Information")
+            from ..tools import collect_os_info
+            collect_os_info.report_system_information(args.config)
+            return 0
+
+        if args.clean_deleted:
+            LOG.warning('Cleaning up deleted relations')
+            from ..tools import admin
+            admin.clean_deleted_relations(args.config, age=args.clean_deleted)
+            return 0
+
+        return 1
+
+
+    def _warm(self, args: NominatimArgs) -> int:
+        LOG.warning('Warming database caches')
+
+        api = napi.NominatimAPI(args.project_dir)
+
+        try:
+            if args.target != 'search':
+                for _ in range(1000):
+                    api.reverse((random.uniform(-90, 90), random.uniform(-180, 180)),
+                                address_details=True)
+
+            if args.target != 'reverse':
+                from ..tokenizer import factory as tokenizer_factory
+
+                tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+                with connect(args.config.get_libpq_dsn()) as conn:
+                    if conn.table_exists('search_name'):
+                        words = tokenizer.most_frequent_words(conn, 1000)
+                    else:
+                        words = []
+
+                for word in words:
+                    api.search(word)
+        finally:
+            api.close()
+
+        return 0
--- a/src/nominatim_db/clicmd/api.py
+++ b/src/nominatim_db/clicmd/api.py
@@ -0,0 +1,374 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Subcommand definitions for API calls from the command line.
+"""
+from typing import Dict, Any
+import argparse
+import logging
+import json
+import sys
+
+import nominatim_api as napi
+import nominatim_api.v1 as api_output
+from nominatim_api.v1.helpers import zoom_to_rank, deduplicate_results
+from nominatim_api.v1.format import dispatch as formatting
+import nominatim_api.logging as loglib
+from .args import NominatimArgs
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+
+LOG = logging.getLogger()
+
+STRUCTURED_QUERY = (
+    ('amenity', 'name and/or type of POI'),
+    ('street', 'housenumber and street'),
+    ('city', 'city, town or village'),
+    ('county', 'county'),
+    ('state', 'state'),
+    ('country', 'country'),
+    ('postalcode', 'postcode')
+)
+
+EXTRADATA_PARAMS = (
+    ('addressdetails', 'Include a breakdown of the address into elements'),
+    ('extratags', ("Include additional information if available "
+                   "(e.g. wikipedia link, opening hours)")),
+    ('namedetails', 'Include a list of alternative names')
+)
+
+def _add_api_output_arguments(parser: argparse.ArgumentParser) -> None:
+    group = parser.add_argument_group('Output arguments')
+    group.add_argument('--format', default='jsonv2',
+                       choices=formatting.list_formats(napi.SearchResults) + ['debug'],
+                       help='Format of result')
+    for name, desc in EXTRADATA_PARAMS:
+        group.add_argument('--' + name, action='store_true', help=desc)
+
+    group.add_argument('--lang', '--accept-language', metavar='LANGS',
+                       help='Preferred language order for presenting search results')
+    group.add_argument('--polygon-output',
+                       choices=['geojson', 'kml', 'svg', 'text'],
+                       help='Output geometry of results as a GeoJSON, KML, SVG or WKT')
+    group.add_argument('--polygon-threshold', type=float, default = 0.0,
+                       metavar='TOLERANCE',
+                       help=("Simplify output geometry."
+                             "Parameter is difference tolerance in degrees."))
+
+
+class APISearch:
+    """\
+    Execute a search query.
+
+    This command works exactly the same as if calling the /search endpoint on
+    the web API. See the online documentation for more details on the
+    various parameters:
+    https://nominatim.org/release-docs/latest/api/Search/
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group = parser.add_argument_group('Query arguments')
+        group.add_argument('--query',
+                           help='Free-form query string')
+        for name, desc in STRUCTURED_QUERY:
+            group.add_argument('--' + name, help='Structured query: ' + desc)
+
+        _add_api_output_arguments(parser)
+
+        group = parser.add_argument_group('Result limitation')
+        group.add_argument('--countrycodes', metavar='CC,..',
+                           help='Limit search results to one or more countries')
+        group.add_argument('--exclude_place_ids', metavar='ID,..',
+                           help='List of search object to be excluded')
+        group.add_argument('--limit', type=int, default=10,
+                           help='Limit the number of returned results')
+        group.add_argument('--viewbox', metavar='X1,Y1,X2,Y2',
+                           help='Preferred area to find search results')
+        group.add_argument('--bounded', action='store_true',
+                           help='Strictly restrict results to viewbox area')
+
+        group = parser.add_argument_group('Other arguments')
+        group.add_argument('--no-dedupe', action='store_false', dest='dedupe',
+                           help='Do not remove duplicates from the result list')
+
+
+    def run(self, args: NominatimArgs) -> int:
+        if args.format == 'debug':
+            loglib.set_log_output('text')
+
+        api = napi.NominatimAPI(args.project_dir)
+
+        params: Dict[str, Any] = {'max_results': args.limit + min(args.limit, 10),
+                                  'address_details': True, # needed for display name
+                                  'geometry_output': args.get_geometry_output(),
+                                  'geometry_simplification': args.polygon_threshold,
+                                  'countries': args.countrycodes,
+                                  'excluded': args.exclude_place_ids,
+                                  'viewbox': args.viewbox,
+                                  'bounded_viewbox': args.bounded,
+                                  'locales': args.get_locales(api.config.DEFAULT_LANGUAGE)
+                                 }
+
+        if args.query:
+            results = api.search(args.query, **params)
+        else:
+            results = api.search_address(amenity=args.amenity,
+                                         street=args.street,
+                                         city=args.city,
+                                         county=args.county,
+                                         state=args.state,
+                                         postalcode=args.postalcode,
+                                         country=args.country,
+                                         **params)
+
+        if args.dedupe and len(results) > 1:
+            results = deduplicate_results(results, args.limit)
+
+        if args.format == 'debug':
+            print(loglib.get_and_disable())
+            return 0
+
+        output = api_output.format_result(
+                    results,
+                    args.format,
+                    {'extratags': args.extratags,
+                     'namedetails': args.namedetails,
+                     'addressdetails': args.addressdetails})
+        if args.format != 'xml':
+            # reformat the result, so it is pretty-printed
+            json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
+        else:
+            sys.stdout.write(output)
+        sys.stdout.write('\n')
+
+        return 0
+
+
+class APIReverse:
+    """\
+    Execute API reverse query.
+
+    This command works exactly the same as if calling the /reverse endpoint on
+    the web API. See the online documentation for more details on the
+    various parameters:
+    https://nominatim.org/release-docs/latest/api/Reverse/
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group = parser.add_argument_group('Query arguments')
+        group.add_argument('--lat', type=float, required=True,
+                           help='Latitude of coordinate to look up (in WGS84)')
+        group.add_argument('--lon', type=float, required=True,
+                           help='Longitude of coordinate to look up (in WGS84)')
+        group.add_argument('--zoom', type=int,
+                           help='Level of detail required for the address')
+        group.add_argument('--layer', metavar='LAYER',
+                           choices=[n.name.lower() for n in napi.DataLayer if n.name],
+                           action='append', required=False, dest='layers',
+                           help='OSM id to lookup in format <NRW><id> (may be repeated)')
+
+        _add_api_output_arguments(parser)
+
+
+    def run(self, args: NominatimArgs) -> int:
+        if args.format == 'debug':
+            loglib.set_log_output('text')
+
+        api = napi.NominatimAPI(args.project_dir)
+
+        result = api.reverse(napi.Point(args.lon, args.lat),
+                             max_rank=zoom_to_rank(args.zoom or 18),
+                             layers=args.get_layers(napi.DataLayer.ADDRESS | napi.DataLayer.POI),
+                             address_details=True, # needed for display name
+                             geometry_output=args.get_geometry_output(),
+                             geometry_simplification=args.polygon_threshold,
+                             locales=args.get_locales(api.config.DEFAULT_LANGUAGE))
+
+        if args.format == 'debug':
+            print(loglib.get_and_disable())
+            return 0
+
+        if result:
+            output = api_output.format_result(
+                        napi.ReverseResults([result]),
+                        args.format,
+                        {'extratags': args.extratags,
+                         'namedetails': args.namedetails,
+                         'addressdetails': args.addressdetails})
+            if args.format != 'xml':
+                # reformat the result, so it is pretty-printed
+                json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
+            else:
+                sys.stdout.write(output)
+            sys.stdout.write('\n')
+
+            return 0
+
+        LOG.error("Unable to geocode.")
+        return 42
+
+
+
+class APILookup:
+    """\
+    Execute API lookup query.
+
+    This command works exactly the same as if calling the /lookup endpoint on
+    the web API. See the online documentation for more details on the
+    various parameters:
+    https://nominatim.org/release-docs/latest/api/Lookup/
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group = parser.add_argument_group('Query arguments')
+        group.add_argument('--id', metavar='OSMID',
+                           action='append', required=True, dest='ids',
+                           help='OSM id to lookup in format <NRW><id> (may be repeated)')
+
+        _add_api_output_arguments(parser)
+
+
+    def run(self, args: NominatimArgs) -> int:
+        if args.format == 'debug':
+            loglib.set_log_output('text')
+
+        api = napi.NominatimAPI(args.project_dir)
+
+        if args.format == 'debug':
+            print(loglib.get_and_disable())
+            return 0
+
+        places = [napi.OsmID(o[0], int(o[1:])) for o in args.ids]
+
+        results = api.lookup(places,
+                             address_details=True, # needed for display name
+                             geometry_output=args.get_geometry_output(),
+                             geometry_simplification=args.polygon_threshold or 0.0,
+                             locales=args.get_locales(api.config.DEFAULT_LANGUAGE))
+
+        output = api_output.format_result(
+                    results,
+                    args.format,
+                    {'extratags': args.extratags,
+                     'namedetails': args.namedetails,
+                     'addressdetails': args.addressdetails})
+        if args.format != 'xml':
+            # reformat the result, so it is pretty-printed
+            json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
+        else:
+            sys.stdout.write(output)
+        sys.stdout.write('\n')
+
+        return 0
+
+
+class APIDetails:
+    """\
+    Execute API details query.
+
+    This command works exactly the same as if calling the /details endpoint on
+    the web API. See the online documentation for more details on the
+    various parameters:
+    https://nominatim.org/release-docs/latest/api/Details/
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group = parser.add_argument_group('Query arguments')
+        objs = group.add_mutually_exclusive_group(required=True)
+        objs.add_argument('--node', '-n', type=int,
+                          help="Look up the OSM node with the given ID.")
+        objs.add_argument('--way', '-w', type=int,
+                          help="Look up the OSM way with the given ID.")
+        objs.add_argument('--relation', '-r', type=int,
+                          help="Look up the OSM relation with the given ID.")
+        objs.add_argument('--place_id', '-p', type=int,
+                          help='Database internal identifier of the OSM object to look up')
+        group.add_argument('--class', dest='object_class',
+                           help=("Class type to disambiguated multiple entries "
+                                 "of the same object."))
+
+        group = parser.add_argument_group('Output arguments')
+        group.add_argument('--addressdetails', action='store_true',
+                           help='Include a breakdown of the address into elements')
+        group.add_argument('--keywords', action='store_true',
+                           help='Include a list of name keywords and address keywords')
+        group.add_argument('--linkedplaces', action='store_true',
+                           help='Include a details of places that are linked with this one')
+        group.add_argument('--hierarchy', action='store_true',
+                           help='Include details of places lower in the address hierarchy')
+        group.add_argument('--group_hierarchy', action='store_true',
+                           help='Group the places by type')
+        group.add_argument('--polygon_geojson', action='store_true',
+                           help='Include geometry of result')
+        group.add_argument('--lang', '--accept-language', metavar='LANGS',
+                           help='Preferred language order for presenting search results')
+
+
+    def run(self, args: NominatimArgs) -> int:
+        place: napi.PlaceRef
+        if args.node:
+            place = napi.OsmID('N', args.node, args.object_class)
+        elif args.way:
+            place = napi.OsmID('W', args.way, args.object_class)
+        elif args.relation:
+            place = napi.OsmID('R', args.relation, args.object_class)
+        else:
+            assert args.place_id is not None
+            place = napi.PlaceID(args.place_id)
+
+        api = napi.NominatimAPI(args.project_dir)
+
+        locales = args.get_locales(api.config.DEFAULT_LANGUAGE)
+        result = api.details(place,
+                             address_details=args.addressdetails,
+                             linked_places=args.linkedplaces,
+                             parented_places=args.hierarchy,
+                             keywords=args.keywords,
+                             geometry_output=napi.GeometryFormat.GEOJSON
+                                             if args.polygon_geojson
+                                             else napi.GeometryFormat.NONE,
+                            locales=locales)
+
+
+        if result:
+            output = api_output.format_result(
+                        result,
+                        'json',
+                        {'locales': locales,
+                         'group_hierarchy': args.group_hierarchy})
+            # reformat the result, so it is pretty-printed
+            json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
+            sys.stdout.write('\n')
+
+            return 0
+
+        LOG.error("Object not found in database.")
+        return 42
+
+
+class APIStatus:
+    """
+    Execute API status query.
+
+    This command works exactly the same as if calling the /status endpoint on
+    the web API. See the online documentation for more details on the
+    various parameters:
+    https://nominatim.org/release-docs/latest/api/Status/
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        formats = api_output.list_formats(napi.StatusResult)
+        group = parser.add_argument_group('API parameters')
+        group.add_argument('--format', default=formats[0], choices=formats,
+                           help='Format of result')
+
+
+    def run(self, args: NominatimArgs) -> int:
+        status = napi.NominatimAPI(args.project_dir).status()
+        print(api_output.format_result(status, args.format, {}))
+        return 0
--- a/src/nominatim_db/clicmd/args.py
+++ b/src/nominatim_db/clicmd/args.py
@@ -0,0 +1,260 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Provides custom functions over command-line arguments.
+"""
+from typing import Optional, List, Dict, Any, Sequence, Tuple
+import argparse
+import logging
+from functools import reduce
+from pathlib import Path
+
+from nominatim_core.errors import UsageError
+from nominatim_core.config import Configuration
+from nominatim_core.typing import Protocol
+import nominatim_api as napi
+
+LOG = logging.getLogger()
+
+class Subcommand(Protocol):
+    """
+    Interface to be implemented by classes implementing a CLI subcommand.
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        """
+        Fill the given parser for the subcommand with the appropriate
+        parameters.
+        """
+
+    def run(self, args: 'NominatimArgs') -> int:
+        """
+        Run the subcommand with the given parsed arguments.
+        """
+
+
+class NominatimArgs:
+    """ Customized namespace class for the nominatim command line tool
+        to receive the command-line arguments.
+    """
+    # Basic environment set by root program.
+    config: Configuration
+    project_dir: Path
+
+    # Global switches
+    version: bool
+    subcommand: Optional[str]
+    command: Subcommand
+
+    # Shared parameters
+    osm2pgsql_cache: Optional[int]
+    socket_timeout: int
+
+    # Arguments added to all subcommands.
+    verbose: int
+    threads: Optional[int]
+
+    # Arguments to 'add-data'
+    file: Optional[str]
+    diff: Optional[str]
+    node: Optional[int]
+    way: Optional[int]
+    relation: Optional[int]
+    tiger_data: Optional[str]
+    use_main_api: bool
+
+    # Arguments to 'admin'
+    warm: bool
+    check_database: bool
+    migrate: bool
+    collect_os_info: bool
+    clean_deleted: str
+    analyse_indexing: bool
+    target: Optional[str]
+    osm_id: Optional[str]
+    place_id: Optional[int]
+
+    # Arguments to 'import'
+    osm_file: List[str]
+    continue_at: Optional[str]
+    reverse_only: bool
+    no_partitions: bool
+    no_updates: bool
+    offline: bool
+    ignore_errors: bool
+    index_noanalyse: bool
+    prepare_database: bool
+
+    # Arguments to 'index'
+    boundaries_only: bool
+    no_boundaries: bool
+    minrank: int
+    maxrank: int
+
+    # Arguments to 'export'
+    output_type: str
+    output_format: str
+    output_all_postcodes: bool
+    language: Optional[str]
+    restrict_to_country: Optional[str]
+
+    # Arguments to 'convert'
+    output: Path
+
+    # Arguments to 'refresh'
+    postcodes: bool
+    word_tokens: bool
+    word_counts: bool
+    address_levels: bool
+    functions: bool
+    wiki_data: bool
+    secondary_importance: bool
+    importance: bool
+    website: bool
+    diffs: bool
+    enable_debug_statements: bool
+    data_object: Sequence[Tuple[str, int]]
+    data_area: Sequence[Tuple[str, int]]
+
+    # Arguments to 'replication'
+    init: bool
+    update_functions: bool
+    check_for_updates: bool
+    once: bool
+    catch_up: bool
+    do_index: bool
+
+    # Arguments to 'serve'
+    server: str
+    engine: str
+
+    # Arguments to 'special-phrases
+    import_from_wiki: bool
+    import_from_csv: Optional[str]
+    no_replace: bool
+
+    # Arguments to all query functions
+    format: str
+    addressdetails: bool
+    extratags: bool
+    namedetails: bool
+    lang: Optional[str]
+    polygon_output: Optional[str]
+    polygon_threshold: Optional[float]
+
+    # Arguments to 'search'
+    query: Optional[str]
+    amenity: Optional[str]
+    street: Optional[str]
+    city: Optional[str]
+    county: Optional[str]
+    state: Optional[str]
+    country: Optional[str]
+    postalcode: Optional[str]
+    countrycodes: Optional[str]
+    exclude_place_ids: Optional[str]
+    limit: int
+    viewbox: Optional[str]
+    bounded: bool
+    dedupe: bool
+
+    # Arguments to 'reverse'
+    lat: float
+    lon: float
+    zoom: Optional[int]
+    layers: Optional[Sequence[str]]
+
+    # Arguments to 'lookup'
+    ids: Sequence[str]
+
+    # Arguments to 'details'
+    object_class: Optional[str]
+    linkedplaces: bool
+    hierarchy: bool
+    keywords: bool
+    polygon_geojson: bool
+    group_hierarchy: bool
+
+
+    def osm2pgsql_options(self, default_cache: int,
+                          default_threads: int) -> Dict[str, Any]:
+        """ Return the standard osm2pgsql options that can be derived
+            from the command line arguments. The resulting dict can be
+            further customized and then used in `run_osm2pgsql()`.
+        """
+        return dict(osm2pgsql=self.config.OSM2PGSQL_BINARY or self.config.lib_dir.osm2pgsql,
+                    osm2pgsql_cache=self.osm2pgsql_cache or default_cache,
+                    osm2pgsql_style=self.config.get_import_style_file(),
+                    osm2pgsql_style_path=self.config.config_dir,
+                    threads=self.threads or default_threads,
+                    dsn=self.config.get_libpq_dsn(),
+                    flatnode_file=str(self.config.get_path('FLATNODE_FILE') or ''),
+                    tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA,
+                                     slim_index=self.config.TABLESPACE_OSM_INDEX,
+                                     main_data=self.config.TABLESPACE_PLACE_DATA,
+                                     main_index=self.config.TABLESPACE_PLACE_INDEX
+                                    )
+                   )
+
+
+    def get_osm_file_list(self) -> Optional[List[Path]]:
+        """ Return the --osm-file argument as a list of Paths or None
+            if no argument was given. The function also checks if the files
+            exist and raises a UsageError if one cannot be found.
+        """
+        if not self.osm_file:
+            return None
+
+        files = [Path(f) for f in self.osm_file]
+        for fname in files:
+            if not fname.is_file():
+                LOG.fatal("OSM file '%s' does not exist.", fname)
+                raise UsageError('Cannot access file.')
+
+        return files
+
+
+    def get_geometry_output(self) -> napi.GeometryFormat:
+        """ Get the requested geometry output format in a API-compatible
+            format.
+        """
+        if not self.polygon_output:
+            return napi.GeometryFormat.NONE
+        if self.polygon_output == 'geojson':
+            return napi.GeometryFormat.GEOJSON
+        if self.polygon_output == 'kml':
+            return napi.GeometryFormat.KML
+        if self.polygon_output == 'svg':
+            return napi.GeometryFormat.SVG
+        if self.polygon_output == 'text':
+            return napi.GeometryFormat.TEXT
+
+        try:
+            return napi.GeometryFormat[self.polygon_output.upper()]
+        except KeyError as exp:
+            raise UsageError(f"Unknown polygon output format '{self.polygon_output}'.") from exp
+
+
+    def get_locales(self, default: Optional[str]) -> napi.Locales:
+        """ Get the locales from the language parameter.
+        """
+        if self.lang:
+            return napi.Locales.from_accept_languages(self.lang)
+        if default:
+            return napi.Locales.from_accept_languages(default)
+
+        return napi.Locales()
+
+
+    def get_layers(self, default: napi.DataLayer) -> Optional[napi.DataLayer]:
+        """ Get the list of selected layers as a DataLayer enum.
+        """
+        if not self.layers:
+            return default
+
+        return reduce(napi.DataLayer.__or__,
+                      (napi.DataLayer[s.upper()] for s in self.layers))
--- a/src/nominatim_db/clicmd/convert.py
+++ b/src/nominatim_db/clicmd/convert.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Implementation of the 'convert' subcommand.
+"""
+from typing import Set, Any, Union, Optional, Sequence
+import argparse
+import asyncio
+from pathlib import Path
+
+from nominatim_core.errors import UsageError
+from .args import NominatimArgs
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415
+
+class WithAction(argparse.Action):
+    """ Special action that saves a list of flags, given on the command-line
+        as `--with-foo` or `--without-foo`.
+    """
+    def __init__(self, option_strings: Sequence[str], dest: Any,
+                 default: bool = True, **kwargs: Any) -> None:
+        if 'nargs' in kwargs:
+            raise ValueError("nargs not allowed.")
+        if option_strings is None:
+            raise ValueError("Positional parameter not allowed.")
+
+        self.dest_set = kwargs.pop('dest_set')
+        full_option_strings = []
+        for opt in option_strings:
+            if not opt.startswith('--'):
+                raise ValueError("short-form options not allowed")
+            if default:
+                self.dest_set.add(opt[2:])
+            full_option_strings.append(f"--with-{opt[2:]}")
+            full_option_strings.append(f"--without-{opt[2:]}")
+
+        super().__init__(full_option_strings, argparse.SUPPRESS, nargs=0, **kwargs)
+
+
+    def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace,
+                 values: Union[str, Sequence[Any], None],
+                 option_string: Optional[str] = None) -> None:
+        assert option_string
+        if option_string.startswith('--with-'):
+            self.dest_set.add(option_string[7:])
+        if option_string.startswith('--without-'):
+            self.dest_set.discard(option_string[10:])
+
+
+class ConvertDB:
+    """ Convert an existing database into a different format. (EXPERIMENTAL)
+
+        Dump a read-only version of the database in a different format.
+        At the moment only a SQLite database suitable for reverse lookup
+        can be created.
+    """
+
+    def __init__(self) -> None:
+        self.options: Set[str] = set()
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument('--format', default='sqlite',
+                            choices=('sqlite', ),
+                            help='Format of the output database (must be sqlite currently)')
+        parser.add_argument('--output', '-o', required=True, type=Path,
+                            help='File to write the database to.')
+        group = parser.add_argument_group('Switches to define database layout'
+                                          '(currently no effect)')
+        group.add_argument('--reverse', action=WithAction, dest_set=self.options, default=True,
+                           help='Enable/disable support for reverse and lookup API'
+                                ' (default: enabled)')
+        group.add_argument('--search', action=WithAction, dest_set=self.options, default=True,
+                           help='Enable/disable support for search API (default: disabled)')
+        group.add_argument('--details', action=WithAction, dest_set=self.options, default=True,
+                           help='Enable/disable support for details API (default: enabled)')
+
+
+    def run(self, args: NominatimArgs) -> int:
+        if args.output.exists():
+            raise UsageError(f"File '{args.output}' already exists. Refusing to overwrite.")
+
+        if args.format == 'sqlite':
+            from ..tools import convert_sqlite
+
+            asyncio.run(convert_sqlite.convert(args.project_dir, args.output, self.options))
+            return 0
+
+        return 1
--- a/src/nominatim_db/clicmd/export.py
+++ b/src/nominatim_db/clicmd/export.py
@@ -0,0 +1,200 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Implementation of the 'export' subcommand.
+"""
+from typing import Optional, List, cast
+import logging
+import argparse
+import asyncio
+import csv
+import sys
+
+import sqlalchemy as sa
+
+import nominatim_api as napi
+from nominatim_api.results import create_from_placex_row, ReverseResult, add_result_details
+from nominatim_api.types import LookupDetails
+from nominatim_core.errors import UsageError
+from .args import NominatimArgs
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415
+# Needed for SQLAlchemy
+# pylint: disable=singleton-comparison
+
+LOG = logging.getLogger()
+
+RANK_RANGE_MAP = {
+  'country': (4, 4),
+  'state': (5, 9),
+  'county': (10, 12),
+  'city': (13, 16),
+  'suburb': (17, 21),
+  'street': (26, 26),
+  'path': (27, 27)
+}
+
+RANK_TO_OUTPUT_MAP = {
+    4: 'country',
+    5: 'state', 6: 'state', 7: 'state', 8: 'state', 9: 'state',
+    10: 'county', 11: 'county', 12: 'county',
+    13: 'city', 14: 'city', 15: 'city', 16: 'city',
+    17: 'suburb', 18: 'suburb', 19: 'suburb', 20: 'suburb', 21: 'suburb',
+    26: 'street', 27: 'path'}
+
+class QueryExport:
+    """\
+    Export places as CSV file from the database.
+
+
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group = parser.add_argument_group('Output arguments')
+        group.add_argument('--output-type', default='street',
+                           choices=('country', 'state', 'county',
+                                    'city', 'suburb', 'street', 'path'),
+                           help='Type of places to output (default: street)')
+        group.add_argument('--output-format',
+                           default='street;suburb;city;county;state;country',
+                           help=("Semicolon-separated list of address types "
+                                 "(see --output-type). Additionally accepts:"
+                                 "placeid,postcode"))
+        group.add_argument('--language',
+                           help=("Preferred language for output "
+                                 "(use local name, if omitted)"))
+        group = parser.add_argument_group('Filter arguments')
+        group.add_argument('--restrict-to-country', metavar='COUNTRY_CODE',
+                           help='Export only objects within country')
+        group.add_argument('--restrict-to-osm-node', metavar='ID', type=int,
+                           dest='node',
+                           help='Export only children of this OSM node')
+        group.add_argument('--restrict-to-osm-way', metavar='ID', type=int,
+                           dest='way',
+                           help='Export only children of this OSM way')
+        group.add_argument('--restrict-to-osm-relation', metavar='ID', type=int,
+                           dest='relation',
+                           help='Export only children of this OSM relation')
+
+
+    def run(self, args: NominatimArgs) -> int:
+        return asyncio.run(export(args))
+
+
+async def export(args: NominatimArgs) -> int:
+    """ The actual export as a asynchronous function.
+    """
+
+    api = napi.NominatimAPIAsync(args.project_dir)
+
+    try:
+        output_range = RANK_RANGE_MAP[args.output_type]
+
+        writer = init_csv_writer(args.output_format)
+
+        async with api.begin() as conn, api.begin() as detail_conn:
+            t = conn.t.placex
+
+            sql = sa.select(t.c.place_id, t.c.parent_place_id,
+                        t.c.osm_type, t.c.osm_id, t.c.name,
+                        t.c.class_, t.c.type, t.c.admin_level,
+                        t.c.address, t.c.extratags,
+                        t.c.housenumber, t.c.postcode, t.c.country_code,
+                        t.c.importance, t.c.wikipedia, t.c.indexed_date,
+                        t.c.rank_address, t.c.rank_search,
+                        t.c.centroid)\
+                     .where(t.c.linked_place_id == None)\
+                     .where(t.c.rank_address.between(*output_range))
+
+            parent_place_id = await get_parent_id(conn, args.node, args.way, args.relation)
+            if parent_place_id:
+                taddr = conn.t.addressline
+
+                sql = sql.join(taddr, taddr.c.place_id == t.c.place_id)\
+                         .where(taddr.c.address_place_id == parent_place_id)\
+                         .where(taddr.c.isaddress)
+
+            if args.restrict_to_country:
+                sql = sql.where(t.c.country_code == args.restrict_to_country.lower())
+
+            results = []
+            for row in await conn.execute(sql):
+                result = create_from_placex_row(row, ReverseResult)
+                if result is not None:
+                    results.append(result)
+
+                if len(results) == 1000:
+                    await dump_results(detail_conn, results, writer, args.language)
+                    results = []
+
+            if results:
+                await dump_results(detail_conn, results, writer, args.language)
+    finally:
+        await api.close()
+
+    return 0
+
+
+def init_csv_writer(output_format: str) -> 'csv.DictWriter[str]':
+    fields = output_format.split(';')
+    writer = csv.DictWriter(sys.stdout, fieldnames=fields, extrasaction='ignore')
+    writer.writeheader()
+
+    return writer
+
+
+async def dump_results(conn: napi.SearchConnection,
+                       results: List[ReverseResult],
+                       writer: 'csv.DictWriter[str]',
+                       lang: Optional[str]) -> None:
+    locale = napi.Locales([lang] if lang else None)
+    await add_result_details(conn, results,
+                             LookupDetails(address_details=True, locales=locale))
+
+
+    for result in results:
+        data = {'placeid': result.place_id,
+                'postcode': result.postcode}
+
+        for line in (result.address_rows or []):
+            if line.isaddress and line.local_name:
+                if line.category[1] == 'postcode':
+                    data['postcode'] = line.local_name
+                elif line.rank_address in RANK_TO_OUTPUT_MAP:
+                    data[RANK_TO_OUTPUT_MAP[line.rank_address]] = line.local_name
+
+        writer.writerow(data)
+
+
+async def get_parent_id(conn: napi.SearchConnection, node_id: Optional[int],
+                        way_id: Optional[int],
+                        relation_id: Optional[int]) -> Optional[int]:
+    """ Get the place ID for the given OSM object.
+    """
+    if node_id is not None:
+        osm_type, osm_id = 'N', node_id
+    elif way_id is not None:
+        osm_type, osm_id = 'W', way_id
+    elif relation_id is not None:
+        osm_type, osm_id = 'R', relation_id
+    else:
+        return None
+
+    t = conn.t.placex
+    sql = sa.select(t.c.place_id).limit(1)\
+            .where(t.c.osm_type == osm_type)\
+            .where(t.c.osm_id == osm_id)\
+            .where(t.c.rank_address > 0)\
+            .order_by(t.c.rank_address)
+
+    for result in await conn.execute(sql):
+        return cast(int, result[0])
+
+    raise UsageError(f'Cannot find a place {osm_type}{osm_id}.')
--- a/src/nominatim_db/clicmd/freeze.py
+++ b/src/nominatim_db/clicmd/freeze.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Implementation of the 'freeze' subcommand.
+"""
+import argparse
+
+from nominatim_core.db.connection import connect
+from .args import NominatimArgs
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415
+
+class SetupFreeze:
+    """\
+    Make database read-only.
+
+    About half of data in the Nominatim database is kept only to be able to
+    keep the data up-to-date with new changes made in OpenStreetMap. This
+    command drops all this data and only keeps the part needed for geocoding
+    itself.
+
+    This command has the same effect as the `--no-updates` option for imports.
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        pass # No options
+
+
+    def run(self, args: NominatimArgs) -> int:
+        from ..tools import freeze
+
+        with connect(args.config.get_libpq_dsn()) as conn:
+            freeze.drop_update_tables(conn)
+        freeze.drop_flatnode_file(args.config.get_path('FLATNODE_FILE'))
+
+        return 0
--- a/src/nominatim_db/clicmd/index.py
+++ b/src/nominatim_db/clicmd/index.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Implementation of the 'index' subcommand.
+"""
+import argparse
+
+import psutil
+
+from nominatim_core.db import status
+from nominatim_core.db.connection import connect
+from .args import NominatimArgs
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415
+
+
+class UpdateIndex:
+    """\
+    Reindex all new and modified data.
+
+    Indexing is the process of computing the address and search terms for
+    the places in the database. Every time data is added or changed, indexing
+    needs to be run. Imports and replication updates automatically take care
+    of indexing. For other cases, this function allows to run indexing manually.
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group = parser.add_argument_group('Filter arguments')
+        group.add_argument('--boundaries-only', action='store_true',
+                           help="""Index only administrative boundaries.""")
+        group.add_argument('--no-boundaries', action='store_true',
+                           help="""Index everything except administrative boundaries.""")
+        group.add_argument('--minrank', '-r', type=int, metavar='RANK', default=0,
+                           help='Minimum/starting rank')
+        group.add_argument('--maxrank', '-R', type=int, metavar='RANK', default=30,
+                           help='Maximum/finishing rank')
+
+
+    def run(self, args: NominatimArgs) -> int:
+        from ..indexer.indexer import Indexer
+        from ..tokenizer import factory as tokenizer_factory
+
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
+        indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
+                          args.threads or psutil.cpu_count() or 1)
+
+        if not args.no_boundaries:
+            indexer.index_boundaries(args.minrank, args.maxrank)
+        if not args.boundaries_only:
+            indexer.index_by_rank(args.minrank, args.maxrank)
+            indexer.index_postcodes()
+
+        if not args.no_boundaries and not args.boundaries_only \
+           and args.minrank == 0 and args.maxrank == 30:
+            with connect(args.config.get_libpq_dsn()) as conn:
+                status.set_indexed(conn, True)
+
+        return 0
--- a/src/nominatim_db/clicmd/refresh.py
+++ b/src/nominatim_db/clicmd/refresh.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Implementation of 'refresh' subcommand.
+"""
+from typing import Tuple, Optional
+import argparse
+import logging
+from pathlib import Path
+
+from nominatim_core.config import Configuration
+from nominatim_core.db.connection import connect
+from ..tokenizer.base import AbstractTokenizer
+from .args import NominatimArgs
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415
+
+LOG = logging.getLogger()
+
+def _parse_osm_object(obj: str) -> Tuple[str, int]:
+    """ Parse the given argument into a tuple of OSM type and ID.
+        Raises an ArgumentError if the format is not recognized.
+    """
+    if len(obj) < 2 or obj[0].lower() not in 'nrw' or not obj[1:].isdigit():
+        raise argparse.ArgumentTypeError("Cannot parse OSM ID. Expect format: [N|W|R]<id>.")
+
+    return (obj[0].upper(), int(obj[1:]))
+
+
+class UpdateRefresh:
+    """\
+    Recompute auxiliary data used by the indexing process.
+
+    This sub-commands updates various static data and functions in the database.
+    It usually needs to be run after changing various aspects of the
+    configuration. The configuration documentation will mention the exact
+    command to use in such case.
+
+    Warning: the 'update' command must not be run in parallel with other update
+             commands like 'replication' or 'add-data'.
+    """
+    def __init__(self) -> None:
+        self.tokenizer: Optional[AbstractTokenizer] = None
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group = parser.add_argument_group('Data arguments')
+        group.add_argument('--postcodes', action='store_true',
+                           help='Update postcode centroid table')
+        group.add_argument('--word-tokens', action='store_true',
+                           help='Clean up search terms')
+        group.add_argument('--word-counts', action='store_true',
+                           help='Compute frequency of full-word search terms')
+        group.add_argument('--address-levels', action='store_true',
+                           help='Reimport address level configuration')
+        group.add_argument('--functions', action='store_true',
+                           help='Update the PL/pgSQL functions in the database')
+        group.add_argument('--wiki-data', action='store_true',
+                           help='Update Wikipedia/data importance numbers')
+        group.add_argument('--secondary-importance', action='store_true',
+                           help='Update secondary importance raster data')
+        group.add_argument('--importance', action='store_true',
+                           help='Recompute place importances (expensive!)')
+        group.add_argument('--website', action='store_true',
+                           help='Refresh the directory that serves the scripts for the web API')
+        group.add_argument('--data-object', action='append',
+                           type=_parse_osm_object, metavar='OBJECT',
+                           help='Mark the given OSM object as requiring an update'
+                                ' (format: [NWR]<id>)')
+        group.add_argument('--data-area', action='append',
+                           type=_parse_osm_object, metavar='OBJECT',
+                           help='Mark the area around the given OSM object as requiring an update'
+                                ' (format: [NWR]<id>)')
+
+        group = parser.add_argument_group('Arguments for function refresh')
+        group.add_argument('--no-diff-updates', action='store_false', dest='diffs',
+                           help='Do not enable code for propagating updates')
+        group.add_argument('--enable-debug-statements', action='store_true',
+                           help='Enable debug warning statements in functions')
+
+
+    def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, too-many-statements
+        from ..tools import refresh, postcodes
+        from ..indexer.indexer import Indexer
+
+        need_function_refresh = args.functions
+
+        if args.postcodes:
+            if postcodes.can_compute(args.config.get_libpq_dsn()):
+                LOG.warning("Update postcodes centroid")
+                tokenizer = self._get_tokenizer(args.config)
+                postcodes.update_postcodes(args.config.get_libpq_dsn(),
+                                           args.project_dir, tokenizer)
+                indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
+                                  args.threads or 1)
+                indexer.index_postcodes()
+            else:
+                LOG.error("The place table doesn't exist. "
+                          "Postcode updates on a frozen database is not possible.")
+
+        if args.word_tokens:
+            LOG.warning('Updating word tokens')
+            tokenizer = self._get_tokenizer(args.config)
+            tokenizer.update_word_tokens()
+
+        if args.word_counts:
+            LOG.warning('Recompute word statistics')
+            self._get_tokenizer(args.config).update_statistics(args.config,
+                                                               threads=args.threads or 1)
+
+        if args.address_levels:
+            LOG.warning('Updating address levels')
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.load_address_levels_from_config(conn, args.config)
+
+        # Attention: must come BEFORE functions
+        if args.secondary_importance:
+            with connect(args.config.get_libpq_dsn()) as conn:
+                # If the table did not exist before, then the importance code
+                # needs to be enabled.
+                if not conn.table_exists('secondary_importance'):
+                    args.functions = True
+
+            LOG.warning('Import secondary importance raster data from %s', args.project_dir)
+            if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
+                                                args.project_dir) > 0:
+                LOG.fatal('FATAL: Cannot update secondary importance raster data')
+                return 1
+            need_function_refresh = True
+
+        if args.wiki_data:
+            data_path = Path(args.config.WIKIPEDIA_DATA_PATH
+                             or args.project_dir)
+            LOG.warning('Import wikipedia article importance from %s', data_path)
+            if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
+                                                 data_path) > 0:
+                LOG.fatal('FATAL: Wikipedia importance file not found in %s', data_path)
+                return 1
+            need_function_refresh = True
+
+        if need_function_refresh:
+            LOG.warning('Create functions')
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.create_functions(conn, args.config,
+                                         args.diffs, args.enable_debug_statements)
+                self._get_tokenizer(args.config).update_sql_functions(args.config)
+
+        # Attention: importance MUST come after wiki data import and after functions.
+        if args.importance:
+            LOG.warning('Update importance values for database')
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.recompute_importance(conn)
+
+        if args.website:
+            webdir = args.project_dir / 'website'
+            LOG.warning('Setting up website directory at %s', webdir)
+            # This is a little bit hacky: call the tokenizer setup, so that
+            # the tokenizer directory gets repopulated as well, in case it
+            # wasn't there yet.
+            self._get_tokenizer(args.config)
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.setup_website(webdir, args.config, conn)
+
+        if args.data_object or args.data_area:
+            with connect(args.config.get_libpq_dsn()) as conn:
+                for obj in args.data_object or []:
+                    refresh.invalidate_osm_object(*obj, conn, recursive=False)
+                for obj in args.data_area or []:
+                    refresh.invalidate_osm_object(*obj, conn, recursive=True)
+                conn.commit()
+
+        return 0
+
+
+    def _get_tokenizer(self, config: Configuration) -> AbstractTokenizer:
+        if self.tokenizer is None:
+            from ..tokenizer import factory as tokenizer_factory
+
+            self.tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+
+        return self.tokenizer
--- a/src/nominatim_db/clicmd/replication.py
+++ b/src/nominatim_db/clicmd/replication.py
@@ -0,0 +1,200 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Implementation of the 'replication' sub-command.
+"""
+from typing import Optional
+import argparse
+import datetime as dt
+import logging
+import socket
+import time
+
+from nominatim_core.db import status
+from nominatim_core.db.connection import connect
+from nominatim_core.errors import UsageError
+from .args import NominatimArgs
+
+LOG = logging.getLogger()
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+# Using non-top-level imports to make pyosmium optional for replication only.
+# pylint: disable=C0415
+
+class UpdateReplication:
+    """\
+    Update the database using an online replication service.
+
+    An OSM replication service is an online service that provides regular
+    updates (OSM diff files) for the planet or update they provide. The OSMF
+    provides the primary replication service for the full planet at
+    https://planet.osm.org/replication/ but there are other providers of
+    extracts of OSM data who provide such a service as well.
+
+    This sub-command allows to set up such a replication service and download
+    and import updates at regular intervals. You need to call '--init' once to
+    set up the process or whenever you change the replication configuration
+    parameters. Without any arguments, the sub-command will go into a loop and
+    continuously apply updates as they become available. Giving `--once` just
+    downloads and imports the next batch of updates.
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group = parser.add_argument_group('Arguments for initialisation')
+        group.add_argument('--init', action='store_true',
+                           help='Initialise the update process')
+        group.add_argument('--no-update-functions', dest='update_functions',
+                           action='store_false',
+                           help="Do not update the trigger function to "
+                                "support differential updates (EXPERT)")
+        group = parser.add_argument_group('Arguments for updates')
+        group.add_argument('--check-for-updates', action='store_true',
+                           help='Check if new updates are available and exit')
+        group.add_argument('--once', action='store_true',
+                           help="Download and apply updates only once. When "
+                                "not set, updates are continuously applied")
+        group.add_argument('--catch-up', action='store_true',
+                           help="Download and apply updates until no new "
+                                "data is available on the server")
+        group.add_argument('--no-index', action='store_false', dest='do_index',
+                           help=("Do not index the new data. Only usable "
+                                 "together with --once"))
+        group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
+                           help='Size of cache to be used by osm2pgsql (in MB)')
+        group = parser.add_argument_group('Download parameters')
+        group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
+                           help='Set timeout for file downloads')
+
+
+    def _init_replication(self, args: NominatimArgs) -> int:
+        from ..tools import replication, refresh
+
+        LOG.warning("Initialising replication updates")
+        with connect(args.config.get_libpq_dsn()) as conn:
+            replication.init_replication(conn, base_url=args.config.REPLICATION_URL,
+                                         socket_timeout=args.socket_timeout)
+            if args.update_functions:
+                LOG.warning("Create functions")
+                refresh.create_functions(conn, args.config, True, False)
+        return 0
+
+
+    def _check_for_updates(self, args: NominatimArgs) -> int:
+        from ..tools import replication
+
+        with connect(args.config.get_libpq_dsn()) as conn:
+            return replication.check_for_updates(conn, base_url=args.config.REPLICATION_URL,
+                                                 socket_timeout=args.socket_timeout)
+
+
+    def _report_update(self, batchdate: dt.datetime,
+                       start_import: dt.datetime,
+                       start_index: Optional[dt.datetime]) -> None:
+        def round_time(delta: dt.timedelta) -> dt.timedelta:
+            return dt.timedelta(seconds=int(delta.total_seconds()))
+
+        end = dt.datetime.now(dt.timezone.utc)
+        LOG.warning("Update completed. Import: %s. %sTotal: %s. Remaining backlog: %s.",
+                    round_time((start_index or end) - start_import),
+                    f"Indexing: {round_time(end - start_index)} " if start_index else '',
+                    round_time(end - start_import),
+                    round_time(end - batchdate))
+
+
+    def _compute_update_interval(self, args: NominatimArgs) -> int:
+        if args.catch_up:
+            return 0
+
+        update_interval = args.config.get_int('REPLICATION_UPDATE_INTERVAL')
+        # Sanity check to not overwhelm the Geofabrik servers.
+        if 'download.geofabrik.de' in args.config.REPLICATION_URL\
+           and update_interval < 86400:
+            LOG.fatal("Update interval too low for download.geofabrik.de.\n"
+                      "Please check install documentation "
+                      "(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
+                      "setting-up-the-update-process).")
+            raise UsageError("Invalid replication update interval setting.")
+
+        return update_interval
+
+
+    def _update(self, args: NominatimArgs) -> None:
+        # pylint: disable=too-many-locals
+        from ..tools import replication
+        from ..indexer.indexer import Indexer
+        from ..tokenizer import factory as tokenizer_factory
+
+        update_interval = self._compute_update_interval(args)
+
+        params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
+        params.update(base_url=args.config.REPLICATION_URL,
+                      update_interval=update_interval,
+                      import_file=args.project_dir / 'osmosischange.osc',
+                      max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'),
+                      indexed_only=not args.once)
+
+        if not args.once:
+            if not args.do_index:
+                LOG.fatal("Indexing cannot be disabled when running updates continuously.")
+                raise UsageError("Bad argument '--no-index'.")
+        recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
+
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+        indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, args.threads or 1)
+
+        dsn = args.config.get_libpq_dsn()
+
+        while True:
+            start = dt.datetime.now(dt.timezone.utc)
+            state = replication.update(dsn, params, socket_timeout=args.socket_timeout)
+
+            with connect(dsn) as conn:
+                if state is not replication.UpdateState.NO_CHANGES:
+                    status.log_status(conn, start, 'import')
+                batchdate, _, _ = status.get_status(conn)
+                conn.commit()
+
+            if state is not replication.UpdateState.NO_CHANGES and args.do_index:
+                index_start = dt.datetime.now(dt.timezone.utc)
+                indexer.index_full(analyse=False)
+
+                with connect(dsn) as conn:
+                    status.set_indexed(conn, True)
+                    status.log_status(conn, index_start, 'index')
+                    conn.commit()
+            else:
+                index_start = None
+
+            if state is replication.UpdateState.NO_CHANGES and \
+               args.catch_up or update_interval > 40*60:
+                while indexer.has_pending():
+                    indexer.index_full(analyse=False)
+
+            if LOG.isEnabledFor(logging.WARNING):
+                assert batchdate is not None
+                self._report_update(batchdate, start, index_start)
+
+            if args.once or (args.catch_up and state is replication.UpdateState.NO_CHANGES):
+                break
+
+            if state is replication.UpdateState.NO_CHANGES:
+                LOG.warning("No new changes. Sleeping for %d sec.", recheck_interval)
+                time.sleep(recheck_interval)
+
+
+    def run(self, args: NominatimArgs) -> int:
+        socket.setdefaulttimeout(args.socket_timeout)
+
+        if args.init:
+            return self._init_replication(args)
+
+        if args.check_for_updates:
+            return self._check_for_updates(args)
+
+        self._update(args)
+        return 0
--- a/src/nominatim_db/clicmd/setup.py
+++ b/src/nominatim_db/clicmd/setup.py
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Implementation of the 'import' subcommand.
+"""
+from typing import Optional
+import argparse
+import logging
+from pathlib import Path
+
+import psutil
+
+from nominatim_core.errors import UsageError
+from nominatim_core.config import Configuration
+from nominatim_core.db.connection import connect
+from nominatim_core.db import status, properties
+from ..tokenizer.base import AbstractTokenizer
+from ..version import NOMINATIM_VERSION
+from .args import NominatimArgs
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=C0415
+
+LOG = logging.getLogger()
+
+class SetupAll:
+    """\
+    Create a new Nominatim database from an OSM file.
+
+    This sub-command sets up a new Nominatim database from scratch starting
+    with creating a new database in Postgresql. The user running this command
+    needs superuser rights on the database.
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group1 = parser.add_argument_group('Required arguments')
+        group1.add_argument('--osm-file', metavar='FILE', action='append',
+                           help='OSM file to be imported'
+                                ' (repeat for importing multiple files)',
+                                default=None)
+        group1.add_argument('--continue', dest='continue_at',
+                           choices=['import-from-file', 'load-data', 'indexing', 'db-postprocess'],
+                           help='Continue an import that was interrupted',
+                           default=None)
+        group2 = parser.add_argument_group('Optional arguments')
+        group2.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
+                           help='Size of cache to be used by osm2pgsql (in MB)')
+        group2.add_argument('--reverse-only', action='store_true',
+                           help='Do not create tables and indexes for searching')
+        group2.add_argument('--no-partitions', action='store_true',
+                           help=("Do not partition search indices "
+                                 "(speeds up import of single country extracts)"))
+        group2.add_argument('--no-updates', action='store_true',
+                           help="Do not keep tables that are only needed for "
+                                "updating the database later")
+        group2.add_argument('--offline', action='store_true',
+                            help="Do not attempt to load any additional data from the internet")
+        group3 = parser.add_argument_group('Expert options')
+        group3.add_argument('--ignore-errors', action='store_true',
+                           help='Continue import even when errors in SQL are present')
+        group3.add_argument('--index-noanalyse', action='store_true',
+                           help='Do not perform analyse operations during index (expert only)')
+        group3.add_argument('--prepare-database', action='store_true',
+                            help='Create the database but do not import any data')
+
+
+    def run(self, args: NominatimArgs) -> int: # pylint: disable=too-many-statements, too-many-branches
+        from ..data import country_info
+        from ..tools import database_import, refresh, postcodes, freeze
+        from ..indexer.indexer import Indexer
+
+        num_threads = args.threads or psutil.cpu_count() or 1
+
+        country_info.setup_country_config(args.config)
+
+        if args.osm_file is None and args.continue_at is None and not args.prepare_database:
+            raise UsageError("No input files (use --osm-file).")
+
+        if args.osm_file is not None and args.continue_at not in ('import-from-file', None):
+            raise UsageError(f"Cannot use --continue {args.continue_at} and --osm-file together.")
+
+        if args.continue_at is not None and args.prepare_database:
+            raise UsageError(
+                "Cannot use --continue and --prepare-database together."
+            )
+
+
+        if args.prepare_database or args.continue_at is None:
+            LOG.warning('Creating database')
+            database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
+                                                        rouser=args.config.DATABASE_WEBUSER)
+            if args.prepare_database:
+                return 0
+
+        if args.continue_at in (None, 'import-from-file'):
+            files = args.get_osm_file_list()
+            if not files:
+                raise UsageError("No input files (use --osm-file).")
+
+            if args.continue_at in ('import-from-file', None):
+                # Check if the correct plugins are installed
+                database_import.check_existing_database_plugins(args.config.get_libpq_dsn())
+                LOG.warning('Setting up country tables')
+                country_info.setup_country_tables(args.config.get_libpq_dsn(),
+                                                args.config.lib_dir.data,
+                                                args.no_partitions)
+
+                LOG.warning('Importing OSM data file')
+                database_import.import_osm_data(files,
+                                                args.osm2pgsql_options(0, 1),
+                                                drop=args.no_updates,
+                                                ignore_errors=args.ignore_errors)
+
+                LOG.warning('Importing wikipedia importance data')
+                data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
+                if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
+                                                    data_path) > 0:
+                    LOG.error('Wikipedia importance dump file not found. '
+                            'Calculating importance values of locations will not '
+                            'use Wikipedia importance data.')
+
+                LOG.warning('Importing secondary importance raster data')
+                if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
+                                                    args.project_dir) != 0:
+                    LOG.error('Secondary importance file not imported. '
+                            'Falling back to default ranking.')
+
+                self._setup_tables(args.config, args.reverse_only)
+
+        if args.continue_at in ('import-from-file', 'load-data', None):
+            LOG.warning('Initialise tables')
+            with connect(args.config.get_libpq_dsn()) as conn:
+                database_import.truncate_data_tables(conn)
+
+            LOG.warning('Load data into placex table')
+            database_import.load_data(args.config.get_libpq_dsn(), num_threads)
+
+        LOG.warning("Setting up tokenizer")
+        tokenizer = self._get_tokenizer(args.continue_at, args.config)
+
+        if args.continue_at in ('import-from-file', 'load-data', None):
+            LOG.warning('Calculate postcodes')
+            postcodes.update_postcodes(args.config.get_libpq_dsn(),
+                                       args.project_dir, tokenizer)
+
+        if args.continue_at in \
+            ('import-from-file', 'load-data', 'indexing', None):
+            LOG.warning('Indexing places')
+            indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, num_threads)
+            indexer.index_full(analyse=not args.index_noanalyse)
+
+        LOG.warning('Post-process tables')
+        with connect(args.config.get_libpq_dsn()) as conn:
+            database_import.create_search_indices(conn, args.config,
+                                                  drop=args.no_updates,
+                                                  threads=num_threads)
+            LOG.warning('Create search index for default country names.')
+            country_info.create_country_names(conn, tokenizer,
+                                              args.config.get_str_list('LANGUAGES'))
+            if args.no_updates:
+                freeze.drop_update_tables(conn)
+        tokenizer.finalize_import(args.config)
+
+        LOG.warning('Recompute word counts')
+        tokenizer.update_statistics(args.config, threads=num_threads)
+
+        webdir = args.project_dir / 'website'
+        LOG.warning('Setup website at %s', webdir)
+        with connect(args.config.get_libpq_dsn()) as conn:
+            refresh.setup_website(webdir, args.config, conn)
+
+        self._finalize_database(args.config.get_libpq_dsn(), args.offline)
+
+        return 0
+
+
+    def _setup_tables(self, config: Configuration, reverse_only: bool) -> None:
+        """ Set up the basic database layout: tables, indexes and functions.
+        """
+        from ..tools import database_import, refresh
+
+        with connect(config.get_libpq_dsn()) as conn:
+            LOG.warning('Create functions (1st pass)')
+            refresh.create_functions(conn, config, False, False)
+            LOG.warning('Create tables')
+            database_import.create_tables(conn, config, reverse_only=reverse_only)
+            refresh.load_address_levels_from_config(conn, config)
+            LOG.warning('Create functions (2nd pass)')
+            refresh.create_functions(conn, config, False, False)
+            LOG.warning('Create table triggers')
+            database_import.create_table_triggers(conn, config)
+            LOG.warning('Create partition tables')
+            database_import.create_partition_tables(conn, config)
+            LOG.warning('Create functions (3rd pass)')
+            refresh.create_functions(conn, config, False, False)
+
+
+    def _get_tokenizer(self, continue_at: Optional[str],
+                       config: Configuration) -> AbstractTokenizer:
+        """ Set up a new tokenizer or load an already initialised one.
+        """
+        from ..tokenizer import factory as tokenizer_factory
+
+        if continue_at in ('import-from-file', 'load-data', None):
+            # (re)initialise the tokenizer data
+            return tokenizer_factory.create_tokenizer(config)
+
+        # just load the tokenizer
+        return tokenizer_factory.get_tokenizer_for_db(config)
+
+
+    def _finalize_database(self, dsn: str, offline: bool) -> None:
+        """ Determine the database date and set the status accordingly.
+        """
+        with connect(dsn) as conn:
+            properties.set_property(conn, 'database_version', str(NOMINATIM_VERSION))
+
+            try:
+                dbdate = status.compute_database_date(conn, offline)
+                status.set_status(conn, dbdate)
+                LOG.info('Database is at %s.', dbdate)
+            except Exception as exc: # pylint: disable=broad-except
+                LOG.error('Cannot determine date of database: %s', exc)
--- a/src/nominatim_db/clicmd/special_phrases.py
+++ b/src/nominatim_db/clicmd/special_phrases.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+    Implementation of the 'special-phrases' command.
+"""
+import argparse
+import logging
+from pathlib import Path
+
+from nominatim_core.errors import UsageError
+from nominatim_core.db.connection import connect
+from ..tools.special_phrases.sp_importer import SPImporter, SpecialPhraseLoader
+from ..tools.special_phrases.sp_wiki_loader import SPWikiLoader
+from ..tools.special_phrases.sp_csv_loader import SPCsvLoader
+from .args import NominatimArgs
+
+LOG = logging.getLogger()
+
+# Do not repeat documentation of subcommand classes.
+# pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415
+
+class ImportSpecialPhrases:
+    """\
+    Import special phrases.
+
+    Special phrases are search terms that narrow down the type of object
+    that should be searched. For example, you might want to search for
+    'Hotels in Barcelona'. The OSM wiki has a selection of special phrases
+    in many languages, which can be imported with this command.
+
+    You can also provide your own phrases in a CSV file. The file needs to have
+    the following five columns:
+     * phrase - the term expected for searching
+     * class - the OSM tag key of the object type
+     * type - the OSM tag value of the object type
+     * operator - the kind of search to be done (one of: in, near, name, -)
+     * plural - whether the term is a plural or not (Y/N)
+
+    An example file can be found in the Nominatim sources at
+    'test/testdb/full_en_phrases_test.csv'.
+
+    The import can be further configured to ignore specific key/value pairs.
+    This is particularly useful when importing phrases from the wiki. The
+    default configuration excludes some very common tags like building=yes.
+    The configuration can be customized by putting a file `phrase-settings.json`
+    with custom rules into the project directory or by using the `--config`
+    option to point to another configuration file.
+    """
+
+    def add_args(self, parser: argparse.ArgumentParser) -> None:
+        group = parser.add_argument_group('Input arguments')
+        group.add_argument('--import-from-wiki', action='store_true',
+                           help='Import special phrases from the OSM wiki to the database')
+        group.add_argument('--import-from-csv', metavar='FILE',
+                           help='Import special phrases from a CSV file')
+        group.add_argument('--no-replace', action='store_true',
+                           help='Keep the old phrases and only add the new ones')
+
+
+    def run(self, args: NominatimArgs) -> int:
+
+        if args.import_from_wiki:
+            self.start_import(args, SPWikiLoader(args.config))
+
+        if args.import_from_csv:
+            if not Path(args.import_from_csv).is_file():
+                LOG.fatal("CSV file '%s' does not exist.", args.import_from_csv)
+                raise UsageError('Cannot access file.')
+
+            self.start_import(args, SPCsvLoader(args.import_from_csv))
+
+        return 0
+
+
+    def start_import(self, args: NominatimArgs, loader: SpecialPhraseLoader) -> None:
+        """
+            Create the SPImporter object containing the right
+            sp loader and then start the import of special phrases.
+        """
+        from ..tokenizer import factory as tokenizer_factory
+
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+        should_replace = not args.no_replace
+        with connect(args.config.get_libpq_dsn()) as db_connection:
+            SPImporter(
+                args.config, db_connection, loader
+            ).import_phrases(tokenizer, should_replace)
--- a/src/nominatim_db/data/init.py
+++ b/src/nominatim_db/data/init.py
--- a/src/nominatim_db/data/country_info.py
+++ b/src/nominatim_db/data/country_info.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for importing and managing static country information.
+"""
+from typing import Dict, Any, Iterable, Tuple, Optional, Container, overload
+from pathlib import Path
+import psycopg2.extras
+
+from nominatim_core.db import utils as db_utils
+from nominatim_core.db.connection import connect, Connection
+from nominatim_core.errors import UsageError
+from nominatim_core.config import Configuration
+from ..tokenizer.base import AbstractTokenizer
+
+def _flatten_name_list(names: Any) -> Dict[str, str]:
+    if names is None:
+        return {}
+
+    if not isinstance(names, dict):
+        raise UsageError("Expected key-value list for names in country_settings.py")
+
+    flat = {}
+    for prefix, remain in names.items():
+        if isinstance(remain, str):
+            flat[prefix] = remain
+        elif not isinstance(remain, dict):
+            raise UsageError("Entries in names must be key-value lists.")
+        else:
+            for suffix, name in remain.items():
+                if suffix == 'default':
+                    flat[prefix] = name
+                else:
+                    flat[f'{prefix}:{suffix}'] = name
+
+    return flat
+
+
+
+class _CountryInfo:
+    """ Caches country-specific properties from the configuration file.
+    """
+
+    def __init__(self) -> None:
+        self._info: Dict[str, Dict[str, Any]] = {}
+
+
+    def load(self, config: Configuration) -> None:
+        """ Load the country properties from the configuration files,
+            if they are not loaded yet.
+        """
+        if not self._info:
+            self._info = config.load_sub_configuration('country_settings.yaml')
+            for prop in self._info.values():
+                # Convert languages into a list for simpler handling.
+                if 'languages' not in prop:
+                    prop['languages'] = []
+                elif not isinstance(prop['languages'], list):
+                    prop['languages'] = [x.strip()
+                                         for x in prop['languages'].split(',')]
+                prop['names'] = _flatten_name_list(prop.get('names'))
+
+
+    def items(self) -> Iterable[Tuple[str, Dict[str, Any]]]:
+        """ Return tuples of (country_code, property dict) as iterable.
+        """
+        return self._info.items()
+
+    def get(self, country_code: str) -> Dict[str, Any]:
+        """ Get country information for the country with the given country code.
+        """
+        return self._info.get(country_code, {})
+
+
+
+_COUNTRY_INFO = _CountryInfo()
+
+
+def setup_country_config(config: Configuration) -> None:
+    """ Load country properties from the configuration file.
+        Needs to be called before using any other functions in this
+        file.
+    """
+    _COUNTRY_INFO.load(config)
+
+@overload
+def iterate() -> Iterable[Tuple[str, Dict[str, Any]]]:
+    ...
+
+@overload
+def iterate(prop: str) -> Iterable[Tuple[str, Any]]:
+    ...
+
+def iterate(prop: Optional[str] = None) -> Iterable[Tuple[str, Dict[str, Any]]]:
+    """ Iterate over country code and properties.
+
+        When `prop` is None, all countries are returned with their complete
+        set of properties.
+
+        If `prop` is given, then only countries are returned where the
+        given property is set. The second item of the tuple contains only
+        the content of the given property.
+    """
+    if prop is None:
+        return _COUNTRY_INFO.items()
+
+    return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
+
+
+def setup_country_tables(dsn: str, sql_dir: Path, ignore_partitions: bool = False) -> None:
+    """ Create and populate the tables with basic static data that provides
+        the background for geocoding. Data is assumed to not yet exist.
+    """
+    db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')
+
+    params = []
+    for ccode, props in _COUNTRY_INFO.items():
+        if ccode is not None and props is not None:
+            if ignore_partitions:
+                partition = 0
+            else:
+                partition = props.get('partition', 0)
+            lang = props['languages'][0] if len(
+                props['languages']) == 1 else None
+
+            params.append((ccode, props['names'], lang, partition))
+    with connect(dsn) as conn:
+        with conn.cursor() as cur:
+            psycopg2.extras.register_hstore(cur)
+            cur.execute(
+                """ CREATE TABLE public.country_name (
+                        country_code character varying(2),
+                        name public.hstore,
+                        derived_name public.hstore,
+                        country_default_language_code text,
+                        partition integer
+                    ); """)
+            cur.execute_values(
+                """ INSERT INTO public.country_name
+                    (country_code, name, country_default_language_code, partition) VALUES %s
+                """, params)
+        conn.commit()
+
+
+def create_country_names(conn: Connection, tokenizer: AbstractTokenizer,
+                         languages: Optional[Container[str]] = None) -> None:
+    """ Add default country names to search index. `languages` is a comma-
+        separated list of language codes as used in OSM. If `languages` is not
+        empty then only name translations for the given languages are added
+        to the index.
+    """
+    def _include_key(key: str) -> bool:
+        return ':' not in key or not languages or \
+               key[key.index(':') + 1:] in languages
+
+    with conn.cursor() as cur:
+        psycopg2.extras.register_hstore(cur)
+        cur.execute("""SELECT country_code, name FROM country_name
+                       WHERE country_code is not null""")
+
+        with tokenizer.name_analyzer() as analyzer:
+            for code, name in cur:
+                names = {'countrycode': code}
+
+                # country names (only in languages as provided)
+                if name:
+                    names.update({k : v for k, v in name.items() if _include_key(k)})
+
+                analyzer.add_country_names(code, names)
+
+    conn.commit()
--- a/src/nominatim_db/data/place_info.py
+++ b/src/nominatim_db/data/place_info.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Wrapper around place information the indexer gets from the database and hands to
+the tokenizer.
+"""
+from typing import Optional, Mapping, Any, Tuple
+
+class PlaceInfo:
+    """ This data class contains all information the tokenizer can access
+        about a place.
+    """
+
+    def __init__(self, info: Mapping[str, Any]) -> None:
+        self._info = info
+
+
+    @property
+    def name(self) -> Optional[Mapping[str, str]]:
+        """ A dictionary with the names of the place. Keys and values represent
+            the full key and value of the corresponding OSM tag. Which tags
+            are saved as names is determined by the import style.
+            The property may be None if the place has no names.
+        """
+        return self._info.get('name')
+
+
+    @property
+    def address(self) -> Optional[Mapping[str, str]]:
+        """ A dictionary with the address elements of the place. They key
+            usually corresponds to the suffix part of the key of an OSM
+            'addr:*' or 'isin:*' tag. There are also some special keys like
+            `country` or `country_code` which merge OSM keys that contain
+            the same information. See [Import Styles][1] for details.
+
+            The property may be None if the place has no address information.
+
+            [1]: ../customize/Import-Styles.md
+        """
+        return self._info.get('address')
+
+
+    @property
+    def country_code(self) -> Optional[str]:
+        """ The country code of the country the place is in. Guaranteed
+            to be a two-letter lower-case string. If the place is not inside
+            any country, the property is set to None.
+        """
+        return self._info.get('country_code')
+
+
+    @property
+    def rank_address(self) -> int:
+        """ The [rank address][1] before any rank correction is applied.
+
+            [1]: ../customize/Ranking.md#address-rank
+        """
+        return self._info.get('rank_address', 0)
+
+
+    @property
+    def centroid(self) -> Optional[Tuple[float, float]]:
+        """ A center point of the place in WGS84. May be None when the
+            geometry of the place is unknown.
+        """
+        x, y = self._info.get('centroid_x'), self._info.get('centroid_y')
+        return None if x is None or y is None else (x, y)
+
+
+    def is_a(self, key: str, value: str) -> bool:
+        """ Set to True when the place's primary tag corresponds to the given
+            key and value.
+        """
+        return self._info.get('class') == key and self._info.get('type') == value
+
+
+    def is_country(self) -> bool:
+        """ Set to True when the place is a valid country boundary.
+        """
+        return self.rank_address == 4 \
+               and self.is_a('boundary', 'administrative') \
+               and self.country_code is not None
--- a/src/nominatim_db/data/place_name.py
+++ b/src/nominatim_db/data/place_name.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Data class for a single name of a place.
+"""
+from typing import Optional, Dict, Mapping
+
+class PlaceName:
+    """ Each name and address part of a place is encapsulated in an object of
+        this class. It saves not only the name proper but also describes the
+        kind of name with two properties:
+
+        * `kind` describes the name of the OSM key used without any suffixes
+          (i.e. the part after the colon removed)
+        * `suffix` contains the suffix of the OSM tag, if any. The suffix
+          is the part of the key after the first colon.
+
+        In addition to that, a name may have arbitrary additional attributes.
+        How attributes are used, depends on the sanitizers and token analysers.
+        The exception is the 'analyzer' attribute. This attribute determines
+        which token analysis module will be used to finalize the treatment of
+        names.
+    """
+
+    def __init__(self, name: str, kind: str, suffix: Optional[str]):
+        self.name = name
+        self.kind = kind
+        self.suffix = suffix
+        self.attr: Dict[str, str] = {}
+
+
+    def __repr__(self) -> str:
+        return f"PlaceName(name={self.name!r},kind={self.kind!r},suffix={self.suffix!r})"
+
+
+    def clone(self, name: Optional[str] = None,
+              kind: Optional[str] = None,
+              suffix: Optional[str] = None,
+              attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
+        """ Create a deep copy of the place name, optionally with the
+            given parameters replaced. In the attribute list only the given
+            keys are updated. The list is not replaced completely.
+            In particular, the function cannot to be used to remove an
+            attribute from a place name.
+        """
+        newobj = PlaceName(name or self.name,
+                           kind or self.kind,
+                           suffix or self.suffix)
+
+        newobj.attr.update(self.attr)
+        if attr:
+            newobj.attr.update(attr)
+
+        return newobj
+
+
+    def set_attr(self, key: str, value: str) -> None:
+        """ Add the given property to the name. If the property was already
+            set, then the value is overwritten.
+        """
+        self.attr[key] = value
+
+
+    def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
+        """ Return the given property or the value of 'default' if it
+            is not set.
+        """
+        return self.attr.get(key, default)
+
+
+    def has_attr(self, key: str) -> bool:
+        """ Check if the given attribute is set.
+        """
+        return key in self.attr
--- a/src/nominatim_db/data/postcode_format.py
+++ b/src/nominatim_db/data/postcode_format.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for formatting postcodes according to their country-specific
+format.
+"""
+from typing import Any, Mapping, Optional, Set, Match
+import re
+
+from nominatim_core.errors import UsageError
+from . import country_info
+
+class CountryPostcodeMatcher:
+    """ Matches and formats a postcode according to a format definition
+        of the given country.
+    """
+    def __init__(self, country_code: str, config: Mapping[str, Any]) -> None:
+        if 'pattern' not in config:
+            raise UsageError("Field 'pattern' required for 'postcode' "
+                             f"for country '{country_code}'")
+
+        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
+
+        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?({pc_pattern})\\s*')
+        self.pattern = re.compile(pc_pattern)
+
+        self.output = config.get('output', r'\g<0>')
+
+
+    def match(self, postcode: str) -> Optional[Match[str]]:
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the match was successful
+            and None otherwise.
+        """
+        # Upper-case, strip spaces and leading country code.
+        normalized = self.norm_pattern.fullmatch(postcode.upper())
+
+        if normalized:
+            return self.pattern.fullmatch(normalized.group(1))
+
+        return None
+
+
+    def normalize(self, match: Match[str]) -> str:
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return match.expand(self.output)
+
+
+class PostcodeFormatter:
+    """ Container for different postcode formats of the world and
+        access functions.
+    """
+    def __init__(self) -> None:
+        # Objects without a country code can't have a postcode per definition.
+        self.country_without_postcode: Set[Optional[str]] = {None}
+        self.country_matcher = {}
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
+
+        for ccode, prop in country_info.iterate('postcode'):
+            if prop is False:
+                self.country_without_postcode.add(ccode)
+            elif isinstance(prop, dict):
+                self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
+            else:
+                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+
+
+    def set_default_pattern(self, pattern: str) -> None:
+        """ Set the postcode match pattern to use, when a country does not
+            have a specific pattern.
+        """
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
+
+
+    def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
+        """ Return the CountryPostcodeMatcher for the given country.
+            Returns None if the country doesn't have a postcode and the
+            default matcher if there is no specific matcher configured for
+            the country.
+        """
+        if country_code in self.country_without_postcode:
+            return None
+
+        assert country_code is not None
+
+        return self.country_matcher.get(country_code, self.default_matcher)
+
+
+    def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the country has a pattern
+            and the match was successful or None if the match failed.
+        """
+        if country_code in self.country_without_postcode:
+            return None
+
+        assert country_code is not None
+
+        return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
+
+
+    def normalize(self, country_code: str, match: Match[str]) -> str:
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return self.country_matcher.get(country_code, self.default_matcher).normalize(match)
--- a/src/nominatim_db/indexer/init.py
+++ b/src/nominatim_db/indexer/init.py
--- a/src/nominatim_db/indexer/indexer.py
+++ b/src/nominatim_db/indexer/indexer.py
@@ -0,0 +1,242 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Main work horse for indexing (computing addresses) the database.
+"""
+from typing import Optional, Any, cast
+import logging
+import time
+
+import psycopg2.extras
+
+from nominatim_core.typing import DictCursorResults
+from nominatim_core.db.async_connection import DBConnection, WorkerPool
+from nominatim_core.db.connection import connect, Connection, Cursor
+from ..tokenizer.base import AbstractTokenizer
+from .progress import ProgressLogger
+from . import runners
+
+LOG = logging.getLogger()
+
+
+class PlaceFetcher:
+    """ Asynchronous connection that fetches place details for processing.
+    """
+    def __init__(self, dsn: str, setup_conn: Connection) -> None:
+        self.wait_time = 0.0
+        self.current_ids: Optional[DictCursorResults] = None
+        self.conn: Optional[DBConnection] = DBConnection(dsn,
+                                               cursor_factory=psycopg2.extras.DictCursor)
+
+        with setup_conn.cursor() as cur:
+            # need to fetch those manually because register_hstore cannot
+            # fetch them on an asynchronous connection below.
+            hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
+            hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
+
+        psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
+                                        array_oid=hstore_array_oid)
+
+    def close(self) -> None:
+        """ Close the underlying asynchronous connection.
+        """
+        if self.conn:
+            self.conn.close()
+            self.conn = None
+
+
+    def fetch_next_batch(self, cur: Cursor, runner: runners.Runner) -> bool:
+        """ Send a request for the next batch of places.
+            If details for the places are required, they will be fetched
+            asynchronously.
+
+            Returns true if there is still data available.
+        """
+        ids = cast(Optional[DictCursorResults], cur.fetchmany(100))
+
+        if not ids:
+            self.current_ids = None
+            return False
+
+        assert self.conn is not None
+        self.current_ids = runner.get_place_details(self.conn, ids)
+
+        return True
+
+    def get_batch(self) -> DictCursorResults:
+        """ Get the next batch of data, previously requested with
+            `fetch_next_batch`.
+        """
+        assert self.conn is not None
+        assert self.conn.cursor is not None
+
+        if self.current_ids is not None and not self.current_ids:
+            tstart = time.time()
+            self.conn.wait()
+            self.wait_time += time.time() - tstart
+            self.current_ids = cast(Optional[DictCursorResults],
+                                    self.conn.cursor.fetchall())
+
+        return self.current_ids if self.current_ids is not None else []
+
+    def __enter__(self) -> 'PlaceFetcher':
+        return self
+
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        assert self.conn is not None
+        self.conn.wait()
+        self.close()
+
+
+class Indexer:
+    """ Main indexing routine.
+    """
+
+    def __init__(self, dsn: str, tokenizer: AbstractTokenizer, num_threads: int):
+        self.dsn = dsn
+        self.tokenizer = tokenizer
+        self.num_threads = num_threads
+
+
+    def has_pending(self) -> bool:
+        """ Check if any data still needs indexing.
+            This function must only be used after the import has finished.
+            Otherwise it will be very expensive.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT 'a' FROM placex WHERE indexed_status > 0 LIMIT 1")
+                return cur.rowcount > 0
+
+
+    def index_full(self, analyse: bool = True) -> None:
+        """ Index the complete database. This will first index boundaries
+            followed by all other objects. When `analyse` is True, then the
+            database will be analysed at the appropriate places to
+            ensure that database statistics are updated.
+        """
+        with connect(self.dsn) as conn:
+            conn.autocommit = True
+
+            def _analyze() -> None:
+                if analyse:
+                    with conn.cursor() as cur:
+                        cur.execute('ANALYZE')
+
+            if self.index_by_rank(0, 4) > 0:
+                _analyze()
+
+            if self.index_boundaries(0, 30) > 100:
+                _analyze()
+
+            if self.index_by_rank(5, 25) > 100:
+                _analyze()
+
+            if self.index_by_rank(26, 30) > 1000:
+                _analyze()
+
+            if self.index_postcodes() > 100:
+                _analyze()
+
+
+    def index_boundaries(self, minrank: int, maxrank: int) -> int:
+        """ Index only administrative boundaries within the given rank range.
+        """
+        total = 0
+        LOG.warning("Starting indexing boundaries using %s threads",
+                    self.num_threads)
+
+        with self.tokenizer.name_analyzer() as analyzer:
+            for rank in range(max(minrank, 4), min(maxrank, 26)):
+                total += self._index(runners.BoundaryRunner(rank, analyzer))
+
+        return total
+
+    def index_by_rank(self, minrank: int, maxrank: int) -> int:
+        """ Index all entries of placex in the given rank range (inclusive)
+            in order of their address rank.
+
+            When rank 30 is requested then also interpolations and
+            places with address rank 0 will be indexed.
+        """
+        total = 0
+        maxrank = min(maxrank, 30)
+        LOG.warning("Starting indexing rank (%i to %i) using %i threads",
+                    minrank, maxrank, self.num_threads)
+
+        with self.tokenizer.name_analyzer() as analyzer:
+            for rank in range(max(1, minrank), maxrank + 1):
+                total += self._index(runners.RankRunner(rank, analyzer), 20 if rank == 30 else 1)
+
+            if maxrank == 30:
+                total += self._index(runners.RankRunner(0, analyzer))
+                total += self._index(runners.InterpolationRunner(analyzer), 20)
+
+        return total
+
+
+    def index_postcodes(self) -> int:
+        """Index the entries of the location_postcode table.
+        """
+        LOG.warning("Starting indexing postcodes using %s threads", self.num_threads)
+
+        return self._index(runners.PostcodeRunner(), 20)
+
+
+    def update_status_table(self) -> None:
+        """ Update the status in the status table to 'indexed'.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.execute('UPDATE import_status SET indexed = true')
+
+            conn.commit()
+
+    def _index(self, runner: runners.Runner, batch: int = 1) -> int:
+        """ Index a single rank or table. `runner` describes the SQL to use
+            for indexing. `batch` describes the number of objects that
+            should be processed with a single SQL statement
+        """
+        LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
+
+        with connect(self.dsn) as conn:
+            psycopg2.extras.register_hstore(conn)
+            with conn.cursor() as cur:
+                total_tuples = cur.scalar(runner.sql_count_objects())
+                LOG.debug("Total number of rows: %i", total_tuples)
+
+            conn.commit()
+
+            progress = ProgressLogger(runner.name(), total_tuples)
+
+            if total_tuples > 0:
+                with conn.cursor(name='places') as cur:
+                    cur.execute(runner.sql_get_objects())
+
+                    with PlaceFetcher(self.dsn, conn) as fetcher:
+                        with WorkerPool(self.dsn, self.num_threads) as pool:
+                            has_more = fetcher.fetch_next_batch(cur, runner)
+                            while has_more:
+                                places = fetcher.get_batch()
+
+                                # asynchronously get the next batch
+                                has_more = fetcher.fetch_next_batch(cur, runner)
+
+                                # And insert the current batch
+                                for idx in range(0, len(places), batch):
+                                    part = places[idx:idx + batch]
+                                    LOG.debug("Processing places: %s", str(part))
+                                    runner.index_places(pool.next_free_worker(), part)
+                                    progress.add(len(part))
+
+                            LOG.info("Wait time: fetcher: %.2fs,  pool: %.2fs",
+                                     fetcher.wait_time, pool.wait_time)
+
+                conn.commit()
+
+        return progress.done()
--- a/src/nominatim_db/indexer/progress.py
+++ b/src/nominatim_db/indexer/progress.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Helpers for progress logging.
+"""
+import logging
+from datetime import datetime
+
+LOG = logging.getLogger()
+
+INITIAL_PROGRESS = 10
+
+class ProgressLogger:
+    """ Tracks and prints progress for the indexing process.
+        `name` is the name of the indexing step being tracked.
+        `total` sets up the total number of items that need processing.
+        `log_interval` denotes the interval in seconds at which progress
+        should be reported.
+    """
+
+    def __init__(self, name: str, total: int, log_interval: int = 1) -> None:
+        self.name = name
+        self.total_places = total
+        self.done_places = 0
+        self.rank_start_time = datetime.now()
+        self.log_interval = log_interval
+        self.next_info = INITIAL_PROGRESS if LOG.isEnabledFor(logging.WARNING) else total + 1
+
+    def add(self, num: int = 1) -> None:
+        """ Mark `num` places as processed. Print a log message if the
+            logging is at least info and the log interval has passed.
+        """
+        self.done_places += num
+
+        if self.done_places < self.next_info:
+            return
+
+        now = datetime.now()
+        done_time = (now - self.rank_start_time).total_seconds()
+
+        if done_time < 2:
+            self.next_info = self.done_places + INITIAL_PROGRESS
+            return
+
+        places_per_sec = self.done_places / done_time
+        eta = (self.total_places - self.done_places) / places_per_sec
+
+        LOG.warning("Done %d in %d @ %.3f per second - %s ETA (seconds): %.2f",
+                    self.done_places, int(done_time),
+                    places_per_sec, self.name, eta)
+
+        self.next_info += int(places_per_sec) * self.log_interval
+
+    def done(self) -> int:
+        """ Print final statistics about the progress.
+        """
+        rank_end_time = datetime.now()
+
+        if rank_end_time == self.rank_start_time:
+            diff_seconds = 0.0
+            places_per_sec = float(self.done_places)
+        else:
+            diff_seconds = (rank_end_time - self.rank_start_time).total_seconds()
+            places_per_sec = self.done_places / diff_seconds
+
+        LOG.warning("Done %d/%d in %d @ %.3f per second - FINISHED %s\n",
+                    self.done_places, self.total_places, int(diff_seconds),
+                    places_per_sec, self.name)
+
+        return self.done_places
--- a/src/nominatim_db/indexer/runners.py
+++ b/src/nominatim_db/indexer/runners.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Mix-ins that provide the actual commands for the indexer for various indexing
+tasks.
+"""
+from typing import Any, List
+import functools
+
+from psycopg2 import sql as pysql
+import psycopg2.extras
+
+from nominatim_core.typing import Query, DictCursorResult, DictCursorResults, Protocol
+from nominatim_core.db.async_connection import DBConnection
+from ..data.place_info import PlaceInfo
+from ..tokenizer.base import AbstractAnalyzer
+
+# pylint: disable=C0111
+
+def _mk_valuelist(template: str, num: int) -> pysql.Composed:
+    return pysql.SQL(',').join([pysql.SQL(template)] * num)
+
+def _analyze_place(place: DictCursorResult, analyzer: AbstractAnalyzer) -> psycopg2.extras.Json:
+    return psycopg2.extras.Json(analyzer.process_place(PlaceInfo(place)))
+
+
+class Runner(Protocol):
+    def name(self) -> str: ...
+    def sql_count_objects(self) -> Query: ...
+    def sql_get_objects(self) -> Query: ...
+    def get_place_details(self, worker: DBConnection,
+                          ids: DictCursorResults) -> DictCursorResults: ...
+    def index_places(self, worker: DBConnection, places: DictCursorResults) -> None: ...
+
+
+class AbstractPlacexRunner:
+    """ Returns SQL commands for indexing of the placex table.
+    """
+    SELECT_SQL = pysql.SQL('SELECT place_id FROM placex ')
+    UPDATE_LINE = "(%s, %s::hstore, %s::hstore, %s::int, %s::jsonb)"
+
+    def __init__(self, rank: int, analyzer: AbstractAnalyzer) -> None:
+        self.rank = rank
+        self.analyzer = analyzer
+
+
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(self, num_places: int) -> pysql.Composed:
+        return pysql.SQL(
+            """ UPDATE placex
+                SET indexed_status = 0, address = v.addr, token_info = v.ti,
+                    name = v.name, linked_place_id = v.linked_place_id
+                FROM (VALUES {}) as v(id, name, addr, linked_place_id, ti)
+                WHERE place_id = v.id
+            """).format(_mk_valuelist(AbstractPlacexRunner.UPDATE_LINE, num_places))
+
+
+    def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
+        worker.perform("""SELECT place_id, extra.*
+                          FROM placex, LATERAL placex_indexing_prepare(placex) as extra
+                          WHERE place_id IN %s""",
+                       (tuple((p[0] for p in ids)), ))
+
+        return []
+
+
+    def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
+        values: List[Any] = []
+        for place in places:
+            for field in ('place_id', 'name', 'address', 'linked_place_id'):
+                values.append(place[field])
+            values.append(_analyze_place(place, self.analyzer))
+
+        worker.perform(self._index_sql(len(places)), values)
+
+
+class RankRunner(AbstractPlacexRunner):
+    """ Returns SQL commands for indexing one rank within the placex table.
+    """
+
+    def name(self) -> str:
+        return f"rank {self.rank}"
+
+    def sql_count_objects(self) -> pysql.Composed:
+        return pysql.SQL("""SELECT count(*) FROM placex
+                            WHERE rank_address = {} and indexed_status > 0
+                         """).format(pysql.Literal(self.rank))
+
+    def sql_get_objects(self) -> pysql.Composed:
+        return self.SELECT_SQL + pysql.SQL(
+            """WHERE indexed_status > 0 and rank_address = {}
+               ORDER BY geometry_sector
+            """).format(pysql.Literal(self.rank))
+
+
+class BoundaryRunner(AbstractPlacexRunner):
+    """ Returns SQL commands for indexing the administrative boundaries
+        of a certain rank.
+    """
+
+    def name(self) -> str:
+        return f"boundaries rank {self.rank}"
+
+    def sql_count_objects(self) -> pysql.Composed:
+        return pysql.SQL("""SELECT count(*) FROM placex
+                            WHERE indexed_status > 0
+                              AND rank_search = {}
+                              AND class = 'boundary' and type = 'administrative'
+                         """).format(pysql.Literal(self.rank))
+
+    def sql_get_objects(self) -> pysql.Composed:
+        return self.SELECT_SQL + pysql.SQL(
+            """WHERE indexed_status > 0 and rank_search = {}
+                     and class = 'boundary' and type = 'administrative'
+               ORDER BY partition, admin_level
+            """).format(pysql.Literal(self.rank))
+
+
+class InterpolationRunner:
+    """ Returns SQL commands for indexing the address interpolation table
+        location_property_osmline.
+    """
+
+    def __init__(self, analyzer: AbstractAnalyzer) -> None:
+        self.analyzer = analyzer
+
+
+    def name(self) -> str:
+        return "interpolation lines (location_property_osmline)"
+
+    def sql_count_objects(self) -> str:
+        return """SELECT count(*) FROM location_property_osmline
+                  WHERE indexed_status > 0"""
+
+    def sql_get_objects(self) -> str:
+        return """SELECT place_id
+                  FROM location_property_osmline
+                  WHERE indexed_status > 0
+                  ORDER BY geometry_sector"""
+
+
+    def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
+        worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
+                          FROM location_property_osmline WHERE place_id IN %s""",
+                       (tuple((p[0] for p in ids)), ))
+        return []
+
+
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(self, num_places: int) -> pysql.Composed:
+        return pysql.SQL("""UPDATE location_property_osmline
+                            SET indexed_status = 0, address = v.addr, token_info = v.ti
+                            FROM (VALUES {}) as v(id, addr, ti)
+                            WHERE place_id = v.id
+                         """).format(_mk_valuelist("(%s, %s::hstore, %s::jsonb)", num_places))
+
+
+    def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
+        values: List[Any] = []
+        for place in places:
+            values.extend((place[x] for x in ('place_id', 'address')))
+            values.append(_analyze_place(place, self.analyzer))
+
+        worker.perform(self._index_sql(len(places)), values)
+
+
+
+class PostcodeRunner(Runner):
+    """ Provides the SQL commands for indexing the location_postcode table.
+    """
+
+    def name(self) -> str:
+        return "postcodes (location_postcode)"
+
+
+    def sql_count_objects(self) -> str:
+        return 'SELECT count(*) FROM location_postcode WHERE indexed_status > 0'
+
+
+    def sql_get_objects(self) -> str:
+        return """SELECT place_id FROM location_postcode
+                  WHERE indexed_status > 0
+                  ORDER BY country_code, postcode"""
+
+
+    def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
+        return ids
+
+    def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
+        worker.perform(pysql.SQL("""UPDATE location_postcode SET indexed_status = 0
+                                    WHERE place_id IN ({})""")
+                       .format(pysql.SQL(',').join((pysql.Literal(i[0]) for i in places))))
--- a/src/nominatim_db/tokenizer/init.py
+++ b/src/nominatim_db/tokenizer/init.py
--- a/src/nominatim_db/tokenizer/base.py
+++ b/src/nominatim_db/tokenizer/base.py
@@ -0,0 +1,253 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Abstract class definitions for tokenizers. These base classes are here
+mainly for documentation purposes.
+"""
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Dict, Any, Optional, Iterable
+from pathlib import Path
+
+from nominatim_core.typing import Protocol
+from nominatim_core.config import Configuration
+from nominatim_core.db.connection import Connection
+from ..data.place_info import PlaceInfo
+
+class AbstractAnalyzer(ABC):
+    """ The analyzer provides the functions for analysing names and building
+        the token database.
+
+        Analyzers are instantiated on a per-thread base. Access to global data
+        structures must be synchronised accordingly.
+    """
+
+    def __enter__(self) -> 'AbstractAnalyzer':
+        return self
+
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.close()
+
+
+    @abstractmethod
+    def close(self) -> None:
+        """ Free all resources used by the analyzer.
+        """
+
+
+    @abstractmethod
+    def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
+        """ Return token information for the given list of words.
+
+            The function is used for testing and debugging only
+            and does not need to be particularly efficient.
+
+            Arguments:
+                words: A list of words to look up the tokens for.
+                       If a word starts with # it is assumed to be a full name
+                       otherwise is a partial term.
+
+            Returns:
+                The function returns the list of all tuples that could be
+                    found for the given words. Each list entry is a tuple of
+                    (original word, word token, word id).
+        """
+
+
+    @abstractmethod
+    def normalize_postcode(self, postcode: str) -> str:
+        """ Convert the postcode to its standardized form.
+
+            This function must yield exactly the same result as the SQL function
+            `token_normalized_postcode()`.
+
+            Arguments:
+                postcode: The postcode to be normalized.
+
+            Returns:
+                The given postcode after normalization.
+        """
+
+
+    @abstractmethod
+    def update_postcodes_from_db(self) -> None:
+        """ Update the tokenizer's postcode tokens from the current content
+            of the `location_postcode` table.
+        """
+
+
+    @abstractmethod
+    def update_special_phrases(self,
+                               phrases: Iterable[Tuple[str, str, str, str]],
+                               should_replace: bool) -> None:
+        """ Update the tokenizer's special phrase tokens from the given
+            list of special phrases.
+
+            Arguments:
+                phrases: The new list of special phrases. Each entry is
+                         a tuple of (phrase, class, type, operator).
+                should_replace: If true, replace the current list of phrases.
+                                When false, just add the given phrases to the
+                                ones that already exist.
+        """
+
+
+    @abstractmethod
+    def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
+        """ Add the given names to the tokenizer's list of country tokens.
+
+            Arguments:
+                country_code: two-letter country code for the country the names
+                              refer to.
+                names: Dictionary of name type to name.
+        """
+
+
+    @abstractmethod
+    def process_place(self, place: PlaceInfo) -> Any:
+        """ Extract tokens for the given place and compute the
+            information to be handed to the PL/pgSQL processor for building
+            the search index.
+
+            Arguments:
+                place: Place information retrieved from the database.
+
+            Returns:
+                A JSON-serialisable structure that will be handed into
+                    the database via the `token_info` field.
+        """
+
+
+
+class AbstractTokenizer(ABC):
+    """ The tokenizer instance is the central instance of the tokenizer in
+        the system. There will only be a single instance of the tokenizer
+        active at any time.
+    """
+
+    @abstractmethod
+    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
+        """ Set up a new tokenizer for the database.
+
+            The function should copy all necessary data into the project
+            directory or save it in the property table to make sure that
+            the tokenizer remains stable over updates.
+
+            Arguments:
+              config: Read-only object with configuration options.
+
+              init_db: When set to False, then initialisation of database
+                tables should be skipped. This option is only required for
+                migration purposes and can be safely ignored by custom
+                tokenizers.
+        """
+
+
+    @abstractmethod
+    def init_from_project(self, config: Configuration) -> None:
+        """ Initialise the tokenizer from an existing database setup.
+
+            The function should load all previously saved configuration from
+            the project directory and/or the property table.
+
+            Arguments:
+              config: Read-only object with configuration options.
+        """
+
+
+    @abstractmethod
+    def finalize_import(self, config: Configuration) -> None:
+        """ This function is called at the very end of an import when all
+            data has been imported and indexed. The tokenizer may create
+            at this point any additional indexes and data structures needed
+            during query time.
+
+            Arguments:
+              config: Read-only object with configuration options.
+        """
+
+
+    @abstractmethod
+    def update_sql_functions(self, config: Configuration) -> None:
+        """ Update the SQL part of the tokenizer. This function is called
+            automatically on migrations or may be called explicitly by the
+            user through the `nominatim refresh --functions` command.
+
+            The tokenizer must only update the code of the tokenizer. The
+            data structures or data itself must not be changed by this function.
+
+            Arguments:
+              config: Read-only object with configuration options.
+        """
+
+
+    @abstractmethod
+    def check_database(self, config: Configuration) -> Optional[str]:
+        """ Check that the database is set up correctly and ready for being
+            queried.
+
+            Arguments:
+              config: Read-only object with configuration options.
+
+            Returns:
+              If an issue was found, return an error message with the
+                  description of the issue as well as hints for the user on
+                  how to resolve the issue. If everything is okay, return `None`.
+        """
+
+
+    @abstractmethod
+    def update_statistics(self, config: Configuration, threads: int = 1) -> None:
+        """ Recompute any tokenizer statistics necessary for efficient lookup.
+            This function is meant to be called from time to time by the user
+            to improve performance. However, the tokenizer must not depend on
+            it to be called in order to work.
+        """
+
+
+    @abstractmethod
+    def update_word_tokens(self) -> None:
+        """ Do house-keeping on the tokenizers internal data structures.
+            Remove unused word tokens, resort data etc.
+        """
+
+
+    @abstractmethod
+    def name_analyzer(self) -> AbstractAnalyzer:
+        """ Create a new analyzer for tokenizing names and queries
+            using this tokinzer. Analyzers are context managers and should
+            be used accordingly:
+
+            ```
+            with tokenizer.name_analyzer() as analyzer:
+                analyser.tokenize()
+            ```
+
+            When used outside the with construct, the caller must ensure to
+            call the close() function before destructing the analyzer.
+        """
+
+
+    @abstractmethod
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the most frequent full words in the database.
+
+            Arguments:
+              conn: Open connection to the database which may be used to
+                    retrieve the words.
+              num: Maximum number of words to return.
+        """
+
+
+class TokenizerModule(Protocol):
+    """ Interface that must be exported by modules that implement their
+        own tokenizer.
+    """
+
+    def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
+        """ Factory for new tokenizers.
+        """
--- a/src/nominatim_db/tokenizer/factory.py
+++ b/src/nominatim_db/tokenizer/factory.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for creating a tokenizer or initialising the right one for an
+existing database.
+
+A tokenizer is something that is bound to the lifetime of a database. It
+can be chosen and configured before the initial import but then needs to
+be used consistently when querying and updating the database.
+
+This module provides the functions to create and configure a new tokenizer
+as well as instantiating the appropriate tokenizer for updating an existing
+database.
+
+A tokenizer usually also includes PHP code for querying. The appropriate PHP
+normalizer module is installed, when the tokenizer is created.
+"""
+from typing import Optional
+import logging
+import importlib
+from pathlib import Path
+
+from nominatim_core.errors import UsageError
+from nominatim_core.db import properties
+from nominatim_core.db.connection import connect
+from nominatim_core.config import Configuration
+from ..tokenizer.base import AbstractTokenizer, TokenizerModule
+
+LOG = logging.getLogger()
+
+def _import_tokenizer(name: str) -> TokenizerModule:
+    """ Load the tokenizer.py module from project directory.
+    """
+    src_file = Path(__file__).parent / (name + '_tokenizer.py')
+    if not src_file.is_file():
+        LOG.fatal("No tokenizer named '%s' available. "
+                  "Check the setting of NOMINATIM_TOKENIZER.", name)
+        raise UsageError('Tokenizer not found')
+
+    return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
+
+
+def create_tokenizer(config: Configuration, init_db: bool = True,
+                     module_name: Optional[str] = None) -> AbstractTokenizer:
+    """ Create a new tokenizer as defined by the given configuration.
+
+        The tokenizer data and code is copied into the 'tokenizer' directory
+        of the project directory and the tokenizer loaded from its new location.
+    """
+    if module_name is None:
+        module_name = config.TOKENIZER
+
+    # Create the directory for the tokenizer data
+    assert config.project_dir is not None
+    basedir = config.project_dir / 'tokenizer'
+    if not basedir.exists():
+        basedir.mkdir()
+    elif not basedir.is_dir():
+        LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
+        raise UsageError("Tokenizer setup failed.")
+
+    # Import and initialize the tokenizer.
+    tokenizer_module = _import_tokenizer(module_name)
+
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer.init_new_db(config, init_db=init_db)
+
+    with connect(config.get_libpq_dsn()) as conn:
+        properties.set_property(conn, 'tokenizer', module_name)
+
+    return tokenizer
+
+
+def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
+    """ Instantiate a tokenizer for an existing database.
+
+        The function looks up the appropriate tokenizer in the database
+        and initialises it.
+    """
+    assert config.project_dir is not None
+    basedir = config.project_dir / 'tokenizer'
+    if not basedir.is_dir():
+        # Directory will be repopulated by tokenizer below.
+        basedir.mkdir()
+
+    with connect(config.get_libpq_dsn()) as conn:
+        name = properties.get_property(conn, 'tokenizer')
+
+    if name is None:
+        LOG.fatal("Tokenizer was not set up properly. Database property missing.")
+        raise UsageError('Cannot initialize tokenizer.')
+
+    tokenizer_module = _import_tokenizer(name)
+
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer.init_from_project(config)
+
+    return tokenizer
--- a/src/nominatim_db/tokenizer/icu_rule_loader.py
+++ b/src/nominatim_db/tokenizer/icu_rule_loader.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Helper class to create ICU rules from a configuration file.
+"""
+from typing import Mapping, Any, Dict, Optional
+import io
+import json
+import logging
+
+from icu import Transliterator
+
+from nominatim_core.config import flatten_config_list, Configuration
+from nominatim_core.db.properties import set_property, get_property
+from nominatim_core.db.connection import Connection
+from nominatim_core.errors import UsageError
+from .place_sanitizer import PlaceSanitizer
+from .icu_token_analysis import ICUTokenAnalysis
+from .token_analysis.base import AnalysisModule, Analyzer
+from ..data import country_info
+
+LOG = logging.getLogger()
+
+DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
+DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
+DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
+
+
+def _get_section(rules: Mapping[str, Any], section: str) -> Any:
+    """ Get the section named 'section' from the rules. If the section does
+        not exist, raise a usage error with a meaningful message.
+    """
+    if section not in rules:
+        LOG.fatal("Section '%s' not found in tokenizer config.", section)
+        raise UsageError("Syntax error in tokenizer configuration file.")
+
+    return rules[section]
+
+
+class ICURuleLoader:
+    """ Compiler for ICU rules from a tokenizer configuration file.
+    """
+
+    def __init__(self, config: Configuration) -> None:
+        self.config = config
+        rules = config.load_sub_configuration('icu_tokenizer.yaml',
+                                              config='TOKENIZER_CONFIG')
+
+        # Make sure country information is available to analyzers and sanitizers.
+        country_info.setup_country_config(config)
+
+        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
+        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
+        self.analysis_rules = _get_section(rules, 'token-analysis')
+        self._setup_analysis()
+
+        # Load optional sanitizer rule set.
+        self.sanitizer_rules = rules.get('sanitizers', [])
+
+
+    def load_config_from_db(self, conn: Connection) -> None:
+        """ Get previously saved parts of the configuration from the
+            database.
+        """
+        rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+        if rules is not None:
+            self.normalization_rules = rules
+
+        rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+        if rules is not None:
+            self.transliteration_rules = rules
+
+        rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
+        if rules:
+            self.analysis_rules = json.loads(rules)
+        else:
+            self.analysis_rules = []
+        self._setup_analysis()
+
+
+    def save_config_to_db(self, conn: Connection) -> None:
+        """ Save the part of the configuration that cannot be changed into
+            the database.
+        """
+        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
+        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
+        set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
+
+
+    def make_sanitizer(self) -> PlaceSanitizer:
+        """ Create a place sanitizer from the configured rules.
+        """
+        return PlaceSanitizer(self.sanitizer_rules, self.config)
+
+
+    def make_token_analysis(self) -> ICUTokenAnalysis:
+        """ Create a token analyser from the reviouly loaded rules.
+        """
+        return ICUTokenAnalysis(self.normalization_rules,
+                                self.transliteration_rules, self.analysis)
+
+
+    def get_search_rules(self) -> str:
+        """ Return the ICU rules to be used during search.
+            The rules combine normalization and transliteration.
+        """
+        # First apply the normalization rules.
+        rules = io.StringIO()
+        rules.write(self.normalization_rules)
+
+        # Then add transliteration.
+        rules.write(self.transliteration_rules)
+        return rules.getvalue()
+
+
+    def get_normalization_rules(self) -> str:
+        """ Return rules for normalisation of a term.
+        """
+        return self.normalization_rules
+
+
+    def get_transliteration_rules(self) -> str:
+        """ Return the rules for converting a string into its asciii representation.
+        """
+        return self.transliteration_rules
+
+
+    def _setup_analysis(self) -> None:
+        """ Process the rules used for creating the various token analyzers.
+        """
+        self.analysis: Dict[Optional[str], TokenAnalyzerRule]  = {}
+
+        if not isinstance(self.analysis_rules, list):
+            raise UsageError("Configuration section 'token-analysis' must be a list.")
+
+        norm = Transliterator.createFromRules("rule_loader_normalization",
+                                              self.normalization_rules)
+        trans = Transliterator.createFromRules("rule_loader_transliteration",
+                                              self.transliteration_rules)
+
+        for section in self.analysis_rules:
+            name = section.get('id', None)
+            if name in self.analysis:
+                if name is None:
+                    LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
+                else:
+                    LOG.fatal("ICU tokenizer configuration has two token "
+                              "analyzers with id '%s'.", name)
+                raise UsageError("Syntax error in ICU tokenizer config.")
+            self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
+                                                    self.config)
+
+
+    @staticmethod
+    def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
+        """ Load an ICU ruleset from the given section. If the section is a
+            simple string, it is interpreted as a file name and the rules are
+            loaded verbatim from the given file. The filename is expected to be
+            relative to the tokenizer rule file. If the section is a list then
+            each line is assumed to be a rule. All rules are concatenated and returned.
+        """
+        content = _get_section(rules, section)
+
+        if content is None:
+            return ''
+
+        return ';'.join(flatten_config_list(content, section)) + ';'
+
+
+class TokenAnalyzerRule:
+    """ Factory for a single analysis module. The class saves the configuration
+        and creates a new token analyzer on request.
+    """
+
+    def __init__(self, rules: Mapping[str, Any],
+                 normalizer: Any, transliterator: Any,
+                 config: Configuration) -> None:
+        analyzer_name = _get_section(rules, 'analyzer')
+        if not analyzer_name or not isinstance(analyzer_name, str):
+            raise UsageError("'analyzer' parameter needs to be simple string")
+
+        self._analysis_mod: AnalysisModule = \
+            config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
+
+        self.config = self._analysis_mod.configure(rules, normalizer,
+                                                   transliterator)
+
+
+    def create(self, normalizer: Any, transliterator: Any) -> Analyzer:
+        """ Create a new analyser instance for the given rule.
+        """
+        return self._analysis_mod.create(normalizer, transliterator, self.config)
--- a/src/nominatim_db/tokenizer/icu_token_analysis.py
+++ b/src/nominatim_db/tokenizer/icu_token_analysis.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Container class collecting all components required to transform an OSM name
+into a Nominatim token.
+"""
+from typing import Mapping, Optional, TYPE_CHECKING
+from icu import Transliterator
+
+from .token_analysis.base import Analyzer
+
+if TYPE_CHECKING:
+    from typing import Any
+    from .icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import
+
+class ICUTokenAnalysis:
+    """ Container class collecting the transliterators and token analysis
+        modules for a single Analyser instance.
+    """
+
+    def __init__(self, norm_rules: str, trans_rules: str,
+                 analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
+        self.normalizer = Transliterator.createFromRules("icu_normalization",
+                                                         norm_rules)
+        trans_rules += ";[:Space:]+ > ' '"
+        self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
+                                                       trans_rules)
+        self.search = Transliterator.createFromRules("icu_search",
+                                                     norm_rules + trans_rules)
+
+        self.analysis = {name: arules.create(self.normalizer, self.to_ascii)
+                         for name, arules in analysis_rules.items()}
+
+
+    def get_analyzer(self, name: Optional[str]) -> Analyzer:
+        """ Return the given named analyzer. If no analyzer with that
+            name exists, return the default analyzer.
+        """
+        return self.analysis.get(name) or self.analysis[None]
--- a/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@ -0,0 +1,952 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tokenizer implementing normalisation as used before Nominatim 4 but using
+libICU instead of the PostgreSQL module.
+"""
+from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
+                   Dict, Set, Iterable
+import itertools
+import json
+import logging
+from pathlib import Path
+from textwrap import dedent
+
+from nominatim_core.db.connection import connect, Connection, Cursor
+from nominatim_core.config import Configuration
+from nominatim_core.db.utils import CopyBuffer
+from nominatim_core.db.sql_preprocessor import SQLPreprocessor
+from ..data.place_info import PlaceInfo
+from ..data.place_name import PlaceName
+from .icu_rule_loader import ICURuleLoader
+from .place_sanitizer import PlaceSanitizer
+from .icu_token_analysis import ICUTokenAnalysis
+from .base import AbstractAnalyzer, AbstractTokenizer
+
+DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
+
+LOG = logging.getLogger()
+
+WORD_TYPES =(('country_names', 'C'),
+             ('postcodes', 'P'),
+             ('full_word', 'W'),
+             ('housenumbers', 'H'))
+
+def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
+    """ Create a new instance of the tokenizer provided by this module.
+    """
+    return ICUTokenizer(dsn, data_dir)
+
+
+class ICUTokenizer(AbstractTokenizer):
+    """ This tokenizer uses libICU to convert names and queries to ASCII.
+        Otherwise it uses the same algorithms and data structures as the
+        normalization routines in Nominatim 3.
+    """
+
+    def __init__(self, dsn: str, data_dir: Path) -> None:
+        self.dsn = dsn
+        self.data_dir = data_dir
+        self.loader: Optional[ICURuleLoader] = None
+
+
+    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
+        """ Set up a new tokenizer for the database.
+
+            This copies all necessary data in the project directory to make
+            sure the tokenizer remains stable even over updates.
+        """
+        self.loader = ICURuleLoader(config)
+
+        self._install_php(config.lib_dir.php, overwrite=True)
+        self._save_config()
+
+        if init_db:
+            self.update_sql_functions(config)
+            self._setup_db_tables(config)
+            self._create_base_indices(config, 'word')
+
+
+    def init_from_project(self, config: Configuration) -> None:
+        """ Initialise the tokenizer from the project directory.
+        """
+        self.loader = ICURuleLoader(config)
+
+        with connect(self.dsn) as conn:
+            self.loader.load_config_from_db(conn)
+
+        self._install_php(config.lib_dir.php, overwrite=False)
+
+
+    def finalize_import(self, config: Configuration) -> None:
+        """ Do any required postprocessing to make the tokenizer data ready
+            for use.
+        """
+        self._create_lookup_indices(config, 'word')
+
+
+    def update_sql_functions(self, config: Configuration) -> None:
+        """ Reimport the SQL functions for this tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
+
+
+    def check_database(self, config: Configuration) -> None:
+        """ Check that the tokenizer is set up correctly.
+        """
+        # Will throw an error if there is an issue.
+        self.init_from_project(config)
+
+
+    def update_statistics(self, config: Configuration, threads: int = 2) -> None:
+        """ Recompute frequencies for all name words.
+        """
+        with connect(self.dsn) as conn:
+            if not conn.table_exists('search_name'):
+                return
+
+            with conn.cursor() as cur:
+                cur.execute('ANALYSE search_name')
+                if threads > 1:
+                    cur.execute('SET max_parallel_workers_per_gather TO %s',
+                                (min(threads, 6),))
+
+                if conn.server_version_tuple() < (12, 0):
+                    LOG.info('Computing word frequencies')
+                    cur.drop_table('word_frequencies')
+                    cur.drop_table('addressword_frequencies')
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute('CREATE INDEX ON word_frequencies(id)')
+                    cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
+                                     SELECT unnest(nameaddress_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute('CREATE INDEX ON addressword_frequencies(id)')
+                    cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
+                                                                               INOUT info JSONB)
+                                   AS $$
+                                   DECLARE rec RECORD;
+                                   BEGIN
+                                   IF info is null THEN
+                                     info = '{}'::jsonb;
+                                   END IF;
+                                   FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
+                                   LOOP
+                                     info = info || jsonb_build_object('count', rec.count);
+                                   END LOOP;
+                                   FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
+                                   LOOP
+                                     info = info || jsonb_build_object('addr_count', rec.count);
+                                   END LOOP;
+                                   IF info = '{}'::jsonb THEN
+                                     info = null;
+                                   END IF;
+                                   END;
+                                   $$ LANGUAGE plpgsql IMMUTABLE;
+                                """)
+                    LOG.info('Update word table with recomputed frequencies')
+                    cur.drop_table('tmp_word')
+                    cur.execute("""CREATE TABLE tmp_word AS
+                                    SELECT word_id, word_token, type, word,
+                                           word_freq_update(word_id, info) as info
+                                    FROM word
+                                """)
+                    cur.drop_table('word_frequencies')
+                    cur.drop_table('addressword_frequencies')
+                else:
+                    LOG.info('Computing word frequencies')
+                    cur.drop_table('word_frequencies')
+                    cur.execute("""
+                      CREATE TEMP TABLE word_frequencies AS
+                      WITH word_freq AS MATERIALIZED (
+                               SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id),
+                           addr_freq AS MATERIALIZED (
+                               SELECT unnest(nameaddress_vector) as id, count(*)
+                                     FROM search_name GROUP BY id)
+                      SELECT coalesce(a.id, w.id) as id,
+                             (CASE WHEN w.count is null THEN '{}'::JSONB
+                                  ELSE jsonb_build_object('count', w.count) END
+                              ||
+                              CASE WHEN a.count is null THEN '{}'::JSONB
+                                  ELSE jsonb_build_object('addr_count', a.count) END) as info
+                      FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
+                      """)
+                    cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
+                    cur.execute('ANALYSE word_frequencies')
+                    LOG.info('Update word table with recomputed frequencies')
+                    cur.drop_table('tmp_word')
+                    cur.execute("""CREATE TABLE tmp_word AS
+                                    SELECT word_id, word_token, type, word,
+                                           (CASE WHEN wf.info is null THEN word.info
+                                            ELSE coalesce(word.info, '{}'::jsonb) || wf.info
+                                            END) as info
+                                    FROM word LEFT JOIN word_frequencies wf
+                                         ON word.word_id = wf.id
+                                """)
+                    cur.drop_table('word_frequencies')
+
+            with conn.cursor() as cur:
+                cur.execute('SET max_parallel_workers_per_gather TO 0')
+
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn,
+                            'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
+            conn.commit()
+        self._create_base_indices(config, 'tmp_word')
+        self._create_lookup_indices(config, 'tmp_word')
+        self._move_temporary_word_table('tmp_word')
+
+
+
+    def _cleanup_housenumbers(self) -> None:
+        """ Remove unused house numbers.
+        """
+        with connect(self.dsn) as conn:
+            if not conn.table_exists('search_name'):
+                return
+            with conn.cursor(name="hnr_counter") as cur:
+                cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
+                               FROM word
+                               WHERE type = 'H'
+                                 AND NOT EXISTS(SELECT * FROM search_name
+                                                WHERE ARRAY[word.word_id] && name_vector)
+                                 AND (char_length(coalesce(word, word_token)) > 6
+                                      OR coalesce(word, word_token) not similar to '\\d+')
+                            """)
+                candidates = {token: wid for wid, token in cur}
+            with conn.cursor(name="hnr_counter") as cur:
+                cur.execute("""SELECT housenumber FROM placex
+                               WHERE housenumber is not null
+                                     AND (char_length(housenumber) > 6
+                                          OR housenumber not similar to '\\d+')
+                            """)
+                for row in cur:
+                    for hnr in row[0].split(';'):
+                        candidates.pop(hnr, None)
+            LOG.info("There are %s outdated housenumbers.", len(candidates))
+            LOG.debug("Outdated housenumbers: %s", candidates.keys())
+            if candidates:
+                with conn.cursor() as cur:
+                    cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
+                                (list(candidates.values()), ))
+                conn.commit()
+
+
+
+    def update_word_tokens(self) -> None:
+        """ Remove unused tokens.
+        """
+        LOG.warning("Cleaning up housenumber tokens.")
+        self._cleanup_housenumbers()
+        LOG.warning("Tokenizer house-keeping done.")
+
+
+    def name_analyzer(self) -> 'ICUNameAnalyzer':
+        """ Create a new analyzer for tokenizing names and queries
+            using this tokinzer. Analyzers are context managers and should
+            be used accordingly:
+
+            ```
+            with tokenizer.name_analyzer() as analyzer:
+                analyser.tokenize()
+            ```
+
+            When used outside the with construct, the caller must ensure to
+            call the close() function before destructing the analyzer.
+
+            Analyzers are not thread-safe. You need to instantiate one per thread.
+        """
+        assert self.loader is not None
+        return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                               self.loader.make_token_analysis())
+
+
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word, sum((info->>'count')::int) as count
+                             FROM word WHERE type = 'W'
+                             GROUP BY word
+                             ORDER BY count DESC LIMIT %s""", (num,))
+            return list(s[0].split('@')[0] for s in cur)
+
+
+    def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
+        """ Install the php script for the tokenizer.
+        """
+        if phpdir is not None:
+            assert self.loader is not None
+            php_file = self.data_dir / "tokenizer.php"
+
+            if not php_file.exists() or overwrite:
+                php_file.write_text(dedent(f"""\
+                    <?php
+                    @define('CONST_Max_Word_Frequency', 10000000);
+                    @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+                    @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+                    require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
+
+
+    def _save_config(self) -> None:
+        """ Save the configuration that needs to remain stable for the given
+            database as database properties.
+        """
+        assert self.loader is not None
+        with connect(self.dsn) as conn:
+            self.loader.save_config_to_db(conn)
+
+
+    def _setup_db_tables(self, config: Configuration) -> None:
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table('word')
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn, """
+                CREATE TABLE word (
+                      word_id INTEGER,
+                      word_token text NOT NULL,
+                      type text NOT NULL,
+                      word text,
+                      info jsonb
+                    ) {{db.tablespace.search_data}};
+                GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
+
+                DROP SEQUENCE IF EXISTS seq_word;
+                CREATE SEQUENCE seq_word start 1;
+                GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
+            """)
+            conn.commit()
+
+
+    def _create_base_indices(self, config: Configuration, table_name: str) -> None:
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn,
+                            """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
+                               USING BTREE (word_token) {{db.tablespace.search_index}}""",
+                            table_name=table_name)
+            for name, ctype in WORD_TYPES:
+                sqlp.run_string(conn,
+                                """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
+                                   USING BTREE (word) {{db.tablespace.address_index}}
+                                   WHERE type = '{{column_type}}'
+                                """,
+                                table_name=table_name, idx_name=name,
+                                column_type=ctype)
+            conn.commit()
+
+
+    def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
+        """ Create additional indexes used when running the API.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            # Index required for details lookup.
+            sqlp.run_string(conn, """
+                CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
+                  ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
+            """,
+            table_name=table_name)
+            conn.commit()
+
+
+    def _move_temporary_word_table(self, old: str) -> None:
+        """ Rename all tables and indexes used by the tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table('word')
+                cur.execute(f"ALTER TABLE {old} RENAME TO word")
+                for idx in ('word_token', 'word_id'):
+                    cur.execute(f"""ALTER INDEX idx_{old}_{idx}
+                                      RENAME TO idx_word_{idx}""")
+                for name, _ in WORD_TYPES:
+                    cur.execute(f"""ALTER INDEX idx_{old}_{name}
+                                    RENAME TO idx_word_{name}""")
+            conn.commit()
+
+
+
+
+class ICUNameAnalyzer(AbstractAnalyzer):
+    """ The ICU analyzer uses the ICU library for splitting names.
+
+        Each instance opens a connection to the database to request the
+        normalization.
+    """
+
+    def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
+                 token_analysis: ICUTokenAnalysis) -> None:
+        self.conn: Optional[Connection] = connect(dsn).connection
+        self.conn.autocommit = True
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis
+
+        self._cache = _TokenCache()
+
+
+    def close(self) -> None:
+        """ Free all resources used by the analyzer.
+        """
+        if self.conn:
+            self.conn.close()
+            self.conn = None
+
+
+    def _search_normalized(self, name: str) -> str:
+        """ Return the search token transliteration of the given name.
+        """
+        return cast(str, self.token_analysis.search.transliterate(name)).strip()
+
+
+    def _normalized(self, name: str) -> str:
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
+
+
+    def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
+        """ Return token information for the given list of words.
+            If a word starts with # it is assumed to be a full name
+            otherwise is a partial name.
+
+            The function returns a list of tuples with
+            (original word, word token, word id).
+
+            The function is used for testing and debugging only
+            and not necessarily efficient.
+        """
+        assert self.conn is not None
+        full_tokens = {}
+        partial_tokens = {}
+        for word in words:
+            if word.startswith('#'):
+                full_tokens[word] = self._search_normalized(word[1:])
+            else:
+                partial_tokens[word] = self._search_normalized(word)
+
+        with self.conn.cursor() as cur:
+            cur.execute("""SELECT word_token, word_id
+                            FROM word WHERE word_token = ANY(%s) and type = 'W'
+                        """, (list(full_tokens.values()),))
+            full_ids = {r[0]: r[1] for r in cur}
+            cur.execute("""SELECT word_token, word_id
+                            FROM word WHERE word_token = ANY(%s) and type = 'w'""",
+                        (list(partial_tokens.values()),))
+            part_ids = {r[0]: r[1] for r in cur}
+
+        return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
+               + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
+
+
+    def normalize_postcode(self, postcode: str) -> str:
+        """ Convert the postcode to a standardized form.
+
+            This function must yield exactly the same result as the SQL function
+            'token_normalized_postcode()'.
+        """
+        return postcode.strip().upper()
+
+
+    def update_postcodes_from_db(self) -> None:
+        """ Update postcode tokens in the word table from the location_postcode
+            table.
+        """
+        assert self.conn is not None
+        analyzer = self.token_analysis.analysis.get('@postcode')
+
+        with self.conn.cursor() as cur:
+            # First get all postcode names currently in the word table.
+            cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
+            word_entries = set((entry[0] for entry in cur))
+
+            # Then compute the required postcode names from the postcode table.
+            needed_entries = set()
+            cur.execute("SELECT country_code, postcode FROM location_postcode")
+            for cc, postcode in cur:
+                info = PlaceInfo({'country_code': cc,
+                                  'class': 'place', 'type': 'postcode',
+                                  'address': {'postcode': postcode}})
+                address = self.sanitizer.process_names(info)[1]
+                for place in address:
+                    if place.kind == 'postcode':
+                        if analyzer is None:
+                            postcode_name = place.name.strip().upper()
+                            variant_base = None
+                        else:
+                            postcode_name = analyzer.get_canonical_id(place)
+                            variant_base = place.get_attr("variant")
+
+                        if variant_base:
+                            needed_entries.add(f'{postcode_name}@{variant_base}')
+                        else:
+                            needed_entries.add(postcode_name)
+                        break
+
+        # Now update the word table.
+        self._delete_unused_postcode_words(word_entries - needed_entries)
+        self._add_missing_postcode_words(needed_entries - word_entries)
+
+    def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
+        assert self.conn is not None
+        if tokens:
+            with self.conn.cursor() as cur:
+                cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
+                            (list(tokens), ))
+
+    def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
+        assert self.conn is not None
+        if not tokens:
+            return
+
+        analyzer = self.token_analysis.analysis.get('@postcode')
+        terms = []
+
+        for postcode_name in tokens:
+            if '@' in postcode_name:
+                term, variant = postcode_name.split('@', 2)
+                term = self._search_normalized(term)
+                if analyzer is None:
+                    variants = [term]
+                else:
+                    variants = analyzer.compute_variants(variant)
+                    if term not in variants:
+                        variants.append(term)
+            else:
+                variants = [self._search_normalized(postcode_name)]
+            terms.append((postcode_name, variants))
+
+        if terms:
+            with self.conn.cursor() as cur:
+                cur.execute_values("""SELECT create_postcode_word(pc, var)
+                                      FROM (VALUES %s) AS v(pc, var)""",
+                                   terms)
+
+
+
+
+    def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
+                               should_replace: bool) -> None:
+        """ Replace the search index for special phrases with the new phrases.
+            If `should_replace` is True, then the previous set of will be
+            completely replaced. Otherwise the phrases are added to the
+            already existing ones.
+        """
+        assert self.conn is not None
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
+                            for p in phrases))
+
+        with self.conn.cursor() as cur:
+            # Get the old phrases.
+            existing_phrases = set()
+            cur.execute("SELECT word, info FROM word WHERE type = 'S'")
+            for word, info in cur:
+                existing_phrases.add((word, info['class'], info['type'],
+                                      info.get('op') or '-'))
+
+            added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
+            if should_replace:
+                deleted = self._remove_special_phrases(cur, norm_phrases,
+                                                       existing_phrases)
+            else:
+                deleted = 0
+
+        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+                 len(norm_phrases), added, deleted)
+
+
+    def _add_special_phrases(self, cursor: Cursor,
+                             new_phrases: Set[Tuple[str, str, str, str]],
+                             existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
+        """ Add all phrases to the database that are not yet there.
+        """
+        to_add = new_phrases - existing_phrases
+
+        added = 0
+        with CopyBuffer() as copystr:
+            for word, cls, typ, oper in to_add:
+                term = self._search_normalized(word)
+                if term:
+                    copystr.add(term, 'S', word,
+                                json.dumps({'class': cls, 'type': typ,
+                                            'op': oper if oper in ('in', 'near') else None}))
+                    added += 1
+
+            copystr.copy_out(cursor, 'word',
+                             columns=['word_token', 'type', 'word', 'info'])
+
+        return added
+
+
+    def _remove_special_phrases(self, cursor: Cursor,
+                             new_phrases: Set[Tuple[str, str, str, str]],
+                             existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
+        """ Remove all phrases from the database that are no longer in the
+            new phrase list.
+        """
+        to_delete = existing_phrases - new_phrases
+
+        if to_delete:
+            cursor.execute_values(
+                """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                    WHERE type = 'S' and word = name
+                          and info->>'class' = in_class and info->>'type' = in_type
+                          and ((op = '-' and info->>'op' is null) or op = info->>'op')
+                """, to_delete)
+
+        return len(to_delete)
+
+
+    def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
+        """ Add default names for the given country to the search index.
+        """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0],
+                                     internal=True)
+
+
+    def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
+                                internal: bool = False) -> None:
+        """ Add names for the given country from an already sanitized
+            name list.
+        """
+        assert self.conn is not None
+        word_tokens = set()
+        for name in names:
+            norm_name = self._search_normalized(name.name)
+            if norm_name:
+                word_tokens.add(norm_name)
+
+        with self.conn.cursor() as cur:
+            # Get existing names
+            cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
+                             FROM word
+                             WHERE type = 'C' and word = %s""",
+                        (country_code, ))
+            # internal/external names
+            existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
+            for word in cur:
+                existing_tokens[word[1]].add(word[0])
+
+            # Delete names that no longer exist.
+            gone_tokens = existing_tokens[internal] - word_tokens
+            if internal:
+                gone_tokens.update(existing_tokens[False] & word_tokens)
+            if gone_tokens:
+                cur.execute("""DELETE FROM word
+                               USING unnest(%s) as token
+                               WHERE type = 'C' and word = %s
+                                     and word_token = token""",
+                            (list(gone_tokens), country_code))
+
+            # Only add those names that are not yet in the list.
+            new_tokens = word_tokens - existing_tokens[True]
+            if not internal:
+                new_tokens -= existing_tokens[False]
+            if new_tokens:
+                if internal:
+                    sql = """INSERT INTO word (word_token, type, word, info)
+                               (SELECT token, 'C', %s, '{"internal": "yes"}'
+                                  FROM unnest(%s) as token)
+                           """
+                else:
+                    sql = """INSERT INTO word (word_token, type, word)
+                                   (SELECT token, 'C', %s
+                                    FROM unnest(%s) as token)
+                          """
+                cur.execute(sql, (country_code, list(new_tokens)))
+
+
+    def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
+        """ Determine tokenizer information about the given place.
+
+            Returns a JSON-serializable structure that will be handed into
+            the database via the token_info field.
+        """
+        token_info = _TokenInfo()
+
+        names, address = self.sanitizer.process_names(place)
+
+        if names:
+            token_info.set_names(*self._compute_name_tokens(names))
+
+            if place.is_country():
+                assert place.country_code is not None
+                self._add_country_full_names(place.country_code, names)
+
+        if address:
+            self._process_place_address(token_info, address)
+
+        return token_info.to_dict()
+
+
+    def _process_place_address(self, token_info: '_TokenInfo',
+                               address: Sequence[PlaceName]) -> None:
+        for item in address:
+            if item.kind == 'postcode':
+                token_info.set_postcode(self._add_postcode(item))
+            elif item.kind == 'housenumber':
+                token_info.add_housenumber(*self._compute_housenumber_token(item))
+            elif item.kind == 'street':
+                token_info.add_street(self._retrieve_full_tokens(item.name))
+            elif item.kind == 'place':
+                if not item.suffix:
+                    token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
+            elif not item.kind.startswith('_') and not item.suffix and \
+                 item.kind not in ('country', 'full', 'inclusion'):
+                token_info.add_address_term(item.kind,
+                                            itertools.chain(*self._compute_name_tokens([item])))
+
+
+    def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
+        """ Normalize the housenumber and return the word token and the
+            canonical form.
+        """
+        assert self.conn is not None
+        analyzer = self.token_analysis.analysis.get('@housenumber')
+        result: Tuple[Optional[int], Optional[str]] = (None, None)
+
+        if analyzer is None:
+            # When no custom analyzer is set, simply normalize and transliterate
+            norm_name = self._search_normalized(hnr.name)
+            if norm_name:
+                result = self._cache.housenumbers.get(norm_name, result)
+                if result[0] is None:
+                    with self.conn.cursor() as cur:
+                        hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+
+                        result = hid, norm_name
+                        self._cache.housenumbers[norm_name] = result
+        else:
+            # Otherwise use the analyzer to determine the canonical name.
+            # Per convention we use the first variant as the 'lookup name', the
+            # name that gets saved in the housenumber field of the place.
+            word_id = analyzer.get_canonical_id(hnr)
+            if word_id:
+                result = self._cache.housenumbers.get(word_id, result)
+                if result[0] is None:
+                    variants = analyzer.compute_variants(word_id)
+                    if variants:
+                        with self.conn.cursor() as cur:
+                            hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+                                             (word_id, list(variants)))
+                            result = hid, variants[0]
+                            self._cache.housenumbers[word_id] = result
+
+        return result
+
+
+    def _retrieve_full_tokens(self, name: str) -> List[int]:
+        """ Get the full name token for the given name, if it exists.
+            The name is only retrieved for the standard analyser.
+        """
+        assert self.conn is not None
+        norm_name = self._search_normalized(name)
+
+        # return cached if possible
+        if norm_name in self._cache.fulls:
+            return self._cache.fulls[norm_name]
+
+        with self.conn.cursor() as cur:
+            cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
+                        (norm_name, ))
+            full = [row[0] for row in cur]
+
+        self._cache.fulls[norm_name] = full
+
+        return full
+
+
+    def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
+        """ Computes the full name and partial name tokens for the given
+            dictionary of names.
+        """
+        assert self.conn is not None
+        full_tokens: Set[int] = set()
+        partial_tokens: Set[int] = set()
+
+        for name in names:
+            analyzer_id = name.get_attr('analyzer')
+            analyzer = self.token_analysis.get_analyzer(analyzer_id)
+            word_id = analyzer.get_canonical_id(name)
+            if analyzer_id is None:
+                token_id = word_id
+            else:
+                token_id = f'{word_id}@{analyzer_id}'
+
+            full, part = self._cache.names.get(token_id, (None, None))
+            if full is None:
+                variants = analyzer.compute_variants(word_id)
+                if not variants:
+                    continue
+
+                with self.conn.cursor() as cur:
+                    cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
+                                (token_id, variants))
+                    full, part = cast(Tuple[int, List[int]], cur.fetchone())
+
+                self._cache.names[token_id] = (full, part)
+
+            assert part is not None
+
+            full_tokens.add(full)
+            partial_tokens.update(part)
+
+        return full_tokens, partial_tokens
+
+
+    def _add_postcode(self, item: PlaceName) -> Optional[str]:
+        """ Make sure the normalized postcode is present in the word table.
+        """
+        assert self.conn is not None
+        analyzer = self.token_analysis.analysis.get('@postcode')
+
+        if analyzer is None:
+            postcode_name = item.name.strip().upper()
+            variant_base = None
+        else:
+            postcode_name = analyzer.get_canonical_id(item)
+            variant_base = item.get_attr("variant")
+
+        if variant_base:
+            postcode = f'{postcode_name}@{variant_base}'
+        else:
+            postcode = postcode_name
+
+        if postcode not in self._cache.postcodes:
+            term = self._search_normalized(postcode_name)
+            if not term:
+                return None
+
+            variants = {term}
+            if analyzer is not None and variant_base:
+                variants.update(analyzer.compute_variants(variant_base))
+
+            with self.conn.cursor() as cur:
+                cur.execute("SELECT create_postcode_word(%s, %s)",
+                            (postcode, list(variants)))
+            self._cache.postcodes.add(postcode)
+
+        return postcode_name
+
+
+class _TokenInfo:
+    """ Collect token information to be sent back to the database.
+    """
+    def __init__(self) -> None:
+        self.names: Optional[str] = None
+        self.housenumbers: Set[str] = set()
+        self.housenumber_tokens: Set[int] = set()
+        self.street_tokens: Optional[Set[int]] = None
+        self.place_tokens: Set[int] = set()
+        self.address_tokens: Dict[str, str] = {}
+        self.postcode: Optional[str] = None
+
+
+    def _mk_array(self, tokens: Iterable[Any]) -> str:
+        return f"{{{','.join((str(s) for s in tokens))}}}"
+
+
+    def to_dict(self) -> Dict[str, Any]:
+        """ Return the token information in database importable format.
+        """
+        out: Dict[str, Any] = {}
+
+        if self.names:
+            out['names'] = self.names
+
+        if self.housenumbers:
+            out['hnr'] = ';'.join(self.housenumbers)
+            out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
+
+        if self.street_tokens is not None:
+            out['street'] = self._mk_array(self.street_tokens)
+
+        if self.place_tokens:
+            out['place'] = self._mk_array(self.place_tokens)
+
+        if self.address_tokens:
+            out['addr'] = self.address_tokens
+
+        if self.postcode:
+            out['postcode'] = self.postcode
+
+        return out
+
+
+    def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
+        """ Adds token information for the normalised names.
+        """
+        self.names = self._mk_array(itertools.chain(fulls, partials))
+
+
+    def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
+        """ Extract housenumber information from a list of normalised
+            housenumbers.
+        """
+        if token:
+            assert hnr is not None
+            self.housenumbers.add(hnr)
+            self.housenumber_tokens.add(token)
+
+
+    def add_street(self, tokens: Iterable[int]) -> None:
+        """ Add addr:street match terms.
+        """
+        if self.street_tokens is None:
+            self.street_tokens = set()
+        self.street_tokens.update(tokens)
+
+
+    def add_place(self, tokens: Iterable[int]) -> None:
+        """ Add addr:place search and match terms.
+        """
+        self.place_tokens.update(tokens)
+
+
+    def add_address_term(self, key: str, partials: Iterable[int]) -> None:
+        """ Add additional address terms.
+        """
+        array = self._mk_array(partials)
+        if len(array) > 2:
+            self.address_tokens[key] = array
+
+    def set_postcode(self, postcode: Optional[str]) -> None:
+        """ Set the postcode to the given one.
+        """
+        self.postcode = postcode
+
+
+class _TokenCache:
+    """ Cache for token information to avoid repeated database queries.
+
+        This cache is not thread-safe and needs to be instantiated per
+        analyzer.
+    """
+    def __init__(self) -> None:
+        self.names: Dict[str, Tuple[int, List[int]]] = {}
+        self.partials: Dict[str, int] = {}
+        self.fulls: Dict[str, List[int]] = {}
+        self.postcodes: Set[str] = set()
+        self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}
--- a/src/nominatim_db/tokenizer/legacy_tokenizer.py
+++ b/src/nominatim_db/tokenizer/legacy_tokenizer.py
@@ -0,0 +1,681 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tokenizer implementing normalisation as used before Nominatim 4.
+"""
+from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
+                   cast, Dict, Set, Iterable
+from collections import OrderedDict
+import logging
+from pathlib import Path
+import re
+import shutil
+from textwrap import dedent
+
+from icu import Transliterator
+import psycopg2
+import psycopg2.extras
+
+from nominatim_core.errors import UsageError
+from nominatim_core.db.connection import connect, Connection
+from nominatim_core.config import Configuration
+from nominatim_core.db import properties
+from nominatim_core.db import utils as db_utils
+from nominatim_core.db.sql_preprocessor import SQLPreprocessor
+from ..data.place_info import PlaceInfo
+from .base import AbstractAnalyzer, AbstractTokenizer
+
+DBCFG_NORMALIZATION = "tokenizer_normalization"
+DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
+
+LOG = logging.getLogger()
+
+def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
+    """ Create a new instance of the tokenizer provided by this module.
+    """
+    return LegacyTokenizer(dsn, data_dir)
+
+
+def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
+    """ Copies the PostgreSQL normalisation module into the project
+        directory if necessary. For historical reasons the module is
+        saved in the '/module' subdirectory and not with the other tokenizer
+        data.
+
+        The function detects when the installation is run from the
+        build directory. It doesn't touch the module in that case.
+    """
+    # Custom module locations are simply used as is.
+    if config_module_path:
+        LOG.info("Using custom path for database module at '%s'", config_module_path)
+        return config_module_path
+
+    # Compatibility mode for builddir installations.
+    if module_dir.exists() and src_dir.samefile(module_dir):
+        LOG.info('Running from build directory. Leaving database module as is.')
+        return str(module_dir)
+
+    # In any other case install the module in the project directory.
+    if not module_dir.exists():
+        module_dir.mkdir()
+
+    destfile = module_dir / 'nominatim.so'
+    shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
+    destfile.chmod(0o755)
+
+    LOG.info('Database module installed at %s', str(destfile))
+
+    return str(module_dir)
+
+
+def _check_module(module_dir: str, conn: Connection) -> None:
+    """ Try to use the PostgreSQL module to confirm that it is correctly
+        installed and accessible from PostgreSQL.
+    """
+    with conn.cursor() as cur:
+        try:
+            cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
+                           RETURNS text AS %s, 'transliteration'
+                           LANGUAGE c IMMUTABLE STRICT;
+                           DROP FUNCTION nominatim_test_import_func(text)
+                        """, (f'{module_dir}/nominatim.so', ))
+        except psycopg2.DatabaseError as err:
+            LOG.fatal("Error accessing database module: %s", err)
+            raise UsageError("Database module cannot be accessed.") from err
+
+
+class LegacyTokenizer(AbstractTokenizer):
+    """ The legacy tokenizer uses a special PostgreSQL module to normalize
+        names and queries. The tokenizer thus implements normalization through
+        calls to the database.
+    """
+
+    def __init__(self, dsn: str, data_dir: Path) -> None:
+        self.dsn = dsn
+        self.data_dir = data_dir
+        self.normalization: Optional[str] = None
+
+
+    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
+        """ Set up a new tokenizer for the database.
+
+            This copies all necessary data in the project directory to make
+            sure the tokenizer remains stable even over updates.
+        """
+        assert config.project_dir is not None
+        module_dir = _install_module(config.DATABASE_MODULE_PATH,
+                                     config.lib_dir.module,
+                                     config.project_dir / 'module')
+
+        self.normalization = config.TERM_NORMALIZATION
+
+        self._install_php(config, overwrite=True)
+
+        with connect(self.dsn) as conn:
+            _check_module(module_dir, conn)
+            self._save_config(conn, config)
+            conn.commit()
+
+        if init_db:
+            self.update_sql_functions(config)
+            self._init_db_tables(config)
+
+
+    def init_from_project(self, config: Configuration) -> None:
+        """ Initialise the tokenizer from the project directory.
+        """
+        assert config.project_dir is not None
+
+        with connect(self.dsn) as conn:
+            self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
+
+        if not (config.project_dir / 'module' / 'nominatim.so').exists():
+            _install_module(config.DATABASE_MODULE_PATH,
+                            config.lib_dir.module,
+                            config.project_dir / 'module')
+
+        self._install_php(config, overwrite=False)
+
+    def finalize_import(self, config: Configuration) -> None:
+        """ Do any required postprocessing to make the tokenizer data ready
+            for use.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+
+
+    def update_sql_functions(self, config: Configuration) -> None:
+        """ Reimport the SQL functions for this tokenizer.
+        """
+        assert config.project_dir is not None
+
+        with connect(self.dsn) as conn:
+            max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
+            modulepath = config.DATABASE_MODULE_PATH or \
+                         str((config.project_dir / 'module').resolve())
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
+                              max_word_freq=max_word_freq,
+                              modulepath=modulepath)
+
+
+    def check_database(self, _: Configuration) -> Optional[str]:
+        """ Check that the tokenizer is set up correctly.
+        """
+        hint = """\
+             The Postgresql extension nominatim.so was not correctly loaded.
+
+             Error: {error}
+
+             Hints:
+             * Check the output of the CMmake/make installation step
+             * Does nominatim.so exist?
+             * Does nominatim.so exist on the database server?
+             * Can nominatim.so be accessed by the database user?
+             """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                try:
+                    out = cur.scalar("SELECT make_standard_name('a')")
+                except psycopg2.Error as err:
+                    return hint.format(error=str(err))
+
+        if out != 'a':
+            return hint.format(error='Unexpected result for make_standard_name()')
+
+        return None
+
+
+    def migrate_database(self, config: Configuration) -> None:
+        """ Initialise the project directory of an existing database for
+            use with this tokenizer.
+
+            This is a special migration function for updating existing databases
+            to new software versions.
+        """
+        assert config.project_dir is not None
+
+        self.normalization = config.TERM_NORMALIZATION
+        module_dir = _install_module(config.DATABASE_MODULE_PATH,
+                                     config.lib_dir.module,
+                                     config.project_dir / 'module')
+
+        with connect(self.dsn) as conn:
+            _check_module(module_dir, conn)
+            self._save_config(conn, config)
+
+
+    def update_statistics(self, config: Configuration, threads: int = 1) -> None:
+        """ Recompute the frequency of full words.
+        """
+        with connect(self.dsn) as conn:
+            if conn.table_exists('search_name'):
+                with conn.cursor() as cur:
+                    cur.drop_table("word_frequencies")
+                    LOG.info("Computing word frequencies")
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute("CREATE INDEX ON word_frequencies(id)")
+                    LOG.info("Update word table with recomputed frequencies")
+                    cur.execute("""UPDATE word SET search_name_count = count
+                                   FROM word_frequencies
+                                   WHERE word_token like ' %' and word_id = id""")
+                    cur.drop_table("word_frequencies")
+            conn.commit()
+
+
+    def update_word_tokens(self) -> None:
+        """ No house-keeping implemented for the legacy tokenizer.
+        """
+        LOG.info("No tokenizer clean-up available.")
+
+
+    def name_analyzer(self) -> 'LegacyNameAnalyzer':
+        """ Create a new analyzer for tokenizing names and queries
+            using this tokinzer. Analyzers are context managers and should
+            be used accordingly:
+
+            ```
+            with tokenizer.name_analyzer() as analyzer:
+                analyser.tokenize()
+            ```
+
+            When used outside the with construct, the caller must ensure to
+            call the close() function before destructing the analyzer.
+
+            Analyzers are not thread-safe. You need to instantiate one per thread.
+        """
+        normalizer = Transliterator.createFromRules("phrase normalizer",
+                                                    self.normalization)
+        return LegacyNameAnalyzer(self.dsn, normalizer)
+
+
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute(""" SELECT word FROM word WHERE word is not null
+                              ORDER BY search_name_count DESC LIMIT %s""", (num,))
+            return list(s[0] for s in cur)
+
+
+    def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
+        """ Install the php script for the tokenizer.
+        """
+        if config.lib_dir.php is not None:
+            php_file = self.data_dir / "tokenizer.php"
+
+            if not php_file.exists() or overwrite:
+                php_file.write_text(dedent(f"""\
+                    <?php
+                    @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
+                    @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
+                    require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+                    """), encoding='utf-8')
+
+
+    def _init_db_tables(self, config: Configuration) -> None:
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            conn.commit()
+
+        LOG.warning("Precomputing word tokens")
+        db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
+
+
+    def _save_config(self, conn: Connection, config: Configuration) -> None:
+        """ Save the configuration that needs to remain stable for the given
+            database as database properties.
+        """
+        assert self.normalization is not None
+
+        properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
+
+
+class LegacyNameAnalyzer(AbstractAnalyzer):
+    """ The legacy analyzer uses the special Postgresql module for
+        splitting names.
+
+        Each instance opens a connection to the database to request the
+        normalization.
+    """
+
+    def __init__(self, dsn: str, normalizer: Any):
+        self.conn: Optional[Connection] = connect(dsn).connection
+        self.conn.autocommit = True
+        self.normalizer = normalizer
+        psycopg2.extras.register_hstore(self.conn)
+
+        self._cache = _TokenCache(self.conn)
+
+
+    def close(self) -> None:
+        """ Free all resources used by the analyzer.
+        """
+        if self.conn:
+            self.conn.close()
+            self.conn = None
+
+
+    def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
+        """ Return token information for the given list of words.
+            If a word starts with # it is assumed to be a full name
+            otherwise is a partial name.
+
+            The function returns a list of tuples with
+            (original word, word token, word id).
+
+            The function is used for testing and debugging only
+            and not necessarily efficient.
+        """
+        assert self.conn is not None
+        with self.conn.cursor() as cur:
+            cur.execute("""SELECT t.term, word_token, word_id
+                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
+                           WHERE word_token = (CASE
+                                   WHEN left(t.term, 1) = '#' THEN
+                                     ' ' || make_standard_name(substring(t.term from 2))
+                                   ELSE
+                                     make_standard_name(t.term)
+                                   END)
+                                 and class is null and country_code is null""",
+                        (words, ))
+
+            return [(r[0], r[1], r[2]) for r in cur]
+
+
+    def normalize(self, phrase: str) -> str:
+        """ Normalize the given phrase, i.e. remove all properties that
+            are irrelevant for search.
+        """
+        return cast(str, self.normalizer.transliterate(phrase))
+
+
+    def normalize_postcode(self, postcode: str) -> str:
+        """ Convert the postcode to a standardized form.
+
+            This function must yield exactly the same result as the SQL function
+            'token_normalized_postcode()'.
+        """
+        return postcode.strip().upper()
+
+
+    def update_postcodes_from_db(self) -> None:
+        """ Update postcode tokens in the word table from the location_postcode
+            table.
+        """
+        assert self.conn is not None
+
+        with self.conn.cursor() as cur:
+            # This finds us the rows in location_postcode and word that are
+            # missing in the other table.
+            cur.execute("""SELECT * FROM
+                            (SELECT pc, word FROM
+                              (SELECT distinct(postcode) as pc FROM location_postcode) p
+                              FULL JOIN
+                              (SELECT word FROM word
+                                WHERE class ='place' and type = 'postcode') w
+                              ON pc = word) x
+                           WHERE pc is null or word is null""")
+
+            to_delete = []
+            to_add = []
+
+            for postcode, word in cur:
+                if postcode is None:
+                    to_delete.append(word)
+                else:
+                    to_add.append(postcode)
+
+            if to_delete:
+                cur.execute("""DELETE FROM WORD
+                               WHERE class ='place' and type = 'postcode'
+                                     and word = any(%s)
+                            """, (to_delete, ))
+            if to_add:
+                cur.execute("""SELECT count(create_postcode_id(pc))
+                               FROM unnest(%s) as pc
+                            """, (to_add, ))
+
+
+
+    def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
+                               should_replace: bool) -> None:
+        """ Replace the search index for special phrases with the new phrases.
+        """
+        assert self.conn is not None
+
+        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+                            for p in phrases))
+
+        with self.conn.cursor() as cur:
+            # Get the old phrases.
+            existing_phrases = set()
+            cur.execute("""SELECT word, class, type, operator FROM word
+                           WHERE class != 'place'
+                                 OR (type != 'house' AND type != 'postcode')""")
+            for label, cls, typ, oper in cur:
+                existing_phrases.add((label, cls, typ, oper or '-'))
+
+            to_add = norm_phrases - existing_phrases
+            to_delete = existing_phrases - norm_phrases
+
+            if to_add:
+                cur.execute_values(
+                    """ INSERT INTO word (word_id, word_token, word, class, type,
+                                          search_name_count, operator)
+                        (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
+                                class, type, 0,
+                                CASE WHEN op in ('in', 'near') THEN op ELSE null END
+                           FROM (VALUES %s) as v(name, class, type, op))""",
+                    to_add)
+
+            if to_delete and should_replace:
+                cur.execute_values(
+                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                        WHERE word = name and class = in_class and type = in_type
+                              and ((op = '-' and operator is null) or op = operator)""",
+                    to_delete)
+
+        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+                 len(norm_phrases), len(to_add), len(to_delete))
+
+
+    def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
+        """ Add names for the given country to the search index.
+        """
+        assert self.conn is not None
+
+        with self.conn.cursor() as cur:
+            cur.execute(
+                """INSERT INTO word (word_id, word_token, country_code)
+                   (SELECT nextval('seq_word'), lookup_token, %s
+                      FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
+                            FROM unnest(%s)n) y
+                      WHERE NOT EXISTS(SELECT * FROM word
+                                       WHERE word_token = lookup_token and country_code = %s))
+                """, (country_code, list(names.values()), country_code))
+
+
+    def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
+        """ Determine tokenizer information about the given place.
+
+            Returns a JSON-serialisable structure that will be handed into
+            the database via the token_info field.
+        """
+        assert self.conn is not None
+
+        token_info = _TokenInfo(self._cache)
+
+        names = place.name
+
+        if names:
+            token_info.add_names(self.conn, names)
+
+            if place.is_country():
+                assert place.country_code is not None
+                self.add_country_names(place.country_code, names)
+
+        address = place.address
+        if address:
+            self._process_place_address(token_info, address)
+
+        return token_info.data
+
+
+    def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
+        assert self.conn is not None
+        hnrs = []
+        addr_terms = []
+
+        for key, value in address.items():
+            if key == 'postcode':
+                # Make sure the normalized postcode is present in the word table.
+                if re.search(r'[:,;]', value) is None:
+                    norm_pc = self.normalize_postcode(value)
+                    token_info.set_postcode(norm_pc)
+                    self._cache.add_postcode(self.conn, norm_pc)
+            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(value)
+            elif key == 'street':
+                token_info.add_street(self.conn, value)
+            elif key == 'place':
+                token_info.add_place(self.conn, value)
+            elif not key.startswith('_') \
+                 and key not in ('country', 'full', 'inclusion'):
+                addr_terms.append((key, value))
+
+        if hnrs:
+            token_info.add_housenumbers(self.conn, hnrs)
+
+        if addr_terms:
+            token_info.add_address_terms(self.conn, addr_terms)
+
+
+
+class _TokenInfo:
+    """ Collect token information to be sent back to the database.
+    """
+    def __init__(self, cache: '_TokenCache') -> None:
+        self.cache = cache
+        self.data: Dict[str, Any] = {}
+
+
+    def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
+        """ Add token information for the names of the place.
+        """
+        with conn.cursor() as cur:
+            # Create the token IDs for all names.
+            self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
+                                            (names, ))
+
+
+    def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
+        """ Extract housenumber information from the address.
+        """
+        if len(hnrs) == 1:
+            token = self.cache.get_housenumber(hnrs[0])
+            if token is not None:
+                self.data['hnr_tokens'] = token
+                self.data['hnr'] = hnrs[0]
+                return
+
+        # split numbers if necessary
+        simple_list: List[str] = []
+        for hnr in hnrs:
+            simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
+
+        if len(simple_list) > 1:
+            simple_list = list(set(simple_list))
+
+        with conn.cursor() as cur:
+            cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
+            result = cur.fetchone()
+            assert result is not None
+            self.data['hnr_tokens'], self.data['hnr'] = result
+
+
+    def set_postcode(self, postcode: str) -> None:
+        """ Set or replace the postcode token with the given value.
+        """
+        self.data['postcode'] = postcode
+
+    def add_street(self, conn: Connection, street: str) -> None:
+        """ Add addr:street match terms.
+        """
+        def _get_street(name: str) -> Optional[str]:
+            with conn.cursor() as cur:
+                return cast(Optional[str],
+                            cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
+
+        tokens = self.cache.streets.get(street, _get_street)
+        self.data['street'] = tokens or '{}'
+
+
+    def add_place(self, conn: Connection, place: str) -> None:
+        """ Add addr:place search and match terms.
+        """
+        def _get_place(name: str) -> Tuple[List[int], List[int]]:
+            with conn.cursor() as cur:
+                cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
+                                      word_ids_from_name(%s)::text""",
+                            (name, name))
+                return cast(Tuple[List[int], List[int]], cur.fetchone())
+
+        self.data['place_search'], self.data['place_match'] = \
+            self.cache.places.get(place, _get_place)
+
+
+    def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
+        """ Add additional address terms.
+        """
+        def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
+            with conn.cursor() as cur:
+                cur.execute("""SELECT addr_ids_from_name(%s)::text,
+                                      word_ids_from_name(%s)::text""",
+                            (name, name))
+                return cast(Tuple[List[int], List[int]], cur.fetchone())
+
+        tokens = {}
+        for key, value in terms:
+            items = self.cache.address_terms.get(value, _get_address_term)
+            if items[0] or items[1]:
+                tokens[key] = items
+
+        if tokens:
+            self.data['addr'] = tokens
+
+
+class _LRU:
+    """ Least recently used cache that accepts a generator function to
+        produce the item when there is a cache miss.
+    """
+
+    def __init__(self, maxsize: int = 128):
+        self.data: 'OrderedDict[str, Any]' = OrderedDict()
+        self.maxsize = maxsize
+
+
+    def get(self, key: str, generator: Callable[[str], Any]) -> Any:
+        """ Get the item with the given key from the cache. If nothing
+            is found in the cache, generate the value through the
+            generator function and store it in the cache.
+        """
+        value = self.data.get(key)
+        if value is not None:
+            self.data.move_to_end(key)
+        else:
+            value = generator(key)
+            if len(self.data) >= self.maxsize:
+                self.data.popitem(last=False)
+            self.data[key] = value
+
+        return value
+
+
+class _TokenCache:
+    """ Cache for token information to avoid repeated database queries.
+
+        This cache is not thread-safe and needs to be instantiated per
+        analyzer.
+    """
+    def __init__(self, conn: Connection):
+        # various LRU caches
+        self.streets = _LRU(maxsize=256)
+        self.places = _LRU(maxsize=128)
+        self.address_terms = _LRU(maxsize=1024)
+
+        # Lookup houseunumbers up to 100 and cache them
+        with conn.cursor() as cur:
+            cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
+                           FROM generate_series(1, 100) as i""")
+            self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
+
+        # For postcodes remember the ones that have already been added
+        self.postcodes: Set[str] = set()
+
+    def get_housenumber(self, number: str) -> Optional[str]:
+        """ Get a housenumber token from the cache.
+        """
+        return self._cached_housenumbers.get(number)
+
+
+    def add_postcode(self, conn: Connection, postcode: str) -> None:
+        """ Make sure the given postcode is in the database.
+        """
+        if postcode not in self.postcodes:
+            with conn.cursor() as cur:
+                cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
+            self.postcodes.add(postcode)
--- a/src/nominatim_db/tokenizer/place_sanitizer.py
+++ b/src/nominatim_db/tokenizer/place_sanitizer.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Handler for cleaning name and address tags in place information before it
+is handed to the token analysis.
+"""
+from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
+
+from nominatim_core.errors import UsageError
+from nominatim_core.config import Configuration
+from .sanitizers.config import SanitizerConfig
+from .sanitizers.base import SanitizerHandler, ProcessInfo
+from ..data.place_name import PlaceName
+from ..data.place_info import PlaceInfo
+
+
+class PlaceSanitizer:
+    """ Controller class which applies sanitizer functions on the place
+        names and address before they are used by the token analysers.
+    """
+
+    def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]],
+                 config: Configuration) -> None:
+        self.handlers: List[Callable[[ProcessInfo], None]] = []
+
+        if rules:
+            for func in rules:
+                if 'step' not in func:
+                    raise UsageError("Sanitizer rule is missing the 'step' attribute.")
+                if not isinstance(func['step'], str):
+                    raise UsageError("'step' attribute must be a simple string.")
+
+                module: SanitizerHandler = \
+                    config.load_plugin_module(func['step'], 'nominatim.tokenizer.sanitizers')
+
+                self.handlers.append(module.create(SanitizerConfig(func)))
+
+
+    def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
+        """ Extract a sanitized list of names and address parts from the
+            given place. The function returns a tuple
+            (list of names, list of address names)
+        """
+        obj = ProcessInfo(place)
+
+        for func in self.handlers:
+            func(obj)
+
+        return obj.names, obj.address
--- a/src/nominatim_db/tokenizer/sanitizers/init.py
+++ b/src/nominatim_db/tokenizer/sanitizers/init.py
--- a/src/nominatim_db/tokenizer/sanitizers/base.py
+++ b/src/nominatim_db/tokenizer/sanitizers/base.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Common data types and protocols for sanitizers.
+"""
+from typing import Optional, List, Mapping, Callable
+
+from nominatim_core.typing import Protocol, Final
+from ...data.place_info import PlaceInfo
+from ...data.place_name import PlaceName
+from .config import SanitizerConfig
+
+
+class ProcessInfo:
+    """ Container class for information handed into to handler functions.
+        The 'names' and 'address' members are mutable. A handler must change
+        them by either modifying the lists place or replacing the old content
+        with a new list.
+    """
+
+    def __init__(self, place: PlaceInfo):
+        self.place: Final = place
+        self.names = self._convert_name_dict(place.name)
+        self.address = self._convert_name_dict(place.address)
+
+
+    @staticmethod
+    def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
+        """ Convert a dictionary of names into a list of PlaceNames.
+            The dictionary key is split into the primary part of the key
+            and the suffix (the part after an optional colon).
+        """
+        out = []
+
+        if names:
+            for key, value in names.items():
+                parts = key.split(':', 1)
+                out.append(PlaceName(value.strip(),
+                                     parts[0].strip(),
+                                     parts[1].strip() if len(parts) > 1 else None))
+
+        return out
+
+
+class SanitizerHandler(Protocol):
+    """ Protocol for sanitizer modules.
+    """
+
+    def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+        """
+        Create a function for sanitizing a place.
+
+        Arguments:
+            config: A dictionary with the additional configuration options
+                    specified in the tokenizer configuration
+
+        Return:
+            The result must be a callable that takes a place description
+            and transforms name and address as required.
+        """
--- a/src/nominatim_db/tokenizer/sanitizers/clean_housenumbers.py
+++ b/src/nominatim_db/tokenizer/sanitizers/clean_housenumbers.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that preprocesses address tags for house numbers. The sanitizer
+allows to
+
+* define which tags are to be considered house numbers (see 'filter-kind')
+* split house number lists into individual numbers (see 'delimiters')
+
+Arguments:
+    delimiters: Define the set of characters to be used for
+                splitting a list of house numbers into parts. (default: ',;')
+    filter-kind: Define the address tags that are considered to be a
+                 house number. Either takes a single string or a list of strings,
+                 where each string is a regular expression. An address item
+                 is considered a house number if the 'kind' fully matches any
+                 of the given regular expressions. (default: 'housenumber')
+    convert-to-name: Define house numbers that should be treated as a name
+                     instead of a house number. Either takes a single string
+                     or a list of strings, where each string is a regular
+                     expression that must match the full house number value.
+"""
+from typing import Callable, Iterator, List
+
+from ...data.place_name import PlaceName
+from .base import ProcessInfo
+from .config import SanitizerConfig
+
+class _HousenumberSanitizer:
+
+    def __init__(self, config: SanitizerConfig) -> None:
+        self.filter_kind = config.get_filter('filter-kind', ['housenumber'])
+        self.split_regexp = config.get_delimiter()
+
+        self.filter_name = config.get_filter('convert-to-name', 'FAIL_ALL')
+
+
+    def __call__(self, obj: ProcessInfo) -> None:
+        if not obj.address:
+            return
+
+        new_address: List[PlaceName] = []
+        for item in obj.address:
+            if self.filter_kind(item.kind):
+                if self.filter_name(item.name):
+                    obj.names.append(item.clone(kind='housenumber'))
+                else:
+                    new_address.extend(item.clone(kind='housenumber', name=n)
+                                       for n in self.sanitize(item.name))
+            else:
+                # Don't touch other address items.
+                new_address.append(item)
+
+        obj.address = new_address
+
+
+    def sanitize(self, value: str) -> Iterator[str]:
+        """ Extract housenumbers in a regularized format from an OSM value.
+
+            The function works as a generator that yields all valid housenumbers
+            that can be created from the value.
+        """
+        for hnr in self.split_regexp.split(value):
+            if hnr:
+                yield from self._regularize(hnr)
+
+
+    def _regularize(self, hnr: str) -> Iterator[str]:
+        yield hnr
+
+
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+    """ Create a housenumber processing function.
+    """
+
+    return _HousenumberSanitizer(config)
--- a/src/nominatim_db/tokenizer/sanitizers/clean_postcodes.py
+++ b/src/nominatim_db/tokenizer/sanitizers/clean_postcodes.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that filters postcodes by their officially allowed pattern.
+
+Arguments:
+    convert-to-address: If set to 'yes' (the default), then postcodes that do
+                        not conform with their country-specific pattern are
+                        converted to an address component. That means that
+                        the postcode does not take part when computing the
+                        postcode centroids of a country but is still searchable.
+                        When set to 'no', non-conforming postcodes are not
+                        searchable either.
+    default-pattern:    Pattern to use, when there is none available for the
+                        country in question. Warning: will not be used for
+                        objects that have no country assigned. These are always
+                        assumed to have no postcode.
+"""
+from typing import Callable, Optional, Tuple
+
+from ...data.postcode_format import PostcodeFormatter
+from .base import ProcessInfo
+from .config import SanitizerConfig
+
+class _PostcodeSanitizer:
+
+    def __init__(self, config: SanitizerConfig) -> None:
+        self.convert_to_address = config.get_bool('convert-to-address', True)
+        self.matcher = PostcodeFormatter()
+
+        default_pattern = config.get('default-pattern')
+        if default_pattern is not None and isinstance(default_pattern, str):
+            self.matcher.set_default_pattern(default_pattern)
+
+
+    def __call__(self, obj: ProcessInfo) -> None:
+        if not obj.address:
+            return
+
+        postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
+
+        for pos, postcode in postcodes:
+            formatted = self.scan(postcode.name, obj.place.country_code)
+
+            if formatted is None:
+                if self.convert_to_address:
+                    postcode.kind = 'unofficial_postcode'
+                else:
+                    obj.address.pop(pos)
+            else:
+                postcode.name = formatted[0]
+                postcode.set_attr('variant', formatted[1])
+
+
+    def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
+        """ Check the postcode for correct formatting and return the
+            normalized version. Returns None if the postcode does not
+            correspond to the official format of the given country.
+        """
+        match = self.matcher.match(country, postcode)
+        if match is None:
+            return None
+
+        assert country is not None
+
+        return self.matcher.normalize(country, match),\
+               ' '.join(filter(lambda p: p is not None, match.groups()))
+
+
+
+
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+    """ Create a function that filters postcodes by their officially allowed pattern.
+    """
+
+    return _PostcodeSanitizer(config)
--- a/src/nominatim_db/tokenizer/sanitizers/clean_tiger_tags.py
+++ b/src/nominatim_db/tokenizer/sanitizers/clean_tiger_tags.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that preprocesses tags from the TIGER import.
+
+It makes the following changes:
+
+* remove state reference from tiger:county
+"""
+from typing import Callable
+import re
+
+from .base import ProcessInfo
+from .config import SanitizerConfig
+
+COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]')
+
+def _clean_tiger_county(obj: ProcessInfo) -> None:
+    """ Remove the state reference from tiger:county tags.
+
+        This transforms a name like 'Hamilton, AL' into 'Hamilton'.
+        If no state reference is detected at the end, the name is left as is.
+    """
+    if not obj.address:
+        return
+
+    for item in obj.address:
+        if item.kind == 'tiger' and item.suffix == 'county':
+            m = COUNTY_MATCH.fullmatch(item.name)
+            if m:
+                item.name = m[1]
+            # Switch kind and suffix, the split left them reversed.
+            item.kind = 'county'
+            item.suffix = 'tiger'
+
+            return
+
+
+def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+    """ Create a function that preprocesses tags from the TIGER import.
+    """
+    return _clean_tiger_county
--- a/src/nominatim_db/tokenizer/sanitizers/config.py
+++ b/src/nominatim_db/tokenizer/sanitizers/config.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Configuration for Sanitizers.
+"""
+from typing import Sequence, Union, Optional, Pattern, Callable, Any, TYPE_CHECKING
+from collections import UserDict
+import re
+
+from nominatim_core.errors import UsageError
+
+# working around missing generics in Python < 3.8
+# See https://github.com/python/typing/issues/60#issuecomment-869757075
+if TYPE_CHECKING:
+    _BaseUserDict = UserDict[str, Any]
+else:
+    _BaseUserDict = UserDict
+
+class SanitizerConfig(_BaseUserDict):
+    """ The `SanitizerConfig` class is a read-only dictionary
+        with configuration options for the sanitizer.
+        In addition to the usual dictionary functions, the class provides
+        accessors to standard sanitizer options that are used by many of the
+        sanitizers.
+    """
+
+    def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
+        """ Extract a configuration parameter as a string list.
+
+            Arguments:
+                param: Name of the configuration parameter.
+                default: Takes a tuple or list of strings which will
+                         be returned if the parameter is missing in the
+                         sanitizer configuration.
+                         Note that if this default parameter is not
+                         provided then an empty list is returned.
+
+            Returns:
+                If the parameter value is a simple string, it is returned as a
+                    one-item list. If the parameter value does not exist, the given
+                    default is returned. If the parameter value is a list, it is
+                    checked to contain only strings before being returned.
+        """
+        values = self.data.get(param, None)
+
+        if values is None:
+            return list(default)
+
+        if isinstance(values, str):
+            return [values] if values else []
+
+        if not isinstance(values, (list, tuple)):
+            raise UsageError(f"Parameter '{param}' must be string or list of strings.")
+
+        if any(not isinstance(value, str) for value in values):
+            raise UsageError(f"Parameter '{param}' must be string or list of strings.")
+
+        return values
+
+
+    def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
+        """ Extract a configuration parameter as a boolean.
+
+            Arguments:
+                param: Name of the configuration parameter. The parameter must
+                       contain one of the yaml boolean values or an
+                       UsageError will be raised.
+                default: Value to return, when the parameter is missing.
+                         When set to `None`, the parameter must be defined.
+
+            Returns:
+                Boolean value of the given parameter.
+        """
+        value = self.data.get(param, default)
+
+        if not isinstance(value, bool):
+            raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no').")
+
+        return value
+
+
+    def get_delimiter(self, default: str = ',;') -> Pattern[str]:
+        """ Return the 'delimiters' parameter in the configuration as a
+            compiled regular expression that can be used to split strings on
+            these delimiters.
+
+            Arguments:
+                default: Delimiters to be used when 'delimiters' parameter
+                         is not explicitly configured.
+
+            Returns:
+                A regular expression pattern which can be used to
+                    split a string. The regular expression makes sure that the
+                    resulting names are stripped and that repeated delimiters
+                    are ignored. It may still create empty fields on occasion. The
+                    code needs to filter those.
+        """
+        delimiter_set = set(self.data.get('delimiters', default))
+        if not delimiter_set:
+            raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
+
+        return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
+
+
+    def get_filter(self, param: str, default: Union[str, Sequence[str]] = 'PASS_ALL'
+                   ) -> Callable[[str], bool]:
+        """ Returns a filter function for the given parameter of the sanitizer
+            configuration.
+
+            The value provided for the parameter in sanitizer configuration
+            should be a string or list of strings, where each string is a regular
+            expression. These regular expressions will later be used by the
+            filter function to filter strings.
+
+            Arguments:
+                param: The parameter for which the filter function
+                       will be created.
+                default: Defines the behaviour of filter function if
+                         parameter is missing in the sanitizer configuration.
+                         Takes a string(PASS_ALL or FAIL_ALL) or a list of strings.
+                         Any other value of string or an empty list is not allowed,
+                         and will raise a ValueError. If the value is PASS_ALL, the filter
+                         function will let all strings to pass, if the value is FAIL_ALL,
+                         filter function will let no strings to pass.
+                         If value provided is a list of strings each string
+                         is treated as a regular expression. In this case these regular
+                         expressions will be used by the filter function.
+                         By default allow filter function to let all strings pass.
+
+            Returns:
+                A filter function that takes a target string as the argument and
+                    returns True if it fully matches any of the regular expressions
+                    otherwise returns False.
+        """
+        filters = self.get_string_list(param) or default
+
+        if filters == 'PASS_ALL':
+            return lambda _: True
+        if filters == 'FAIL_ALL':
+            return lambda _: False
+
+        if filters and isinstance(filters, (list, tuple)):
+            regexes = [re.compile(regex) for regex in filters]
+            return lambda target: any(regex.fullmatch(target) for regex in regexes)
+
+        raise ValueError("Default parameter must be a non-empty list or a string value \
+                          ('PASS_ALL' or 'FAIL_ALL').")
--- a/src/nominatim_db/tokenizer/sanitizers/delete_tags.py
+++ b/src/nominatim_db/tokenizer/sanitizers/delete_tags.py
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer which prevents certain tags from getting into the search index.
+It remove tags which matches all properties given below.
+
+
+Arguments:
+    type: Define which type of tags should be considered for removal.
+          There are two types of tags 'name' and 'address' tags.
+          Takes a string 'name' or 'address'. (default: 'name')
+
+    filter-kind: Define which 'kind' of tags should be removed.
+                 Takes a string or list of strings where each
+                 string is a regular expression. A tag is considered
+                 to be a candidate for removal if its 'kind' property
+                 fully matches any of the given regular expressions.
+                 Note that by default all 'kind' of tags are considered.
+
+    suffix: Define the 'suffix' property of the tags which should be
+            removed. Takes a string or list of strings where each
+            string is a regular expression. A tag is considered to be a
+            candidate for removal if its 'suffix' property fully
+            matches any of the given regular expressions. Note that by
+            default tags with any suffix value are considered including
+            those which don't have a suffix at all.
+
+    name: Define the 'name' property corresponding to the 'kind' property
+          of the tag. Takes a string or list of strings where each string
+          is a regular expression. A tag is considered to be a candidate
+          for removal if its name fully matches any of the given regular
+          expressions. Note that by default tags with any 'name' are
+          considered.
+
+    country_code: Define the country code of places whose tags should be
+                  considered for removed. Takes a string or list of strings
+                  where each string is a two-letter lower-case country code.
+                  Note that by default tags of places with any country code
+                  are considered including those which don't have a country
+                  code at all.
+
+    rank_address: Define the address rank of places whose tags should be
+                  considered for removal. Takes a string or list of strings
+                  where each string is a number or range of number or the
+                  form <from>-<to>.
+                  Note that default is '0-30', which means that tags of all
+                  places are considered.
+                  See https://nominatim.org/release-docs/latest/customize/Ranking/#address-rank
+                  to learn more about address rank.
+
+
+"""
+from typing import Callable, List, Tuple, Sequence
+
+from ...data.place_name import PlaceName
+from .base import ProcessInfo
+from .config import SanitizerConfig
+
+class _TagSanitizer:
+
+    def __init__(self, config: SanitizerConfig) -> None:
+        self.type = config.get('type', 'name')
+        self.filter_kind = config.get_filter('filter-kind')
+        self.country_codes = config.get_string_list('country_code', [])
+        self.filter_suffix = config.get_filter('suffix')
+        self.filter_name = config.get_filter('name')
+        self.allowed_ranks = self._set_allowed_ranks(
+            config.get_string_list("rank_address", ["0-30"])
+        )
+
+        self.has_country_code = config.get('country_code', None) is not None
+
+
+    def __call__(self, obj: ProcessInfo) -> None:
+        tags = obj.names if self.type == 'name' else obj.address
+
+        if not tags \
+           or not self.allowed_ranks[obj.place.rank_address] \
+           or self.has_country_code \
+           and obj.place.country_code not in self.country_codes:
+            return
+
+        filtered_tags: List[PlaceName] = []
+
+        for tag in tags:
+
+            if not self.filter_kind(tag.kind) \
+               or not self.filter_suffix(tag.suffix or '') \
+               or not self.filter_name(tag.name):
+                filtered_tags.append(tag)
+
+
+        if self.type == 'name':
+            obj.names = filtered_tags
+        else:
+            obj.address = filtered_tags
+
+
+    def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
+        """ Returns a tuple of 31 boolean values corresponding to the
+            address ranks 0-30. Value at index 'i' is True if rank 'i'
+            is present in the ranks or lies in the range of any of the
+            ranks provided in the sanitizer configuration, otherwise
+            the value is False.
+        """
+        allowed_ranks = [False] * 31
+
+        for rank in ranks:
+            intvl = [int(x) for x in rank.split('-')]
+
+            start, end = intvl[0], intvl[0] if len(intvl) == 1 else intvl[1]
+
+            for i in range(start, end + 1):
+                allowed_ranks[i] = True
+
+
+        return tuple(allowed_ranks)
+
+
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+    """ Create a function to process removal of certain tags.
+    """
+
+    return _TagSanitizer(config)
--- a/src/nominatim_db/tokenizer/sanitizers/split_name_list.py
+++ b/src/nominatim_db/tokenizer/sanitizers/split_name_list.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that splits lists of names into their components.
+
+Arguments:
+    delimiters: Define the set of characters to be used for
+                splitting the list. (default: ',;')
+"""
+from typing import Callable
+
+from .base import ProcessInfo
+from .config import SanitizerConfig
+
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+    """ Create a name processing function that splits name values with
+        multiple values into their components.
+    """
+    regexp = config.get_delimiter()
+
+    def _process(obj: ProcessInfo) -> None:
+        if not obj.names:
+            return
+
+        new_names = []
+        for name in obj.names:
+            split_names = regexp.split(name.name)
+            if len(split_names) == 1:
+                new_names.append(name)
+            else:
+                new_names.extend(name.clone(name=n) for n in split_names if n)
+
+        obj.names = new_names
+
+    return _process
--- a/src/nominatim_db/tokenizer/sanitizers/strip_brace_terms.py
+++ b/src/nominatim_db/tokenizer/sanitizers/strip_brace_terms.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+This sanitizer creates additional name variants for names that have
+addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
+only the main name part with the bracket part removed.
+"""
+from typing import Callable
+
+from .base import ProcessInfo
+from .config import SanitizerConfig
+
+
+def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+    """ Create a name processing function that creates additional name variants
+        for bracket addendums.
+    """
+    def _process(obj: ProcessInfo) -> None:
+        """ Add variants for names that have a bracket extension.
+        """
+        if obj.names:
+            new_names = []
+            for name in (n for n in obj.names if '(' in n.name):
+                new_name = name.name.split('(')[0].strip()
+                if new_name:
+                    new_names.append(name.clone(name=new_name))
+
+            obj.names.extend(new_names)
+
+    return _process
--- a/src/nominatim_db/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/src/nominatim_db/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+This sanitizer sets the `analyzer` property depending on the
+language of the tag. The language is taken from the suffix of the name.
+If a name already has an analyzer tagged, then this is kept.
+
+Arguments:
+
+    filter-kind: Restrict the names the sanitizer should be applied to
+                 the given tags. The parameter expects a list of
+                 regular expressions which are matched against 'kind'.
+                 Note that a match against the full string is expected.
+    whitelist: Restrict the set of languages that should be tagged.
+               Expects a list of acceptable suffixes. When unset,
+               all 2- and 3-letter lower-case codes are accepted.
+    use-defaults:  Configure what happens when the name has no suffix.
+                   When set to 'all', a variant is created for
+                   each of the default languages in the country
+                   the feature is in. When set to 'mono', a variant is
+                   only created, when exactly one language is spoken
+                   in the country. The default is to do nothing with
+                   the default languages of a country.
+    mode: Define how the variants are created and may be 'replace' or
+          'append'. When set to 'append' the original name (without
+          any analyzer tagged) is retained. (default: replace)
+
+"""
+from typing import Callable, Dict, Optional, List
+
+from ...data import country_info
+from .base import ProcessInfo
+from .config import SanitizerConfig
+
+class _AnalyzerByLanguage:
+    """ Processor for tagging the language of names in a place.
+    """
+
+    def __init__(self, config: SanitizerConfig) -> None:
+        self.filter_kind = config.get_filter('filter-kind')
+        self.replace = config.get('mode', 'replace') != 'append'
+        self.whitelist = config.get('whitelist')
+
+        self._compute_default_languages(config.get('use-defaults', 'no'))
+
+
+    def _compute_default_languages(self, use_defaults: str) -> None:
+        self.deflangs: Dict[Optional[str], List[str]] = {}
+
+        if use_defaults in ('mono', 'all'):
+            for ccode, clangs in country_info.iterate('languages'):
+                if len(clangs) == 1 or use_defaults == 'all':
+                    if self.whitelist:
+                        self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
+                    else:
+                        self.deflangs[ccode] = clangs
+
+
+    def _suffix_matches(self, suffix: str) -> bool:
+        if self.whitelist is None:
+            return len(suffix) in (2, 3) and suffix.islower()
+
+        return suffix in self.whitelist
+
+
+    def __call__(self, obj: ProcessInfo) -> None:
+        if not obj.names:
+            return
+
+        more_names = []
+
+        for name in (n for n in obj.names
+                     if not n.has_attr('analyzer') and self.filter_kind(n.kind)):
+            if name.suffix:
+                langs = [name.suffix] if self._suffix_matches(name.suffix) else None
+            else:
+                langs = self.deflangs.get(obj.place.country_code)
+
+
+            if langs:
+                if self.replace:
+                    name.set_attr('analyzer', langs[0])
+                else:
+                    more_names.append(name.clone(attr={'analyzer': langs[0]}))
+
+                more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
+
+        obj.names.extend(more_names)
+
+
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+    """ Create a function that sets the analyzer property depending on the
+        language of the tag.
+    """
+    return _AnalyzerByLanguage(config)
--- a/src/nominatim_db/tokenizer/sanitizers/tag_japanese.py
+++ b/src/nominatim_db/tokenizer/sanitizers/tag_japanese.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+This sanitizer maps OSM data to Japanese block addresses.
+It replaces blocknumber and housenumber with housenumber,
+and quarter and neighbourhood with place.
+"""
+
+
+from typing import Callable
+from typing import List, Optional
+
+from .base import ProcessInfo
+from .config import SanitizerConfig
+from ...data.place_name import PlaceName
+
+def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+    """Set up the sanitizer
+    """
+    return tag_japanese
+
+def reconbine_housenumber(
+    new_address: List[PlaceName],
+    tmp_housenumber: Optional[str],
+    tmp_blocknumber: Optional[str]
+) -> List[PlaceName]:
+    """ Recombine the tag of housenumber by using housenumber and blocknumber
+    """
+    if tmp_blocknumber and tmp_housenumber:
+        new_address.append(
+            PlaceName(
+                kind='housenumber',
+                name=f'{tmp_blocknumber}-{tmp_housenumber}',
+                suffix=''
+            )
+        )
+    elif tmp_blocknumber:
+        new_address.append(
+            PlaceName(
+                kind='housenumber',
+                name=tmp_blocknumber,
+                suffix=''
+            )
+        )
+    elif tmp_housenumber:
+        new_address.append(
+            PlaceName(
+                kind='housenumber',
+                name=tmp_housenumber,
+                suffix=''
+            )
+        )
+    return new_address
+
+def reconbine_place(
+    new_address: List[PlaceName],
+    tmp_neighbourhood: Optional[str],
+    tmp_quarter: Optional[str]
+) -> List[PlaceName]:
+    """ Recombine the tag of place by using neighbourhood and quarter
+    """
+    if tmp_neighbourhood and tmp_quarter:
+        new_address.append(
+            PlaceName(
+                kind='place',
+                name=f'{tmp_quarter}{tmp_neighbourhood}',
+                suffix=''
+            )
+        )
+    elif tmp_neighbourhood:
+        new_address.append(
+            PlaceName(
+                kind='place',
+                name=tmp_neighbourhood,
+                suffix=''
+            )
+        )
+    elif tmp_quarter:
+        new_address.append(
+            PlaceName(
+                kind='place',
+                name=tmp_quarter,
+                suffix=''
+            )
+        )
+    return new_address
+def tag_japanese(obj: ProcessInfo) -> None:
+    """Recombine kind of address
+    """
+    if obj.place.country_code != 'jp':
+        return
+    tmp_housenumber = None
+    tmp_blocknumber = None
+    tmp_neighbourhood = None
+    tmp_quarter = None
+
+    new_address = []
+    for item in obj.address:
+        if item.kind == 'housenumber':
+            tmp_housenumber = item.name
+        elif item.kind == 'block_number':
+            tmp_blocknumber = item.name
+        elif item.kind == 'neighbourhood':
+            tmp_neighbourhood = item.name
+        elif item.kind == 'quarter':
+            tmp_quarter = item.name
+        else:
+            new_address.append(item)
+
+    new_address = reconbine_housenumber(new_address, tmp_housenumber, tmp_blocknumber)
+    new_address = reconbine_place(new_address, tmp_neighbourhood, tmp_quarter)
+
+    obj.address = [item for item in new_address if item.name is not None]
--- a/src/nominatim_db/tokenizer/token_analysis/init.py
+++ b/src/nominatim_db/tokenizer/token_analysis/init.py
--- a/src/nominatim_db/tokenizer/token_analysis/base.py
+++ b/src/nominatim_db/tokenizer/token_analysis/base.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Common data types and protocols for analysers.
+"""
+from typing import Mapping, List, Any
+
+from nominatim_core.typing import Protocol
+from ...data.place_name import PlaceName
+
+class Analyzer(Protocol):
+    """ The `create()` function of an analysis module needs to return an
+        object that implements the following functions.
+    """
+
+    def get_canonical_id(self, name: PlaceName) -> str:
+        """ Return the canonical form of the given name. The canonical ID must
+            be unique (the same ID must always yield the same variants) and
+            must be a form from which the variants can be derived.
+
+            Arguments:
+                name: Extended place name description as prepared by
+                      the sanitizers.
+
+            Returns:
+                ID string with a canonical form of the name. The string may
+                    be empty, when the analyzer cannot analyze the name at all,
+                    for example because the character set in use does not match.
+        """
+
+    def compute_variants(self, canonical_id: str) -> List[str]:
+        """ Compute the transliterated spelling variants for the given
+            canonical ID.
+
+            Arguments:
+                canonical_id: ID string previously computed with
+                              `get_canonical_id()`.
+
+            Returns:
+                A list of possible spelling variants. All strings must have
+                    been transformed with the global normalizer and
+                    transliterator ICU rules. Otherwise they cannot be matched
+                    against the input by the query frontend.
+                    The list may be empty, when there are no useful
+                    spelling variants. This may happen when an analyzer only
+                    usually outputs additional variants to the canonical spelling
+                    and there are no such variants.
+        """
+
+
+class AnalysisModule(Protocol):
+    """ The setup of the token analysis is split into two parts:
+        configuration and analyser factory. A token analysis module must
+        therefore implement the two functions here described.
+    """
+
+    def configure(self, rules: Mapping[str, Any],
+                  normalizer: Any, transliterator: Any) -> Any:
+        """ Prepare the configuration of the analysis module.
+            This function should prepare all data that can be shared
+            between instances of this analyser.
+
+            Arguments:
+                rules: A dictionary with the additional configuration options
+                       as specified in the tokenizer configuration.
+                normalizer: an ICU Transliterator with the compiled
+                            global normalization rules.
+                transliterator: an ICU Transliterator with the compiled
+                                global transliteration rules.
+
+            Returns:
+                A data object with configuration data. This will be handed
+                    as is into the `create()` function and may be
+                    used freely by the analysis module as needed.
+        """
+
+    def create(self, normalizer: Any, transliterator: Any, config: Any) -> Analyzer:
+        """ Create a new instance of the analyser.
+            A separate instance of the analyser is created for each thread
+            when used in multi-threading context.
+
+            Arguments:
+                normalizer: an ICU Transliterator with the compiled normalization
+                            rules.
+                transliterator: an ICU Transliterator with the compiled
+                                transliteration rules.
+                config: The object that was returned by the call to configure().
+
+            Returns:
+                A new analyzer instance. This must be an object that implements
+                    the Analyzer protocol.
+        """
--- a/src/nominatim_db/tokenizer/token_analysis/config_variants.py
+++ b/src/nominatim_db/tokenizer/token_analysis/config_variants.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Parser for configuration for variants.
+"""
+from typing import Any, Iterator, Tuple, List, Optional, Set, NamedTuple
+from collections import defaultdict
+import itertools
+import re
+
+from nominatim_core.config import flatten_config_list
+from nominatim_core.errors import UsageError
+
+class ICUVariant(NamedTuple):
+    """ A single replacement rule for variant creation.
+    """
+    source: str
+    replacement: str
+
+
+def get_variant_config(in_rules: Any,
+                       normalizer: Any) -> Tuple[List[Tuple[str, List[str]]], str]:
+    """ Convert the variant definition from the configuration into
+        replacement sets.
+
+        Returns a tuple containing the replacement set and the list of characters
+        used in the replacements.
+    """
+    immediate = defaultdict(list)
+    chars: Set[str] = set()
+
+    if in_rules:
+        vset: Set[ICUVariant] = set()
+        rules = flatten_config_list(in_rules, 'variants')
+
+        vmaker = _VariantMaker(normalizer)
+
+        for section in rules:
+            for rule in (section.get('words') or []):
+                vset.update(vmaker.compute(rule))
+
+        # Intermediate reorder by source. Also compute required character set.
+        for variant in vset:
+            if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
+                replstr = variant.replacement[:-1]
+            else:
+                replstr = variant.replacement
+            immediate[variant.source].append(replstr)
+            chars.update(variant.source)
+
+    return list(immediate.items()), ''.join(chars)
+
+
+class _VariantMaker:
+    """ Generator for all necessary ICUVariants from a single variant rule.
+
+        All text in rules is normalized to make sure the variants match later.
+    """
+
+    def __init__(self, normalizer: Any) -> None:
+        self.norm = normalizer
+
+
+    def compute(self, rule: Any) -> Iterator[ICUVariant]:
+        """ Generator for all ICUVariant tuples from a single variant rule.
+        """
+        parts = re.split(r'(\|)?([=-])>', rule)
+        if len(parts) != 4:
+            raise UsageError(f"Syntax error in variant rule: {rule}")
+
+        decompose = parts[1] is None
+        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
+        repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
+
+        # If the source should be kept, add a 1:1 replacement
+        if parts[2] == '-':
+            for src in src_terms:
+                if src:
+                    for froms, tos in _create_variants(*src, src[0], decompose):
+                        yield ICUVariant(froms, tos)
+
+        for src, repl in itertools.product(src_terms, repl_terms):
+            if src and repl:
+                for froms, tos in _create_variants(*src, repl, decompose):
+                    yield ICUVariant(froms, tos)
+
+
+    def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]:
+        name = name.strip()
+        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
+        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
+            raise UsageError(f"Invalid variant word descriptor '{name}'")
+        norm_name = self.norm.transliterate(match.group(2)).strip()
+        if not norm_name:
+            return None
+
+        return norm_name, match.group(1), match.group(3)
+
+
+_FLAG_MATCH = {'^': '^ ',
+               '$': ' ^',
+               '': ' '}
+
+
+def _create_variants(src: str, preflag: str, postflag: str,
+                     repl: str, decompose: bool) -> Iterator[Tuple[str, str]]:
+    if preflag == '~':
+        postfix = _FLAG_MATCH[postflag]
+        # suffix decomposition
+        src = src + postfix
+        repl = repl + postfix
+
+        yield src, repl
+        yield ' ' + src, ' ' + repl
+
+        if decompose:
+            yield src, ' ' + repl
+            yield ' ' + src, repl
+    elif postflag == '~':
+        # prefix decomposition
+        prefix = _FLAG_MATCH[preflag]
+        src = prefix + src
+        repl = prefix + repl
+
+        yield src, repl
+        yield src + ' ', repl + ' '
+
+        if decompose:
+            yield src, repl + ' '
+            yield src + ' ', repl
+    else:
+        prefix = _FLAG_MATCH[preflag]
+        postfix = _FLAG_MATCH[postflag]
+
+        yield prefix + src + postfix, prefix + repl + postfix
--- a/src/nominatim_db/tokenizer/token_analysis/generic.py
+++ b/src/nominatim_db/tokenizer/token_analysis/generic.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Generic processor for names that creates abbreviation variants.
+"""
+from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
+import itertools
+
+import datrie
+
+from nominatim_core.errors import UsageError
+from ...data.place_name import PlaceName
+from .config_variants import get_variant_config
+from .generic_mutation import MutationVariantGenerator
+
+### Configuration section
+
+def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
+    """ Extract and preprocess the configuration for this module.
+    """
+    config: Dict[str, Any] = {}
+
+    config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
+                                                                 normalizer)
+    config['variant_only'] = rules.get('mode', '') == 'variant-only'
+
+    # parse mutation rules
+    config['mutations'] = []
+    for rule in rules.get('mutations', []):
+        if 'pattern' not in rule:
+            raise UsageError("Missing field 'pattern' in mutation configuration.")
+        if not isinstance(rule['pattern'], str):
+            raise UsageError("Field 'pattern' in mutation configuration "
+                             "must be a simple text field.")
+        if 'replacements' not in rule:
+            raise UsageError("Missing field 'replacements' in mutation configuration.")
+        if not isinstance(rule['replacements'], list):
+            raise UsageError("Field 'replacements' in mutation configuration "
+                             "must be a list of texts.")
+
+        config['mutations'].append((rule['pattern'], rule['replacements']))
+
+    return config
+
+
+### Analysis section
+
+def create(normalizer: Any, transliterator: Any,
+           config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
+    """ Create a new token analysis instance for this module.
+    """
+    return GenericTokenAnalysis(normalizer, transliterator, config)
+
+
+class GenericTokenAnalysis:
+    """ Collects the different transformation rules for normalisation of names
+        and provides the functions to apply the transformations.
+    """
+
+    def __init__(self, norm: Any, to_ascii: Any, config: Mapping[str, Any]) -> None:
+        self.norm = norm
+        self.to_ascii = to_ascii
+        self.variant_only = config['variant_only']
+
+        # Set up datrie
+        if config['replacements']:
+            self.replacements = datrie.Trie(config['chars'])
+            for src, repllist in config['replacements']:
+                self.replacements[src] = repllist
+        else:
+            self.replacements = None
+
+        # set up mutation rules
+        self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
+
+
+    def get_canonical_id(self, name: PlaceName) -> str:
+        """ Return the normalized form of the name. This is the standard form
+            from which possible variants for the name can be derived.
+        """
+        return cast(str, self.norm.transliterate(name.name)).strip()
+
+
+    def compute_variants(self, norm_name: str) -> List[str]:
+        """ Compute the spelling variants for the given normalized name
+            and transliterate the result.
+        """
+        variants = self._generate_word_variants(norm_name)
+
+        for mutation in self.mutations:
+            variants = mutation.generate(variants)
+
+        return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
+
+
+    def _transliterate_unique_list(self, norm_name: str,
+                                   iterable: Iterable[str]) -> Iterator[Optional[str]]:
+        seen = set()
+        if self.variant_only:
+            seen.add(norm_name)
+
+        for variant in map(str.strip, iterable):
+            if variant not in seen:
+                seen.add(variant)
+                yield self.to_ascii.transliterate(variant).strip()
+
+
+    def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
+        baseform = '^ ' + norm_name + ' ^'
+        baselen = len(baseform)
+        partials = ['']
+
+        startpos = 0
+        if self.replacements is not None:
+            pos = 0
+            force_space = False
+            while pos < baselen:
+                full, repl = self.replacements.longest_prefix_item(baseform[pos:],
+                                                                   (None, None))
+                if full is not None:
+                    done = baseform[startpos:pos]
+                    partials = [v + done + r
+                                for v, r in itertools.product(partials, repl)
+                                if not force_space or r.startswith(' ')]
+                    if len(partials) > 128:
+                        # If too many variants are produced, they are unlikely
+                        # to be helpful. Only use the original term.
+                        startpos = 0
+                        break
+                    startpos = pos + len(full)
+                    if full[-1] == ' ':
+                        startpos -= 1
+                        force_space = True
+                    pos = startpos
+                else:
+                    pos += 1
+                    force_space = False
+
+        # No variants detected? Fast return.
+        if startpos == 0:
+            return (norm_name, )
+
+        if startpos < baselen:
+            return (part[1:] + baseform[startpos:-1] for part in partials)
+
+        return (part[1:-1] for part in partials)
--- a/src/nominatim_db/tokenizer/token_analysis/generic_mutation.py
+++ b/src/nominatim_db/tokenizer/token_analysis/generic_mutation.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Creator for mutation variants for the generic token analysis.
+"""
+from typing import Sequence, Iterable, Iterator, Tuple
+import itertools
+import logging
+import re
+
+from nominatim_core.errors import UsageError
+
+LOG = logging.getLogger()
+
+def _zigzag(outer: Iterable[str], inner: Iterable[str]) -> Iterator[str]:
+    return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
+
+
+class MutationVariantGenerator:
+    """ Generates name variants by applying a regular expression to the name
+        and replacing it with one or more variants. When the regular expression
+        matches more than once, each occurrence is replaced with all replacement
+        patterns.
+    """
+
+    def __init__(self, pattern: str, replacements: Sequence[str]):
+        self.pattern = re.compile(pattern)
+        self.replacements = replacements
+
+        if self.pattern.groups > 0:
+            LOG.fatal("The mutation pattern %s contains a capturing group. "
+                      "This is not allowed.", pattern)
+            raise UsageError("Bad mutation pattern in configuration.")
+
+
+    def generate(self, names: Iterable[str]) -> Iterator[str]:
+        """ Generator function for the name variants. 'names' is an iterable
+            over a set of names for which the variants are to be generated.
+        """
+        for name in names:
+            parts = self.pattern.split(name)
+            if len(parts) == 1:
+                yield name
+            else:
+                for seps in self._fillers(len(parts)):
+                    yield ''.join(_zigzag(parts, seps))
+
+
+    def _fillers(self, num_parts: int) -> Iterator[Tuple[str, ...]]:
+        """ Returns a generator for strings to join the given number of string
+            parts in all possible combinations.
+        """
+        return itertools.product(self.replacements, repeat=num_parts - 1)
--- a/src/nominatim_db/tokenizer/token_analysis/housenumbers.py
+++ b/src/nominatim_db/tokenizer/token_analysis/housenumbers.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Specialized processor for housenumbers. Analyses common housenumber patterns
+and creates variants for them.
+"""
+from typing import Any, List, cast
+import re
+
+from ...data.place_name import PlaceName
+from .generic_mutation import MutationVariantGenerator
+
+RE_NON_DIGIT = re.compile('[^0-9]')
+RE_DIGIT_ALPHA = re.compile(r'(\d)\s*([^\d\s␣])')
+RE_ALPHA_DIGIT = re.compile(r'([^\s\d␣])\s*(\d)')
+RE_NAMED_PART = re.compile(r'[a-z]{4}')
+
+### Configuration section
+
+def configure(*_: Any) -> None:
+    """ All behaviour is currently hard-coded.
+    """
+    return None
+
+### Analysis section
+
+def create(normalizer: Any, transliterator: Any, config: None) -> 'HousenumberTokenAnalysis': # pylint: disable=W0613
+    """ Create a new token analysis instance for this module.
+    """
+    return HousenumberTokenAnalysis(normalizer, transliterator)
+
+
+class HousenumberTokenAnalysis:
+    """ Detects common housenumber patterns and normalizes them.
+    """
+    def __init__(self, norm: Any, trans: Any) -> None:
+        self.norm = norm
+        self.trans = trans
+
+        self.mutator = MutationVariantGenerator('␣', (' ', ''))
+
+    def get_canonical_id(self, name: PlaceName) -> str:
+        """ Return the normalized form of the housenumber.
+        """
+        # shortcut for number-only numbers, which make up 90% of the data.
+        if RE_NON_DIGIT.search(name.name) is None:
+            return name.name
+
+        norm = cast(str, self.trans.transliterate(self.norm.transliterate(name.name)))
+        # If there is a significant non-numeric part, use as is.
+        if RE_NAMED_PART.search(norm) is None:
+            # Otherwise add optional spaces between digits and letters.
+            (norm_opt, cnt1) = RE_DIGIT_ALPHA.subn(r'\1␣\2', norm)
+            (norm_opt, cnt2) = RE_ALPHA_DIGIT.subn(r'\1␣\2', norm_opt)
+            # Avoid creating too many variants per number.
+            if cnt1 + cnt2 <= 4:
+                return norm_opt
+
+        return norm
+
+    def compute_variants(self, norm_name: str) -> List[str]:
+        """ Compute the spelling variants for the given normalized housenumber.
+
+            Generates variants for optional spaces (marked with '␣').
+        """
+        return list(self.mutator.generate([norm_name]))
--- a/src/nominatim_db/tokenizer/token_analysis/postcodes.py
+++ b/src/nominatim_db/tokenizer/token_analysis/postcodes.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Specialized processor for postcodes. Supports a 'lookup' variant of the
+token, which produces variants with optional spaces.
+"""
+from typing import Any, List
+
+from ...data.place_name import PlaceName
+from .generic_mutation import MutationVariantGenerator
+
+### Configuration section
+
+def configure(*_: Any) -> None:
+    """ All behaviour is currently hard-coded.
+    """
+    return None
+
+### Analysis section
+
+def create(normalizer: Any, transliterator: Any, config: None) -> 'PostcodeTokenAnalysis': # pylint: disable=W0613
+    """ Create a new token analysis instance for this module.
+    """
+    return PostcodeTokenAnalysis(normalizer, transliterator)
+
+
+class PostcodeTokenAnalysis:
+    """ Special normalization and variant generation for postcodes.
+
+        This analyser must not be used with anything but postcodes as
+        it follows some special rules: the canonial ID is the form that
+        is used for the output. `compute_variants` then needs to ensure that
+        the generated variants once more follow the standard normalization
+        and transliteration, so that postcodes are correctly recognised by
+        the search algorithm.
+    """
+    def __init__(self, norm: Any, trans: Any) -> None:
+        self.norm = norm
+        self.trans = trans
+
+        self.mutator = MutationVariantGenerator(' ', (' ', ''))
+
+
+    def get_canonical_id(self, name: PlaceName) -> str:
+        """ Return the standard form of the postcode.
+        """
+        return name.name.strip().upper()
+
+
+    def compute_variants(self, norm_name: str) -> List[str]:
+        """ Compute the spelling variants for the given normalized postcode.
+
+            Takes the canonical form of the postcode, normalizes it using the
+            standard rules and then creates variants of the result where
+            all spaces are optional.
+        """
+        # Postcodes follow their own transliteration rules.
+        # Make sure at this point, that the terms are normalized in a way
+        # that they are searchable with the standard transliteration rules.
+        return [self.trans.transliterate(term) for term in
+                self.mutator.generate([self.norm.transliterate(norm_name)]) if term]
--- a/src/nominatim_db/tools/init.py
+++ b/src/nominatim_db/tools/init.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Module with functions for importing, updating Nominatim databases
+as well as general maintenance helpers.
+"""
--- a/src/nominatim_db/tools/add_osm_data.py
+++ b/src/nominatim_db/tools/add_osm_data.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Function to add additional OSM data from a file or the API into the database.
+"""
+from typing import Any, MutableMapping
+from pathlib import Path
+import logging
+import urllib
+
+from nominatim_core.db.connection import connect
+from nominatim_core.utils.url_utils import get_url
+from .exec_utils import run_osm2pgsql
+
+LOG = logging.getLogger()
+
+def _run_osm2pgsql(dsn: str, options: MutableMapping[str, Any]) -> None:
+    run_osm2pgsql(options)
+
+    # Handle deletions
+    with connect(dsn) as conn:
+        with conn.cursor() as cur:
+            cur.execute('SELECT flush_deleted_places()')
+        conn.commit()
+
+
+def add_data_from_file(dsn: str, fname: str, options: MutableMapping[str, Any]) -> int:
+    """ Adds data from a OSM file to the database. The file may be a normal
+        OSM file or a diff file in all formats supported by libosmium.
+    """
+    options['import_file'] = Path(fname)
+    options['append'] = True
+    _run_osm2pgsql(dsn, options)
+
+    # No status update. We don't know where the file came from.
+    return 0
+
+
+def add_osm_object(dsn: str, osm_type: str, osm_id: int, use_main_api: bool,
+                   options: MutableMapping[str, Any]) -> int:
+    """ Add or update a single OSM object from the latest version of the
+        API.
+    """
+    if use_main_api:
+        base_url = f'https://www.openstreetmap.org/api/0.6/{osm_type}/{osm_id}'
+        if osm_type in ('way', 'relation'):
+            base_url += '/full'
+    else:
+        # use Overpass API
+        if osm_type == 'node':
+            data = f'node({osm_id});out meta;'
+        elif osm_type == 'way':
+            data = f'(way({osm_id});>;);out meta;'
+        else:
+            data = f'(rel(id:{osm_id});>;);out meta;'
+        base_url = 'https://overpass-api.de/api/interpreter?' \
+                   + urllib.parse.urlencode({'data': data})
+
+    options['append'] = True
+    options['import_data'] = get_url(base_url).encode('utf-8')
+
+    _run_osm2pgsql(dsn, options)
+
+    return 0
--- a/src/nominatim_db/tools/admin.py
+++ b/src/nominatim_db/tools/admin.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for database analysis and maintenance.
+"""
+from typing import Optional, Tuple, Any, cast
+import logging
+
+from psycopg2.extras import Json, register_hstore
+from psycopg2 import DataError
+
+from nominatim_core.typing import DictCursorResult
+from nominatim_core.config import Configuration
+from nominatim_core.db.connection import connect, Cursor
+from nominatim_core.errors import UsageError
+from ..tokenizer import factory as tokenizer_factory
+from ..data.place_info import PlaceInfo
+
+LOG = logging.getLogger()
+
+def _get_place_info(cursor: Cursor, osm_id: Optional[str],
+                    place_id: Optional[int]) -> DictCursorResult:
+    sql = """SELECT place_id, extra.*
+             FROM placex, LATERAL placex_indexing_prepare(placex) as extra
+          """
+
+    values: Tuple[Any, ...]
+    if osm_id:
+        osm_type = osm_id[0].upper()
+        if osm_type not in 'NWR' or not osm_id[1:].isdigit():
+            LOG.fatal('OSM ID must be of form <N|W|R><id>. Got: %s', osm_id)
+            raise UsageError("OSM ID parameter badly formatted")
+
+        sql += ' WHERE placex.osm_type = %s AND placex.osm_id = %s'
+        values = (osm_type, int(osm_id[1:]))
+    elif place_id is not None:
+        sql += ' WHERE placex.place_id = %s'
+        values = (place_id, )
+    else:
+        LOG.fatal("No OSM object given to index.")
+        raise UsageError("OSM object not found")
+
+    cursor.execute(sql + ' LIMIT 1', values)
+
+    if cursor.rowcount < 1:
+        LOG.fatal("OSM object %s not found in database.", osm_id)
+        raise UsageError("OSM object not found")
+
+    return cast(DictCursorResult, cursor.fetchone())
+
+
+def analyse_indexing(config: Configuration, osm_id: Optional[str] = None,
+                     place_id: Optional[int] = None) -> None:
+    """ Analyse indexing of a single Nominatim object.
+    """
+    with connect(config.get_libpq_dsn()) as conn:
+        register_hstore(conn)
+        with conn.cursor() as cur:
+            place = _get_place_info(cur, osm_id, place_id)
+
+            cur.execute("update placex set indexed_status = 2 where place_id = %s",
+                        (place['place_id'], ))
+
+            cur.execute("""SET auto_explain.log_min_duration = '0';
+                           SET auto_explain.log_analyze = 'true';
+                           SET auto_explain.log_nested_statements = 'true';
+                           LOAD 'auto_explain';
+                           SET client_min_messages = LOG;
+                           SET log_min_messages = FATAL""")
+
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+
+            with tokenizer.name_analyzer() as analyzer:
+                cur.execute("""UPDATE placex
+                               SET indexed_status = 0, address = %s, token_info = %s,
+                               name = %s, linked_place_id = %s
+                               WHERE place_id = %s""",
+                            (place['address'],
+                             Json(analyzer.process_place(PlaceInfo(place))),
+                             place['name'], place['linked_place_id'], place['place_id']))
+
+        # we do not want to keep the results
+        conn.rollback()
+
+        for msg in conn.notices:
+            print(msg)
+
+
+def clean_deleted_relations(config: Configuration, age: str) -> None:
+    """ Clean deleted relations older than a given age
+    """
+    with connect(config.get_libpq_dsn()) as conn:
+        with conn.cursor() as cur:
+            try:
+                cur.execute("""SELECT place_force_delete(p.place_id)
+                            FROM import_polygon_delete d, placex p
+                            WHERE p.osm_type = d.osm_type AND p.osm_id = d.osm_id
+                            AND age(p.indexed_date) > %s::interval""",
+                            (age, ))
+            except DataError as exc:
+                raise UsageError('Invalid PostgreSQL time interval format') from exc
+        conn.commit()
--- a/src/nominatim_db/tools/check_database.py
+++ b/src/nominatim_db/tools/check_database.py
@@ -0,0 +1,350 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Collection of functions that check if the database is complete and functional.
+"""
+from typing import Callable, Optional, Any, Union, Tuple, Mapping, List
+from enum import Enum
+from textwrap import dedent
+
+from nominatim_core.config import Configuration
+from nominatim_core.db.connection import connect, Connection
+from nominatim_core.db import properties
+from nominatim_core.errors import UsageError
+from ..tokenizer import factory as tokenizer_factory
+from . import freeze
+from ..version import NOMINATIM_VERSION, parse_version
+
+CHECKLIST = []
+
+class CheckState(Enum):
+    """ Possible states of a check. FATAL stops check execution entirely.
+    """
+    OK = 0
+    FAIL = 1
+    FATAL = 2
+    NOT_APPLICABLE = 3
+    WARN = 4
+
+CheckResult = Union[CheckState, Tuple[CheckState, Mapping[str, Any]]]
+CheckFunc = Callable[[Connection, Configuration], CheckResult]
+
+def _check(hint: Optional[str] = None) -> Callable[[CheckFunc], CheckFunc]:
+    """ Decorator for checks. It adds the function to the list of
+        checks to execute and adds the code for printing progress messages.
+    """
+    def decorator(func: CheckFunc) -> CheckFunc:
+        title = (func.__doc__ or '').split('\n', 1)[0].strip()
+
+        def run_check(conn: Connection, config: Configuration) -> CheckState:
+            print(title, end=' ... ')
+            ret = func(conn, config)
+            if isinstance(ret, tuple):
+                ret, params = ret
+            else:
+                params = {}
+            if ret == CheckState.OK:
+                print('\033[92mOK\033[0m')
+            elif ret == CheckState.WARN:
+                print('\033[93mWARNING\033[0m')
+                if hint:
+                    print('')
+                    print(dedent(hint.format(**params)))
+            elif ret == CheckState.NOT_APPLICABLE:
+                print('not applicable')
+            else:
+                print('\x1B[31mFailed\033[0m')
+                if hint:
+                    print(dedent(hint.format(**params)))
+            return ret
+
+        CHECKLIST.append(run_check)
+        return run_check
+
+    return decorator
+
+class _BadConnection:
+
+    def __init__(self, msg: str) -> None:
+        self.msg = msg
+
+    def close(self) -> None:
+        """ Dummy function to provide the implementation.
+        """
+
+def check_database(config: Configuration) -> int:
+    """ Run a number of checks on the database and return the status.
+    """
+    try:
+        conn = connect(config.get_libpq_dsn()).connection
+    except UsageError as err:
+        conn = _BadConnection(str(err)) # type: ignore[assignment]
+
+    overall_result = 0
+    for check in CHECKLIST:
+        ret = check(conn, config)
+        if ret == CheckState.FATAL:
+            conn.close()
+            return 1
+        if ret in (CheckState.FATAL, CheckState.FAIL):
+            overall_result = 1
+
+    conn.close()
+    return overall_result
+
+
+def _get_indexes(conn: Connection) -> List[str]:
+    indexes = ['idx_place_addressline_address_place_id',
+               'idx_placex_rank_search',
+               'idx_placex_rank_address',
+               'idx_placex_parent_place_id',
+               'idx_placex_geometry_reverse_lookuppolygon',
+               'idx_placex_geometry_placenode',
+               'idx_osmline_parent_place_id',
+               'idx_osmline_parent_osm_id',
+               'idx_postcode_id',
+               'idx_postcode_postcode'
+              ]
+    if conn.table_exists('search_name'):
+        indexes.extend(('idx_search_name_nameaddress_vector',
+                        'idx_search_name_name_vector',
+                        'idx_search_name_centroid'))
+        if conn.server_version_tuple() >= (11, 0, 0):
+            indexes.extend(('idx_placex_housenumber',
+                            'idx_osmline_parent_osm_id_with_hnr'))
+    if conn.table_exists('place'):
+        indexes.extend(('idx_location_area_country_place_id',
+                        'idx_place_osm_unique',
+                        'idx_placex_rank_address_sector',
+                        'idx_placex_rank_boundaries_sector'))
+
+    return indexes
+
+
+# CHECK FUNCTIONS
+#
+# Functions are executed in the order they appear here.
+
+@_check(hint="""\
+             {error}
+
+             Hints:
+             * Is the database server started?
+             * Check the NOMINATIM_DATABASE_DSN variable in your local .env
+             * Try connecting to the database with the same settings
+
+             Project directory: {config.project_dir}
+             Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
+             """)
+def check_connection(conn: Any, config: Configuration) -> CheckResult:
+    """ Checking database connection
+    """
+    if isinstance(conn, _BadConnection):
+        return CheckState.FATAL, dict(error=conn.msg, config=config)
+
+    return CheckState.OK
+
+@_check(hint="""\
+             Database version ({db_version}) doesn't match Nominatim version ({nom_version})
+
+             Hints:
+             * Are you connecting to the correct database?
+             
+             {instruction}
+
+             Check the Migration chapter of the Administration Guide.
+
+             Project directory: {config.project_dir}
+             Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
+             """)
+def check_database_version(conn: Connection, config: Configuration) -> CheckResult:
+    """ Checking database_version matches Nominatim software version
+    """
+
+    if conn.table_exists('nominatim_properties'):
+        db_version_str = properties.get_property(conn, 'database_version')
+    else:
+        db_version_str = None
+
+    if db_version_str is not None:
+        db_version = parse_version(db_version_str)
+
+        if db_version == NOMINATIM_VERSION:
+            return CheckState.OK
+
+        instruction = (
+            'Run migrations: nominatim admin --migrate'
+            if db_version < NOMINATIM_VERSION
+            else 'You need to upgrade the Nominatim software.'
+        )
+    else:
+        instruction = ''
+
+    return CheckState.FATAL, dict(db_version=db_version_str,
+                                  nom_version=NOMINATIM_VERSION,
+                                  instruction=instruction,
+                                  config=config)
+
+@_check(hint="""\
+             placex table not found
+
+             Hints:
+             * Are you connecting to the correct database?
+             * Did the import process finish without errors?
+
+             Project directory: {config.project_dir}
+             Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
+             """)
+def check_placex_table(conn: Connection, config: Configuration) -> CheckResult:
+    """ Checking for placex table
+    """
+    if conn.table_exists('placex'):
+        return CheckState.OK
+
+    return CheckState.FATAL, dict(config=config)
+
+
+@_check(hint="""placex table has no data. Did the import finish successfully?""")
+def check_placex_size(conn: Connection, _: Configuration) -> CheckResult:
+    """ Checking for placex content
+    """
+    with conn.cursor() as cur:
+        cnt = cur.scalar('SELECT count(*) FROM (SELECT * FROM placex LIMIT 100) x')
+
+    return CheckState.OK if cnt > 0 else CheckState.FATAL
+
+
+@_check(hint="""{msg}""")
+def check_tokenizer(_: Connection, config: Configuration) -> CheckResult:
+    """ Checking that tokenizer works
+    """
+    try:
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+    except UsageError:
+        return CheckState.FAIL, dict(msg="""\
+            Cannot load tokenizer. Did the import finish successfully?""")
+
+    result = tokenizer.check_database(config)
+
+    if result is None:
+        return CheckState.OK
+
+    return CheckState.FAIL, dict(msg=result)
+
+
+@_check(hint="""\
+             Wikipedia/Wikidata importance tables missing.
+             Quality of search results may be degraded. Reverse geocoding is unaffected.
+             See https://nominatim.org/release-docs/latest/admin/Import/#wikipediawikidata-rankings
+             """)
+def check_existance_wikipedia(conn: Connection, _: Configuration) -> CheckResult:
+    """ Checking for wikipedia/wikidata data
+    """
+    if not conn.table_exists('search_name') or not conn.table_exists('place'):
+        return CheckState.NOT_APPLICABLE
+
+    with conn.cursor() as cur:
+        if conn.table_exists('wikimedia_importance'):
+            cnt = cur.scalar('SELECT count(*) FROM wikimedia_importance')
+        else:
+            cnt = cur.scalar('SELECT count(*) FROM wikipedia_article')
+
+        return CheckState.WARN if cnt == 0 else CheckState.OK
+
+
+@_check(hint="""\
+             The indexing didn't finish. {count} entries are not yet indexed.
+
+             To index the remaining entries, run:   {index_cmd}
+             """)
+def check_indexing(conn: Connection, _: Configuration) -> CheckResult:
+    """ Checking indexing status
+    """
+    with conn.cursor() as cur:
+        cnt = cur.scalar('SELECT count(*) FROM placex WHERE indexed_status > 0')
+
+    if cnt == 0:
+        return CheckState.OK
+
+    if freeze.is_frozen(conn):
+        index_cmd="""\
+            Database is marked frozen, it cannot be updated.
+            Low counts of unindexed places are fine."""
+        return CheckState.WARN, dict(count=cnt, index_cmd=index_cmd)
+
+    if conn.index_exists('idx_placex_rank_search'):
+        # Likely just an interrupted update.
+        index_cmd = 'nominatim index'
+    else:
+        # Looks like the import process got interrupted.
+        index_cmd = 'nominatim import --continue indexing'
+
+    return CheckState.FAIL, dict(count=cnt, index_cmd=index_cmd)
+
+
+@_check(hint="""\
+             The following indexes are missing:
+               {indexes}
+
+             Rerun the index creation with:   nominatim import --continue db-postprocess
+             """)
+def check_database_indexes(conn: Connection, _: Configuration) -> CheckResult:
+    """ Checking that database indexes are complete
+    """
+    missing = []
+    for index in _get_indexes(conn):
+        if not conn.index_exists(index):
+            missing.append(index)
+
+    if missing:
+        return CheckState.FAIL, dict(indexes='\n  '.join(missing))
+
+    return CheckState.OK
+
+
+@_check(hint="""\
+             At least one index is invalid. That can happen, e.g. when index creation was
+             disrupted and later restarted. You should delete the affected indices
+             and recreate them.
+
+             Invalid indexes:
+               {indexes}
+             """)
+def check_database_index_valid(conn: Connection, _: Configuration) -> CheckResult:
+    """ Checking that all database indexes are valid
+    """
+    with conn.cursor() as cur:
+        cur.execute(""" SELECT relname FROM pg_class, pg_index
+                        WHERE pg_index.indisvalid = false
+                        AND pg_index.indexrelid = pg_class.oid""")
+
+        broken = [c[0] for c in cur]
+
+    if broken:
+        return CheckState.FAIL, dict(indexes='\n  '.join(broken))
+
+    return CheckState.OK
+
+
+@_check(hint="""\
+             {error}
+             Run TIGER import again:   nominatim add-data --tiger-data <DIR>
+             """)
+def check_tiger_table(conn: Connection, config: Configuration) -> CheckResult:
+    """ Checking TIGER external data table.
+    """
+    if not config.get_bool('USE_US_TIGER_DATA'):
+        return CheckState.NOT_APPLICABLE
+
+    if not conn.table_exists('location_property_tiger'):
+        return CheckState.FAIL, dict(error='TIGER data table not found.')
+
+    with conn.cursor() as cur:
+        if cur.scalar('SELECT count(*) FROM location_property_tiger') == 0:
+            return CheckState.FAIL, dict(error='TIGER data table is empty.')
+
+    return CheckState.OK
--- a/src/nominatim_db/tools/collect_os_info.py
+++ b/src/nominatim_db/tools/collect_os_info.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Collection of host system information including software versions, memory,
+storage, and database configuration.
+"""
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import psutil
+from psycopg2.extensions import make_dsn, parse_dsn
+
+from nominatim_core.config import Configuration
+from nominatim_core.db.connection import connect
+from ..version import NOMINATIM_VERSION
+
+
+def convert_version(ver_tup: Tuple[int, int]) -> str:
+    """converts tuple version (ver_tup) to a string representation"""
+    return ".".join(map(str, ver_tup))
+
+
+def friendly_memory_string(mem: float) -> str:
+    """Create a user friendly string for the amount of memory specified as mem"""
+    mem_magnitude = ("bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    mag = 0
+    # determine order of magnitude
+    while mem > 1000:
+        mem /= 1000
+        mag += 1
+
+    return f"{mem:.1f} {mem_magnitude[mag]}"
+
+
+def run_command(cmd: Union[str, List[str]]) -> str:
+    """Runs a command using the shell and returns the output from stdout"""
+    try:
+        if sys.version_info < (3, 7):
+            cap_out = subprocess.run(cmd, stdout=subprocess.PIPE, check=False)
+        else:
+            cap_out = subprocess.run(cmd, capture_output=True, check=False)
+        return cap_out.stdout.decode("utf-8")
+    except FileNotFoundError:
+        # non-Linux system should end up here
+        return f"Unknown (unable to find the '{cmd}' command)"
+
+
+def os_name_info() -> str:
+    """Obtain Operating System Name (and possibly the version)"""
+    os_info = None
+    # man page os-release(5) details meaning of the fields
+    if Path("/etc/os-release").is_file():
+        os_info = from_file_find_line_portion(
+            "/etc/os-release", "PRETTY_NAME", "=")
+    # alternative location
+    elif Path("/usr/lib/os-release").is_file():
+        os_info = from_file_find_line_portion(
+            "/usr/lib/os-release", "PRETTY_NAME", "="
+        )
+
+    # fallback on Python's os name
+    if os_info is None or os_info == "":
+        os_info = os.name
+
+    # if the above is insufficient, take a look at neofetch's approach to OS detection
+    return os_info
+
+
+# Note: Intended to be used on informational files like /proc
+def from_file_find_line_portion(
+    filename: str, start: str, sep: str, fieldnum: int = 1
+) -> Optional[str]:
+    """open filename, finds the line starting with the 'start' string.
+    Splits the line using separator and returns a "fieldnum" from the split."""
+    with open(filename, encoding='utf8') as file:
+        result = ""
+        for line in file:
+            if line.startswith(start):
+                result = line.split(sep)[fieldnum].strip()
+        return result
+
+
+def get_postgresql_config(version: int) -> str:
+    """Retrieve postgres configuration file"""
+    try:
+        with open(f"/etc/postgresql/{version}/main/postgresql.conf", encoding='utf8') as file:
+            db_config = file.read()
+            file.close()
+            return db_config
+    except IOError:
+        return f"**Could not read '/etc/postgresql/{version}/main/postgresql.conf'**"
+
+
+def report_system_information(config: Configuration) -> None:
+    """Generate a report about the host system including software versions, memory,
+    storage, and database configuration."""
+
+    with connect(make_dsn(config.get_libpq_dsn(), dbname='postgres')) as conn:
+        postgresql_ver: str = convert_version(conn.server_version_tuple())
+
+        with conn.cursor() as cur:
+            num = cur.scalar("SELECT count(*) FROM pg_catalog.pg_database WHERE datname=%s",
+                             (parse_dsn(config.get_libpq_dsn())['dbname'], ))
+            nominatim_db_exists = num == 1 if isinstance(num, int) else False
+
+    if nominatim_db_exists:
+        with connect(config.get_libpq_dsn()) as conn:
+            postgis_ver: str = convert_version(conn.postgis_version_tuple())
+    else:
+        postgis_ver = "Unable to connect to database"
+
+    postgresql_config: str = get_postgresql_config(int(float(postgresql_ver)))
+
+    # Note: psutil.disk_partitions() is similar to run_command("lsblk")
+
+    # Note: run_command("systemd-detect-virt") only works on Linux, on other OSes
+    # should give a message: "Unknown (unable to find the 'systemd-detect-virt' command)"
+
+    # Generates the Markdown report.
+
+    report = f"""
+    **Instructions**
+    Use this information in your issue report at https://github.com/osm-search/Nominatim/issues
+    Redirect the output to a file:
+    $ ./collect_os_info.py > report.md
+
+
+    **Software Environment:**
+    - Python version: {sys.version}
+    - Nominatim version: {NOMINATIM_VERSION!s}
+    - PostgreSQL version: {postgresql_ver}
+    - PostGIS version: {postgis_ver}
+    - OS: {os_name_info()}
+    
+    
+    **Hardware Configuration:**
+    - RAM: {friendly_memory_string(psutil.virtual_memory().total)}
+    - number of CPUs: {psutil.cpu_count(logical=False)}
+    - bare metal/AWS/other cloud service (per systemd-detect-virt(1)): {run_command("systemd-detect-virt")} 
+    - type and size of disks:
+    **`df -h` - df - report file system disk space usage: **
+    ```
+    {run_command(["df", "-h"])}
+    ```
+    
+    **lsblk - list block devices: **
+    ```
+    {run_command("lsblk")}
+    ```
+    
+    
+    **Postgresql Configuration:**
+    ```
+    {postgresql_config}
+    ```
+    **Notes**
+    Please add any notes about anything above anything above that is incorrect.
+"""
+    print(report)
--- a/src/nominatim_db/tools/convert_sqlite.py
+++ b/src/nominatim_db/tools/convert_sqlite.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Exporting a Nominatim database to SQlite.
+"""
+from typing import Set, Any
+import datetime as dt
+import logging
+from pathlib import Path
+
+import sqlalchemy as sa
+
+import nominatim_api as napi
+from nominatim_api.search.query_analyzer_factory import make_query_analyzer
+from nominatim_core.typing import SaSelect, SaRow
+from nominatim_core.db.sqlalchemy_types import Geometry, IntArray
+
+LOG = logging.getLogger()
+
+async def convert(project_dir: Path, outfile: Path, options: Set[str]) -> None:
+    """ Export an existing database to sqlite. The resulting database
+        will be usable against the Python frontend of Nominatim.
+    """
+    api = napi.NominatimAPIAsync(project_dir)
+
+    try:
+        outapi = napi.NominatimAPIAsync(project_dir,
+                                        {'NOMINATIM_DATABASE_DSN': f"sqlite:dbname={outfile}",
+                                         'NOMINATIM_DATABASE_RW': '1'})
+
+        try:
+            async with api.begin() as src, outapi.begin() as dest:
+                writer = SqliteWriter(src, dest, options)
+                await writer.write()
+        finally:
+            await outapi.close()
+    finally:
+        await api.close()
+
+
+class SqliteWriter:
+    """ Worker class which creates a new SQLite database.
+    """
+
+    def __init__(self, src: napi.SearchConnection,
+                 dest: napi.SearchConnection, options: Set[str]) -> None:
+        self.src = src
+        self.dest = dest
+        self.options = options
+
+
+    async def write(self) -> None:
+        """ Create the database structure and copy the data from
+            the source database to the destination.
+        """
+        LOG.warning('Setting up spatialite')
+        await self.dest.execute(sa.select(sa.func.InitSpatialMetaData(True, 'WGS84')))
+
+        await self.create_tables()
+        await self.copy_data()
+        if 'search' in self.options:
+            await self.create_word_table()
+        await self.create_indexes()
+
+
+    async def create_tables(self) -> None:
+        """ Set up the database tables.
+        """
+        LOG.warning('Setting up tables')
+        if 'search' not in self.options:
+            self.dest.t.meta.remove(self.dest.t.search_name)
+        else:
+            await self.create_class_tables()
+
+        await self.dest.connection.run_sync(self.dest.t.meta.create_all)
+
+        # Convert all Geometry columns to Spatialite geometries
+        for table in self.dest.t.meta.sorted_tables:
+            for col in table.c:
+                if isinstance(col.type, Geometry):
+                    await self.dest.execute(sa.select(
+                        sa.func.RecoverGeometryColumn(table.name, col.name, 4326,
+                                                      col.type.subtype.upper(), 'XY')))
+
+
+    async def create_class_tables(self) -> None:
+        """ Set up the table that serve class/type-specific geometries.
+        """
+        sql = sa.text("""SELECT tablename FROM pg_tables
+                         WHERE tablename LIKE 'place_classtype_%'""")
+        for res in await self.src.execute(sql):
+            for db in (self.src, self.dest):
+                sa.Table(res[0], db.t.meta,
+                         sa.Column('place_id', sa.BigInteger),
+                         sa.Column('centroid', Geometry))
+
+
+    async def create_word_table(self) -> None:
+        """ Create the word table.
+            This table needs the property information to determine the
+            correct format. Therefore needs to be done after all other
+            data has been copied.
+        """
+        await make_query_analyzer(self.src)
+        await make_query_analyzer(self.dest)
+        src = self.src.t.meta.tables['word']
+        dest = self.dest.t.meta.tables['word']
+
+        await self.dest.connection.run_sync(dest.create)
+
+        LOG.warning("Copying word table")
+        async_result = await self.src.connection.stream(sa.select(src))
+
+        async for partition in async_result.partitions(10000):
+            data = [{k: getattr(r, k) for k in r._fields} for r in partition]
+            await self.dest.execute(dest.insert(), data)
+
+        await self.dest.connection.run_sync(sa.Index('idx_word_woken', dest.c.word_token).create)
+
+
+    async def copy_data(self) -> None:
+        """ Copy data for all registered tables.
+        """
+        def _getfield(row: SaRow, key: str) -> Any:
+            value = getattr(row, key)
+            if isinstance(value, dt.datetime):
+                if value.tzinfo is not None:
+                    value = value.astimezone(dt.timezone.utc)
+            return value
+
+        for table in self.dest.t.meta.sorted_tables:
+            LOG.warning("Copying '%s'", table.name)
+            async_result = await self.src.connection.stream(self.select_from(table.name))
+
+            async for partition in async_result.partitions(10000):
+                data = [{('class_' if k == 'class' else k): _getfield(r, k)
+                         for k in r._fields}
+                        for r in partition]
+                await self.dest.execute(table.insert(), data)
+
+        # Set up a minimal copy of pg_tables used to look up the class tables later.
+        pg_tables = sa.Table('pg_tables', self.dest.t.meta,
+                             sa.Column('schemaname', sa.Text, default='public'),
+                             sa.Column('tablename', sa.Text))
+        await self.dest.connection.run_sync(pg_tables.create)
+        data = [{'tablename': t} for t in self.dest.t.meta.tables]
+        await self.dest.execute(pg_tables.insert().values(data))
+
+
+    async def create_indexes(self) -> None:
+        """ Add indexes necessary for the frontend.
+        """
+        # reverse place node lookup needs an extra table to simulate a
+        # partial index with adaptive buffering.
+        await self.dest.execute(sa.text(
+            """ CREATE TABLE placex_place_node_areas AS
+                  SELECT place_id, ST_Expand(geometry,
+                                             14.0 * exp(-0.2 * rank_search) - 0.03) as geometry
+                  FROM placex
+                  WHERE rank_address between 5 and 25
+                        and osm_type = 'N'
+                        and linked_place_id is NULL """))
+        await self.dest.execute(sa.select(
+            sa.func.RecoverGeometryColumn('placex_place_node_areas', 'geometry',
+                                          4326, 'GEOMETRY', 'XY')))
+        await self.dest.execute(sa.select(sa.func.CreateSpatialIndex(
+                                             'placex_place_node_areas', 'geometry')))
+
+        # Remaining indexes.
+        await self.create_spatial_index('country_grid', 'geometry')
+        await self.create_spatial_index('placex', 'geometry')
+        await self.create_spatial_index('osmline', 'linegeo')
+        await self.create_spatial_index('tiger', 'linegeo')
+        await self.create_index('placex', 'place_id')
+        await self.create_index('placex', 'parent_place_id')
+        await self.create_index('placex', 'rank_address')
+        await self.create_index('addressline', 'place_id')
+        await self.create_index('postcode', 'place_id')
+        await self.create_index('osmline', 'place_id')
+        await self.create_index('tiger', 'place_id')
+
+        if 'search' in self.options:
+            await self.create_spatial_index('postcode', 'geometry')
+            await self.create_spatial_index('search_name', 'centroid')
+            await self.create_index('search_name', 'place_id')
+            await self.create_index('osmline', 'parent_place_id')
+            await self.create_index('tiger', 'parent_place_id')
+            await self.create_search_index()
+
+            for t in self.dest.t.meta.tables:
+                if t.startswith('place_classtype_'):
+                    await self.dest.execute(sa.select(
+                      sa.func.CreateSpatialIndex(t, 'centroid')))
+
+
+    async def create_spatial_index(self, table: str, column: str) -> None:
+        """ Create a spatial index on the given table and column.
+        """
+        await self.dest.execute(sa.select(
+                  sa.func.CreateSpatialIndex(getattr(self.dest.t, table).name, column)))
+
+
+    async def create_index(self, table_name: str, column: str) -> None:
+        """ Create a simple index on the given table and column.
+        """
+        table = getattr(self.dest.t, table_name)
+        await self.dest.connection.run_sync(
+            sa.Index(f"idx_{table}_{column}", getattr(table.c, column)).create)
+
+
+    async def create_search_index(self) -> None:
+        """ Create the tables and indexes needed for word lookup.
+        """
+        LOG.warning("Creating reverse search table")
+        rsn = sa.Table('reverse_search_name', self.dest.t.meta,
+                       sa.Column('word', sa.Integer()),
+                       sa.Column('column', sa.Text()),
+                       sa.Column('places', IntArray))
+        await self.dest.connection.run_sync(rsn.create)
+
+        tsrc = self.src.t.search_name
+        for column in ('name_vector', 'nameaddress_vector'):
+            sql = sa.select(sa.func.unnest(getattr(tsrc.c, column)).label('word'),
+                            sa.func.ArrayAgg(tsrc.c.place_id).label('places'))\
+                    .group_by('word')
+
+            async_result = await self.src.connection.stream(sql)
+            async for partition in async_result.partitions(100):
+                data = []
+                for row in partition:
+                    row.places.sort()
+                    data.append({'word': row.word,
+                                 'column': column,
+                                 'places': row.places})
+                await self.dest.execute(rsn.insert(), data)
+
+        await self.dest.connection.run_sync(
+            sa.Index('idx_reverse_search_name_word', rsn.c.word).create)
+
+
+    def select_from(self, table: str) -> SaSelect:
+        """ Create the SQL statement to select the source columns and rows.
+        """
+        columns = self.src.t.meta.tables[table].c
+
+        if table == 'placex':
+            # SQLite struggles with Geometries that are larger than 5MB,
+            # so simplify those.
+            return sa.select(*(c for c in columns if not isinstance(c.type, Geometry)),
+                             sa.func.ST_AsText(columns.centroid).label('centroid'),
+                             sa.func.ST_AsText(
+                               sa.case((sa.func.ST_MemSize(columns.geometry) < 5000000,
+                                        columns.geometry),
+                                       else_=sa.func.ST_SimplifyPreserveTopology(
+                                                columns.geometry, 0.0001)
+                                )).label('geometry'))
+
+        sql = sa.select(*(sa.func.ST_AsText(c).label(c.name)
+                             if isinstance(c.type, Geometry) else c for c in columns))
+
+        return sql
--- a/src/nominatim_db/tools/database_import.py
+++ b/src/nominatim_db/tools/database_import.py
@@ -0,0 +1,272 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for setting up and importing a new Nominatim database.
+"""
+from typing import Tuple, Optional, Union, Sequence, MutableMapping, Any
+import logging
+import os
+import selectors
+import subprocess
+from pathlib import Path
+
+import psutil
+from psycopg2 import sql as pysql
+
+from nominatim_core.errors import UsageError
+from nominatim_core.config import Configuration
+from nominatim_core.db.connection import connect, get_pg_env, Connection
+from nominatim_core.db.async_connection import DBConnection
+from nominatim_core.db.sql_preprocessor import SQLPreprocessor
+from .exec_utils import run_osm2pgsql
+from ..version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION
+
+LOG = logging.getLogger()
+
+def _require_version(module: str, actual: Tuple[int, int], expected: Tuple[int, int]) -> None:
+    """ Compares the version for the given module and raises an exception
+        if the actual version is too old.
+    """
+    if actual < expected:
+        LOG.fatal('Minimum supported version of %s is %d.%d. '
+                  'Found version %d.%d.',
+                  module, expected[0], expected[1], actual[0], actual[1])
+        raise UsageError(f'{module} is too old.')
+
+
+def _require_loaded(extension_name: str, conn: Connection) -> None:
+    """ Check that the given extension is loaded. """
+    if not conn.extension_loaded(extension_name):
+        LOG.fatal('Required module %s is not loaded.', extension_name)
+        raise UsageError(f'{extension_name} is not loaded.')
+
+
+def check_existing_database_plugins(dsn: str) -> None:
+    """ Check that the database has the required plugins installed."""
+    with connect(dsn) as conn:
+        _require_version('PostgreSQL server',
+                         conn.server_version_tuple(),
+                         POSTGRESQL_REQUIRED_VERSION)
+        _require_version('PostGIS',
+                         conn.postgis_version_tuple(),
+                         POSTGIS_REQUIRED_VERSION)
+        _require_loaded('hstore', conn)
+
+
+def setup_database_skeleton(dsn: str, rouser: Optional[str] = None) -> None:
+    """ Create a new database for Nominatim and populate it with the
+        essential extensions.
+
+        The function fails when the database already exists or Postgresql or
+        PostGIS versions are too old.
+
+        Uses `createdb` to create the database.
+
+        If 'rouser' is given, then the function also checks that the user
+        with that given name exists.
+
+        Requires superuser rights by the caller.
+    """
+    proc = subprocess.run(['createdb'], env=get_pg_env(dsn), check=False)
+
+    if proc.returncode != 0:
+        raise UsageError('Creating new database failed.')
+
+    with connect(dsn) as conn:
+        _require_version('PostgreSQL server',
+                         conn.server_version_tuple(),
+                         POSTGRESQL_REQUIRED_VERSION)
+
+        if rouser is not None:
+            with conn.cursor() as cur:
+                cnt = cur.scalar('SELECT count(*) FROM pg_user where usename = %s',
+                                 (rouser, ))
+                if cnt == 0:
+                    LOG.fatal("Web user '%s' does not exist. Create it with:\n"
+                              "\n      createuser %s", rouser, rouser)
+                    raise UsageError('Missing read-only user.')
+
+        # Create extensions.
+        with conn.cursor() as cur:
+            cur.execute('CREATE EXTENSION IF NOT EXISTS hstore')
+            cur.execute('CREATE EXTENSION IF NOT EXISTS postgis')
+
+            postgis_version = conn.postgis_version_tuple()
+            if postgis_version[0] >= 3:
+                cur.execute('CREATE EXTENSION IF NOT EXISTS postgis_raster')
+
+        conn.commit()
+
+        _require_version('PostGIS',
+                         conn.postgis_version_tuple(),
+                         POSTGIS_REQUIRED_VERSION)
+
+
+def import_osm_data(osm_files: Union[Path, Sequence[Path]],
+                    options: MutableMapping[str, Any],
+                    drop: bool = False, ignore_errors: bool = False) -> None:
+    """ Import the given OSM files. 'options' contains the list of
+        default settings for osm2pgsql.
+    """
+    options['import_file'] = osm_files
+    options['append'] = False
+    options['threads'] = 1
+
+    if not options['flatnode_file'] and options['osm2pgsql_cache'] == 0:
+        # Make some educated guesses about cache size based on the size
+        # of the import file and the available memory.
+        mem = psutil.virtual_memory()
+        fsize = 0
+        if isinstance(osm_files, list):
+            for fname in osm_files:
+                fsize += os.stat(str(fname)).st_size
+        else:
+            fsize = os.stat(str(osm_files)).st_size
+        options['osm2pgsql_cache'] = int(min((mem.available + mem.cached) * 0.75,
+                                             fsize * 2) / 1024 / 1024) + 1
+
+    run_osm2pgsql(options)
+
+    with connect(options['dsn']) as conn:
+        if not ignore_errors:
+            with conn.cursor() as cur:
+                cur.execute('SELECT * FROM place LIMIT 1')
+                if cur.rowcount == 0:
+                    raise UsageError('No data imported by osm2pgsql.')
+
+        if drop:
+            conn.drop_table('planet_osm_nodes')
+
+    if drop and options['flatnode_file']:
+        Path(options['flatnode_file']).unlink()
+
+
+def create_tables(conn: Connection, config: Configuration, reverse_only: bool = False) -> None:
+    """ Create the set of basic tables.
+        When `reverse_only` is True, then the main table for searching will
+        be skipped and only reverse search is possible.
+    """
+    sql = SQLPreprocessor(conn, config)
+    sql.env.globals['db']['reverse_only'] = reverse_only
+
+    sql.run_sql_file(conn, 'tables.sql')
+
+
+def create_table_triggers(conn: Connection, config: Configuration) -> None:
+    """ Create the triggers for the tables. The trigger functions must already
+        have been imported with refresh.create_functions().
+    """
+    sql = SQLPreprocessor(conn, config)
+    sql.run_sql_file(conn, 'table-triggers.sql')
+
+
+def create_partition_tables(conn: Connection, config: Configuration) -> None:
+    """ Create tables that have explicit partitioning.
+    """
+    sql = SQLPreprocessor(conn, config)
+    sql.run_sql_file(conn, 'partition-tables.src.sql')
+
+
+def truncate_data_tables(conn: Connection) -> None:
+    """ Truncate all data tables to prepare for a fresh load.
+    """
+    with conn.cursor() as cur:
+        cur.execute('TRUNCATE placex')
+        cur.execute('TRUNCATE place_addressline')
+        cur.execute('TRUNCATE location_area')
+        cur.execute('TRUNCATE location_area_country')
+        cur.execute('TRUNCATE location_property_tiger')
+        cur.execute('TRUNCATE location_property_osmline')
+        cur.execute('TRUNCATE location_postcode')
+        if conn.table_exists('search_name'):
+            cur.execute('TRUNCATE search_name')
+        cur.execute('DROP SEQUENCE IF EXISTS seq_place')
+        cur.execute('CREATE SEQUENCE seq_place start 100000')
+
+        cur.execute("""SELECT tablename FROM pg_tables
+                       WHERE tablename LIKE 'location_road_%'""")
+
+        for table in [r[0] for r in list(cur)]:
+            cur.execute('TRUNCATE ' + table)
+
+    conn.commit()
+
+
+_COPY_COLUMNS = pysql.SQL(',').join(map(pysql.Identifier,
+                                        ('osm_type', 'osm_id', 'class', 'type',
+                                         'name', 'admin_level', 'address',
+                                         'extratags', 'geometry')))
+
+
+def load_data(dsn: str, threads: int) -> None:
+    """ Copy data into the word and placex table.
+    """
+    sel = selectors.DefaultSelector()
+    # Then copy data from place to placex in <threads - 1> chunks.
+    place_threads = max(1, threads - 1)
+    for imod in range(place_threads):
+        conn = DBConnection(dsn)
+        conn.connect()
+        conn.perform(
+            pysql.SQL("""INSERT INTO placex ({columns})
+                           SELECT {columns} FROM place
+                           WHERE osm_id % {total} = {mod}
+                             AND NOT (class='place' and (type='houses' or type='postcode'))
+                             AND ST_IsValid(geometry)
+                      """).format(columns=_COPY_COLUMNS,
+                                  total=pysql.Literal(place_threads),
+                                  mod=pysql.Literal(imod)))
+        sel.register(conn, selectors.EVENT_READ, conn)
+
+    # Address interpolations go into another table.
+    conn = DBConnection(dsn)
+    conn.connect()
+    conn.perform("""INSERT INTO location_property_osmline (osm_id, address, linegeo)
+                      SELECT osm_id, address, geometry FROM place
+                      WHERE class='place' and type='houses' and osm_type='W'
+                            and ST_GeometryType(geometry) = 'ST_LineString'
+                 """)
+    sel.register(conn, selectors.EVENT_READ, conn)
+
+    # Now wait for all of them to finish.
+    todo = place_threads + 1
+    while todo > 0:
+        for key, _ in sel.select(1):
+            conn = key.data
+            sel.unregister(conn)
+            conn.wait()
+            conn.close()
+            todo -= 1
+        print('.', end='', flush=True)
+    print('\n')
+
+    with connect(dsn) as syn_conn:
+        with syn_conn.cursor() as cur:
+            cur.execute('ANALYSE')
+
+
+def create_search_indices(conn: Connection, config: Configuration,
+                          drop: bool = False, threads: int = 1) -> None:
+    """ Create tables that have explicit partitioning.
+    """
+
+    # If index creation failed and left an index invalid, they need to be
+    # cleaned out first, so that the script recreates them.
+    with conn.cursor() as cur:
+        cur.execute("""SELECT relname FROM pg_class, pg_index
+                       WHERE pg_index.indisvalid = false
+                             AND pg_index.indexrelid = pg_class.oid""")
+        bad_indices = [row[0] for row in list(cur)]
+        for idx in bad_indices:
+            LOG.info("Drop invalid index %s.", idx)
+            cur.execute(pysql.SQL('DROP INDEX {}').format(pysql.Identifier(idx)))
+    conn.commit()
+
+    sql = SQLPreprocessor(conn, config)
+
+    sql.run_parallel_sql_file(config.get_libpq_dsn(),
+                              'indices.sql', min(8, threads), drop=drop)
--- a/src/nominatim_db/tools/exec_utils.py
+++ b/src/nominatim_db/tools/exec_utils.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Helper functions for executing external programs.
+"""
+from typing import Any, Mapping
+import logging
+import os
+import subprocess
+import shutil
+
+from nominatim_core.typing import StrPath
+from nominatim_core.db.connection import get_pg_env
+
+LOG = logging.getLogger()
+
+def run_php_server(server_address: str, base_dir: StrPath) -> None:
+    """ Run the built-in server from the given directory.
+    """
+    subprocess.run(['/usr/bin/env', 'php', '-S', server_address],
+                   cwd=str(base_dir), check=True)
+
+
+def run_osm2pgsql(options: Mapping[str, Any]) -> None:
+    """ Run osm2pgsql with the given options.
+    """
+    env = get_pg_env(options['dsn'])
+
+    osm2pgsql_cmd = options['osm2pgsql']
+    if osm2pgsql_cmd is None:
+        osm2pgsql_cmd = shutil.which('osm2pgsql')
+        if osm2pgsql_cmd is None:
+            raise RuntimeError('osm2pgsql executable not found. Please install osm2pgsql first.')
+
+    cmd = [str(osm2pgsql_cmd),
+           '--slim',
+           '--log-progress', 'true',
+           '--number-processes', '1' if options['append'] else str(options['threads']),
+           '--cache', str(options['osm2pgsql_cache']),
+           '--style', str(options['osm2pgsql_style'])
+          ]
+
+    if str(options['osm2pgsql_style']).endswith('.lua'):
+        env['LUA_PATH'] = ';'.join((str(options['osm2pgsql_style_path'] / '?.lua'),
+                                    os.environ.get('LUAPATH', ';')))
+        cmd.extend(('--output', 'flex'))
+    else:
+        cmd.extend(('--output', 'gazetteer', '--hstore', '--latlon'))
+
+    cmd.append('--append' if options['append'] else '--create')
+
+    if options['flatnode_file']:
+        cmd.extend(('--flat-nodes', options['flatnode_file']))
+
+    for key, param in (('slim_data', '--tablespace-slim-data'),
+                       ('slim_index', '--tablespace-slim-index'),
+                       ('main_data', '--tablespace-main-data'),
+                       ('main_index', '--tablespace-main-index')):
+        if options['tablespaces'][key]:
+            cmd.extend((param, options['tablespaces'][key]))
+
+    if options['tablespaces']['main_data']:
+        env['NOMINATIM_TABLESPACE_PLACE_DATA'] = options['tablespaces']['main_data']
+    if options['tablespaces']['main_index']:
+        env['NOMINATIM_TABLESPACE_PLACE_INDEX'] = options['tablespaces']['main_index']
+
+    if options.get('disable_jit', False):
+        env['PGOPTIONS'] = '-c jit=off -c max_parallel_workers_per_gather=0'
+
+    if 'import_data' in options:
+        cmd.extend(('-r', 'xml', '-'))
+    elif isinstance(options['import_file'], list):
+        for fname in options['import_file']:
+            cmd.append(str(fname))
+    else:
+        cmd.append(str(options['import_file']))
+
+    subprocess.run(cmd, cwd=options.get('cwd', '.'),
+                   input=options.get('import_data'),
+                   env=env, check=True)
--- a/src/nominatim_db/tools/freeze.py
+++ b/src/nominatim_db/tools/freeze.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for removing unnecessary data from the database.
+"""
+from typing import Optional
+from pathlib import Path
+
+from psycopg2 import sql as pysql
+
+from nominatim_core.db.connection import Connection
+
+UPDATE_TABLES = [
+    'address_levels',
+    'gb_postcode',
+    'import_osmosis_log',
+    'import_polygon_%',
+    'location_area%',
+    'location_road%',
+    'place',
+    'planet_osm_%',
+    'search_name_%',
+    'us_postcode',
+    'wikipedia_%'
+]
+
+def drop_update_tables(conn: Connection) -> None:
+    """ Drop all tables only necessary for updating the database from
+        OSM replication data.
+    """
+    parts = (pysql.SQL("(tablename LIKE {})").format(pysql.Literal(t)) for t in UPDATE_TABLES)
+
+    with conn.cursor() as cur:
+        cur.execute(pysql.SQL("SELECT tablename FROM pg_tables WHERE ")
+                    + pysql.SQL(' or ').join(parts))
+        tables = [r[0] for r in cur]
+
+        for table in tables:
+            cur.drop_table(table, cascade=True)
+
+    conn.commit()
+
+
+def drop_flatnode_file(fpath: Optional[Path]) -> None:
+    """ Remove the flatnode file if it exists.
+    """
+    if fpath and fpath.exists():
+        fpath.unlink()
+
+def is_frozen(conn: Connection) -> bool:
+    """ Returns true if database is in a frozen state
+    """
+
+    return conn.table_exists('place') is False
--- a/src/nominatim_db/tools/migration.py
+++ b/src/nominatim_db/tools/migration.py
@@ -0,0 +1,405 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for database migration to newer software versions.
+"""
+from typing import List, Tuple, Callable, Any
+import logging
+
+from psycopg2 import sql as pysql
+
+from nominatim_core.errors import UsageError
+from nominatim_core.config import Configuration
+from nominatim_core.db import properties
+from nominatim_core.db.connection import connect, Connection
+from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version
+from ..tokenizer import factory as tokenizer_factory
+from . import refresh
+
+LOG = logging.getLogger()
+
+_MIGRATION_FUNCTIONS : List[Tuple[NominatimVersion, Callable[..., None]]] = []
+
+def migrate(config: Configuration, paths: Any) -> int:
+    """ Check for the current database version and execute migrations,
+        if necesssary.
+    """
+    with connect(config.get_libpq_dsn()) as conn:
+        if conn.table_exists('nominatim_properties'):
+            db_version_str = properties.get_property(conn, 'database_version')
+        else:
+            db_version_str = None
+
+        if db_version_str is not None:
+            db_version = parse_version(db_version_str)
+
+            if db_version == NOMINATIM_VERSION:
+                LOG.warning("Database already at latest version (%s)", db_version_str)
+                return 0
+
+            LOG.info("Detected database version: %s", db_version_str)
+        else:
+            db_version = _guess_version(conn)
+
+
+        for version, func in _MIGRATION_FUNCTIONS:
+            if db_version < version or \
+               (db_version == (3, 5, 0, 99) and version == (3, 5, 0, 99)):
+                title = func.__doc__ or ''
+                LOG.warning("Running: %s (%s)", title.split('\n', 1)[0], version)
+                kwargs = dict(conn=conn, config=config, paths=paths)
+                func(**kwargs)
+                conn.commit()
+
+        LOG.warning('Updating SQL functions.')
+        refresh.create_functions(conn, config)
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+        tokenizer.update_sql_functions(config)
+
+        properties.set_property(conn, 'database_version', str(NOMINATIM_VERSION))
+
+        conn.commit()
+
+    return 0
+
+
+def _guess_version(conn: Connection) -> NominatimVersion:
+    """ Guess a database version when there is no property table yet.
+        Only migrations for 3.6 and later are supported, so bail out
+        when the version seems older.
+    """
+    with conn.cursor() as cur:
+        # In version 3.6, the country_name table was updated. Check for that.
+        cnt = cur.scalar("""SELECT count(*) FROM
+                            (SELECT svals(name) FROM  country_name
+                             WHERE country_code = 'gb')x;
+                         """)
+        if cnt < 100:
+            LOG.fatal('It looks like your database was imported with a version '
+                      'prior to 3.6.0. Automatic migration not possible.')
+            raise UsageError('Migration not possible.')
+
+    return NominatimVersion(3, 5, 0, 99)
+
+
+
+def _migration(major: int, minor: int, patch: int = 0,
+               dbpatch: int = 0) -> Callable[[Callable[..., None]], Callable[..., None]]:
+    """ Decorator for a single migration step. The parameters describe the
+        version after which the migration is applicable, i.e before changing
+        from the given version to the next, the migration is required.
+
+        All migrations are run in the order in which they are defined in this
+        file. Do not run global SQL scripts for migrations as you cannot be sure
+        that these scripts do the same in later versions.
+
+        Functions will always be reimported in full at the end of the migration
+        process, so the migration functions may leave a temporary state behind
+        there.
+    """
+    def decorator(func: Callable[..., None]) -> Callable[..., None]:
+        version = NominatimVersion(major, minor, patch, dbpatch)
+        _MIGRATION_FUNCTIONS.append((version, func))
+        return func
+
+    return decorator
+
+
+@_migration(3, 5, 0, 99)
+def import_status_timestamp_change(conn: Connection, **_: Any) -> None:
+    """ Add timezone to timestamp in status table.
+
+        The import_status table has been changed to include timezone information
+        with the time stamp.
+    """
+    with conn.cursor() as cur:
+        cur.execute("""ALTER TABLE import_status ALTER COLUMN lastimportdate
+                       TYPE timestamp with time zone;""")
+
+
+@_migration(3, 5, 0, 99)
+def add_nominatim_property_table(conn: Connection, config: Configuration, **_: Any) -> None:
+    """ Add nominatim_property table.
+    """
+    if not conn.table_exists('nominatim_properties'):
+        with conn.cursor() as cur:
+            cur.execute(pysql.SQL("""CREATE TABLE nominatim_properties (
+                                        property TEXT,
+                                        value TEXT);
+                                     GRANT SELECT ON TABLE nominatim_properties TO {};
+                                  """).format(pysql.Identifier(config.DATABASE_WEBUSER)))
+
+@_migration(3, 6, 0, 0)
+def change_housenumber_transliteration(conn: Connection, **_: Any) -> None:
+    """ Transliterate housenumbers.
+
+        The database schema switched from saving raw housenumbers in
+        placex.housenumber to saving transliterated ones.
+
+        Note: the function create_housenumber_id() has been dropped in later
+              versions.
+    """
+    with conn.cursor() as cur:
+        cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
+                       RETURNS TEXT AS $$
+                       DECLARE
+                         normtext TEXT;
+                       BEGIN
+                         SELECT array_to_string(array_agg(trans), ';')
+                           INTO normtext
+                           FROM (SELECT lookup_word as trans,
+                                        getorcreate_housenumber_id(lookup_word)
+                                 FROM (SELECT make_standard_name(h) as lookup_word
+                                       FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
+                         return normtext;
+                       END;
+                       $$ LANGUAGE plpgsql STABLE STRICT;""")
+        cur.execute("DELETE FROM word WHERE class = 'place' and type = 'house'")
+        cur.execute("""UPDATE placex
+                       SET housenumber = create_housenumber_id(housenumber)
+                       WHERE housenumber is not null""")
+
+
+@_migration(3, 7, 0, 0)
+def switch_placenode_geometry_index(conn: Connection, **_: Any) -> None:
+    """ Replace idx_placex_geometry_reverse_placeNode index.
+
+        Make the index slightly more permissive, so that it can also be used
+        when matching up boundaries and place nodes. It makes the index
+        idx_placex_adminname index unnecessary.
+    """
+    with conn.cursor() as cur:
+        cur.execute(""" CREATE INDEX IF NOT EXISTS idx_placex_geometry_placenode ON placex
+                        USING GIST (geometry)
+                        WHERE osm_type = 'N' and rank_search < 26
+                              and class = 'place' and type != 'postcode'
+                              and linked_place_id is null""")
+        cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
+
+
+@_migration(3, 7, 0, 1)
+def install_legacy_tokenizer(conn: Connection, config: Configuration, **_: Any) -> None:
+    """ Setup legacy tokenizer.
+
+        If no other tokenizer has been configured yet, then create the
+        configuration for the backwards-compatible legacy tokenizer
+    """
+    if properties.get_property(conn, 'tokenizer') is None:
+        with conn.cursor() as cur:
+            for table in ('placex', 'location_property_osmline'):
+                has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
+                                           WHERE table_name = %s
+                                           and column_name = 'token_info'""",
+                                        (table, ))
+                if has_column == 0:
+                    cur.execute(pysql.SQL('ALTER TABLE {} ADD COLUMN token_info JSONB')
+                                .format(pysql.Identifier(table)))
+        tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
+                                                       module_name='legacy')
+
+        tokenizer.migrate_database(config) # type: ignore[attr-defined]
+
+
+@_migration(4, 0, 99, 0)
+def create_tiger_housenumber_index(conn: Connection, **_: Any) -> None:
+    """ Create idx_location_property_tiger_parent_place_id with included
+        house number.
+
+        The inclusion is needed for efficient lookup of housenumbers in
+        full address searches.
+    """
+    if conn.server_version_tuple() >= (11, 0, 0):
+        with conn.cursor() as cur:
+            cur.execute(""" CREATE INDEX IF NOT EXISTS
+                                idx_location_property_tiger_housenumber_migrated
+                            ON location_property_tiger
+                            USING btree(parent_place_id)
+                            INCLUDE (startnumber, endnumber) """)
+
+
+@_migration(4, 0, 99, 1)
+def create_interpolation_index_on_place(conn: Connection, **_: Any) -> None:
+    """ Create idx_place_interpolations for lookup of interpolation lines
+        on updates.
+    """
+    with conn.cursor() as cur:
+        cur.execute("""CREATE INDEX IF NOT EXISTS idx_place_interpolations
+                       ON place USING gist(geometry)
+                       WHERE osm_type = 'W' and address ? 'interpolation'""")
+
+
+@_migration(4, 0, 99, 2)
+def add_step_column_for_interpolation(conn: Connection, **_: Any) -> None:
+    """ Add a new column 'step' to the interpolations table.
+
+        Also converts the data into the stricter format which requires that
+        startnumbers comply with the odd/even requirements.
+    """
+    if conn.table_has_column('location_property_osmline', 'step'):
+        return
+
+    with conn.cursor() as cur:
+        # Mark invalid all interpolations with no intermediate numbers.
+        cur.execute("""UPDATE location_property_osmline SET startnumber = null
+                       WHERE endnumber - startnumber <= 1 """)
+        # Align the start numbers where odd/even does not match.
+        cur.execute("""UPDATE location_property_osmline
+                       SET startnumber = startnumber + 1,
+                           linegeo = ST_LineSubString(linegeo,
+                                                      1.0 / (endnumber - startnumber)::float,
+                                                      1)
+                       WHERE (interpolationtype = 'odd' and startnumber % 2 = 0)
+                              or (interpolationtype = 'even' and startnumber % 2 = 1)
+                    """)
+        # Mark invalid odd/even interpolations with no intermediate numbers.
+        cur.execute("""UPDATE location_property_osmline SET startnumber = null
+                       WHERE interpolationtype in ('odd', 'even')
+                             and endnumber - startnumber = 2""")
+        # Finally add the new column and populate it.
+        cur.execute("ALTER TABLE location_property_osmline ADD COLUMN step SMALLINT")
+        cur.execute("""UPDATE location_property_osmline
+                         SET step = CASE WHEN interpolationtype = 'all'
+                                         THEN 1 ELSE 2 END
+                    """)
+
+
+@_migration(4, 0, 99, 3)
+def add_step_column_for_tiger(conn: Connection, **_: Any) -> None:
+    """ Add a new column 'step' to the tiger data table.
+    """
+    if conn.table_has_column('location_property_tiger', 'step'):
+        return
+
+    with conn.cursor() as cur:
+        cur.execute("ALTER TABLE location_property_tiger ADD COLUMN step SMALLINT")
+        cur.execute("""UPDATE location_property_tiger
+                         SET step = CASE WHEN interpolationtype = 'all'
+                                         THEN 1 ELSE 2 END
+                    """)
+
+
+@_migration(4, 0, 99, 4)
+def add_derived_name_column_for_country_names(conn: Connection, **_: Any) -> None:
+    """ Add a new column 'derived_name' which in the future takes the
+        country names as imported from OSM data.
+    """
+    if not conn.table_has_column('country_name', 'derived_name'):
+        with conn.cursor() as cur:
+            cur.execute("ALTER TABLE country_name ADD COLUMN derived_name public.HSTORE")
+
+
+@_migration(4, 0, 99, 5)
+def mark_internal_country_names(conn: Connection, config: Configuration, **_: Any) -> None:
+    """ Names from the country table should be marked as internal to prevent
+        them from being deleted. Only necessary for ICU tokenizer.
+    """
+    import psycopg2.extras # pylint: disable=import-outside-toplevel
+
+    tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+    with tokenizer.name_analyzer() as analyzer:
+        with conn.cursor() as cur:
+            psycopg2.extras.register_hstore(cur)
+            cur.execute("SELECT country_code, name FROM country_name")
+
+            for country_code, names in cur:
+                if not names:
+                    names = {}
+                names['countrycode'] = country_code
+                analyzer.add_country_names(country_code, names)
+
+
+@_migration(4, 1, 99, 0)
+def add_place_deletion_todo_table(conn: Connection, **_: Any) -> None:
+    """ Add helper table for deleting data on updates.
+
+        The table is only necessary when updates are possible, i.e.
+        the database is not in freeze mode.
+    """
+    if conn.table_exists('place'):
+        with conn.cursor() as cur:
+            cur.execute("""CREATE TABLE IF NOT EXISTS place_to_be_deleted (
+                             osm_type CHAR(1),
+                             osm_id BIGINT,
+                             class TEXT,
+                             type TEXT,
+                             deferred BOOLEAN)""")
+
+
+@_migration(4, 1, 99, 1)
+def split_pending_index(conn: Connection, **_: Any) -> None:
+    """ Reorganise indexes for pending updates.
+    """
+    if conn.table_exists('place'):
+        with conn.cursor() as cur:
+            cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_rank_address_sector
+                           ON placex USING BTREE (rank_address, geometry_sector)
+                           WHERE indexed_status > 0""")
+            cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_rank_boundaries_sector
+                           ON placex USING BTREE (rank_search, geometry_sector)
+                           WHERE class = 'boundary' and type = 'administrative'
+                                 and indexed_status > 0""")
+            cur.execute("DROP INDEX IF EXISTS idx_placex_pendingsector")
+
+
+@_migration(4, 2, 99, 0)
+def enable_forward_dependencies(conn: Connection, **_: Any) -> None:
+    """ Create indexes for updates with forward dependency tracking (long-running).
+    """
+    if conn.table_exists('planet_osm_ways'):
+        with conn.cursor() as cur:
+            cur.execute("""SELECT * FROM pg_indexes
+                           WHERE tablename = 'planet_osm_ways'
+                                 and indexdef LIKE '%nodes%'""")
+            if cur.rowcount == 0:
+                cur.execute("""CREATE OR REPLACE FUNCTION public.planet_osm_index_bucket(bigint[])
+                               RETURNS bigint[]
+                               LANGUAGE sql IMMUTABLE
+                                AS $function$
+                                  SELECT ARRAY(SELECT DISTINCT unnest($1) >> 5)
+                                $function$""")
+                cur.execute("""CREATE INDEX planet_osm_ways_nodes_bucket_idx
+                                 ON planet_osm_ways
+                                 USING gin (planet_osm_index_bucket(nodes))
+                                 WITH (fastupdate=off)""")
+                cur.execute("""CREATE INDEX planet_osm_rels_parts_idx
+                                 ON planet_osm_rels USING gin (parts)
+                                 WITH (fastupdate=off)""")
+                cur.execute("ANALYZE planet_osm_ways")
+
+
+@_migration(4, 2, 99, 1)
+def add_improved_geometry_reverse_placenode_index(conn: Connection, **_: Any) -> None:
+    """ Create improved index for reverse lookup of place nodes.
+    """
+    with conn.cursor() as cur:
+        cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_geometry_reverse_lookupPlaceNode
+                       ON placex
+                       USING gist (ST_Buffer(geometry, reverse_place_diameter(rank_search)))
+                       WHERE rank_address between 4 and 25 AND type != 'postcode'
+                         AND name is not null AND linked_place_id is null AND osm_type = 'N'
+                    """)
+
+@_migration(4, 4, 99, 0)
+def create_postcode_area_lookup_index(conn: Connection, **_: Any) -> None:
+    """ Create index needed for looking up postcode areas from postocde points.
+    """
+    with conn.cursor() as cur:
+        cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_postcode_areas
+                       ON placex USING BTREE (country_code, postcode)
+                       WHERE osm_type = 'R' AND class = 'boundary' AND type = 'postal_code'
+                    """)
+
+
+@_migration(4, 4, 99, 1)
+def create_postcode_parent_index(conn: Connection, **_: Any) -> None:
+    """ Create index needed for updating postcodes when a parent changes.
+    """
+    if conn.table_exists('planet_osm_ways'):
+        with conn.cursor() as cur:
+            cur.execute("""CREATE INDEX IF NOT EXISTS
+                             idx_location_postcode_parent_place_id
+                             ON location_postcode USING BTREE (parent_place_id)""")
--- a/src/nominatim_db/tools/postcodes.py
+++ b/src/nominatim_db/tools/postcodes.py
@@ -0,0 +1,234 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for importing, updating and otherwise maintaining the table
+of artificial postcode centroids.
+"""
+from typing import Optional, Tuple, Dict, List, TextIO
+from collections import defaultdict
+from pathlib import Path
+import csv
+import gzip
+import logging
+from math import isfinite
+
+from psycopg2 import sql as pysql
+
+from nominatim_core.db.connection import connect, Connection
+from nominatim_core.utils.centroid import PointsCentroid
+from ..data.postcode_format import PostcodeFormatter, CountryPostcodeMatcher
+from ..tokenizer.base import AbstractAnalyzer, AbstractTokenizer
+
+LOG = logging.getLogger()
+
+def _to_float(numstr: str, max_value: float) -> float:
+    """ Convert the number in string into a float. The number is expected
+        to be in the range of [-max_value, max_value]. Otherwise rises a
+        ValueError.
+    """
+    num = float(numstr)
+    if not isfinite(num) or num <= -max_value or num >= max_value:
+        raise ValueError()
+
+    return num
+
+class _PostcodeCollector:
+    """ Collector for postcodes of a single country.
+    """
+
+    def __init__(self, country: str, matcher: Optional[CountryPostcodeMatcher]):
+        self.country = country
+        self.matcher = matcher
+        self.collected: Dict[str, PointsCentroid] = defaultdict(PointsCentroid)
+        self.normalization_cache: Optional[Tuple[str, Optional[str]]] = None
+
+
+    def add(self, postcode: str, x: float, y: float) -> None:
+        """ Add the given postcode to the collection cache. If the postcode
+            already existed, it is overwritten with the new centroid.
+        """
+        if self.matcher is not None:
+            normalized: Optional[str]
+            if self.normalization_cache and self.normalization_cache[0] == postcode:
+                normalized = self.normalization_cache[1]
+            else:
+                match = self.matcher.match(postcode)
+                normalized = self.matcher.normalize(match) if match else None
+                self.normalization_cache = (postcode, normalized)
+
+            if normalized:
+                self.collected[normalized] += (x, y)
+
+
+    def commit(self, conn: Connection, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
+        """ Update postcodes for the country from the postcodes selected so far
+            as well as any externally supplied postcodes.
+        """
+        self._update_from_external(analyzer, project_dir)
+        to_add, to_delete, to_update = self._compute_changes(conn)
+
+        LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
+                 self.country, len(to_add), len(to_delete), len(to_update))
+
+        with conn.cursor() as cur:
+            if to_add:
+                cur.execute_values(
+                    """INSERT INTO location_postcode
+                         (place_id, indexed_status, country_code,
+                          postcode, geometry) VALUES %s""",
+                    to_add,
+                    template=pysql.SQL("""(nextval('seq_place'), 1, {},
+                                          %s, 'SRID=4326;POINT(%s %s)')
+                                       """).format(pysql.Literal(self.country)))
+            if to_delete:
+                cur.execute("""DELETE FROM location_postcode
+                               WHERE country_code = %s and postcode = any(%s)
+                            """, (self.country, to_delete))
+            if to_update:
+                cur.execute_values(
+                    pysql.SQL("""UPDATE location_postcode
+                                 SET indexed_status = 2,
+                                     geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
+                                 FROM (VALUES %s) AS v (pc, x, y)
+                                 WHERE country_code = {} and postcode = pc
+                              """).format(pysql.Literal(self.country)), to_update)
+
+
+    def _compute_changes(self, conn: Connection) \
+          -> Tuple[List[Tuple[str, float, float]], List[str], List[Tuple[str, float, float]]]:
+        """ Compute which postcodes from the collected postcodes have to be
+            added or modified and which from the location_postcode table
+            have to be deleted.
+        """
+        to_update = []
+        to_delete = []
+        with conn.cursor() as cur:
+            cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
+                           FROM location_postcode
+                           WHERE country_code = %s""",
+                        (self.country, ))
+            for postcode, x, y in cur:
+                pcobj = self.collected.pop(postcode, None)
+                if pcobj:
+                    newx, newy = pcobj.centroid()
+                    if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
+                        to_update.append((postcode, newx, newy))
+                else:
+                    to_delete.append(postcode)
+
+        to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
+        self.collected = defaultdict(PointsCentroid)
+
+        return to_add, to_delete, to_update
+
+
+    def _update_from_external(self, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
+        """ Look for an external postcode file for the active country in
+            the project directory and add missing postcodes when found.
+        """
+        csvfile = self._open_external(project_dir)
+        if csvfile is None:
+            return
+
+        try:
+            reader = csv.DictReader(csvfile)
+            for row in reader:
+                if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
+                    LOG.warning("Bad format for external postcode file for country '%s'."
+                                " Ignored.", self.country)
+                    return
+                postcode = analyzer.normalize_postcode(row['postcode'])
+                if postcode not in self.collected:
+                    try:
+                        # Do float conversation separately, it might throw
+                        centroid = (_to_float(row['lon'], 180),
+                                    _to_float(row['lat'], 90))
+                        self.collected[postcode] += centroid
+                    except ValueError:
+                        LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
+                                    row['lat'], row['lon'], self.country)
+
+        finally:
+            csvfile.close()
+
+
+    def _open_external(self, project_dir: Path) -> Optional[TextIO]:
+        fname = project_dir / f'{self.country}_postcodes.csv'
+
+        if fname.is_file():
+            LOG.info("Using external postcode file '%s'.", fname)
+            return open(fname, 'r', encoding='utf-8')
+
+        fname = project_dir / f'{self.country}_postcodes.csv.gz'
+
+        if fname.is_file():
+            LOG.info("Using external postcode file '%s'.", fname)
+            return gzip.open(fname, 'rt')
+
+        return None
+
+
+def update_postcodes(dsn: str, project_dir: Path, tokenizer: AbstractTokenizer) -> None:
+    """ Update the table of artificial postcodes.
+
+        Computes artificial postcode centroids from the placex table,
+        potentially enhances it with external data and then updates the
+        postcodes in the table 'location_postcode'.
+    """
+    matcher = PostcodeFormatter()
+    with tokenizer.name_analyzer() as analyzer:
+        with connect(dsn) as conn:
+            # First get the list of countries that currently have postcodes.
+            # (Doing this before starting to insert, so it is fast on import.)
+            with conn.cursor() as cur:
+                cur.execute("SELECT DISTINCT country_code FROM location_postcode")
+                todo_countries = set((row[0] for row in cur))
+
+            # Recompute the list of valid postcodes from placex.
+            with conn.cursor(name="placex_postcodes") as cur:
+                cur.execute("""
+                SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
+                FROM (SELECT
+                        COALESCE(plx.country_code,
+                                 get_country_code(ST_Centroid(pl.geometry))) as cc,
+                        pl.address->'postcode' as pc,
+                        COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
+                      FROM place AS pl LEFT OUTER JOIN placex AS plx
+                             ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
+                    WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
+                WHERE pc IS NOT null AND cc IS NOT null
+                ORDER BY cc, pc""")
+
+                collector = None
+
+                for country, postcode, x, y in cur:
+                    if collector is None or country != collector.country:
+                        if collector is not None:
+                            collector.commit(conn, analyzer, project_dir)
+                        collector = _PostcodeCollector(country, matcher.get_matcher(country))
+                        todo_countries.discard(country)
+                    collector.add(postcode, x, y)
+
+                if collector is not None:
+                    collector.commit(conn, analyzer, project_dir)
+
+            # Now handle any countries that are only in the postcode table.
+            for country in todo_countries:
+                fmt = matcher.get_matcher(country)
+                _PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir)
+
+            conn.commit()
+
+        analyzer.update_postcodes_from_db()
+
+def can_compute(dsn: str) -> bool:
+    """
+        Check that the place table exists so that
+        postcodes can be computed.
+    """
+    with connect(dsn) as conn:
+        return conn.table_exists('place')
--- a/src/nominatim_db/tools/refresh.py
+++ b/src/nominatim_db/tools/refresh.py
@@ -0,0 +1,346 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for bringing auxiliary data in the database up-to-date.
+"""
+from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
+import csv
+import gzip
+import logging
+from textwrap import dedent
+from pathlib import Path
+
+from psycopg2 import sql as pysql
+
+from nominatim_core.config import Configuration
+from nominatim_core.db.connection import Connection, connect
+from nominatim_core.db.utils import execute_file, CopyBuffer
+from nominatim_core.db.sql_preprocessor import SQLPreprocessor
+from ..version import NOMINATIM_VERSION
+
+LOG = logging.getLogger()
+
+OSM_TYPE = {'N': 'node', 'W': 'way', 'R': 'relation'}
+
+def _add_address_level_rows_from_entry(rows: MutableSequence[Tuple[Any, ...]],
+                                       entry: Mapping[str, Any]) -> None:
+    """ Converts a single entry from the JSON format for address rank
+        descriptions into a flat format suitable for inserting into a
+        PostgreSQL table and adds these lines to `rows`.
+    """
+    countries = entry.get('countries') or (None, )
+    for key, values in entry['tags'].items():
+        for value, ranks in values.items():
+            if isinstance(ranks, list):
+                rank_search, rank_address = ranks
+            else:
+                rank_search = rank_address = ranks
+            if not value:
+                value = None
+            for country in countries:
+                rows.append((country, key, value, rank_search, rank_address))
+
+
+def load_address_levels(conn: Connection, table: str, levels: Sequence[Mapping[str, Any]]) -> None:
+    """ Replace the `address_levels` table with the contents of `levels'.
+
+        A new table is created any previously existing table is dropped.
+        The table has the following columns:
+            country, class, type, rank_search, rank_address
+    """
+    rows: List[Tuple[Any, ...]]  = []
+    for entry in levels:
+        _add_address_level_rows_from_entry(rows, entry)
+
+    with conn.cursor() as cur:
+        cur.drop_table(table)
+
+        cur.execute(pysql.SQL("""CREATE TABLE {} (
+                                        country_code varchar(2),
+                                        class TEXT,
+                                        type TEXT,
+                                        rank_search SMALLINT,
+                                        rank_address SMALLINT)
+                              """).format(pysql.Identifier(table)))
+
+        cur.execute_values(pysql.SQL("INSERT INTO {} VALUES %s")
+                           .format(pysql.Identifier(table)), rows)
+
+        cur.execute(pysql.SQL('CREATE UNIQUE INDEX ON {} (country_code, class, type)')
+                    .format(pysql.Identifier(table)))
+
+    conn.commit()
+
+
+def load_address_levels_from_config(conn: Connection, config: Configuration) -> None:
+    """ Replace the `address_levels` table with the content as
+        defined in the given configuration. Uses the parameter
+        NOMINATIM_ADDRESS_LEVEL_CONFIG to determine the location of the
+        configuration file.
+    """
+    cfg = config.load_sub_configuration('', config='ADDRESS_LEVEL_CONFIG')
+    load_address_levels(conn, 'address_levels', cfg)
+
+
+def create_functions(conn: Connection, config: Configuration,
+                     enable_diff_updates: bool = True,
+                     enable_debug: bool = False) -> None:
+    """ (Re)create the PL/pgSQL functions.
+    """
+    sql = SQLPreprocessor(conn, config)
+
+    sql.run_sql_file(conn, 'functions.sql',
+                     disable_diff_updates=not enable_diff_updates,
+                     debug=enable_debug)
+
+
+
+WEBSITE_SCRIPTS = (
+    'deletable.php',
+    'details.php',
+    'lookup.php',
+    'polygons.php',
+    'reverse.php',
+    'search.php',
+    'status.php'
+)
+
+# constants needed by PHP scripts: PHP name, config name, type
+PHP_CONST_DEFS = (
+    ('Database_DSN', 'DATABASE_DSN', str),
+    ('Default_Language', 'DEFAULT_LANGUAGE', str),
+    ('Log_DB', 'LOG_DB', bool),
+    ('Log_File', 'LOG_FILE', Path),
+    ('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
+    ('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
+    ('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
+    ('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
+    ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
+    ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
+    ('MapIcon_URL', 'MAPICON_URL', str),
+    ('Search_WithinCountries', 'SEARCH_WITHIN_COUNTRIES', bool),
+)
+
+
+def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
+    """ Replaces the wikipedia importance tables with new data.
+        The import is run in a single transaction so that the new data
+        is replace seamlessly.
+
+        Returns 0 if all was well and 1 if the importance file could not
+        be found. Throws an exception if there was an error reading the file.
+    """
+    if import_importance_csv(dsn, data_path / 'wikimedia-importance.csv.gz') == 0 \
+       or import_importance_sql(dsn, data_path / 'wikimedia-importance.sql.gz',
+                                ignore_errors) == 0:
+        return 0
+
+    return 1
+
+
+def import_importance_csv(dsn: str, data_file: Path) -> int:
+    """ Replace wikipedia importance table with data from a
+        single CSV file.
+
+        The file must be a gzipped CSV and have the following columns:
+        language, title, importance, wikidata_id
+
+        Other columns may be present but will be ignored.
+    """
+    if not data_file.exists():
+        return 1
+
+    # Only import the first occurance of a wikidata ID.
+    # This keeps indexes and table small.
+    wd_done = set()
+
+    with connect(dsn) as conn:
+        with conn.cursor() as cur:
+            cur.drop_table('wikipedia_article')
+            cur.drop_table('wikipedia_redirect')
+            cur.drop_table('wikimedia_importance')
+            cur.execute("""CREATE TABLE wikimedia_importance (
+                             language TEXT NOT NULL,
+                             title TEXT NOT NULL,
+                             importance double precision NOT NULL,
+                             wikidata TEXT
+                           ) """)
+
+        with gzip.open(str(data_file), 'rt') as fd, CopyBuffer() as buf:
+            for row in csv.DictReader(fd, delimiter='\t', quotechar='|'):
+                wd_id = int(row['wikidata_id'][1:])
+                buf.add(row['language'], row['title'], row['importance'],
+                        None if wd_id in wd_done else row['wikidata_id'])
+                wd_done.add(wd_id)
+
+                if buf.size() > 10000000:
+                    with conn.cursor() as cur:
+                        buf.copy_out(cur, 'wikimedia_importance',
+                                     columns=['language', 'title', 'importance',
+                                              'wikidata'])
+
+            with conn.cursor() as cur:
+                buf.copy_out(cur, 'wikimedia_importance',
+                             columns=['language', 'title', 'importance', 'wikidata'])
+
+        with conn.cursor() as cur:
+            cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_title
+                           ON wikimedia_importance (title)""")
+            cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_wikidata
+                           ON wikimedia_importance (wikidata)
+                           WHERE wikidata is not null""")
+
+        conn.commit()
+
+    return 0
+
+
+def import_importance_sql(dsn: str, data_file: Path, ignore_errors: bool) -> int:
+    """ Replace wikipedia importance table with data from an SQL file.
+    """
+    if not data_file.exists():
+        return 1
+
+    pre_code = """BEGIN;
+                  DROP TABLE IF EXISTS "wikipedia_article";
+                  DROP TABLE IF EXISTS "wikipedia_redirect";
+                  DROP TABLE IF EXISTS "wikipedia_importance";
+               """
+    post_code = "COMMIT"
+    execute_file(dsn, data_file, ignore_errors=ignore_errors,
+                 pre_code=pre_code, post_code=post_code)
+
+    return 0
+
+
+def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
+    """ Replaces the secondary importance raster data table with new data.
+
+        Returns 0 if all was well and 1 if the raster SQL file could not
+        be found. Throws an exception if there was an error reading the file.
+    """
+    datafile = data_path / 'secondary_importance.sql.gz'
+    if not datafile.exists():
+        return 1
+
+    with connect(dsn) as conn:
+        postgis_version = conn.postgis_version_tuple()
+        if postgis_version[0] < 3:
+            LOG.error('PostGIS version is too old for using OSM raster data.')
+            return 2
+
+    execute_file(dsn, datafile, ignore_errors=ignore_errors)
+
+    return 0
+
+def recompute_importance(conn: Connection) -> None:
+    """ Recompute wikipedia links and importance for all entries in placex.
+        This is a long-running operations that must not be executed in
+        parallel with updates.
+    """
+    with conn.cursor() as cur:
+        cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL')
+        cur.execute("""
+            UPDATE placex SET (wikipedia, importance) =
+               (SELECT wikipedia, importance
+                FROM compute_importance(extratags, country_code, rank_search, centroid))
+            """)
+        cur.execute("""
+            UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
+             FROM placex d
+             WHERE s.place_id = d.linked_place_id and d.wikipedia is not null
+                   and (s.wikipedia is null or s.importance < d.importance);
+            """)
+
+        cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL')
+    conn.commit()
+
+
+def _quote_php_variable(var_type: Type[Any], config: Configuration,
+                        conf_name: str) -> str:
+    if var_type == bool:
+        return 'true' if config.get_bool(conf_name) else 'false'
+
+    if var_type == int:
+        return cast(str, getattr(config, conf_name))
+
+    if not getattr(config, conf_name):
+        return 'false'
+
+    if var_type == Path:
+        value = str(config.get_path(conf_name) or '')
+    else:
+        value = getattr(config, conf_name)
+
+    quoted = value.replace("'", "\\'")
+    return f"'{quoted}'"
+
+
+def setup_website(basedir: Path, config: Configuration, conn: Connection) -> None:
+    """ Create the website script stubs.
+    """
+    if config.lib_dir.php is None:
+        LOG.info("Python frontend does not require website setup. Skipping.")
+        return
+
+    if not basedir.exists():
+        LOG.info('Creating website directory.')
+        basedir.mkdir()
+
+    assert config.project_dir is not None
+    basedata = dedent(f"""\
+                      <?php
+
+                      @define('CONST_Debug', $_GET['debug'] ?? false);
+                      @define('CONST_LibDir', '{config.lib_dir.php}');
+                      @define('CONST_TokenizerDir', '{config.project_dir / 'tokenizer'}');
+                      @define('CONST_NominatimVersion', '{NOMINATIM_VERSION!s}');
+
+                      """)
+
+    for php_name, conf_name, var_type in PHP_CONST_DEFS:
+        varout = _quote_php_variable(var_type, config, conf_name)
+
+        basedata += f"@define('CONST_{php_name}', {varout});\n"
+
+    template = "\nrequire_once(CONST_LibDir.'/website/{}');\n"
+
+    search_name_table_exists = bool(conn and conn.table_exists('search_name'))
+
+    for script in WEBSITE_SCRIPTS:
+        if not search_name_table_exists and script == 'search.php':
+            out = template.format('reverse-only-search.php')
+        else:
+            out = template.format(script)
+
+        (basedir / script).write_text(basedata + out, 'utf-8')
+
+
+def invalidate_osm_object(osm_type: str, osm_id: int, conn: Connection,
+                          recursive: bool = True) -> None:
+    """ Mark the given OSM object for reindexing. When 'recursive' is set
+        to True (the default), then all dependent objects are marked for
+        reindexing as well.
+
+        'osm_type' must be on of 'N' (node), 'W' (way) or 'R' (relation).
+        If the given object does not exist, then nothing happens.
+    """
+    assert osm_type in ('N', 'R', 'W')
+
+    LOG.warning("Invalidating OSM %s %s%s.",
+                OSM_TYPE[osm_type], osm_id,
+                ' and its dependent places' if recursive else '')
+
+    with conn.cursor() as cur:
+        if recursive:
+            sql = """SELECT place_force_update(place_id)
+                     FROM placex WHERE osm_type = %s and osm_id = %s"""
+        else:
+            sql = """UPDATE placex SET indexed_status = 2
+                     WHERE osm_type = %s and osm_id = %s"""
+
+        cur.execute(sql, (osm_type, osm_id))
--- a/src/nominatim_db/tools/replication.py
+++ b/src/nominatim_db/tools/replication.py
@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for updating a database from a replication source.
+"""
+from typing import ContextManager, MutableMapping, Any, Generator, cast, Iterator
+from contextlib import contextmanager
+import datetime as dt
+from enum import Enum
+import logging
+import time
+import types
+import urllib.request as urlrequest
+
+import requests
+
+from nominatim_core.errors import UsageError
+from nominatim_core.db import status
+from nominatim_core.db.connection import Connection, connect
+from .exec_utils import run_osm2pgsql
+
+try:
+    from osmium.replication.server import ReplicationServer
+    from osmium import WriteHandler
+    from osmium import version as pyo_version
+except ImportError as exc:
+    logging.getLogger().critical("pyosmium not installed. Replication functions not available.\n"
+                                 "To install pyosmium via pip: pip3 install osmium")
+    raise UsageError("replication tools not available") from exc
+
+LOG = logging.getLogger()
+
+def init_replication(conn: Connection, base_url: str,
+                     socket_timeout: int = 60) -> None:
+    """ Set up replication for the server at the given base URL.
+    """
+    LOG.info("Using replication source: %s", base_url)
+    date = status.compute_database_date(conn)
+
+    # margin of error to make sure we get all data
+    date -= dt.timedelta(hours=3)
+
+    with _make_replication_server(base_url, socket_timeout) as repl:
+        seq = repl.timestamp_to_sequence(date)
+
+    if seq is None:
+        LOG.fatal("Cannot reach the configured replication service '%s'.\n"
+                  "Does the URL point to a directory containing OSM update data?",
+                  base_url)
+        raise UsageError("Failed to reach replication service")
+
+    status.set_status(conn, date=date, seq=seq)
+
+    LOG.warning("Updates initialised at sequence %s (%s)", seq, date)
+
+
+def check_for_updates(conn: Connection, base_url: str,
+                      socket_timeout: int = 60) -> int:
+    """ Check if new data is available from the replication service at the
+        given base URL.
+    """
+    _, seq, _ = status.get_status(conn)
+
+    if seq is None:
+        LOG.error("Replication not set up. "
+                  "Please run 'nominatim replication --init' first.")
+        return 254
+
+    with _make_replication_server(base_url, socket_timeout) as repl:
+        state = repl.get_state_info()
+
+    if state is None:
+        LOG.error("Cannot get state for URL %s.", base_url)
+        return 253
+
+    if state.sequence <= seq:
+        LOG.warning("Database is up to date.")
+        return 2
+
+    LOG.warning("New data available (%i => %i).", seq, state.sequence)
+    return 0
+
+class UpdateState(Enum):
+    """ Possible states after an update has run.
+    """
+
+    UP_TO_DATE = 0
+    MORE_PENDING = 2
+    NO_CHANGES = 3
+
+
+def update(dsn: str, options: MutableMapping[str, Any],
+           socket_timeout: int = 60) -> UpdateState:
+    """ Update database from the next batch of data. Returns the state of
+        updates according to `UpdateState`.
+    """
+    with connect(dsn) as conn:
+        startdate, startseq, indexed = status.get_status(conn)
+        conn.commit()
+
+    if startseq is None:
+        LOG.error("Replication not set up. "
+                  "Please run 'nominatim replication --init' first.")
+        raise UsageError("Replication not set up.")
+
+    assert startdate is not None
+
+    if not indexed and options['indexed_only']:
+        LOG.info("Skipping update. There is data that needs indexing.")
+        return UpdateState.MORE_PENDING
+
+    last_since_update = dt.datetime.now(dt.timezone.utc) - startdate
+    update_interval = dt.timedelta(seconds=options['update_interval'])
+    if last_since_update < update_interval:
+        duration = (update_interval - last_since_update).seconds
+        LOG.warning("Sleeping for %s sec before next update.", duration)
+        time.sleep(duration)
+
+    if options['import_file'].exists():
+        options['import_file'].unlink()
+
+    # Read updates into file.
+    with _make_replication_server(options['base_url'], socket_timeout) as repl:
+        outhandler = WriteHandler(str(options['import_file']))
+        endseq = repl.apply_diffs(outhandler, startseq + 1,
+                                  max_size=options['max_diff_size'] * 1024)
+        outhandler.close()
+
+        if endseq is None:
+            return UpdateState.NO_CHANGES
+
+        with connect(dsn) as conn:
+            run_osm2pgsql_updates(conn, options)
+
+            # Write the current status to the file
+            endstate = repl.get_state_info(endseq)
+            status.set_status(conn, endstate.timestamp if endstate else None,
+                              seq=endseq, indexed=False)
+            conn.commit()
+
+    return UpdateState.UP_TO_DATE
+
+
+def run_osm2pgsql_updates(conn: Connection, options: MutableMapping[str, Any]) -> None:
+    """ Run osm2pgsql in append mode.
+    """
+    # Remove any stale deletion marks.
+    with conn.cursor() as cur:
+        cur.execute('TRUNCATE place_to_be_deleted')
+    conn.commit()
+
+    # Consume updates with osm2pgsql.
+    options['append'] = True
+    options['disable_jit'] = conn.server_version_tuple() >= (11, 0)
+    run_osm2pgsql(options)
+
+    # Handle deletions
+    with conn.cursor() as cur:
+        cur.execute('SELECT flush_deleted_places()')
+    conn.commit()
+
+
+def _make_replication_server(url: str, timeout: int) -> ContextManager[ReplicationServer]:
+    """ Returns a ReplicationServer in form of a context manager.
+
+        Creates a light wrapper around older versions of pyosmium that did
+        not support the context manager interface.
+    """
+    if hasattr(ReplicationServer, '__enter__'):
+        # Patches the open_url function for pyosmium >= 3.2
+        # where the socket timeout is no longer respected.
+        def patched_open_url(self: ReplicationServer, url: urlrequest.Request) -> Any:
+            """ Download a resource from the given URL and return a byte sequence
+                of the content.
+            """
+            headers = {"User-Agent" : f"Nominatim (pyosmium/{pyo_version.pyosmium_release})"}
+
+            if self.session is not None:
+                return self.session.get(url.get_full_url(),
+                                       headers=headers, timeout=timeout or None,
+                                       stream=True)
+
+            @contextmanager
+            def _get_url_with_session() -> Iterator[requests.Response]:
+                with requests.Session() as session:
+                    request = session.get(url.get_full_url(),
+                                          headers=headers, timeout=timeout or None,
+                                          stream=True)
+                    yield request
+
+            return _get_url_with_session()
+
+        repl = ReplicationServer(url)
+        setattr(repl, 'open_url', types.MethodType(patched_open_url, repl))
+
+        return cast(ContextManager[ReplicationServer], repl)
+
+    @contextmanager
+    def get_cm() -> Generator[ReplicationServer, None, None]:
+        yield ReplicationServer(url)
+
+    return get_cm()
--- a/src/nominatim_db/tools/special_phrases/init.py
+++ b/src/nominatim_db/tools/special_phrases/init.py
--- a/src/nominatim_db/tools/special_phrases/importer_statistics.py
+++ b/src/nominatim_db/tools/special_phrases/importer_statistics.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+    Contains the class which handles statistics for the
+    import of special phrases.
+"""
+import logging
+LOG = logging.getLogger()
+
+class SpecialPhrasesImporterStatistics():
+    """
+        Class handling statistics of the import
+        process of special phrases.
+    """
+    def __init__(self) -> None:
+        self._intialize_values()
+
+    def _intialize_values(self) -> None:
+        """
+            Set all counts for the global
+            import to 0.
+        """
+        self.tables_created = 0
+        self.tables_deleted = 0
+        self.tables_ignored = 0
+        self.invalids = 0
+
+    def notify_one_phrase_invalid(self) -> None:
+        """
+            Add +1 to the count of invalid entries
+            fetched from the wiki.
+        """
+        self.invalids += 1
+
+    def notify_one_table_created(self) -> None:
+        """
+            Add +1 to the count of created tables.
+        """
+        self.tables_created += 1
+
+    def notify_one_table_deleted(self) -> None:
+        """
+            Add +1 to the count of deleted tables.
+        """
+        self.tables_deleted += 1
+
+    def notify_one_table_ignored(self) -> None:
+        """
+            Add +1 to the count of ignored tables.
+        """
+        self.tables_ignored += 1
+
+    def notify_import_done(self) -> None:
+        """
+            Print stats for the whole import process
+            and reset all values.
+        """
+        LOG.info('====================================================================')
+        LOG.info('Final statistics of the import:')
+        LOG.info('- %s phrases were invalid.', self.invalids)
+        if self.invalids > 0:
+            LOG.info('  Those invalid phrases have been skipped.')
+        LOG.info('- %s tables were ignored as they already exist on the database',
+                 self.tables_ignored)
+        LOG.info('- %s tables were created', self.tables_created)
+        LOG.info('- %s tables were deleted from the database', self.tables_deleted)
+        if self.tables_deleted > 0:
+            LOG.info('  They were deleted as they are not valid anymore.')
+
+        if self.invalids > 0:
+            LOG.warning('%s phrases were invalid and have been skipped during the whole process.',
+                        self.invalids)
+
+        self._intialize_values()
--- a/src/nominatim_db/tools/special_phrases/sp_csv_loader.py
+++ b/src/nominatim_db/tools/special_phrases/sp_csv_loader.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+    Module containing the SPCsvLoader class.
+
+    The class allows to load phrases from a csv file.
+"""
+from typing import Iterable
+import csv
+import os
+
+from nominatim_core.errors import UsageError
+from .special_phrase import SpecialPhrase
+
+class SPCsvLoader:
+    """
+        Handles loading of special phrases from external csv file.
+    """
+    def __init__(self, csv_path: str) -> None:
+        self.csv_path = csv_path
+
+
+    def generate_phrases(self) -> Iterable[SpecialPhrase]:
+        """ Open and parse the given csv file.
+            Create the corresponding SpecialPhrases.
+        """
+        self._check_csv_validity()
+
+        with open(self.csv_path, encoding='utf-8') as fd:
+            reader = csv.DictReader(fd, delimiter=',')
+            for row in reader:
+                yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
+
+
+    def _check_csv_validity(self) -> None:
+        """
+            Check that the csv file has the right extension.
+        """
+        _, extension = os.path.splitext(self.csv_path)
+
+        if extension != '.csv':
+            raise UsageError(f'The file {self.csv_path} is not a csv file.')
--- a/src/nominatim_db/tools/special_phrases/sp_importer.py
+++ b/src/nominatim_db/tools/special_phrases/sp_importer.py
@@ -0,0 +1,274 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+    Module containing the class handling the import
+    of the special phrases.
+
+    Phrases are analyzed and imported into the database.
+
+    The phrases already present in the database which are not
+    valids anymore are removed.
+"""
+from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set
+import logging
+import re
+
+from psycopg2.sql import Identifier, SQL
+
+from nominatim_core.typing import Protocol
+from nominatim_core.config import Configuration
+from nominatim_core.db.connection import Connection
+from .importer_statistics import SpecialPhrasesImporterStatistics
+from .special_phrase import SpecialPhrase
+from ...tokenizer.base import AbstractTokenizer
+
+LOG = logging.getLogger()
+
+def _classtype_table(phrase_class: str, phrase_type: str) -> str:
+    """ Return the name of the table for the given class and type.
+    """
+    return f'place_classtype_{phrase_class}_{phrase_type}'
+
+
+class SpecialPhraseLoader(Protocol):
+    """ Protocol for classes implementing a loader for special phrases.
+    """
+
+    def generate_phrases(self) -> Iterable[SpecialPhrase]:
+        """ Generates all special phrase terms this loader can produce.
+        """
+
+
+class SPImporter():
+    # pylint: disable-msg=too-many-instance-attributes
+    """
+        Class handling the process of special phrases importation into the database.
+
+        Take a sp loader which load the phrases from an external source.
+    """
+    def __init__(self, config: Configuration, conn: Connection,
+                 sp_loader: SpecialPhraseLoader) -> None:
+        self.config = config
+        self.db_connection = conn
+        self.sp_loader = sp_loader
+        self.statistics_handler = SpecialPhrasesImporterStatistics()
+        self.black_list, self.white_list = self._load_white_and_black_lists()
+        self.sanity_check_pattern = re.compile(r'^\w+$')
+        # This set will contain all existing phrases to be added.
+        # It contains tuples with the following format: (label, class, type, operator)
+        self.word_phrases: Set[Tuple[str, str, str, str]] = set()
+        # This set will contain all existing place_classtype tables which doesn't match any
+        # special phrases class/type on the wiki.
+        self.table_phrases_to_delete: Set[str] = set()
+
+    def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None:
+        """
+            Iterate through all SpecialPhrases extracted from the
+            loader and import them into the database.
+
+            If should_replace is set to True only the loaded phrases
+            will be kept into the database. All other phrases already
+            in the database will be removed.
+        """
+        LOG.warning('Special phrases importation starting')
+        self._fetch_existing_place_classtype_tables()
+
+        # Store pairs of class/type for further processing
+        class_type_pairs = set()
+
+        for phrase in self.sp_loader.generate_phrases():
+            result = self._process_phrase(phrase)
+            if result:
+                class_type_pairs.add(result)
+
+        self._create_classtype_table_and_indexes(class_type_pairs)
+        if should_replace:
+            self._remove_non_existent_tables_from_db()
+        self.db_connection.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.update_special_phrases(self.word_phrases, should_replace)
+
+        LOG.warning('Import done.')
+        self.statistics_handler.notify_import_done()
+
+
+    def _fetch_existing_place_classtype_tables(self) -> None:
+        """
+            Fetch existing place_classtype tables.
+            Fill the table_phrases_to_delete set of the class.
+        """
+        query = """
+            SELECT table_name
+            FROM information_schema.tables
+            WHERE table_schema='public'
+            AND table_name like 'place_classtype_%';
+        """
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute(SQL(query))
+            for row in db_cursor:
+                self.table_phrases_to_delete.add(row[0])
+
+    def _load_white_and_black_lists(self) \
+          -> Tuple[Mapping[str, Sequence[str]], Mapping[str, Sequence[str]]]:
+        """
+            Load white and black lists from phrases-settings.json.
+        """
+        settings = self.config.load_sub_configuration('phrase-settings.json')
+
+        return settings['blackList'], settings['whiteList']
+
+    def _check_sanity(self, phrase: SpecialPhrase) -> bool:
+        """
+            Check sanity of given inputs in case somebody added garbage in the wiki.
+            If a bad class/type is detected the system will exit with an error.
+        """
+        class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
+        type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
+
+        if not class_matchs or not type_matchs:
+            LOG.warning("Bad class/type: %s=%s. It will not be imported",
+                        phrase.p_class, phrase.p_type)
+            return False
+        return True
+
+    def _process_phrase(self, phrase: SpecialPhrase) -> Optional[Tuple[str, str]]:
+        """
+            Processes the given phrase by checking black and white list
+            and sanity.
+            Return the class/type pair corresponding to the phrase.
+        """
+
+        # blacklisting: disallow certain class/type combinations
+        if phrase.p_class in self.black_list.keys() \
+           and phrase.p_type in self.black_list[phrase.p_class]:
+            return None
+
+        # whitelisting: if class is in whitelist, allow only tags in the list
+        if phrase.p_class in self.white_list.keys() \
+           and phrase.p_type not in self.white_list[phrase.p_class]:
+            return None
+
+        # sanity check, in case somebody added garbage in the wiki
+        if not self._check_sanity(phrase):
+            self.statistics_handler.notify_one_phrase_invalid()
+            return None
+
+        self.word_phrases.add((phrase.p_label, phrase.p_class,
+                               phrase.p_type, phrase.p_operator))
+
+        return (phrase.p_class, phrase.p_type)
+
+
+    def _create_classtype_table_and_indexes(self,
+                                            class_type_pairs: Iterable[Tuple[str, str]]) -> None:
+        """
+            Create table place_classtype for each given pair.
+            Also create indexes on place_id and centroid.
+        """
+        LOG.warning('Create tables and indexes...')
+
+        sql_tablespace = self.config.TABLESPACE_AUX_DATA
+        if sql_tablespace:
+            sql_tablespace = ' TABLESPACE ' + sql_tablespace
+
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
+
+        for pair in class_type_pairs:
+            phrase_class = pair[0]
+            phrase_type = pair[1]
+
+            table_name = _classtype_table(phrase_class, phrase_type)
+
+            if table_name in self.table_phrases_to_delete:
+                self.statistics_handler.notify_one_table_ignored()
+                # Remove this table from the ones to delete as it match a
+                # class/type still existing on the special phrases of the wiki.
+                self.table_phrases_to_delete.remove(table_name)
+                # So don't need to create the table and indexes.
+                continue
+
+            # Table creation
+            self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
+
+            # Indexes creation
+            self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
+
+            # Grant access on read to the web user.
+            self._grant_access_to_webuser(phrase_class, phrase_type)
+
+            self.statistics_handler.notify_one_table_created()
+
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute("DROP INDEX idx_placex_classtype")
+
+
+    def _create_place_classtype_table(self, sql_tablespace: str,
+                                      phrase_class: str, phrase_type: str) -> None:
+        """
+            Create table place_classtype of the given phrase_class/phrase_type
+            if doesn't exit.
+        """
+        table_name = _classtype_table(phrase_class, phrase_type)
+        with self.db_connection.cursor() as cur:
+            cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS
+                                 SELECT place_id AS place_id,
+                                        st_centroid(geometry) AS centroid
+                                 FROM placex
+                                 WHERE class = %s AND type = %s
+                             """).format(Identifier(table_name), SQL(sql_tablespace)),
+                        (phrase_class, phrase_type))
+
+
+    def _create_place_classtype_indexes(self, sql_tablespace: str,
+                                        phrase_class: str, phrase_type: str) -> None:
+        """
+            Create indexes on centroid and place_id for the place_classtype table.
+        """
+        index_prefix = f'idx_place_classtype_{phrase_class}_{phrase_type}_'
+        base_table = _classtype_table(phrase_class, phrase_type)
+        # Index on centroid
+        if not self.db_connection.index_exists(index_prefix + 'centroid'):
+            with self.db_connection.cursor() as db_cursor:
+                db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}")
+                                  .format(Identifier(index_prefix + 'centroid'),
+                                          Identifier(base_table),
+                                          SQL(sql_tablespace)))
+
+        # Index on place_id
+        if not self.db_connection.index_exists(index_prefix + 'place_id'):
+            with self.db_connection.cursor() as db_cursor:
+                db_cursor.execute(SQL("CREATE INDEX {} ON {} USING btree(place_id) {}")
+                                  .format(Identifier(index_prefix + 'place_id'),
+                                          Identifier(base_table),
+                                          SQL(sql_tablespace)))
+
+
+    def _grant_access_to_webuser(self, phrase_class: str, phrase_type: str) -> None:
+        """
+            Grant access on read to the table place_classtype for the webuser.
+        """
+        table_name = _classtype_table(phrase_class, phrase_type)
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
+                              .format(Identifier(table_name),
+                                      Identifier(self.config.DATABASE_WEBUSER)))
+
+    def _remove_non_existent_tables_from_db(self) -> None:
+        """
+            Remove special phrases which doesn't exist on the wiki anymore.
+            Delete the place_classtype tables.
+        """
+        LOG.warning('Cleaning database...')
+
+        # Delete place_classtype tables corresponding to class/type which
+        # are not on the wiki anymore.
+        with self.db_connection.cursor() as db_cursor:
+            for table in self.table_phrases_to_delete:
+                self.statistics_handler.notify_one_table_deleted()
+                db_cursor.drop_table(table)
--- a/src/nominatim_db/tools/special_phrases/sp_wiki_loader.py
+++ b/src/nominatim_db/tools/special_phrases/sp_wiki_loader.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+    Module containing the SPWikiLoader class.
+"""
+from typing import Iterable
+import re
+import logging
+
+from nominatim_core.config import Configuration
+from nominatim_core.utils.url_utils import get_url
+from .special_phrase import SpecialPhrase
+
+LOG = logging.getLogger()
+
+def _get_wiki_content(lang: str) -> str:
+    """
+        Request and return the wiki page's content
+        corresponding to special phrases for a given lang.
+        Requested URL Example :
+            https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
+    """
+    url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+          + lang.upper()
+    return get_url(url)
+
+
+class SPWikiLoader:
+    """
+        Handles loading of special phrases from the wiki.
+    """
+    def __init__(self, config: Configuration) -> None:
+        self.config = config
+        # Compile the regex here to increase performances.
+        self.occurence_pattern = re.compile(
+            r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
+        )
+        # Hack around a bug where building=yes was imported with quotes into the wiki
+        self.type_fix_pattern = re.compile(r'\"|&quot;')
+
+        self.languages = self.config.get_str_list('LANGUAGES') or \
+                         ['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
+                          'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
+                          'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
+                          'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi',
+                          'lv', 'tr']
+
+
+    def generate_phrases(self) -> Iterable[SpecialPhrase]:
+        """ Download the wiki pages for the configured languages
+            and extract the phrases from the page.
+        """
+        for lang in self.languages:
+            LOG.warning('Importing phrases for lang: %s...', lang)
+            loaded_xml = _get_wiki_content(lang)
+
+            # One match will be of format [label, class, type, operator, plural]
+            matches = self.occurence_pattern.findall(loaded_xml)
+
+            for match in matches:
+                yield SpecialPhrase(match[0],
+                                    match[1],
+                                    self.type_fix_pattern.sub('', match[2]),
+                                    match[3])
--- a/src/nominatim_db/tools/special_phrases/special_phrase.py
+++ b/src/nominatim_db/tools/special_phrases/special_phrase.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+    Module containing the class SpecialPhrase.
+
+    This class is a model used to transfer a special phrase through
+    the process of load and importation.
+"""
+from typing import Any
+
+class SpecialPhrase:
+    """
+        Model representing a special phrase.
+    """
+    def __init__(self, p_label: str, p_class: str, p_type: str, p_operator: str) -> None:
+        self.p_label = p_label.strip()
+        self.p_class = p_class.strip()
+        self.p_type = p_type.strip()
+        # Needed if some operator in the wiki are not written in english
+        p_operator = p_operator.strip().lower()
+        self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, SpecialPhrase):
+            return False
+
+        return self.p_label == other.p_label \
+               and self.p_class == other.p_class \
+               and self.p_type == other.p_type \
+               and self.p_operator == other.p_operator
+
+    def __hash__(self) -> int:
+        return hash((self.p_label, self.p_class, self.p_type, self.p_operator))
--- a/src/nominatim_db/tools/tiger_data.py
+++ b/src/nominatim_db/tools/tiger_data.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for importing tiger data and handling tarbar and directory files
+"""
+from typing import Any, TextIO, List, Union, cast
+import csv
+import io
+import logging
+import os
+import tarfile
+
+from psycopg2.extras import Json
+
+from nominatim_core.config import Configuration
+from nominatim_core.db.connection import connect
+from nominatim_core.db.async_connection import WorkerPool
+from nominatim_core.db.sql_preprocessor import SQLPreprocessor
+from nominatim_core.errors import UsageError
+from ..data.place_info import PlaceInfo
+from ..tokenizer.base import AbstractAnalyzer, AbstractTokenizer
+from . import freeze
+
+LOG = logging.getLogger()
+
+class TigerInput:
+    """ Context manager that goes through Tiger input files which may
+        either be in a directory or gzipped together in a tar file.
+    """
+
+    def __init__(self, data_dir: str) -> None:
+        self.tar_handle = None
+        self.files: List[Union[str, tarfile.TarInfo]] = []
+
+        if data_dir.endswith('.tar.gz'):
+            try:
+                self.tar_handle = tarfile.open(data_dir) # pylint: disable=consider-using-with
+            except tarfile.ReadError as err:
+                LOG.fatal("Cannot open '%s'. Is this a tar file?", data_dir)
+                raise UsageError("Cannot open Tiger data file.") from err
+
+            self.files = [i for i in self.tar_handle.getmembers() if i.name.endswith('.csv')]
+            LOG.warning("Found %d CSV files in tarfile with path %s", len(self.files), data_dir)
+        else:
+            files = os.listdir(data_dir)
+            self.files = [os.path.join(data_dir, i) for i in files if i.endswith('.csv')]
+            LOG.warning("Found %d CSV files in path %s", len(self.files), data_dir)
+
+        if not self.files:
+            LOG.warning("Tiger data import selected but no files found at %s", data_dir)
+
+
+    def __enter__(self) -> 'TigerInput':
+        return self
+
+
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        if self.tar_handle:
+            self.tar_handle.close()
+            self.tar_handle = None
+
+
+    def next_file(self) -> TextIO:
+        """ Return a file handle to the next file to be processed.
+            Raises an IndexError if there is no file left.
+        """
+        fname = self.files.pop(0)
+
+        if self.tar_handle is not None:
+            extracted = self.tar_handle.extractfile(fname)
+            assert extracted is not None
+            return io.TextIOWrapper(extracted)
+
+        return open(cast(str, fname), encoding='utf-8')
+
+
+    def __len__(self) -> int:
+        return len(self.files)
+
+
+def handle_threaded_sql_statements(pool: WorkerPool, fd: TextIO,
+                                   analyzer: AbstractAnalyzer) -> None:
+    """ Handles sql statement with multiplexing
+    """
+    lines = 0
+    # Using pool of database connections to execute sql statements
+
+    sql = "SELECT tiger_line_import(%s, %s, %s, %s, %s, %s)"
+
+    for row in csv.DictReader(fd, delimiter=';'):
+        try:
+            address = dict(street=row['street'], postcode=row['postcode'])
+            args = ('SRID=4326;' + row['geometry'],
+                    int(row['from']), int(row['to']), row['interpolation'],
+                    Json(analyzer.process_place(PlaceInfo({'address': address}))),
+                    analyzer.normalize_postcode(row['postcode']))
+        except ValueError:
+            continue
+        pool.next_free_worker().perform(sql, args=args)
+
+        lines += 1
+        if lines == 1000:
+            print('.', end='', flush=True)
+            lines = 0
+
+
+def add_tiger_data(data_dir: str, config: Configuration, threads: int,
+                   tokenizer: AbstractTokenizer) -> int:
+    """ Import tiger data from directory or tar file `data dir`.
+    """
+    dsn = config.get_libpq_dsn()
+
+    with connect(dsn) as conn:
+        is_frozen = freeze.is_frozen(conn)
+        conn.close()
+
+        if is_frozen:
+            raise UsageError("Tiger cannot be imported when database frozen (Github issue #3048)")
+
+    with TigerInput(data_dir) as tar:
+        if not tar:
+            return 1
+
+        with connect(dsn) as conn:
+            sql = SQLPreprocessor(conn, config)
+            sql.run_sql_file(conn, 'tiger_import_start.sql')
+
+        # Reading files and then for each file line handling
+        # sql_query in <threads - 1> chunks.
+        place_threads = max(1, threads - 1)
+
+        with WorkerPool(dsn, place_threads, ignore_sql_errors=True) as pool:
+            with tokenizer.name_analyzer() as analyzer:
+                while tar:
+                    with tar.next_file() as fd:
+                        handle_threaded_sql_statements(pool, fd, analyzer)
+
+        print('\n')
+
+    LOG.warning("Creating indexes on Tiger data")
+    with connect(dsn) as conn:
+        sql = SQLPreprocessor(conn, config)
+        sql.run_sql_file(conn, 'tiger_import_finish.sql')
+
+    return 0
--- a/src/nominatim_db/version.py
+++ b/src/nominatim_db/version.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Version information for Nominatim.
+"""
+from typing import Optional, NamedTuple
+
+class NominatimVersion(NamedTuple):
+    """ Version information for Nominatim. We follow semantic versioning.
+
+        Major, minor and patch_level refer to the last released version.
+        The database patch level tracks important changes between releases
+        and must always be increased when there is a change to the database or code
+        that requires a migration.
+
+        When adding a migration on the development branch, raise the patch level
+        to 99 to make sure that the migration is applied when updating from a
+        patch release to the next minor version. Patch releases usually shouldn't
+        have migrations in them. When they are needed, then make sure that the
+        migration can be reapplied and set the migration version to the appropriate
+        patch level when cherry-picking the commit with the migration.
+    """
+
+    major: int
+    minor: int
+    patch_level: int
+    db_patch_level: int
+
+    def __str__(self) -> str:
+        return f"{self.major}.{self.minor}.{self.patch_level}-{self.db_patch_level}"
+
+    def release_version(self) -> str:
+        """ Return the release version in semantic versioning format.
+
+            The release version does not include the database patch version.
+        """
+        return f"{self.major}.{self.minor}.{self.patch_level}"
+
+NOMINATIM_VERSION = NominatimVersion(4, 4, 99, 1)
+
+POSTGRESQL_REQUIRED_VERSION = (9, 6)
+POSTGIS_REQUIRED_VERSION = (2, 2)
+
+# Cmake sets a variable @GIT_HASH@ by executing 'git --log'. It is not run
+# on every execution of 'make'.
+# cmake/tool-installed.tmpl is used to build the binary 'nominatim'. Inside
+# there is a call to set the variable value below.
+GIT_COMMIT_HASH : Optional[str] = None
+
+
+def parse_version(version: str) -> NominatimVersion:
+    """ Parse a version string into a version consisting of a tuple of
+        four ints: major, minor, patch level, database patch level
+
+        This is the reverse operation of `version_str()`.
+    """
+    parts = version.split('.')
+    return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])