prepare release 4.0.2

harmonize flags for PHP's htmlspecialchars
adapt PHP tests for debug output
2026-02-14 18:37:58 +00:00 · 2023-02-20 17:41:33 +01:00 · 2023-02-20 17:33:53 +01:00 · 2023-02-20 17:33:10 +01:00 · 2023-02-20 17:33:08 +01:00 · 2021-11-22 14:18:54 +01:00
165 changed files with 8782 additions and 2811 deletions
--- a/.github/ISSUE_TEMPLATE/report-issues-with-search-results.md
+++ b/.github/ISSUE_TEMPLATE/report-issues-with-search-results.md
@@ -7,6 +7,8 @@ assignees: ''

 ---

+<!-- Note: this template is for reporting problems with searching. If you have found an issue with the data, you need to report/fix the issue directly in OpenStreetMap. See https://www.openstreetmap.org/fixthemap for details. -->
+
 ## What did you search for?

 <!-- Please try to provide a link to your search. You  can go to https://nominatim.openstreetmap.org and repeat your search there. If you originally found the issue somewhere else, please tell us what software/website you were using. -->
@@ -15,11 +17,11 @@ assignees: ''

 ## What result did you expect?

-**Is the result in the right place and just named wrongly?** 
+**When the result in the right place and just named wrongly:** 

 <!-- Please tell us the display name you expected. -->

-**Is the result missing completely?**
+**When the result missing completely:**

 <!-- Make sure that the data you are looking for is in OpenStreetMap. Provide a link to the OpenStreetMap object or if you cannot get it, a link to the map on https://openstreetmap.org where you expect the result to be.

--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -3,7 +3,38 @@ name: CI Tests
 on: [ push, pull_request ]

 jobs:
+    create-archive:
+        runs-on: ubuntu-latest
+
+        steps:
+            - uses: actions/checkout@v2
+              with:
+                submodules: true
+
+            - uses: actions/cache@v2
+              with:
+                  path: |
+                     data/country_osm_grid.sql.gz
+                  key: nominatim-country-data-1
+
+            - name: Package tarball
+              run: |
+                  if [ ! -f data/country_osm_grid.sql.gz ]; then
+                      wget --no-verbose -O data/country_osm_grid.sql.gz https://www.nominatim.org/data/country_grid.sql.gz
+                  fi
+                  cd ..
+                  tar czf nominatim-src.tar.bz2 Nominatim
+                  mv nominatim-src.tar.bz2 Nominatim
+
+            - name: 'Upload Artifact'
+              uses: actions/upload-artifact@v2
+              with:
+                  name: full-source
+                  path: nominatim-src.tar.bz2
+                  retention-days: 1
+
    tests:
+        needs: create-archive
        strategy:
            matrix:
                ubuntu: [18, 20]
@@ -22,10 +53,12 @@ jobs:
        runs-on: ubuntu-${{ matrix.ubuntu }}.04

        steps:
-            - uses: actions/checkout@v2
+            - uses: actions/download-artifact@v2
              with:
-                  submodules: true
-                  path: Nominatim
+                  name: full-source
+
+            - name: Unpack Nominatim
+              run: tar xf nominatim-src.tar.bz2

            - name: Setup PHP
              uses: shivammathur/setup-php@v2
@@ -39,18 +72,6 @@ jobs:
                python-version: 3.6
              if: matrix.ubuntu == 18

-            - name: Get Date
-              id: get-date
-              run: |
-                  echo "::set-output name=date::$(/bin/date -u "+%Y%W")"
-              shell: bash
-
-            - uses: actions/cache@v2
-              with:
-                  path: |
-                     country_grid.sql.gz
-                  key: nominatim-country-data-${{ steps.get-date.outputs.date }}
-
            - uses: ./Nominatim/.github/actions/setup-postgresql
              with:
                  postgresql-version: ${{ matrix.postgresql }}
@@ -65,8 +86,7 @@ jobs:
              if: matrix.ubuntu == 20

            - name: Install test prerequsites
-              run: |
-                   pip3 install pylint==2.6.0 pytest pytest-cov behave==1.2.6
+              run: pip3 install pylint==2.6.0 pytest pytest-cov behave==1.2.6
              if: matrix.ubuntu == 18

            - name: PHP linting
@@ -103,11 +123,6 @@ jobs:
              working-directory: Nominatim/test/bdd
              if: matrix.ubuntu == 18

-            - name: BDD tests (legacy_icu tokenizer)
-              run: |
-                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy_icu --format=progress3
-              working-directory: Nominatim/test/bdd
-
            - name: Upload coverage to Codecov
              uses: codecov/codecov-action@v1
              with:
@@ -119,43 +134,35 @@ jobs:
                verbose: true
              if: matrix.ubuntu == 20

-    import:
+
+    icu-test:
+        needs: create-archive
        strategy:
            matrix:
-                ubuntu: [18, 20]
+                ubuntu: [20]
                include:
-                    - ubuntu: 18
-                      postgresql: 9.5
-                      postgis: 2.5
                    - ubuntu: 20
                      postgresql: 13
                      postgis: 3
+                      pytest: py.test-3
+                      php: 7.4

        runs-on: ubuntu-${{ matrix.ubuntu }}.04

        steps:
-            - uses: actions/checkout@v2
+            - uses: actions/download-artifact@v2
              with:
-                  submodules: true
-                  path: Nominatim
+                  name: full-source

-            - name: Get Date
-              id: get-date
-              run: |
-                  echo "::set-output name=date::$(/bin/date -u "+%Y%W")"
-              shell: bash
+            - name: Unpack Nominatim
+              run: tar xf nominatim-src.tar.bz2

-            - uses: actions/cache@v2
+            - name: Setup PHP
+              uses: shivammathur/setup-php@v2
              with:
-                  path: |
-                     country_grid.sql.gz
-                  key: nominatim-country-data-${{ steps.get-date.outputs.date }}
-
-            - uses: actions/cache@v2
-              with:
-                  path: |
-                     monaco-latest.osm.pbf
-                  key: nominatim-test-data-${{ steps.get-date.outputs.date }}
+                  php-version: ${{ matrix.php }}
+                  coverage: xdebug
+                  tools: phpunit, phpcs, composer

            - uses: actions/setup-python@v2
              with:
@@ -166,52 +173,148 @@ jobs:
              with:
                  postgresql-version: ${{ matrix.postgresql }}
                  postgis-version: ${{ matrix.postgis }}
+
            - uses: ./Nominatim/.github/actions/build-nominatim
              with:
                  ubuntu: ${{ matrix.ubuntu }}

-            - name: Clean installation
-              run: rm -rf Nominatim build
+            - name: Install test prerequsites
+              run: sudo apt-get install -y -qq python3-behave
+              if: matrix.ubuntu == 20
+
+            - name: Install test prerequsites
+              run: pip3 install behave==1.2.6
+              if: matrix.ubuntu == 18
+
+            - name: BDD tests (icu tokenizer)
+              run: |
+                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
+              working-directory: Nominatim/test/bdd
+
+
+    install:
+        runs-on: ubuntu-latest
+        needs: create-archive
+
+        strategy:
+            matrix:
+                name: [Ubuntu-18, Ubuntu-20, Centos-8]
+                include:
+                    - name: Ubuntu-18
+                      flavour: ubuntu
+                      image: "ubuntu:18.04"
+                      ubuntu: 18
+                      install_mode: install-nginx
+                    - name: Ubuntu-20
+                      flavour: ubuntu
+                      image: "ubuntu:20.04"
+                      ubuntu: 20
+                      install_mode: install-apache
+                    - name: Centos-8
+                      flavour: centos
+                      image: "centos:8"
+
+        container:
+            image: ${{ matrix.image }}
+            env:
+                LANG: en_US.UTF-8
+
+        defaults:
+            run:
+                shell: sudo -Hu nominatim bash --noprofile --norc -eo pipefail {0}
+
+        steps:
+            - name: Prepare container (Ubuntu)
+              run: |
+                  export APT_LISTCHANGES_FRONTEND=none
+                  export DEBIAN_FRONTEND=noninteractive
+                  apt-get update -qq
+                  apt-get install -y git sudo wget
+                  ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
              shell: bash
+              if: matrix.flavour == 'ubuntu'
+
+            - name: Prepare container (CentOS)
+              run: |
+                  dnf update -y
+                  dnf install -y sudo glibc-langpack-en
+              shell: bash
+              if: matrix.flavour == 'centos'
+
+            - name: Setup import user
+              run: |
+                  useradd -m nominatim
+                  echo 'nominatim   ALL=(ALL:ALL) NOPASSWD: ALL' > /etc/sudoers.d/nominiatim
+                  echo "/home/nominatim/Nominatim/vagrant/Install-on-${OS}.sh no $INSTALL_MODE" > /home/nominatim/vagrant.sh
+              shell: bash
+              env:
+                OS: ${{ matrix.name }}
+                INSTALL_MODE: ${{ matrix.install_mode }}
+
+            - uses: actions/download-artifact@v2
+              with:
+                  name: full-source
+                  path: /home/nominatim
+
+            - name: Install Nominatim
+              run: |
+                export USERNAME=nominatim
+                export USERHOME=/home/nominatim
+                export NOSYSTEMD=yes
+                export HAVE_SELINUX=no
+                tar xf nominatim-src.tar.bz2
+                . vagrant.sh
+              working-directory: /home/nominatim

            - name: Prepare import environment
              run: |
-                  if [ ! -f monaco-latest.osm.pbf ]; then
-                      wget --no-verbose https://download.geofabrik.de/europe/monaco-latest.osm.pbf
-                  fi
-                  mkdir data-env
-                  cd data-env
-              shell: bash
+                  mv Nominatim/test/testdb/apidb-test-data.pbf test.pbf
+                  rm -rf Nominatim
+                  mkdir data-env-reverse
+              working-directory: /home/nominatim
+
+            - name: Prepare import environment (CentOS)
+              run: |
+                  sudo ln -s /usr/local/bin/nominatim /usr/bin/nominatim
+                  echo NOMINATIM_DATABASE_WEBUSER="apache" > nominatim-project/.env
+                  cp nominatim-project/.env data-env-reverse/.env
+              working-directory: /home/nominatim
+              if: matrix.flavour == 'centos'

            - name: Import
-              run: nominatim import --osm-file ../monaco-latest.osm.pbf
-              shell: bash
-              working-directory: data-env
+              run: nominatim import --osm-file ../test.pbf
+              working-directory: /home/nominatim/nominatim-project

            - name: Import special phrases
              run: nominatim special-phrases --import-from-wiki
-              working-directory: data-env
+              working-directory: /home/nominatim/nominatim-project

            - name: Check full import
              run: nominatim admin --check-database
-              working-directory: data-env
+              working-directory: /home/nominatim/nominatim-project

            - name: Warm up database
              run: nominatim admin --warm
-              working-directory: data-env
+              working-directory: /home/nominatim/nominatim-project
+
+            - name: Prepare update (Ubuntu)
+              run: apt-get install -y python3-pip
+              shell: bash
+              if: matrix.flavour == 'ubuntu'

            - name: Run update
              run: |
-                   nominatim replication --init
-                   nominatim replication --once
-              working-directory: data-env
+                  pip3 install --user osmium
+                  nominatim replication --init
+                  NOMINATIM_REPLICATION_MAX_DIFF=1 nominatim replication --once
+              working-directory: /home/nominatim/nominatim-project

            - name: Run reverse-only import
-              run : nominatim import --osm-file ../monaco-latest.osm.pbf --reverse-only --no-updates
-              working-directory: data-env
-              env:
-                  NOMINATIM_DATABASE_DSN: pgsql:dbname=reverse
+              run : |
+                  echo 'NOMINATIM_DATABASE_DSN="pgsql:dbname=reverse"' >> .env
+                  nominatim import --osm-file ../test.pbf --reverse-only --no-updates
+              working-directory: /home/nominatim/data-env-reverse

            - name: Check reverse import
              run: nominatim admin --check-database
-              working-directory: data-env
+              working-directory: /home/nominatim/data-env-reverse
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,9 @@
 *.log
 *.pyc

-build
-settings/local.php
+docs/develop/*.png

-data/wiki_import.sql
-data/wiki_specialphrases.sql
-data/osmosischange.osc
+build

 .vagrant
 data/country_osm_grid.sql.gz
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,9 +18,9 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")

 project(nominatim)

-set(NOMINATIM_VERSION_MAJOR 3)
-set(NOMINATIM_VERSION_MINOR 7)
-set(NOMINATIM_VERSION_PATCH 0)
+set(NOMINATIM_VERSION_MAJOR 4)
+set(NOMINATIM_VERSION_MINOR 0)
+set(NOMINATIM_VERSION_PATCH 2)

 set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}")

@@ -38,6 +38,7 @@ set(BUILD_TESTS on CACHE BOOL "Build test suite")
 set(BUILD_DOCS on CACHE BOOL "Build documentation")
 set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
 set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)")
+set(INSTALL_MUNIN_PLUGINS on CACHE BOOL "Install Munin plugins for supervising Nominatim")

 #-----------------------------------------------------------------------------
 #  osm2pgsql (imports/updates only)
@@ -153,7 +154,7 @@ if (BUILD_TESTS)
    if (PHPCS)
        message(STATUS "Using phpcs binary ${PHPCS}")
        add_test(NAME phpcs
-                 COMMAND ${PHPCS} --report-width=120 --colors lib website utils
+                 COMMAND ${PHPCS} --report-width=120 --colors lib-php
                 WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
    else()
        message(WARNING "phpcs not found. PHP linting tests disabled." )
@@ -199,7 +200,7 @@ endif()
 #-----------------------------------------------------------------------------

 if (BUILD_MANPAGE)
-   add_subdirectory(manual)
+   add_subdirectory(man)
 endif()

 #-----------------------------------------------------------------------------
@@ -211,6 +212,7 @@ include(GNUInstallDirs)
 set(NOMINATIM_DATADIR ${CMAKE_INSTALL_FULL_DATADIR}/${PROJECT_NAME})
 set(NOMINATIM_LIBDIR ${CMAKE_INSTALL_FULL_LIBDIR}/${PROJECT_NAME})
 set(NOMINATIM_CONFIGDIR ${CMAKE_INSTALL_FULL_SYSCONFDIR}/${PROJECT_NAME})
+set(NOMINATIM_MUNINDIR ${CMAKE_INSTALL_FULL_DATADIR}/munin/plugins)

 if (BUILD_IMPORTER)
    configure_file(${PROJECT_SOURCE_DIR}/cmake/tool-installed.tmpl installed.bin)
@@ -258,6 +260,16 @@ install(FILES settings/env.defaults
              settings/import-address.style
              settings/import-full.style
              settings/import-extratags.style
-              settings/legacy_icu_tokenizer.yaml
-              settings/icu-rules/extended-unicode-to-asccii.yaml
+              settings/icu_tokenizer.yaml
+              settings/country_settings.yaml
        DESTINATION ${NOMINATIM_CONFIGDIR})
+
+install(DIRECTORY settings/icu-rules
+        DESTINATION ${NOMINATIM_CONFIGDIR})
+
+if (INSTALL_MUNIN_PLUGINS)
+    install(FILES munin/nominatim_importlag
+                  munin/nominatim_query_speed
+                  munin/nominatim_requests
+            DESTINATION ${NOMINATIM_MUNINDIR})
+endif()
--- a/63
+++ b/63
@@ -1,3 +1,65 @@
+4.0.2
+
+ * fix XSS vulnerability in debug view
+
+4.0.1
+
+ * fix initialisation error in replication script
+ * ICU tokenizer: avoid any special characters in word tokens
+ * better error message when API php script does not exist
+ * fix quoting of house numbers in SQL queries
+ * small fixes and improvements in search query parsing
+ * add documentation for moving the database to a different machine
+
+4.0.0
+
+ * refactor name token computation and introduce ICU tokenizer
+   * name processing now happens in the indexer outside the DB
+   * reorganizes abbreviation handling and moves it to the indexing phases
+   * adds preprocessing of names
+ * add country-specific ranking for Spain, Slovakia
+ * partially switch to using SP-GIST indexes
+ * better updating of dependent addresses for name changes in streets
+ * remove unused/broken tables for external housenumbers
+ * move external postcodes to CSV format and no longer save them in tables
+   (adds support for postcodes for arbitrary countries)
+ * remove postcode helper entries from placex (thanks @AntoJvlt)
+ * change required format for TIGER data to CSV
+ * move configuration of default languages from wiki into config file
+ * expect customized configuration files in project directory by default
+ * disable search API for reverse-only import (thanks @darkshredder)
+ * port most of maintenance/import code to Python and remove PHP utils
+ * add catch-up mode for replication
+ * add updating of special phrases (thanks @AntoJvlt)
+ * add support for special phrases in CSV files (thanks @AntoJvlt)
+ * switch to case-independent matching between place and boundary names
+ * remove disabling of reverse query parsing
+ * minor tweaks to search algorithm to avoid more false positives
+ * major overhaul of the administrator and developer documentation
+ * add security disclosure policy
+ * add testing of installation scripts via CI
+ * drop support for Python < 3.6 and Postgresql < 9.5
+
+3.7.3
+
+ * fix XSS vulnerability in debug view
+
+3.7.2
+
+ * fix database check for reverse-only imports
+ * do not error out in status API result when import date is missing
+ * add array_key_last function for PHP < 7.3 (thanks to @woodpeck)
+ * fix more url when server name is unknown (thanks to @mogita)
+ * commit changes to replication log table
+
+3.7.1
+
+ * fix smaller issues with special phrases import (thanks @AntoJvlt)
+ * add index to speed up continued indexing during import
+ * fix index on location_property_tiger(parent_place_id) (thanks @changpingc)
+ * make sure Python code is backward-compatible with Python 3.5
+ * various documentation fixes
+
 3.7.0

 * switch to dotenv for configuration file
@@ -20,7 +82,6 @@
 * add non-key indexes to speed up housenumber + street searches
 * switch housenumber field in placex to save transliterated names

-
 3.6.0

 * add full support for searching by and displaying of addr:* tags
--- a/README.md
+++ b/README.md
@@ -20,14 +20,6 @@ https://nominatim.org/release-docs/develop/ .
 Installation
 ============

-**Nominatim is a complex piece of software and runs in a complex environment.
-Installing and running Nominatim is something for experienced system
-administrators only who can do some trouble-shooting themselves. We are sorry,
-but we can not provide installation support. We are all doing this in our free
-time and there is just so much of that time to go around. Do not open issues in
-our bug tracker if you need help. Use the discussions forum
-or ask for help on [help.openstreetmap.org](https://help.openstreetmap.org/).**
-
 The latest stable release can be downloaded from https://nominatim.org.
 There you can also find [installation instructions for the release](https://nominatim.org/release-docs/latest/admin/Installation), as well as an extensive [Troubleshooting/FAQ section](https://nominatim.org/release-docs/latest/admin/Faq/).

--- a/data/country_name.sql
+++ b/data/country_name.sql
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -10,6 +10,7 @@ set (DOC_SOURCES
     admin
     develop
     api
+     customize
     index.md
     extra.css
     styles.css
@@ -26,7 +27,10 @@ ADD_CUSTOM_TARGET(doc
   COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Centos-8.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Centos-8.md
   COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-18.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-18.md
   COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-20.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-20.md
-   COMMAND mkdocs build -d ${CMAKE_CURRENT_BINARY_DIR}/../site-html -f ${CMAKE_CURRENT_BINARY_DIR}/../mkdocs.yml
+   COMMAND PYTHONPATH=${PROJECT_SOURCE_DIR} mkdocs build -d ${CMAKE_CURRENT_BINARY_DIR}/../site-html -f ${CMAKE_CURRENT_BINARY_DIR}/../mkdocs.yml
 )

-
+ADD_CUSTOM_TARGET(serve-doc
+    COMMAND PYTHONPATH=${PROJECT_SOURCE_DIR} mkdocs serve
+    WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+)
--- a/docs/admin/Advanced-Installations.md
+++ b/docs/admin/Advanced-Installations.md
@@ -5,9 +5,34 @@ your Nominatim database. It is assumed that you have already successfully
 installed the Nominatim software itself, if not return to the 
 [installation page](Installation.md).

-## Importing multiple regions
+## Importing multiple regions (without updates)

-To import multiple regions in your database, you need to configure and run `utils/import_multiple_regions.sh` file. This script will set up the update directory which has the following structure:
+To import multiple regions in your database you can simply give multiple
+OSM files to the import command:
+
+```
+nominatim import --osm-file file1.pbf --osm-file file2.pbf
+```
+
+If you already have imported a file and want to add another one, you can
+use the add-data function to import the additional data as follows:
+
+```
+nominatim add-data --file <FILE>
+nominatim refresh --postcodes
+nominatim index -j <NUMBER OF THREADS>
+```
+
+Please note that adding additional data is always significantly slower than
+the original import.
+
+## Importing multiple regions (with updates)
+
+If you want to import multiple regions _and_ be able to keep them up-to-date
+with updates, then you can use the scripts provided in the `utils` directory.
+
+These scripts will set up an `update` directory in your project directory,
+which has the following structure:

 ```bash
 update
@@ -17,7 +42,6 @@ update
    │   └── monaco
    │       └── sequence.state
    └── tmp
-        ├── combined.osm.pbf
        └── europe
                ├── andorra-latest.osm.pbf
                └── monaco-latest.osm.pbf
@@ -25,87 +49,59 @@ update

 ```

-The `sequence.state` files will contain the sequence ID, which will be used by pyosmium to get updates. The tmp folder is used for import dump.
+The `sequence.state` files contain the sequence ID for each region. They will
+be used by pyosmium to get updates. The `tmp` folder is used for import dump and
+can be deleted once the import is complete.

-### Configuring multiple regions
-
-The file `import_multiple_regions.sh` needs to be edited as per your requirement:
-
-1. List of countries. eg:
-
-        COUNTRIES="europe/monaco europe/andorra"
-
-2. Path to Build directory. eg:
-
-        NOMINATIMBUILD="/srv/nominatim/build"
-
-3. Path to Update directory. eg:
-        
-        UPDATEDIR="/srv/nominatim/update"
-
-4. Replication URL. eg:
-    
-        BASEURL="https://download.geofabrik.de"
-        DOWNCOUNTRYPOSTFIX="-latest.osm.pbf"

 ### Setting up multiple regions

-!!! tip
-    If your database already exists and you want to add more countries,
-    replace the setting up part
-    `${SETUPFILE} --osm-file ${UPDATEDIR}/tmp/combined.osm.pbf --all 2>&1`
-    with `${UPDATEFILE} --import-file ${UPDATEDIR}/tmp/combined.osm.pbf --index --index-instances N 2>&1`
-    where N is the numbers of CPUs in your system.
+Create a project directory as described for the
+[simple import](Import.md#creating-the-project-directory). If necessary,
+you can also add an `.env` configuration with customized options. In particular,
+you need to make sure that `NOMINATIM_REPLICATION_UPDATE_INTERVAL` and
+`NOMINATIM_REPLICATION_RECHECK_INTERVAL` are set according to the update
+interval of the extract server you use.

-Run the following command from your Nominatim directory after configuring the file.
+Copy the scripts `utils/import_multiple_regions.sh` and `utils/update_database.sh`
+into the project directory.

-    bash ./utils/import_multiple_regions.sh
+Now customize both files as per your requirements

-!!! danger "Important"
-        This file uses osmium-tool. It must be installed before executing the import script.
-        Installation instructions can be found [here](https://osmcode.org/osmium-tool/manual.html#installation).
-
-### Updating multiple regions
-
-To import multiple regions in your database, you need to configure and run ```utils/update_database.sh```.
-This uses the update directory set up while setting up the DB.   
-
-### Configuring multiple regions
-
-The file `update_database.sh` needs to be edited as per your requirement:
-
-1. List of countries. eg:
+1. List of countries. e.g.

        COUNTRIES="europe/monaco europe/andorra"

-2. Path to Build directory. eg:
+2. URL to the service providing the extracts and updates. eg:

-        NOMINATIMBUILD="/srv/nominatim/build"
-
-3. Path to Update directory. eg:
-        
-        UPDATEDIR="/srv/nominatim/update"
-
-4. Replication URL. eg:
-    
        BASEURL="https://download.geofabrik.de"
-        DOWNCOUNTRYPOSTFIX="-updates"
+        DOWNCOUNTRYPOSTFIX="-latest.osm.pbf"

-5. Followup can be set according to your installation. eg: For Photon,
+5. Followup in the update script can be set according to your installation.
+   E.g. for Photon,

        FOLLOWUP="curl http://localhost:2322/nominatim-update"

    will handle the indexing.

+
+To start the initial import, change into the project directory and run
+
+```
+    bash import_multiple_regions.sh
+```
+
 ### Updating the database

-Run the following command from your Nominatim directory after configuring the file.
+Change into the project directory and run the following command:

-    bash ./utils/update_database.sh
+    bash update_database.sh

-This will get diffs from the replication server, import diffs and index the database. The default replication server in the script([Geofabrik](https://download.geofabrik.de)) provides daily updates.
+This will get diffs from the replication server, import diffs and index
+the database. The default replication server in the
+script([Geofabrik](https://download.geofabrik.de)) provides daily updates.

-## Importing Nominatim to an external PostgreSQL database
+## Using an external PostgreSQL database

 You can install Nominatim using a database that runs on a different server when
 you have physical access to the file system on the other server. Nominatim
@@ -113,6 +109,11 @@ uses a custom normalization library that needs to be made accessible to the
 PostgreSQL server. This section explains how to set up the normalization
 library.

+!!! note
+    The external module is only needed when using the legacy tokenizer.
+    If you have choosen the ICU tokenizer, then you can ignore this section
+    and follow the standard import documentation.
+
 ### Option 1: Compiling the library on the database server

 The most sure way to get a working library is to compile it on the database
@@ -170,4 +171,45 @@ NOMINATIM_DATABASE_MODULE_PATH="<directory on the database server where nominati
 ```

 Now change the `NOMINATIM_DATABASE_DSN` to point to your remote server and continue
-to follow the [standard instructions for importing](/admin/Import).
+to follow the [standard instructions for importing](Import.md).
+
+
+## Moving the database to another machine
+
+For some configurations it may be useful to run the import on one machine, then
+move the database to another machine and run the Nominatim service from there.
+For example, you might want to use a large machine to be able to run the import
+quickly but only want a smaller machine for production because there is not so
+much load. Or you might want to do the import once and then replicate the
+database to many machines.
+
+The important thing to keep in mind when transferring the Nominatim installation
+is that you need to transfer the database _and the project directory_. Both
+parts are essential for your installation.
+
+The Nominatim database can be transferred using the `pg_dump`/`pg_restore` tool.
+Make sure to use the same version of PostgreSQL and PostGIS on source and
+target machine.
+
+!!! note
+    Before creating a dump of your Nominatim database, consider running
+    `nominatim freeze` first. Your database looses the ability to receive further
+    data updates but the resulting database is only about a third of the size
+    of a full database.
+
+Next install Nominatim on the target machine by following the standard installation
+instructions. Again make sure to use the same version as the source machine.
+
+You can now copy the project directory from the source machine to the new machine.
+If necessary, edit the `.env` file to point it to the restored database.
+Finally run
+
+    nominatim refresh --website
+
+to make sure that the local installation of Nominatim will be used.
+
+If you are using the legacy tokenizer you might also have to switch to the
+PostgreSQL module that was compiled on your target machine. If you get errors
+that PostgreSQL cannot find or access `nominatim.so` then copy the installed
+version into the `module` directory of your project directory. The installed
+copy can usually be found under `/usr/local/lib/nominatim/module/nominatim.so`.
--- a/docs/admin/Customization.md
+++ b/docs/admin/Customization.md
@@ -1,101 +0,0 @@
-# Customization of the Database
-
-This section explains in detail how to configure a Nominatim import and
-the various means to use external data.
-
-## External postcode data
-
-Nominatim creates a table of known postcode centroids during import. This table
-is used for searches of postcodes and for adding postcodes to places where the
-OSM data does not provide one. These postcode centroids are mainly computed
-from the OSM data itself. In addition, Nominatim supports reading postcode
-information from an external CSV file, to supplement the postcodes that are
-missing in OSM.
-
-To enable external postcode support, simply put one CSV file per country into
-your project directory and name it `<CC>_postcodes.csv`. `<CC>` must be the
-two-letter country code for which to apply the file. The file may also be
-gzipped. Then it must be called `<CC>_postcodes.csv.gz`.
-
-The CSV file must use commas as a delimiter and have a header line. Nominatim
-expects three columns to be present: `postcode`, `lat` and `lon`. All other
-columns are ignored. `lon` and `lat` must describe the x and y coordinates of the
-postcode centroids in WGS84.
-
-The postcode files are loaded only when there is data for the given country
-in your database. For example, if there is a `us_postcodes.csv` file in your
-project directory but you import only an excerpt of Italy, then the US postcodes
-will simply be ignored.
-
-As a rule, the external postcode data should be put into the project directory
-**before** starting the initial import. Still, you can add, remove and update the
-external postcode data at any time. Simply
-run:
-
-```
-nominatim refresh --postcodes
-```
-
-to make the changes visible in your database. Be aware, however, that the changes
-only have an immediate effect on searches for postcodes. Postcodes that were
-added to places are only updated, when they are reindexed. That usually happens
-only during replication updates.
-
-## Installing Tiger housenumber data for the US
-
-Nominatim is able to use the official [TIGER](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
-address set to complement the OSM house number data in the US. You can add
-TIGER data to your own Nominatim instance by following these steps. The
-entire US adds about 10GB to your database.
-
-  1. Get preprocessed TIGER 2020 data:
-
-        cd $PROJECT_DIR
-        wget https://nominatim.org/data/tiger2020-nominatim-preprocessed.csv.tar.gz
-
-  2. Import the data into your Nominatim database:
-
-        nominatim add-data --tiger-data tiger2020-nominatim-preprocessed.csv.tar.gz
-
-  3. Enable use of the Tiger data in your `.env` by adding:
-
-        echo NOMINATIM_USE_US_TIGER_DATA=yes >> .env
-
-  4. Apply the new settings:
-
-        nominatim refresh --functions
-
-
-See the [developer's guide](../develop/data-sources.md#us-census-tiger) for more
-information on how the data got preprocessed.
-
-## Special phrases import
-
-As described in the [Importation chapter](Import.md), it is possible to
-import special phrases from the wiki with the following command:
-
-```sh
-nominatim special-phrases --import-from-wiki
-```
-
-But, it is also possible to import some phrases from a csv file. 
-To do so, you have access to the following command:
-
-```sh
-nominatim special-phrases --import-from-csv <csv file>
-```
-
-Note that the two previous import commands will update the phrases from your database.
-This means that if you import some phrases from a csv file, only the phrases
-present in the csv file will be kept into the database. All other phrases will
-be removed.
-
-If you want to only add new phrases and not update the other ones you can add
-the argument `--no-replace` to the import command. For example:
-
-```sh
-nominatim special-phrases --import-from-csv <csv file> --no-replace
-```
-
-This will add the phrases present in the csv file into the database without
-removing the other ones.
--- a/docs/admin/Faq.md
+++ b/docs/admin/Faq.md
@@ -134,7 +134,7 @@ On CentOS v7 the PostgreSQL server is started with `systemd`. Check if
 `/usr/lib/systemd/system/httpd.service` contains a line `PrivateTmp=true`. If
 so then Apache cannot see the `/tmp/.s.PGSQL.5432` file. It's a good security
 feature, so use the
-[preferred solution](../appendix/Install-on-Centos-7/#adding-selinux-security-settings).
+[preferred solution](../appendix/Install-on-Centos-7.md#adding-selinux-security-settings).

 However, you can solve this the quick and dirty way by commenting out that line and then run

@@ -182,7 +182,7 @@ by everybody, e.g.
 Try `chmod a+r nominatim.so; chmod a+x nominatim.so`.

 When running SELinux, make sure that the
-[context is set up correctly](../appendix/Install-on-Centos-7/#adding-selinux-security-settings).
+[context is set up correctly](../appendix/Install-on-Centos-7.md#adding-selinux-security-settings).

 When you recently updated your operating system, updated PostgreSQL to
 a new version or moved files (e.g. the build directory) you should
--- a/docs/admin/Import.md
+++ b/docs/admin/Import.md
@@ -47,8 +47,9 @@ You can also set the same configuration via environment variables. All
 settings have a `NOMINATIM_` prefix to avoid conflicts with other environment
 variables.

-There are lots of configuration settings you can tweak. Have a look
-at `Nominatim/settings/env.default` for a full list. Most should have a sensible default.
+There are lots of configuration settings you can tweak. A full reference
+can be found in the chapter [Configuration Settings](../customize/Settings.md).
+Most should have a sensible default.

 #### Flatnode files

@@ -95,7 +96,7 @@ This data can be optionally downloaded into the project directory:
    wget https://www.nominatim.org/data/us_postcodes.csv.gz

 You can also add your own custom postcode sources, see
-[Customization of postcodes](Customization.md#external-postcode-data).
+[Customization of postcodes](../customize/Postcodes.md).

 ## Choosing the data to import

@@ -111,7 +112,7 @@ If you only need geocoding for a smaller region, then precomputed OSM extracts
 are a good way to reduce the database size and import time.
 [Geofabrik](https://download.geofabrik.de) offers extracts for most countries.
 They even have daily updates which can be used with the update process described
-[in the next section](../Update). There are also
+[in the next section](Update.md). There are also
 [other providers for extracts](https://wiki.openstreetmap.org/wiki/Planet.osm#Downloading).

 Please be aware that some extracts are not cut exactly along the country
@@ -137,6 +138,14 @@ Note that you still need to provide for sufficient disk space for the initial
 import. So this option is particularly interesting if you plan to transfer the
 database or reuse the space later.

+!!! warning
+    The datastructure for updates are also required when adding additional data
+    after the import, for example [TIGER housenumber data](../customize/Tiger.md).
+    If you plan to use those, you must not use the `--no-updates` parameter.
+    Do a normal import, add the external data and once you are done with
+    everything run `nominatim freeze`.
+
+
 ### Reverse-only Imports

 If you only want to use the Nominatim database for reverse lookups or
@@ -152,15 +161,15 @@ Nominatim normally sets up a full search database containing administrative
 boundaries, places, streets, addresses and POI data. There are also other
 import styles available which only read selected data:

-* **settings/import-admin.style**
+* **admin**
  Only import administrative boundaries and places.
-* **settings/import-street.style**
+* **street**
  Like the admin style but also adds streets.
-* **settings/import-address.style**
+* **address**
  Import all data necessary to compute addresses down to house number level.
-* **settings/import-full.style**
+* **full**
  Default style that also includes points of interest.
-* **settings/import-extratags.style**
+* **extratags**
  Like the full style but also adds most of the OSM tags into the extratags
  column.

@@ -183,8 +192,8 @@ full      |   54h        |  640 GB    |  330 GB
 extratags |   54h        |  650 GB    |  340 GB

 You can also customize the styles further.
-A [description of the style format](../develop/Import.md#configuring-the-import) 
-can be found in the development section.
+A [description of the style format](../customize/Import-Styles.md)
+can be found in the customization guide.

 ## Initial import of the data

@@ -200,7 +209,7 @@ nominatim import --osm-file <data file> 2>&1 | tee setup.log
 ```

 The **project directory** is the one that you have set up at the beginning.
-See [creating the project directory](Import#creating-the-project-directory).
+See [creating the project directory](#creating-the-project-directory).

 ### Notes on full planet imports

@@ -219,7 +228,7 @@ to load the OSM data into the PostgreSQL database. This step is very demanding
 in terms of RAM usage. osm2pgsql and PostgreSQL are running in parallel at 
 this point. PostgreSQL blocks at least the part of RAM that has been configured
 with the `shared_buffers` parameter during
-[PostgreSQL tuning](Installation#postgresql-tuning)
+[PostgreSQL tuning](Installation.md#postgresql-tuning)
 and needs some memory on top of that. osm2pgsql needs at least 2GB of RAM for
 its internal data structures, potentially more when it has to process very large
 relations. In addition it needs to maintain a cache for node locations. The size
@@ -238,7 +247,8 @@ reduce the cache size or even consider using a flatnode file.

 ### Testing the installation

-Run this script to verify all required tables and indices got created successfully.
+Run this script to verify that all required tables and indices got created
+successfully.

 ```sh
 nominatim admin --check-database
@@ -261,23 +271,10 @@ reverse query, e.g. `http://localhost:8088/reverse.php?lat=27.1750090510034&lon=
 To run Nominatim via webservers like Apache or nginx, please read the
 [Deployment chapter](Deployment.md).

-## Tuning the database
-
-Accurate word frequency information for search terms helps PostgreSQL's query
-planner to make the right decisions. Recomputing them can improve the performance
-of forward geocoding in particular under high load. To recompute word counts run:
-
-```sh
-nominatim refresh --word-counts
-```
-
-This will take a couple of hours for a full planet installation. You can
-also defer that step to a later point in time when you realise that
-performance becomes an issue. Just make sure that updates are stopped before
-running this function.
+## Adding search through category phrases

 If you want to be able to search for places by their type through
-[special key phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases)
+[special phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases)
 you also need to import these key phrases like this:

 ```sh
@@ -288,4 +285,4 @@ Note that this command downloads the phrases from the wiki link above. You
 need internet access for the step.

 You can also import special phrases from a csv file, for more 
-information please read the [Customization chapter](Customization.md).
+information please see the [Customization part](../customize/Special-Phrases.md).
--- a/docs/admin/Installation.md
+++ b/docs/admin/Installation.md
@@ -24,6 +24,10 @@ and can't offer support.

 ### Software

+!!! Warning
+    For larger installations you **must have** PostgreSQL 11+ and Postgis 3+
+    otherwise import and queries will be slow to the point of being unusable.
+
 For compiling:

  * [cmake](https://cmake.org/)
@@ -39,7 +43,7 @@ For compiling:
 For running Nominatim:

  * [PostgreSQL](https://www.postgresql.org) (9.5+ will work, 11+ strongly recommended)
-  * [PostGIS](https://postgis.net) (2.2+)
+  * [PostGIS](https://postgis.net) (2.2+ will work, 3.0+ strongly recommended)
  * [Python 3](https://www.python.org/) (3.6+)
  * [Psycopg2](https://www.psycopg.org) (2.7+)
  * [Python Dotenv](https://github.com/theskumar/python-dotenv)
--- a/docs/admin/Maintenance.md
+++ b/docs/admin/Maintenance.md
@@ -0,0 +1,51 @@
+This chapter describes the various operations the Nominatim database administrator
+may use to clean and maintain the database. None of these operations is mandatory
+but they may help improve the performance and accuracy of results.
+
+
+## Updating postcodes
+
+Command: `nominatim refresh --postcodes`
+
+Postcode centroids (aka 'calculated postcodes') are generated by looking at all
+postcodes of a country, grouping them and calculating the geometric centroid.
+There is currently no logic to deal with extreme outliers (typos or other
+mistakes in OSM data). There is also no check if a postcodes adheres to a
+country's format, e.g. if Swiss postcodes are 4 digits.
+
+When running regular updates, postcodes results can be improved by running
+this command on a regular basis. Note that only the postcode table and the
+postcode search terms are updated. The postcode that is assigned to each place
+is only updated when the place is updated.
+
+The command takes around 70min to run on the planet and needs ca. 40GB of
+temporary disk space.
+
+
+## Updating word counts
+
+Command: `nominatim refresh --word-counts`
+
+Nominatim keeps frequency statistics about all search terms it indexes. These
+statistics are currently used to optimise queries to the database. Thus better
+statistics mean better performance. Word counts are created once after import
+and are usually sufficient even when running regular updates. You might want
+to rerun the statistics computation when adding larger amounts of new data,
+for example, when adding an additional country via `nominatim add-data`.
+
+
+## Removing large deleted objects
+
+Nominatim refuses to delete very large areas because often these deletions are
+accidental and are reverted within hours. Instead the deletions are logged in
+the `import_polygon_delete` table and left to the administrator to clean up.
+
+There is currently no command to do that. You can use the following SQL
+query to force a deletion on all objects that have been deleted more than
+a certain timespan ago (here: 1 month):
+
+```sql
+SELECT place_force_delete(p.place_id) FROM import_polygon_delete d, placex p
+WHERE p.osm_type = d.osm_type and p.osm_id = d.osm_id
+      and age(p.indexed_date) > '1 month'::interval
+```
--- a/docs/admin/Migration.md
+++ b/docs/admin/Migration.md
@@ -15,6 +15,27 @@ breaking changes. **Please read them before running the migration.**
    If you are migrating from a version <3.6, then you still have to follow
    the manual migration steps up to 3.6.

+## 3.7.0 -> 4.0.0
+
+### NOMINATIM_PHRASE_CONFIG removed
+
+Custom blacklist configurations for special phrases now need to be handed
+with the `--config` parameter to `nominatim special-phrases`. Alternatively
+you can put your custom configuration in the project directory in a file
+named `phrase-settings.json`.
+
+Version 3.8 also removes the automatic converter for the php format of
+the configuration in older versions. If you are updating from Nominatim < 3.7
+and still work with a custom `phrase-settings.php`, you need to manually
+convert it into a json format.
+
+### PHP utils removed
+
+The old PHP utils have now been removed completely. You need to switch to
+the appropriate functions of the nominatim  command line tool. See
+[Introducing `nominatim` command line tool](#introducing-nominatim-command-line-tool)
+below.
+
 ## 3.6.0 -> 3.7.0

 ### New format and name of configuration file
@@ -80,7 +101,7 @@ done

 The debugging UI is no longer directly provided with Nominatim. Instead we
 now provide a simple Javascript application. Please refer to
-[Setting up the Nominatim UI](../Setup-Nominatim-UI) for details on how to
+[Setting up the Nominatim UI](Setup-Nominatim-UI.md) for details on how to
 set up the UI.

 The icons served together with the API responses have been moved to the
--- a/docs/admin/Setup-Nominatim-UI.md
+++ b/docs/admin/Setup-Nominatim-UI.md
@@ -16,13 +16,14 @@ and run it. Grab the latest release from
 [nominatim-ui's Github release page](https://github.com/osm-search/nominatim-ui/releases)
 and unpack it. You can use `nominatim-ui-x.x.x.tar.gz` or `nominatim-ui-x.x.x.zip`.

-Copy the example configuration into the right place:
+Next you need to adapt the UI yo your installation. Custom settings need to be
+put into `dist/theme/config.theme.js`. At a minimum you need to
+set `Nominatim_API_Endpoint` to point to your Nominatim installation:

    cd nominatim-ui
-    cp dist/config.example.js dist/config.js
+    echo "Nominatim_Config.Nominatim_API_Endpoint='https:\\myserver.org\nominatim';" > dist/theme/config.theme.js

-Now adapt the configuration to your needs. You need at least
-to change the `Nominatim_API_Endpoint` to point to your Nominatim installation.
+For the full set of available settings, have a look at `dist/config.defaults.js`.

 Then you can just test it locally by spinning up a webserver in the `dist`
 directory. For example, with Python:
--- a/docs/admin/Update.md
+++ b/docs/admin/Update.md
@@ -10,18 +10,21 @@ For a list of other methods to add or update data see the output of
    If you have configured a flatnode file for the import, then you
    need to keep this flatnode file around for updates.

-#### Installing the newest version of Pyosmium
+### Installing the newest version of Pyosmium

-It is recommended to install Pyosmium via pip. Make sure to use python3.
+The replication process uses
+[Pyosmium](https://docs.osmcode.org/pyosmium/latest/updating_osm_data.html)
+to download update data from the server.
+It is recommended to install Pyosmium via pip.
 Run (as the same user who will later run the updates):

 ```sh
 pip3 install --user osmium
 ```

-#### Setting up the update process
+### Setting up the update process

-Next the update needs to be initialised. By default Nominatim is configured
+Next the update process needs to be initialised. By default Nominatim is configured
 to update using the global minutely diffs.

 If you want a different update source you will need to add some settings
@@ -45,12 +48,119 @@ what you expect.
 The `replication --init` command needs to be rerun whenever the replication
 service is changed.

-#### Updating Nominatim
+### Updating Nominatim

-The following command will keep your database constantly up to date:
+Nominatim supports different modes how to retrieve the update data from the
+server. Which one you want to use depends on your exact setup and how often you
+want to retrieve updates.
+
+These instructions are for using a single source of updates. If you have
+imported multiple country extracts and want to keep them
+up-to-date, [Advanced installations section](Advanced-Installations.md)
+contains instructions to set up and update multiple country extracts.
+
+#### Continuous updates
+
+This is the easiest mode. Simply run the replication command without any
+parameters:

    nominatim replication

-If you have imported multiple country extracts and want to keep them
-up-to-date, [Advanced installations section](Advanced-Installations.md) contains instructions 
-to set up and update multiple country extracts.
+The update application keeps running forever and retrieves and applies
+new updates from the server as they are published.
+
+You can run this command as a simple systemd service. Create a service
+description like that in `/etc/systemd/system/nominatim-update.service`:
+
+```
+[Unit]
+Description=Continuous updates of Nominatim
+
+[Service]
+WorkingDirectory=/srv/nominatim
+ExecStart=nominatim replication
+StandardOutput=append:/var/log/nominatim-updates.log
+StandardError=append:/var/log/nominatim-updates.error.log
+User=nominatim
+Group=nominatim
+Type=simple
+
+[Install]
+WantedBy=multi-user.target
+```
+
+Replace the `WorkingDirectory` with your project directory. Also adapt user
+and group names as required.
+
+Now activate the service and start the updates:
+
+```
+sudo systemctl daemon-reload
+sudo systemctl enable nominatim-updates
+sudo systemctl start nominatim-updates
+```
+
+#### One-time mode
+
+When the `--once` parameter is given, then Nominatim will download exactly one
+batch of updates and then exit. This one-time mode still respects the
+`NOMINATIM_REPLICATION_UPDATE_INTERVAL` that you have set. If according to
+the update interval no new data has been published yet, it will go to sleep
+until the next expected update and only then attempt to download the next batch.
+
+The one-time mode is particularly useful if you want to run updates continuously
+but need to schedule other work in between updates. For example, the main
+service at osm.org uses it, to regularly recompute postcodes -- a process that
+must not be run while updates are in progress. Its update script
+looks like this:
+
+```sh
+#!/bin/bash
+
+# Switch to your project directory.
+cd /srv/nominatim
+
+while true; do
+  nominatim replication --once
+  if [ -f "/srv/nominatim/schedule-mainenance" ]; then
+    rm /srv/nominatim/schedule-mainenance
+    nominatim refresh --postcodes
+  fi
+done
+```
+
+A cron job then creates the file `/srv/nominatim/need-mainenance` once per night.
+
+
+#### Catch-up mode
+
+With the `--catch-up` parameter, Nominatim will immediately try to download
+all changes from the server until the database is up-to-date. The catch-up mode
+still respects the parameter `NOMINATIM_REPLICATION_MAX_DIFF`. It downloads and
+applies the changes in appropriate batches until all is done.
+
+The catch-up mode is foremost useful to bring the database up to speed after the
+initial import. Give that the service usually is not in production at this
+point, you can temporarily be a bit more generous with the batch size and
+number of threads you use for the updates by running catch-up like this:
+
+```
+cd /srv/nominatim
+NOMINATIM_REPLICATION_MAX_DIFF=5000 nominatim replication --catch-up --threads 15
+```
+
+The catch-up mode is also useful when you want to apply updates at a lower
+frequency than what the source publishes. You can set up a cron job to run
+replication catch-up at whatever interval you desire.
+
+!!! hint
+    When running scheduled updates with catch-up, it is a good idea to choose
+    a replication source with an update frequency that is an order of magnitude
+    lower. For example, if you want to update once a day, use an hourly updated
+    source. This makes sure that you don't miss an entire day of updates when
+    the source is unexpectely late to publish its update.
+
+    If you want to use the source with the same update frequency (e.g. a daily
+    updated source with daily updates), use the
+    continuous update mode. It ensures to re-request the newest update until it
+    is published.
--- a/docs/api/Faq.md
+++ b/docs/api/Faq.md
@@ -35,7 +35,7 @@ it contains the county/state/country across the border.
 #### 3. I get different counties/states/countries when I change the zoom parameter in the reverse query. How is that possible?

 This is basically the same problem as in the previous answer.
-The zoom level influences at which [search rank](https://wiki.openstreetmap.org/wiki/Nominatim/Development_overview#Country_to_street_level) Nominatim starts looking
+The zoom level influences at which [search rank](../customize/Ranking.md#search-rank) Nominatim starts looking
 for the closest object. So the closest house number maybe on one side of the
 border while the closest street is on the other. As the address details contain
 the address of the closest object found, you might sometimes get one result,
--- a/docs/api/Output.md
+++ b/docs/api/Output.md
@@ -290,6 +290,7 @@ with a designation label. Per default the following labels may appear:
 * emergency, historic, military, natural, landuse, place, railway,
   man_made, aerialway, boundary, amenity, aeroway, club, craft, leisure,
   office, mountain_pass, shop, tourism, bridge, tunnel, waterway
+ * postcode

 They roughly correspond to the classification of the OpenStreetMap data
 according to either the `place` tag or the main key of the object.
--- a/docs/api/Search.md
+++ b/docs/api/Search.md
@@ -27,8 +27,8 @@ The search term may be specified with two different sets of parameters:

    Free-form query string to search for.
    Free-form queries are processed first left-to-right and then right-to-left if that fails. So you may search for
-    [pilkington avenue, birmingham](//nominatim.openstreetmap.org/search?q=pilkington+avenue,birmingham) as well as for
-    [birmingham, pilkington avenue](//nominatim.openstreetmap.org/search?q=birmingham,+pilkington+avenue).
+    [pilkington avenue, birmingham](https://nominatim.openstreetmap.org/search?q=pilkington+avenue,birmingham) as well as for
+    [birmingham, pilkington avenue](https://nominatim.openstreetmap.org/search?q=birmingham,+pilkington+avenue).
    Commas are optional, but improve performance by reducing the complexity of the search.


--- a/docs/customize/Import-Styles.md
+++ b/docs/customize/Import-Styles.md
@@ -1,38 +1,24 @@
-# OSM Data Import
-
-OSM data is initially imported using [osm2pgsql](https://osm2pgsql.org).
-Nominatim uses its own data output style 'gazetteer', which differs from the
-output style created for map rendering.
-
-## Database Layout
-
-The gazetteer style produces a single table `place` with the following rows:
-
- * `osm_type` - kind of OSM object (**N** - node, **W** - way, **R** - relation)
- * `osm_id` - original OSM ID
- * `class` - key of principal tag defining the object type
- * `type` - value of principal tag defining the object type
- * `name` - collection of tags that contain a name or reference
- * `admin_level` - numerical value of the tagged administrative level
- * `address` - collection of tags defining the address of an object
- * `extratags` - collection of additional interesting tags that are not
-                 directly relevant for searching
- * `geometry` - geometry of the object (in WGS84)
-
-A single OSM object may appear multiple times in this table when it is tagged
-with multiple tags that may constitute a principal tag. Take for example a
-motorway bridge. In OSM, this would be a way which is tagged with
-`highway=motorway` and `bridge=yes`. This way would appear in the `place` table
-once with `class` of `highway` and once with a `class` of `bridge`. Thus the
-*unique key* for `place` is (`osm_type`, `osm_id`, `class`).
-
 ## Configuring the Import

-How tags are interpreted and assigned to the different `place` columns can be
-configured via the import style configuration file (`NOMINATIM_IMPORT_STYLE`). This
+Which OSM objects are added to the database and which of the tags are used
+can be configured via the import style configuration file. This
 is a JSON file which contains a list of rules which are matched against every
 tag of every object and then assign the tag its specific role.

+The style to use is given by the `NOMINATIM_IMPORT_STYLE` configuration
+option. There are a number of default styles, which are explained in detail
+in the [Import section](../admin/Import.md#filtering-imported-data). These
+standard styles may be referenced by their name.
+
+You can also create your own custom syle. Put the style file into your
+project directory and then set `NOMINATIM_IMPORT_STYLE` to the name of the file.
+It is always recommended to start with one of the standard styles and customize
+those. You find the standard styles under the name `import-<stylename>.style`
+in the standard Nominatim configuration path (usually `/etc/nominatim` or
+`/usr/local/etc/nominatim`).
+
+The remainder of the page describes the format of the file.
+
 ### Configuration Rules

 A single rule looks like this:
@@ -159,9 +145,6 @@ A rule can define as many of these properties for one match as it likes. For
 example, if the property is `"main,extra"` then the tag will open a new row
 but also have the tag appear in the list of extra tags.

-There are a number of pre-defined styles in the `settings/` directory. It is
-advisable to start from one of these styles when defining your own.
-
 ### Changing the Style of Existing Databases

 There is normally no issue changing the style of a database that is already
--- a/docs/customize/Overview.md
+++ b/docs/customize/Overview.md
@@ -0,0 +1,20 @@
+Nominatim comes with a predefined set of configuration options that should
+work for most standard installations. If you have special requirements, there
+are many places where the configuration can be adapted. This chapter describes
+the following configurable parts:
+
+* [Global Settings](Settings.md) has a detailed description of all parameters that
+  can be set in your local `.env` configuration
+* [Import styles](Import-Styles.md) explains how to write your own import style
+  in order to control what kind of OSM data will be imported
+* [Place ranking](Ranking.md) describes the configuration around classifing
+  places in terms of their importance and their role in an address
+* [Tokenizers](Tokenizers.md) describes the configuration of the module
+  responsible for analysing and indexing names
+* [Special Phrases](Special-Phrases.md) are common nouns or phrases that
+  can be used in search to identify a class of places
+
+There are also guides for adding the following external data:
+
+* [US house numbers from the TIGER dataset](Tiger.md)
+* [External postcodes](Postcodes.md)
--- a/docs/customize/Postcodes.md
+++ b/docs/customize/Postcodes.md
@@ -0,0 +1,37 @@
+# External postcode data
+
+Nominatim creates a table of known postcode centroids during import. This table
+is used for searches of postcodes and for adding postcodes to places where the
+OSM data does not provide one. These postcode centroids are mainly computed
+from the OSM data itself. In addition, Nominatim supports reading postcode
+information from an external CSV file, to supplement the postcodes that are
+missing in OSM.
+
+To enable external postcode support, simply put one CSV file per country into
+your project directory and name it `<CC>_postcodes.csv`. `<CC>` must be the
+two-letter country code for which to apply the file. The file may also be
+gzipped. Then it must be called `<CC>_postcodes.csv.gz`.
+
+The CSV file must use commas as a delimiter and have a header line. Nominatim
+expects three columns to be present: `postcode`, `lat` and `lon`. All other
+columns are ignored. `lon` and `lat` must describe the x and y coordinates of the
+postcode centroids in WGS84.
+
+The postcode files are loaded only when there is data for the given country
+in your database. For example, if there is a `us_postcodes.csv` file in your
+project directory but you import only an excerpt of Italy, then the US postcodes
+will simply be ignored.
+
+As a rule, the external postcode data should be put into the project directory
+**before** starting the initial import. Still, you can add, remove and update the
+external postcode data at any time. Simply
+run:
+
+```
+nominatim refresh --postcodes
+```
+
+to make the changes visible in your database. Be aware, however, that the changes
+only have an immediate effect on searches for postcodes. Postcodes that were
+added to places are only updated, when they are reindexed. That usually happens
+only during replication updates.
--- a/docs/customize/Ranking.md
+++ b/docs/customize/Ranking.md
@@ -1,8 +1,7 @@
 # Place Ranking in Nominatim

 Nominatim uses two metrics to rank a place: search rank and address rank.
-Both can be assigned a value between 0 and 30. They serve slightly
-different purposes, which are explained in this chapter.
+This chapter explains what place ranking means and how it can be customized.

 ## Search rank

--- a/docs/customize/Settings.md
+++ b/docs/customize/Settings.md
@@ -0,0 +1,649 @@
+This section provides a reference of all configuration parameters that can
+be used with Nominatim.
+
+# Configuring Nominatim
+
+Nominatim uses [dotenv](https://github.com/theskumar/python-dotenv) to manage
+its configuration settings. There are two means to set configuration
+variables: through an `.env` configuration file or through an environment
+variable.
+
+The `.env` configuration file needs to be placed into the
+[project directory](../admin/Import.md#creating-the-project-directory). It
+must contain configuration parameters in `<parameter>=<value>` format.
+Please refer to the dotenv documentation for details.
+
+The configuration options may also be set in the form of shell environment
+variables. This is particularly useful, when you want to temporarily change
+a configuration option. For example, to force the replication serve to
+download the next change, you can temporarily disable the update interval:
+
+    NOMINATIM_REPLICATION_UPDATE_INTERVAL=0 nominatim replication --once
+
+If a configuration option is defined through .env file and environment
+variable, then the latter takes precedence. 
+
+## Configuration Parameter Reference
+
+### Import and Database Settings
+
+#### NOMINATIM_DATABASE_DSN
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Database connection string |
+| **Format:**        | string: `pgsql:<param1>=<value1>;<param2>=<value2>;...` |
+| **Default:**       | pgsql:dbname=nominatim |
+| **After Changes:** | run `nominatim refresh --website` |
+
+Sets the connection parameters for the Nominatim database. At a minimum
+the name of the database (`dbname`) is required. You can set any additional
+parameter that is understood by libpq. See the [Postgres documentation](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS) for a full list.
+
+!!! note
+    It is usually recommended not to set the password directly in this
+    configuration parameter. Use a
+    [password file](https://www.postgresql.org/docs/current/libpq-pgpass.html)
+    instead.
+
+
+#### NOMINATIM_DATABASE_WEBUSER
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Database query user |
+| **Format:**        | string  |
+| **Default:**       | www-data |
+| **After Changes:** | cannot be changed after import |
+
+Defines the name of the database user that will run search queries. Usually
+this is the user under which the webserver is executed. When running Nominatim
+via php-fpm, you can also define a separate query user. The Postgres user
+needs to be set up before starting the import.
+
+Nominatim grants minimal rights to this user to all tables that are needed
+for running geocoding queries.
+
+
+#### NOMINATIM_DATABASE_MODULE_PATH
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Directory where to find the PostgreSQL server module |
+| **Format:**        | path |
+| **Default:**       | _empty_ (use `<project_directory>/module`) |
+| **After Changes:** | run `nominatim refresh --functions` |
+| **Comment:**       | Legacy tokenizer only |
+
+Defines the directory in which the PostgreSQL server module `nominatim.so`
+is stored. The directory and module must be accessible by the PostgreSQL
+server.
+
+For information on how to use this setting when working with external databases,
+see [Advanced Installations](../admin/Advanced-Installations.md).
+
+The option is only used by the Legacy tokenizer and ignored otherwise.
+
+
+#### NOMINATIM_TOKENIZER
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Tokenizer used for normalizing and parsing queries and names |
+| **Format:**        | string |
+| **Default:**       | legacy |
+| **After Changes:** | cannot be changed after import |
+
+Sets the tokenizer type to use for the import. For more information on
+available tokenizers and how they are configured, see
+[Tokenizers](../customize/Tokenizers.md).
+
+
+#### NOMINATIM_TOKENIZER_CONFIG
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Configuration file for the tokenizer |
+| **Format:**        | path |
+| **Default:**       | _empty_ (default file depends on tokenizer) |
+| **After Changes:** | see documentation for each tokenizer |
+
+Points to the file with additional configuration for the tokenizer.
+See the [Tokenizer](../customize/Tokenizers.md) descriptions for details
+on the file format.
+
+If a relative path is given, then the file is searched first relative to the
+project directory and then in the global settings directory.
+
+#### NOMINATIM_MAX_WORD_FREQUENCY
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Number of occurrences before a word is considered frequent |
+| **Format:**        | int |
+| **Default:**       | 50000 |
+| **After Changes:** | cannot be changed after import |
+| **Comment:**       | Legacy tokenizer only |
+
+The word frequency count is used by the Legacy tokenizer to automatically
+identify _stop words_. Any partial term that occurs more often then what
+is defined in this setting, is effectively ignored during search.
+
+
+#### NOMINATIM_LIMIT_REINDEXING
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Avoid invalidating large areas |
+| **Format:**        | bool |
+| **Default:**       | yes |
+
+Nominatim computes the address of each place at indexing time. This has the
+advantage to make search faster but also means that more objects needs to
+be invalidated when the data changes. For example, changing the name of
+the state of Florida would require recomputing every single address point
+in the state to make the new name searchable in conjunction with addresses.
+
+Setting this option to 'yes' means that Nominatim skips reindexing of contained
+objects when the area becomes too large.
+
+
+#### NOMINATIM_LANGUAGES
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Restrict search languages |
+| **Format:**        | string: comma-separated list of language codes |
+| **Default:**       | _empty_ |
+
+Normally Nominatim will include all language variants of name:XX
+in the search index. Set this to a comma separated list of language
+codes, to restrict import to a subset of languages.
+
+Currently only affects the initial import of country names and special phrases.
+
+
+#### NOMINATIM_TERM_NORMALIZATION
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Rules for normalizing terms for comparisons |
+| **Format:**        | string: semicolon-separated list of ICU rules |
+| **Default:**       | :: NFD (); [[:Nonspacing Mark:] [:Cf:]] >;  :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC (); |
+| **Comment:**       | Legacy tokenizer only |
+
+[Special phrases](Special-Phrases.md) have stricter matching requirements than
+normal search terms. They must appear exactly in the query after this term
+normalization has been applied.
+
+Only has an effect on the Legacy tokenizer. For the ICU tokenizer the rules
+defined in the
+[normalization section](Tokenizers.md#normalization-and-transliteration)
+will be used.
+
+
+#### NOMINATIM_USE_US_TIGER_DATA
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Enable searching for Tiger house number data |
+| **Format:**        | boolean |
+| **Default:**       | no |
+| **After Changes:** | run `nominatim --refresh --functions` |
+
+When this setting is enabled, search and reverse queries also take data
+from [Tiger house number data](Tiger.md) into account.
+
+
+#### NOMINATIM_USE_AUX_LOCATION_DATA
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Enable searching in external house number tables |
+| **Format:**        | boolean |
+| **Default:**       | no |
+| **After Changes:** | run `nominatim --refresh --functions` |
+| **Comment:**       | Do not use. |
+
+When this setting is enabled, search queries also take data from external
+house number tables into account.
+
+*Warning:* This feature is currently unmaintained and should not be used.
+
+
+#### NOMINATIM_HTTP_PROXY
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Use HTTP proxy when downloading data |
+| **Format:**        | boolean |
+| **Default:**       | no |
+
+When this setting is enabled and at least
+[NOMINATIM_HTTP_PROXY_HOST](#nominatim_http_proxy_host) and
+[NOMINATIM_HTTP_PROXY_PORT](#nominatim_http_proxy_port) are set, the
+configured proxy will be used, when downloading external data like
+replication diffs.
+
+
+#### NOMINATIM_HTTP_PROXY_HOST
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Host name of the proxy to use |
+| **Format:**        | string |
+| **Default:**       | _empty_ |
+
+When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, this setting
+configures the proxy host name.
+
+
+#### NOMINATIM_HTTP_PROXY_PORT
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Port number of the proxy to use |
+| **Format:**        | integer |
+| **Default:**       | 3128 |
+
+When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, this setting
+configures the port number to use with the proxy.
+
+
+#### NOMINATIM_HTTP_PROXY_LOGIN
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Username for proxies that require login |
+| **Format:**        | string |
+| **Default:**       | _empty_ |
+
+When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, use this
+setting to define the username for proxies that require a login.
+
+
+#### NOMINATIM_HTTP_PROXY_PASSWORD
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Password for proxies that require login |
+| **Format:**        | string |
+| **Default:**       | _empty_ |
+
+When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, use this
+setting to define the password for proxies that require a login.
+
+
+#### NOMINATIM_OSM2PGSQL_BINARY
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Location of the osm2pgsql binary |
+| **Format:**        | path |
+| **Default:**       | _empty_ (use binary shipped with Nominatim) |
+| **Comment:**       | EXPERT ONLY |
+
+Nominatim uses [osm2pgsql](https://osm2pgsql.org) to load the OSM data
+initially into the database. Nominatim comes bundled with a version of
+osm2pgsql that is guaranteed to be compatible. Use this setting to use
+a different binary instead. You should do this only when you know exactly
+what you are doing. If the osm2pgsql version is not compatible, then the
+result is undefined.
+
+
+#### NOMINATIM_WIKIPEDIA_DATA_PATH
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Directory with the wikipedia importance data |
+| **Format:**        | path |
+| **Default:**       | _empty_ (project directory) |
+
+Set a custom location for the
+[wikipedia ranking file](../admin/Import.md#wikipediawikidata-rankings). When
+unset, Nominatim expects the data to be saved in the project directory.
+
+#### NOMINATIM_ADDRESS_LEVEL_CONFIG
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Configuration file for rank assignments |
+| **Format:**        | path |
+| **Default:**       | address-levels.json |
+
+The _address level configuration_ defines the rank assignments for places. See
+[Place Ranking](Ranking.md) for a detailed explanation what rank assignments
+are and what the configuration file must look like.
+
+When a relative path is given, then the file is searched first relative to the
+project directory and then in the global settings directory.
+
+
+#### NOMINATIM_IMPORT_STYLE
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Configuration to use for the initial OSM data import |
+| **Format:**        | string or path |
+| **Default:**       | extratags |
+
+The _style configuration_ describes which OSM objects and tags are taken
+into consideration for the search database. Nominatim comes with a set
+of pre-configured styles, that may be configured here.
+
+You can also write your own custom style and point the setting to the file
+with the style. When a relative path is given, then the style file is searched
+first relative to the project directory and then in the global settings
+directory.
+
+See [Import Styles](Import-Styles.md)
+for more information on the available internal styles and the format of the
+configuration file.
+
+#### NOMINATIM_FLATNODE_FILE
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Location of osm2pgsql flatnode file |
+| **Format:**        | path |
+| **Default:**       | _empty_ (do not use a flatnote file) |
+| **After Changes:** | Only change when moving the file physically. |
+
+The `osm2pgsql flatnode file` is file that efficiently stores geographic
+location for OSM nodes. For larger imports it can significantly speed up
+the import. When this option is unset, then osm2pgsql uses a PsotgreSQL table
+to store the locations.
+
+When a relative path is given, then the flatnode file is created/searched
+relative to the project directory.
+
+!!! warning
+
+    The flatnode file is not only used during the initial import but also
+    when adding new data with `nominatim add-data` or `nominatim replication`.
+    Make sure you keep the flatnode file around and this setting unmodified,
+    if you plan to add more data or run regular updates.
+
+
+#### NOMINATIM_TABLESPACE_*
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Group of settings for distributing the database over tablespaces |
+| **Format:**        | string |
+| **Default:**       | _empty_ (do not use a table space) |
+| **After Changes:** | no effect after initial import |
+
+Nominatim allows to distribute the search database over up to 10 different
+[PostgreSQL tablespaces](https://www.postgresql.org/docs/current/manage-ag-tablespaces.html).
+If you use this option, make sure that the tablespaces exist before starting
+the import.
+
+The available tablespace groups are:
+
+NOMINATIM_TABLESPACE_SEARCH_DATA
+:    Data used by the geocoding frontend.
+
+NOMINATIM_TABLESPACE_SEARCH_INDEX
+:    Indexes used by the geocoding frontend.
+
+NOMINATIM_TABLESPACE_OSM_DATA
+:    Raw OSM data cache used for import and updates.
+
+NOMINATIM_TABLESPACE_OSM_DATA
+:    Indexes on the raw OSM data cache.
+
+NOMINATIM_TABLESPACE_PLACE_DATA
+:    Data table with the pre-filtered but still unprocessed OSM data.
+     Used only during imports and updates.
+
+NOMINATIM_TABLESPACE_PLACE_INDEX
+:    Indexes on raw data table. Used only during imports and updates.
+
+NOMINATIM_TABLESPACE_ADDRESS_DATA
+:    Data tables used for computing search terms and addresses of places
+     during import and updates.
+
+NOMINATIM_TABLESPACE_ADDRESS_INDEX
+:    Indexes on the data tables for search term and address computation.
+     Used only for import and updates.
+
+NOMINATIM_TABLESPACE_AUX_DATA
+:    Auxiliary data tables for non-OSM data, e.g. for Tiger house number data.
+
+NOMINATIM_TABLESPACE_AUX_INDEX
+:    Indexes on auxiliary data tables.
+
+
+### Replication Update Settings
+
+#### NOMINATIM_REPLICATION_URL
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Base URL of the replication service |
+| **Format:**        | url |
+| **Default:**       | https://planet.openstreetmap.org/replication/minute |
+| **After Changes:** | run `nominatim replication --init` |
+
+Replication services deliver updates to OSM data. Use this setting to choose
+which replication service to use. See [Updates](../admin/Update.md) for more
+information on how to set up regular updates.
+
+#### NOMINATIM_REPLICATION_MAX_DIFF
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Maximum amount of data to download per update cycle (in MB) |
+| **Format:**        | integer |
+| **Default:**       | 50 |
+| **After Changes:** | restart the replication process |
+
+At each update cycle Nominatim downloads diffs until either no more diffs
+are available on the server (i.e. the database is up-to-date) or the limit
+given in this setting is exceeded. Nominatim guarantees to downloads at least
+one diff, if one is available, no matter how small the setting.
+
+The default for this setting is fairly conservative because Nominatim keeps
+all data downloaded in one cycle in RAM. Using large values in a production
+server may interfere badly with the search frontend because it evicts data
+from RAM that is needed for speedy answers to incoming requests. It is usually
+a better idea to keep this setting lower and run multiple update cycles
+to catch up with updates.
+
+When catching up in non-production mode, for example after the initial import,
+the setting can easily be changed temporarily on the command line:
+
+    NOMINATIM_REPLICATION_MAX_DIFF=3000 nominatim replication
+
+
+#### NOMINATIM_REPLICATION_UPDATE_INTERVAL
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Publication interval of the replication service (in seconds) |
+| **Format:**        | integer |
+| **Default:**       | 75 |
+| **After Changes:** | restart the replication process |
+
+This setting determines when Nominatim will attempt to download again a new
+update. The time is computed from the publication date of the last diff
+downloaded. Setting this to a slightly higher value than the actual
+publication interval avoids unnecessary rechecks.
+
+
+#### NOMINATIM_REPLICATION_RECHECK_INTERVAL
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Wait time to recheck for a pending update (in seconds)  |
+| **Format:**        | integer |
+| **Default:**       | 60 |
+| **After Changes:** | restart the replication process |
+
+When replication updates are run in continuous mode (using `nominatim replication`),
+this setting determines how long Nominatim waits until it looks for updates
+again when updates were not available on the server.
+
+Note that this is different from
+[NOMINATIM_REPLICATION_UPDATE_INTERVAL](#nominatim_replication_update_interval).
+Nominatim will never attempt to query for new updates for UPDATE_INTERVAL
+seconds after the current database date. Only after the update interval has
+passed it asks for new data. If then no new data is found, it waits for
+RECHECK_INTERVAL seconds before it attempts again.
+
+### API Settings
+
+#### NOMINATIM_CORS_NOACCESSCONTROL
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Send permissive CORS access headers |
+| **Format:**        | boolean |
+| **Default:**       | yes |
+| **After Changes:** | run `nominatim refresh --website` |
+
+When this setting is enabled, API HTTP responses include the HTTP
+[CORS](https://en.wikipedia.org/wiki/CORS) headers
+`access-control-allow-origin: *` and `access-control-allow-methods: OPTIONS,GET`.
+
+#### NOMINATIM_MAPICON_URL
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | URL prefix for static icon images |
+| **Format:**        | url |
+| **Default:**       | _empty_ |
+| **After Changes:** | run `nominatim refresh --website` |
+
+When a mapicon URL is configured, then Nominatim includes an additional `icon`
+field in the responses, pointing to an appropriate icon for the place type.
+
+Map icons used to be included in Nominatim itself but now have moved to the
+[nominatim-ui](https://github.com/osm-search/nominatim-ui/) project. If you
+want the URL to be included in API responses, make the `/mapicon`
+directory of the project available under a public URL and point this setting
+to the directory.
+
+
+#### NOMINATIM_DEFAULT_LANGUAGE
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Language of responses when no language is requested |
+| **Format:**        | language code |
+| **Default:**       | _empty_ (use the local language of the feature) |
+| **After Changes:** | run `nominatim refresh --website` |
+
+Nominatim localizes the place names in responses when the corresponding
+translation is available. Users can request a custom language setting through
+the HTTP accept-languages header or through the explicit parameter
+[accept-languages](../api/Search.md#language-of-results). If neither is
+given, it falls back to this setting. If the setting is also empty, then
+the local languages (in OSM: the name tag without any language suffix) is
+used.
+
+
+#### NOMINATIM_SEARCH_BATCH_MODE
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Enable a special batch query mode |
+| **Format:**        | boolean |
+| **Default:**       | no |
+| **After Changes:** | run `nominatim refresh --website` |
+
+This feature is currently undocumented and potentially broken.
+
+
+#### NOMINATIM_SEARCH_NAME_ONLY_THRESHOLD
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Threshold for switching the search index lookup strategy |
+| **Format:**        | integer |
+| **Default:**       | 500 |
+| **After Changes:** | run `nominatim refresh --website` |
+
+This setting defines the threshold over which a name is no longer considered
+as rare. When searching for places with rare names, only the name is used
+for place lookups. Otherwise the name and any address information is used.
+
+This setting only has an effect after `nominatim refresh --word-counts` has
+been called to compute the word frequencies.
+
+
+#### NOMINATIM_LOOKUP_MAX_COUNT
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Maximum number of OSM ids accepted by /lookup |
+| **Format:**        | integer |
+| **Default:**       | 50 |
+| **After Changes:** | run `nominatim refresh --website` |
+
+The /lookup point accepts list of ids to look up address details for. This
+setting restricts the number of places a user may look up with a single
+request.
+
+
+#### NOMINATIM_POLYGON_OUTPUT_MAX_TYPES
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Number of different geometry formats that may be returned |
+| **Format:**        | integer |
+| **Default:**       | 1 |
+| **After Changes:** | run `nominatim refresh --website` |
+
+Nominatim supports returning full geometries of places. The geometries may
+be requested in different formats with one of the
+[`polygon_*` parameters](../api/Search.md#polygon-output). Use this
+setting to restrict the number of geometry types that may be requested
+with a single query.
+
+Setting this parameter to 0 disables polygon output completely.
+
+### Logging Settings
+
+#### NOMINATIM_LOG_DB
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Log requests into the database |
+| **Format:**        | boolean |
+| **Default:**       | no |
+| **After Changes:** | run `nominatim refresh --website` |
+
+Enable logging requests into a database table with this setting. The logs
+can be found in the table `new_query_log`.
+
+When using this logging method, it is advisable to set up a job that
+regularly clears out old logging information. Nominatim will not do that
+on its own.
+
+Can be used as the same time as NOMINATIM_LOG_FILE.
+
+#### NOMINATIM_LOG_FILE
+
+| Summary            |                                                     |
+| --------------     | --------------------------------------------------- |
+| **Description:**   | Log requests into a file |
+| **Format:**        | path |
+| **Default:**       | _empty_ (logging disabled) |
+| **After Changes:** | run `nominatim refresh --website` |
+
+Enable logging of requests into a file with this setting by setting the log
+file where to log to. A relative file name is assumed to be relative to
+the project directory.
+
+
+The entries in the log file have the following format:
+
+    <request time> <execution time in s> <number of results> <type> "<query string>"
+
+Request time is the time when the request was started. The execution time is
+given in ms and corresponds to the time the query took executing in PHP.
+type contains the name of the endpoint used.
+
+Can be used as the same time as NOMINATIM_LOG_DB.
--- a/docs/customize/Special-Phrases.md
+++ b/docs/customize/Special-Phrases.md
@@ -0,0 +1,34 @@
+# Special phrases
+
+## Importing OSM user-maintained special phrases
+
+As described in the [Import section](../admin/Import.md), it is possible to
+import special phrases from the wiki with the following command:
+
+```sh
+nominatim special-phrases --import-from-wiki
+```
+
+## Importing custom special phrases
+
+But, it is also possible to import some phrases from a csv file. 
+To do so, you have access to the following command:
+
+```sh
+nominatim special-phrases --import-from-csv <csv file>
+```
+
+Note that the two previous import commands will update the phrases from your database.
+This means that if you import some phrases from a csv file, only the phrases
+present in the csv file will be kept into the database. All other phrases will
+be removed.
+
+If you want to only add new phrases and not update the other ones you can add
+the argument `--no-replace` to the import command. For example:
+
+```sh
+nominatim special-phrases --import-from-csv <csv file> --no-replace
+```
+
+This will add the phrases present in the csv file into the database without
+removing the other ones.
--- a/docs/customize/Tiger.md
+++ b/docs/customize/Tiger.md
@@ -0,0 +1,28 @@
+# Installing TIGER housenumber data for the US
+
+Nominatim is able to use the official [TIGER](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
+address set to complement the OSM house number data in the US. You can add
+TIGER data to your own Nominatim instance by following these steps. The
+entire US adds about 10GB to your database.
+
+  1. Get preprocessed TIGER 2021 data:
+
+        cd $PROJECT_DIR
+        wget https://nominatim.org/data/tiger2021-nominatim-preprocessed.csv.tar.gz
+
+  2. Import the data into your Nominatim database:
+
+        nominatim add-data --tiger-data tiger2021-nominatim-preprocessed.csv.tar.gz
+
+  3. Enable use of the Tiger data in your `.env` by adding:
+
+        echo NOMINATIM_USE_US_TIGER_DATA=yes >> .env
+
+  4. Apply the new settings:
+
+        nominatim refresh --functions
+
+
+See the [TIGER-data project](https://github.com/osm-search/TIGER-data) for more
+information on how the data got preprocessed.
+
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -37,39 +37,42 @@ NOMINATIM_DATABASE_MODULE_PATH=<path to directory where nominatim.so resides>
 ```

 This is in particular useful when the database runs on a different server.
-See [Advanced installations](Advanced-Installations.md#importing-nominatim-to-an-external-postgresql-database) for details.
+See [Advanced installations](../admin/Advanced-Installations.md#importing-nominatim-to-an-external-postgresql-database) for details.

 There are no other configuration options for the legacy tokenizer. All
 normalization functions are hard-coded.

 ## ICU tokenizer

-!!! danger
-    This tokenizer is currently in active development and still subject
-    to backwards-incompatible changes.
-
 The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
 normalize names and queries. It also offers configurable decomposition and
 abbreviation handling.

+To enable the tokenizer add the following line to your project configuration:
+
+```
+NOMINATIM_TOKENIZER=icu
+```
+
 ### How it works

-On import the tokenizer processes names in the following four stages:
+On import the tokenizer processes names in the following three stages:

-1. The **Normalization** part removes all non-relevant information from the
-   input.
-2. Incoming names are now converted to **full names**. This process is currently
-   hard coded and mostly serves to handle name tags from OSM that contain
-   multiple names (e.g. [Biel/Bienne](https://www.openstreetmap.org/node/240097197)).
-3. Next the tokenizer creates **variants** from the full names. These variants
-   cover decomposition and abbreviation handling. Variants are saved to the
-   database, so that it is not necessary to create the variants for a search
-   query.
-4. The final **Tokenization** step converts the names to a simple ASCII form,
-   potentially removing further spelling variants for better matching.
+1. During the **Sanitizer step** incoming names are cleaned up and converted to
+   **full names**. This step can be used to regularize spelling, split multi-name
+   tags into their parts and tag names with additional attributes. See the
+   [Sanitizers section](#sanitizers) below for available cleaning routines.
+2. The **Normalization** part removes all information from the full names
+   that are not relevant for search.
+3. The **Token analysis** step takes the normalized full names and creates
+   all transliterated variants under which the name should be searchable.
+   See the [Token analysis](#token-analysis) section below for more
+   information.

-At query time only stage 1) and 4) are used. The query is normalized and
-tokenized and the resulting string used for searching in the database.
+During query time, only normalization and transliteration are relevant.
+An incoming query is first split into name chunks (this usually means splitting
+the string at the commas) and the each part is normalised and transliterated.
+The result is used to look up places in the search index.

 ### Configuration

@@ -87,21 +90,36 @@ normalization:
 transliteration:
    - !include /etc/nominatim/icu-rules/extended-unicode-to-asccii.yaml
    - ":: Ascii ()"
-variants:
-    - language: de
-      words:
-        - ~haus => haus
-        - ~strasse -> str
-    - language: en
-      words: 
-        - road -> rd
-        - bridge -> bdge,br,brdg,bri,brg
+sanitizers:
+    - step: split-name-list
+token-analysis:
+    - analyzer: generic
+      variants:
+          - !include icu-rules/variants-ca.yaml
+          - words:
+              - road -> rd
+              - bridge -> bdge,br,brdg,bri,brg
 ```

-The configuration file contains three sections:
-`normalization`, `transliteration`, `variants`.
+The configuration file contains four sections:
+`normalization`, `transliteration`, `sanitizers` and `token-analysis`.

-The normalization and transliteration sections each must contain a list of
+#### Normalization and Transliteration
+
+The normalization and transliteration sections each define a set of
+ICU rules that are applied to the names.
+
+The **normalisation** rules are applied after sanitation. They should remove
+any information that is not relevant for search at all. Usual rules to be
+applied here are: lower-casing, removing of special characters, cleanup of
+spaces.
+
+The **transliteration** rules are applied at the end of the tokenization
+process to transfer the name into an ASCII representation. Transliteration can
+be useful to allow for further fuzzy matching, especially between different
+scripts.
+
+Each section must contain a list of
 [ICU transformation rules](https://unicode-org.github.io/icu/userguide/transforms/general/rules.html).
 The rules are applied in the order in which they appear in the file.
 You can also include additional rules from external yaml file using the
@@ -113,6 +131,85 @@ and may again include other files.
    YAML syntax. You should therefore always enclose the ICU rules in
    double-quotes.

+#### Sanitizers
+
+The sanitizers section defines an ordered list of functions that are applied
+to the name and address tags before they are further processed by the tokenizer.
+They allows to clean up the tagging and bring it to a standardized form more
+suitable for building the search index.
+
+!!! hint
+    Sanitizers only have an effect on how the search index is built. They
+    do not change the information about each place that is saved in the
+    database. In particular, they have no influence on how the results are
+    displayed. The returned results always show the original information as
+    stored in the OpenStreetMap database.
+
+Each entry contains information of a sanitizer to be applied. It has a
+mandatory parameter `step` which gives the name of the sanitizer. Depending
+on the type, it may have additional parameters to configure its operation.
+
+The order of the list matters. The sanitizers are applied exactly in the order
+that is configured. Each sanitizer works on the results of the previous one.
+
+The following is a list of sanitizers that are shipped with Nominatim.
+
+##### split-name-list
+
+::: nominatim.tokenizer.sanitizers.split_name_list
+    selection:
+        members: False
+    rendering:
+        heading_level: 6
+
+##### strip-brace-terms
+
+::: nominatim.tokenizer.sanitizers.strip_brace_terms
+    selection:
+        members: False
+    rendering:
+        heading_level: 6
+
+##### tag-analyzer-by-language
+
+::: nominatim.tokenizer.sanitizers.tag_analyzer_by_language
+    selection:
+        members: False
+    rendering:
+        heading_level: 6
+
+
+
+#### Token Analysis
+
+Token analyzers take a full name and transform it into one or more normalized
+form that are then saved in the search index. In its simplest form, the
+analyzer only applies the transliteration rules. More complex analyzers
+create additional spelling variants of a name. This is useful to handle
+decomposition and abbreviation.
+
+The ICU tokenizer may use different analyzers for different names. To select
+the analyzer to be used, the name must be tagged with the `analyzer` attribute
+by a sanitizer (see for example the
+[tag-analyzer-by-language sanitizer](#tag-analyzer-by-language)).
+
+The token-analysis section contains the list of configured analyzers. Each
+analyzer must have an `id` parameter that uniquely identifies the analyzer.
+The only exception is the default analyzer that is used when no special
+analyzer was selected.
+
+Different analyzer implementations may exist. To select the implementation,
+the `analyzer` parameter must be set. Currently there is only one implementation
+`generic` which is described in the following.
+
+##### Generic token analyzer
+
+The generic analyzer is able to create variants from a list of given
+abbreviation and decomposition replacements. It takes one optional parameter
+`variants` which lists the replacements to apply. If the section is
+omitted, then the generic analyzer becomes a simple analyzer that only
+applies the transliteration.
+
 The variants section defines lists of replacements which create alternative
 spellings of a name. To create the variants, a name is scanned from left to
 right and the longest matching replacement is applied until the end of the
@@ -138,7 +235,7 @@ term.
    words in the configuration because then it is possible to change the
    rules for normalization later without having to adapt the variant rules.

-#### Decomposition
+###### Decomposition

 In its standard form, only full words match against the source. There
 is a special notation to match the prefix and suffix of a word:
@@ -165,7 +262,7 @@ To avoid automatic decomposition, use the '|' notation:

 simply changes "hauptstrasse" to "hauptstr" and "rote strasse" to "rote str".

-#### Initial and final terms
+###### Initial and final terms

 It is also possible to restrict replacements to the beginning and end of a
 name:
@@ -178,7 +275,7 @@ name:
 So the first example would trigger a replacement for "south 45th street" but
 not for "the south beach restaurant".

-#### Replacements vs. variants
+###### Replacements vs. variants

 The replacement syntax `source => target` works as a pure replacement. It changes
 the name instead of creating a variant. To create an additional version, you'd
--- a/docs/develop/Database-Layout.md
+++ b/docs/develop/Database-Layout.md
@@ -0,0 +1,167 @@
+# Database Layout
+
+### Import tables
+
+OSM data is initially imported using [osm2pgsql](https://osm2pgsql.org).
+Nominatim uses its own data output style 'gazetteer', which differs from the
+output style created for map rendering.
+
+The import process creates the following tables:
+
+![osm2pgsql tables](osm2pgsql-tables.svg)
+
+The `planet_osm_*` tables are the usual backing tables for OSM data. Note
+that Nominatim uses them to look up special relations and to find nodes on
+ways.
+
+The gazetteer style produces a single table `place` as output with the following
+columns:
+
+ * `osm_type` - kind of OSM object (**N** - node, **W** - way, **R** - relation)
+ * `osm_id` - original OSM ID
+ * `class` - key of principal tag defining the object type
+ * `type` - value of principal tag defining the object type
+ * `name` - collection of tags that contain a name or reference
+ * `admin_level` - numerical value of the tagged administrative level
+ * `address` - collection of tags defining the address of an object
+ * `extratags` - collection of additional interesting tags that are not
+                 directly relevant for searching
+ * `geometry` - geometry of the object (in WGS84)
+
+A single OSM object may appear multiple times in this table when it is tagged
+with multiple tags that may constitute a principal tag. Take for example a
+motorway bridge. In OSM, this would be a way which is tagged with
+`highway=motorway` and `bridge=yes`. This way would appear in the `place` table
+once with `class` of `highway` and once with a `class` of `bridge`. Thus the
+*unique key* for `place` is (`osm_type`, `osm_id`, `class`).
+
+How raw OSM tags are mapped to the columns in the place table is to a certain
+degree configurable. See [Customizing Import Styles](../customize/Import-Styles.md)
+for more information.
+
+### Search tables
+
+The following tables carry all information needed to do the search:
+
+![search tables](search-tables.svg)
+
+The **placex** table is the central table that saves all information about the
+searchable places in Nominatim. The basic columns are the same as for the
+place table and have the same meaning. The placex tables adds the following
+additional columns:
+
+ * `place_id` - the internal unique ID to identify the place
+ * `partition` - the id to use with partitioned tables (see below)
+ * `geometry_sector` - a location hash used for geographically close ordering
+ * `parent_place_id` - the next higher place in the address hierarchy, only
+   relevant for POI-type places (with rank 30)
+ * `linked_place_id` - place ID of the place this object has been merged with.
+   When this ID is set, then the place is invisible for search.
+ * `importance` - measure how well known the place is
+ * `rank_search`, `rank_address` - search and address rank (see [Customizing ranking](../customize/Ranking.md)
+ * `wikipedia` - the wikipedia page used for computing the importance of the place
+ * `country_code` - the country the place is located in
+ * `housenumber` - normalized housenumber, if the place has one
+ * `postcode` - computed postcode for the place
+ * `indexed_status` - processing status of the place (0 - ready, 1 - freshly inserted, 2 - needs updating, 100 - needs deletion)
+ * `indexed_date` - timestamp when the place was processed last
+ * `centroid` - a point feature for the place
+
+The **location_property_osmline** table is a special table for
+[address interpolations](https://wiki.openstreetmap.org/wiki/Addresses#Using_interpolation).
+The columns have the same meaning and use as the columns with the same name in
+the placex table. Only three columns are special:
+
+ * `startnumber` and `endnumber` - beginning and end of the number range
+    for the interpolation
+ * `interpolationtype` - a string `odd`, `even` or `all` to indicate
+    the interval between the numbers
+
+Address interpolations are always ways in OSM, which is why there is no column
+`osm_type`.
+
+The **location_postcode** table holds computed centroids of all postcodes that
+can be found in the OSM data. The meaning of the columns is again the same
+as that of the placex table.
+
+Every place needs an address, a set of surrounding places that describe the
+location of the place. The set of address places is made up of OSM places
+themselves. The **place_addressline** table cross-references for each place
+all the places that make up its address. Two columns define the address
+relation:
+
+  * `place_id` - reference to the place being addressed
+  * `address_place_id` - reference to the place serving as an address part
+
+The most of the columns cache information from the placex entry of the address
+part. The exceptions are:
+
+  * `fromarea` - is true if the address part has an area geometry and can
+    therefore be considered preceise
+  * `isaddress` - is true if the address part should show up in the address
+    output. Sometimes there are multiple places competing for for same address
+    type (e.g. multiple cities) and this field resolves the tie.
+
+The **search_name** table contains the search index proper. It saves for each
+place the terms with which the place can be found. The terms are split into
+the name itself and all terms that make up the address. The table mirrors some
+of the columns from placex for faster lookup.
+
+Search terms are not saved as strings. Each term is assigned an integer and those
+integers are saved in the name and address vectors of the search_name table. The
+**word** table serves as the lookup table from string to such a word ID. The
+exact content of the word table depends on the [tokenizer](Tokenizers.md) used.
+
+## Address computation tables
+
+Next to the main search tables, there is a set of secondary helper tables used
+to compute the address relations between places. These tables are partitioned.
+Each country is assigned a partition number in the country_name table (see
+below) and the data is then split between a set of tables, one for each
+partition. Note that Nominatim still manually manages partitioned tables.
+Native support for partitions in PostgreSQL only became useable with version 13.
+It will be a little while before Nominatim drops support for older versions.
+
+![address tables](address-tables.svg)
+
+The **search_name_X** tables are used to look up streets that appear in the
+`addr:street` tag.
+
+The **location_area_large_X** tables are used to look up larger areas
+(administrative boundaries and place nodes) either through their geographic
+closeness or through `addr:*` entries.
+
+The **location_road_X** tables are used to find the closest street for a
+dependent place.
+
+All three table cache specific information from the placex table for their
+selected subset of places:
+
+ * `keywords` and `name_vector` contain lists of term ids (from the word table)
+   that the full name of the place should match against
+ * `isguess` is true for places that are not described by an area
+
+All other columns reflect their counterpart in the placex table.
+
+## Static data tables
+
+Nominatim also creates a number of static tables at import:
+
+ * `nominatim_properties` saves settings that must not be changed after
+    import
+ * `address_levels` save the rank information from the
+   [ranking configuration](../customize/Ranking.md)
+ * `country_name` contains a fallback of names for all countries, their
+   default languages and saves the assignment of countries to partitions.
+ * `country_osm_grid` provides a fallback for country geometries
+
+## Auxilary data tables
+
+Finally there are some table for auxillary data:
+
+ * `location_property_tiger` - saves housenumber from the Tiger import. Its
+   layout is similar to that of `location_propoerty_osmline`.
+ * `place_class_*` tables are helper tables to facilitate lookup of POIs
+   by their class and type. They exist because it is not possible to create
+   combined indexes with geometries.
+
--- a/docs/develop/Development-Environment.md
+++ b/docs/develop/Development-Environment.md
@@ -38,6 +38,7 @@ It has the following additional requirements:
 The documentation is built with mkdocs:

 * [mkdocs](https://www.mkdocs.org/) >= 1.1.2
+* [mkdocstrings](https://mkdocstrings.github.io/)

 ### Installing prerequisites on Ubuntu/Debian

@@ -51,7 +52,7 @@ To install all necessary packages run:
 sudo apt install php-cgi phpunit php-codesniffer \
                 python3-pip python3-setuptools python3-dev pylint

-pip3 install --user behave mkdocs pytest
+pip3 install --user behave mkdocs mkdocstrings pytest
 ```

 The `mkdocs` executable will be located in `.local/bin`. You may have to add
@@ -113,7 +114,7 @@ symlinks (see `CMakeLists.txt` for the exact steps).
 Now you can start webserver for local testing

 ```
-build> mkdocs serve
+build> make serve-doc
 [server:296] Serving on http://127.0.0.1:8000
 [handlers:62] Start watching changes
 ```
@@ -122,7 +123,7 @@ If you develop inside a Vagrant virtual machine, use a port that is forwarded
 to your host:

 ```
-build> mkdocs serve --dev-addr 0.0.0.0:8088
+build> PYTHONPATH=$SRCDIR mkdocs serve --dev-addr 0.0.0.0:8088
 [server:296] Serving on http://0.0.0.0:8088
 [handlers:62] Start watching changes
 ```
--- a/docs/develop/Indexing.md
+++ b/docs/develop/Indexing.md
@@ -0,0 +1,152 @@
+# Indexing Places
+
+In Nominatim, the word __indexing__ refers to the process that takes the raw
+OpenStreetMap data from the place table, enriches it with address information
+and creates the search indexes. This section explains the basic data flow.
+
+
+## Initial import
+
+After osm2pgsql has loaded the raw OSM data into the place table,
+the data is copied to the final search tables placex and location_property_osmline.
+While they are copied, some basic properties are added:
+
+ * country_code, geometry_sector and partition
+ * initial search and address rank
+
+In addition the column `indexed_status` is set to `1` marking the place as one
+that needs to be indexed.
+
+All this happens in the triggers `placex_insert` and `osmline_insert`.
+
+## Indexing
+
+The main work horse of the data import is the indexing step, where Nominatim
+takes every place from the placex and location_property_osmline tables where
+the indexed_status != 0 and computes the search terms and the address parts
+of the place.
+
+The indexing happens in three major steps:
+
+1. **Data preparation** - The indexer gets the data for the place to be indexed
+   from the database.
+
+2. **Search name processing** - The prepared data is given to the
+   tokenizer which computes the search terms from the names
+   and potentially other information.
+
+3. **Address processing** - The indexer then hands the prepared data and the
+   tokenizer information back to the database via an `INSERT` statement which
+   also sets the indexed_status to `0`. This triggers the update triggers
+   `placex_update`/`osmline_update` which do the work of computing address
+   parts and filling all the search tables.
+
+When computing the address terms of a place, Nominatim relies on the processed
+search names of all the address parts. That is why places are processed in rank
+order, from smallest rank to largest. To ensure correct handling of linked
+place nodes, administrative boundaries are processed before all other places.
+
+Apart from these restrictions, each place can be indexed independently
+from the others. This allows a large degree of parallelization during the indexing.
+It also means that the indexing process can be interrupted at any time and
+will simply pick up where it left of when restarted.
+
+### Data preparation
+
+The data preparation step computes and retrieves all data for a place that
+might be needed for the next step of processing the search name. That includes
+
+* location information (country code)
+* place classification (class, type, ranks)
+* names (including names of linked places)
+* address information (`addr:*` tags)
+
+Data preparation is implemented in pl/PgSQL mostly in the functions
+`placex_indexing_prepare()` and `get_interpolation_address()`.
+
+#### `addr:*` tag inheritance
+
+Nominatim has limited support for inheriting address tags from a building
+to POIs inside the building. This only works when the address tags are on the
+building outline. Any rank 30 object inside such a building or on its outline
+inherits all address tags when it does not have any address tags of its own.
+
+The inheritance is computed in the data preparation step.
+
+### Search name processing
+
+The prepared place information is handed to the tokenizer next. This is a
+Python module responsible for processing the names  from both name and address
+terms and building up the word index from them. The process is explained in
+more detail in the [Tokenizer chapter](Tokenizer.md).
+
+### Address processing
+
+Finally, the preprocessed place information and the results of the search name
+processing are written back to the database. At this point the update trigger
+of the placex/location_property_osmline tables take over and fill all the
+dependent tables. This makes up the most work-intensive part of the indexing.
+
+Nominatim distinguishes between dependent and independent places.
+**Dependent places** are all places on rank 30: house numbers, POIs etc. These
+places don't have a full address of their own. Instead they are attached to
+a parent street or place and use the information of the parent for searching
+and displaying information. Everything else are **independent places**: streets,
+parks, water bodies, suburbs, cities, states etc.  They receive a full address
+on their own.
+
+The address processing for both types of places is very different.
+
+#### Independent places
+
+To compute the address of an independent place Nominatim searches for all
+places that cover the place to compute the address for at least partially.
+For places with an area, that area is used to check for coverage. For place
+nodes an artificial square area is computed according to the rank of
+the place. The lower the rank the lager the area. The `location_area_large_X`
+tables are there to facilitate the lookup. All places that can function as
+the address of another place are saved in those tables.
+
+`addr:*` and `isin:*` tags are taken into account to compute the address, too.
+Nominatim will give preference to places with the same name as in these tags
+when looking for places in the vicinity. If there are no matching place names
+at all, then the tags are at least added to the search index. That means that
+the names will not be shown in the result as the 'address' of the place, but
+searching by them still works.
+
+Independent places are always added to the global search index `search_name`.
+
+#### Dependent places
+
+Dependent places skip the full address computation for performance reasons.
+Instead they just find a parent place to attach themselves to.
+
+![parenting of dependent places](parenting-flow.svg)
+
+By default a POI
+or house number will be attached to the closest street. That can be any major
+or minor street indexed by Nominatim. In the default configuration that means
+that it can attach itself to a footway but only when it has a name.
+
+When the dependent place has an `addr:street` tag, then Nominatim will first
+try to find a street with the same name before falling back to the closest
+street.
+
+There are also addresses in OSM, where the housenumber does not belong
+to a street at all. These have an `addr:place` tag. For these places, Nominatim
+tries to find a place with the given name in the indexed places with an
+address rank between 16 and 25. If none is found, then the dependent place
+is attached to the closest place in that category and the addr:place name is
+added as *unlisted* place, which indicates to Nominatim that it needs to add
+it to the address output, no matter what. This special case is necessary to
+cover addresses that don't really refer to an existing object.
+
+When an address has both the `addr:street` and `addr:place` tag, then Nominatim
+assumes that the `addr:place` tag in fact should be the city part of the address
+and give the POI the usual street number address.
+
+Dependent places are only added to the global search index `search_name` when
+they have either a name themselves or when they have address tags that are not
+covered by the places that make up their address. The latter ensures that
+addresses are always searchable by those address tags.
+
--- a/docs/develop/Postcodes.md
+++ b/docs/develop/Postcodes.md
@@ -1,45 +0,0 @@
-# Postcodes in Nominatim
-
-The blog post
-[Nominatim and Postcodes](https://www.openstreetmap.org/user/lonvia/diary/43143)
-describes the handling implemented since Nominatim 3.1.
-
-Postcode centroids (aka 'calculated postcodes') are generated by looking at all
-postcodes of a country, grouping them and calculating the geometric centroid.
-There is currently no logic to deal with extreme outliers (typos or other
-mistakes in OSM data). There is also no check if a postcodes adheres to a
-country's format, e.g. if Swiss postcodes are 4 digits.
-
-
-## Regular updating calculated postcodes
-
-The script to rerun the calculation is
-`nominatim refresh --postcodes`
-and runs once per night on nominatim.openstreetmap.org.
-
-
-## Finding places that share a specific postcode
-
-In the Nominatim database run
-
-```sql
-SELECT address->'postcode' as pc,
-       osm_type, osm_id, class, type,
-       st_x(centroid) as lon, st_y(centroid) as lat
-FROM placex
-WHERE country_code='fr'
-  AND upper(trim (both ' ' from address->'postcode')) = '33210';
-```
-
-Alternatively on [Overpass](https://overpass-turbo.eu/) run the following query
-
-```
-[out:json][timeout:250];
-area["name"="France"]->.boundaryarea;
-(
-nwr(area.boundaryarea)["addr:postcode"="33210"];
-);
-out body;
->;
-out skel qt;
-```
--- a/docs/develop/Tokenizers.md
+++ b/docs/develop/Tokenizers.md
@@ -0,0 +1,332 @@
+# Tokenizers
+
+The tokenizer is the component of Nominatim that is responsible for
+analysing names of OSM objects and queries. Nominatim provides different
+tokenizers that use different strategies for normalisation. This page describes
+how tokenizers are expected to work and the public API that needs to be
+implemented when creating a new tokenizer. For information on how to configure
+a specific tokenizer for a database see the
+[tokenizer chapter in the Customization Guide](../customize/Tokenizers.md).
+
+## Generic Architecture
+
+### About Search Tokens
+
+Search in Nominatim is organised around search tokens. Such a token represents
+string that can be part of the search query. Tokens are used so that the search
+index does not need to be organised around strings. Instead the database saves
+for each place which tokens match this place's name, address, house number etc.
+To be able to distinguish between these different types of information stored
+with the place, a search token also always has a certain type: name, house number,
+postcode etc.
+
+During search an incoming query is transformed into a ordered list of such
+search tokens (or rather many lists, see below) and this list is then converted
+into a database query to find the right place.
+
+It is the core task of the tokenizer to create, manage and assign the search
+tokens. The tokenizer is involved in two distinct operations:
+
+* __at import time__: scanning names of OSM objects, normalizing them and
+  building up the list of search tokens.
+* __at query time__: scanning the query and returning the appropriate search
+  tokens.
+
+
+### Importing
+
+The indexer is responsible to enrich an OSM object (or place) with all data
+required for geocoding. It is split into two parts: the controller collects
+the places that require updating, enriches the place information as required
+and hands the place to Postgresql. The collector is part of the Nominatim
+library written in Python. Within Postgresql, the `placex_update`
+trigger is responsible to fill out all secondary tables with extra geocoding
+information. This part is written in PL/pgSQL.
+
+The tokenizer is involved in both parts. When the indexer prepares a place,
+it hands it over to the tokenizer to inspect the names and create all the
+search tokens applicable for the place. This usually involves updating the
+tokenizer's internal token lists and creating a list of all token IDs for
+the specific place. This list is later needed in the PL/pgSQL part where the
+indexer needs to add the token IDs to the appropriate search tables. To be
+able to communicate the list between the Python part and the pl/pgSQL trigger,
+the `placex` table contains a special JSONB column `token_info` which is there
+for the exclusive use of the tokenizer.
+
+The Python part of the tokenizer returns a structured information about the
+tokens of a place to the indexer which converts it to JSON and inserts it into
+the `token_info` column. The content of the column is then handed to the PL/pqSQL
+callbacks of the tokenizer which extracts the required information. Usually
+the tokenizer then removes all information from the `token_info` structure,
+so that no information is ever persistently saved in the table. All information
+that went in should have been processed after all and put into secondary tables.
+This is however not a hard requirement. If the tokenizer needs to store
+additional information about a place permanently, it may do so in the
+`token_info` column. It just may never execute searches over it and
+consequently not create any special indexes on it.
+
+### Querying
+
+At query time, Nominatim builds up multiple _interpretations_ of the search
+query. Each of these interpretations is tried against the database in order
+of the likelihood with which they match to the search query. The first
+interpretation that yields results wins.
+
+The interpretations are encapsulated in the `SearchDescription` class. An
+instance of this class is created by applying a sequence of
+_search tokens_ to an initially empty SearchDescription. It is the
+responsibility of the tokenizer to parse the search query and derive all
+possible sequences of search tokens. To that end the tokenizer needs to parse
+the search query and look up matching words in its own data structures.
+
+## Tokenizer API
+
+The following section describes the functions that need to be implemented
+for a custom tokenizer implementation.
+
+!!! warning
+    This API is currently in early alpha status. While this API is meant to
+    be a public API on which other tokenizers may be implemented, the API is
+    far away from being stable at the moment.
+
+### Directory Structure
+
+Nominatim expects two files for a tokenizer:
+
+* `nominiatim/tokenizer/<NAME>_tokenizer.py` containing the Python part of the
+  implementation
+* `lib-php/tokenizer/<NAME>_tokenizer.php` with the PHP part of the
+  implementation
+
+where `<NAME>` is a unique name for the tokenizer consisting of only lower-case
+letters, digits and underscore. A tokenizer also needs to install some SQL
+functions. By convention, these should be placed in `lib-sql/tokenizer`.
+
+If the tokenizer has a default configuration file, this should be saved in
+the `settings/<NAME>_tokenizer.<SUFFIX>`.
+
+### Configuration and Persistance
+
+Tokenizers may define custom settings for their configuration. All settings
+must be prefixed with `NOMINATIM_TOKENIZER_`. Settings may be transient or
+persistent. Transient settings are loaded from the configuration file when
+Nominatim is started and may thus be changed at any time. Persistent settings
+are tied to a database installation and must only be read during installation
+time. If they are needed for the runtime then they must be saved into the
+`nominatim_properties` table and later loaded from there.
+
+### The Python module
+
+The Python module is expect to export a single factory function:
+
+```python
+def create(dsn: str, data_dir: Path) -> AbstractTokenizer
+```
+
+The `dsn` parameter contains the DSN of the Nominatim database. The `data_dir`
+is a directory in the project directory that the tokenizer may use to save
+database-specific data. The function must return the instance of the tokenizer
+class as defined below.
+
+### Python Tokenizer Class
+
+All tokenizers must inherit from `nominatim.tokenizer.base.AbstractTokenizer`
+and implement the abstract functions defined there.
+
+::: nominatim.tokenizer.base.AbstractTokenizer
+    rendering:
+        heading_level: 4
+
+### Python Analyzer Class
+
+::: nominatim.tokenizer.base.AbstractAnalyzer
+    rendering:
+        heading_level: 4
+
+### PL/pgSQL Functions
+
+The tokenizer must provide access functions for the `token_info` column
+to the indexer which extracts the necessary information for the global
+search tables. If the tokenizer needs additional SQL functions for private
+use, then these functions must be prefixed with `token_` in order to ensure
+that there are no naming conflicts with the SQL indexer code.
+
+The following functions are expected:
+
+```sql
+FUNCTION token_get_name_search_tokens(info JSONB) RETURNS INTEGER[]
+```
+
+Return an array of token IDs of search terms that should match
+the name(s) for the given place. These tokens are used to look up the place
+by name and, where the place functions as part of an address for another place,
+by address. Must return NULL when the place has no name.
+
+```sql
+FUNCTION token_get_name_match_tokens(info JSONB) RETURNS INTEGER[]
+```
+
+Return an array of token IDs of full names of the place that should be used
+to match addresses. The list of match tokens is usually more strict than
+search tokens as it is used to find a match between two OSM tag values which
+are expected to contain matching full names. Partial terms should not be
+used for match tokens. Must return NULL when the place has no name.
+
+```sql
+FUNCTION token_get_housenumber_search_tokens(info JSONB) RETURNS INTEGER[]
+```
+
+Return an array of token IDs of house number tokens that apply to the place.
+Note that a place may have multiple house numbers, for example when apartments
+each have their own number. Must be NULL when the place has no house numbers.
+
+```sql
+FUNCTION token_normalized_housenumber(info JSONB) RETURNS TEXT
+```
+
+Return the house number(s) in the normalized form that can be matched against
+a house number token text. If a place has multiple house numbers they must
+be listed with a semicolon as delimiter. Must be NULL when the place has no
+house numbers.
+
+```sql
+FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN
+```
+
+Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
+match against the `addr:street` tag name. Must return either NULL or FALSE
+when the place has no `addr:street` tag.
+
+```sql
+FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN
+```
+
+Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
+match against the `addr:place` tag name. Must return either NULL or FALSE
+when the place has no `addr:place` tag.
+
+
+```sql
+FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[]
+```
+
+Return the search token IDs extracted from the `addr:place` tag. These tokens
+are used for searches by address when no matching place can be found in the
+database. Must be NULL when the place has no `addr:place` tag.
+
+```sql
+FUNCTION token_get_address_keys(info JSONB) RETURNS SETOF TEXT
+```
+
+Return the set of keys for which address information is provided. This
+should correspond to the list of (relevant) `addr:*` tags with the `addr:`
+prefix removed or the keys used in the `address` dictionary of the place info.
+
+```sql
+FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[]
+```
+
+Return the array of search tokens for the given address part. `key` can be
+expected to be one of those returned with `token_get_address_keys()`. The
+search tokens are added to the address search vector of the place, when no
+corresponding OSM object could be found for the given address part from which
+to copy the name information.
+
+```sql
+FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+```
+
+Check if the given tokens match against the address part `key`.
+
+__Warning:__ the tokens that are handed in are the lists previously saved
+from `token_get_name_search_tokens()`, _not_ from the match token list. This
+is an historical oddity which will be fixed at some point in the future.
+Currently, tokenizers are encouraged to make sure that matching works against
+both the search token list and the match token list.
+
+```sql
+FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
+```
+
+Return the normalized version of the given postcode. This function must return
+the same value as the Python function `AbstractAnalyzer->normalize_postcode()`.
+
+```sql
+FUNCTION token_strip_info(info JSONB) RETURNS JSONB
+```
+
+Return the part of the `token_info` field that should be stored in the database
+permanently. The indexer calls this function when all processing is done and
+replaces the content of the `token_info` column with the returned value before
+the trigger stores the information in the database. May return NULL if no
+information should be stored permanently.
+
+### PHP Tokenizer class
+
+The PHP tokenizer class is instantiated once per request and responsible for
+analyzing the incoming query. Multiple requests may be in flight in
+parallel.
+
+The class is expected to be found under the
+name of `\Nominatim\Tokenizer`. To find the class the PHP code includes the file
+`tokenizer/tokenizer.php` in the project directory. This file must be created
+when the tokenizer is first set up on import. The file should initialize any
+configuration variables by setting PHP constants and then require the file
+with the actual implementation of the tokenizer.
+
+The tokenizer class must implement the following functions:
+
+```php
+public function __construct(object &$oDB)
+```
+
+The constructor of the class receives a database connection that can be used
+to query persistent data in the database.
+
+```php
+public function checkStatus()
+```
+
+Check that the tokenizer can access its persistent data structures. If there
+is an issue, throw an `\Exception`.
+
+```php
+public function normalizeString(string $sTerm) : string
+```
+
+Normalize string to a form to be used for comparisons when reordering results.
+Nominatim reweighs results how well the final display string matches the actual
+query. Before comparing result and query, names and query are normalised against
+this function. The tokenizer can thus remove all properties that should not be
+taken into account for reweighing, e.g. special characters or case.
+
+```php
+public function tokensForSpecialTerm(string $sTerm) : array
+```
+
+Return the list of special term tokens that match the given term.
+
+```php
+public function extractTokensFromPhrases(array &$aPhrases) : TokenList
+```
+
+Parse the given phrases, splitting them into word lists and retrieve the
+matching tokens.
+
+The phrase array may take on two forms. In unstructured searches (using `q=`
+parameter) the search query is split at the commas and the elements are
+put into a sorted list. For structured searches the phrase array is an
+associative array where the key designates the type of the term (street, city,
+county etc.) The tokenizer may ignore the phrase type at this stage in parsing.
+Matching phrase type and appropriate search token type will be done later
+when the SearchDescription is built.
+
+For each phrase in the list of phrases, the function must analyse the phrase
+string and then call `setWordSets()` to communicate the result of the analysis.
+A word set is a list of strings, where each string refers to a search token.
+A phrase may have multiple interpretations. Therefore a list of word sets is
+usually attached to the phrase. The search tokens themselves are returned
+by the function in an associative array, where the key corresponds to the
+strings given in the word sets. The value is a list of search tokens. Thus
+a single string in the list of word sets may refer to multiple search tokens.
+
--- a/docs/develop/address-tables.plantuml
+++ b/docs/develop/address-tables.plantuml
@@ -0,0 +1,35 @@
+@startuml
+skinparam monochrome true
+skinparam ObjectFontStyle bold
+
+map search_name_X {
+  place_id => BIGINT
+  address_rank => SMALLINT
+  name_vector => INT[]
+  centroid => GEOMETRY
+}
+
+map location_area_large_X {
+  place_id => BIGINT
+  keywords => INT[]
+  partition => SMALLINT
+  rank_search => SMALLINT
+  rank_address => SMALLINT
+  country_code => VARCHR(2)
+  isguess => BOOLEAN
+  postcode => TEXT
+  centroid => POINT
+  geometry => GEOMETRY
+}
+
+map location_road_X {
+  place_id => BIGINT
+  partition => SMALLINT
+  country_code => VARCHR(2)
+  geometry => GEOMETRY
+}
+
+search_name_X -[hidden]> location_area_large_X
+location_area_large_X -[hidden]> location_road_X
+
+@enduml
--- a/docs/develop/address-tables.svg
+++ b/docs/develop/address-tables.svg
--- a/docs/develop/osm2pgsql-tables.plantuml
+++ b/docs/develop/osm2pgsql-tables.plantuml
@@ -0,0 +1,44 @@
+@startuml
+skinparam monochrome true
+skinparam ObjectFontStyle bold
+
+map planet_osm_nodes #eee {
+  id => BIGINT
+  lat => INT
+  lon => INT
+}
+
+map planet_osm_ways #eee {
+  id => BIGINT
+  nodes => BIGINT[]
+  tags => TEXT[]
+}
+
+map planet_osm_rels #eee {
+  id => BIGINT
+  parts => BIGINT[]
+  members => TEXT[]
+  tags => TEXT[]
+  way_off => SMALLINT
+  rel_off => SMALLINT
+}
+
+map place {
+  osm_type => CHAR(1)
+  osm_id => BIGINT
+  class => TEXT
+  type => TEXT
+  name => HSTORE
+  address => HSTORE
+  extratags => HSTORE
+  admin_level => SMALLINT
+  geometry => GEOMETRY
+}
+
+planet_osm_nodes  -[hidden]> planet_osm_ways
+planet_osm_ways  -[hidden]> planet_osm_rels
+planet_osm_ways -[hidden]-> place
+
+planet_osm_nodes::id <- planet_osm_ways::nodes
+
+@enduml
--- a/docs/develop/osm2pgsql-tables.svg
+++ b/docs/develop/osm2pgsql-tables.svg
--- a/docs/develop/parenting-flow.plantuml
+++ b/docs/develop/parenting-flow.plantuml
@@ -0,0 +1,31 @@
+@startuml
+skinparam monochrome true
+
+start
+
+if (has 'addr:street'?) then (yes)
+  if (street with that name\n nearby?) then (yes)
+    :**Use closest street**
+     **with same name**;
+     kill
+  else (no)
+    :** Use closest**\n**street**;
+     kill
+  endif
+elseif (has 'addr:place'?) then (yes)
+  if (place with that name\n nearby?) then (yes)
+    :**Use closest place**
+     **with same name**;
+     kill
+  else (no)
+    :add addr:place to adress;
+    :**Use closest place**\n**rank 16 to 25**;
+     kill
+  endif
+else (otherwise)
+ :**Use closest**\n**street**;
+ kill
+endif
+
+
+@enduml
--- a/docs/develop/parenting-flow.svg
+++ b/docs/develop/parenting-flow.svg
--- a/docs/develop/search-tables.plantuml
+++ b/docs/develop/search-tables.plantuml
@@ -0,0 +1,99 @@
+@startuml
+skinparam monochrome true
+skinparam ObjectFontStyle bold
+
+left to right direction
+
+map placex {
+  place_id => BIGINT
+  osm_type => CHAR(1)
+  osm_id => BIGINT
+  class => TEXT
+  type => TEXT
+  name => HSTORE
+  address => HSTORE
+  extratags => HSTORE
+  admin_level => SMALLINT
+  partition => SMALLINT
+  geometry_sector => INT
+  parent_place_id => BIGINT
+  linked_place_id => BIGINT
+  importance => DOUBLE
+  rank_search => SMALLINT
+  rank_address => SMALLINT
+  wikipedia => TEXT
+  country_code => VARCHAR(2)
+  housenumber => TEXT
+  postcode => TEXT
+  indexed_status => SMALLINT
+  indexed_date => TIMESTAMP
+  centroid => GEOMETRY
+  geometry => GEOMETRY
+}
+
+map search_name {
+  place_id => BIGINT
+  importance => DOUBLE
+  search_rank => SMALLINT
+  address_rank => SMALLINT
+  name_vector => INT[]
+  nameaddress_vector => INT[]
+  country_code => VARCHAR(2)
+  centroid => GEOMETRY
+}
+
+map word {
+  word_id => INT
+  word_token => TEXT
+  ... =>
+}
+
+map location_property_osmline {
+  place_id => BIGINT
+  osm_id => BIGINT
+  startnumber => INT
+  endnumber => INT
+  interpolationtype => TEXT
+  address => HSTORE
+  partition => SMALLINT
+  geometry_sector => INT
+  parent_place_id => BIGINT
+  country_code => VARCHAR(2)
+  postcode => text
+  indexed_status => SMALLINT
+  indexed_date => TIMESTAMP
+  linegeo => GEOMETRY
+}
+
+map place_addressline {
+  place_id => BIGINT
+  address_place_id => BIGINT
+  distance => DOUBLE
+  cached_rank_address => SMALLINT
+  fromarea => BOOLEAN
+  isaddress => BOOLEAN
+}
+
+map location_postcode {
+  place_id => BIGINT
+  postcode => TEXT
+  parent_place_id => BIGINT
+  rank_search => SMALLINT
+  rank_address => SMALLINT
+  indexed_status => SMALLINT
+  indexed_date => TIMESTAMP
+  geometry => GEOMETRY
+}
+
+placex::place_id <-- search_name::place_id
+placex::place_id <-- place_addressline::place_id
+placex::place_id <-- place_addressline::address_place_id
+
+search_name::name_vector --> word::word_id
+search_name::nameaddress_vector --> word::word_id
+
+place_addressline -[hidden]> location_property_osmline
+search_name -[hidden]> place_addressline
+location_property_osmline -[hidden]-> location_postcode
+
+@enduml
--- a/docs/develop/search-tables.svg
+++ b/docs/develop/search-tables.svg
--- a/docs/extra.css
+++ b/docs/extra.css
@@ -13,3 +13,11 @@ th, td {
 th {
    background-color: #eee;
 }
+
+/* Indentation for mkdocstrings.
+div.doc-contents:not(.first) {
+  padding-left: 25px;
+  border-left: 4px solid rgba(230, 230, 230);
+  margin-bottom: 60px;
+}*/
+
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,8 +1,10 @@
 Nominatim (from the Latin, 'by name') is a tool to search OSM data by name and address and to generate synthetic addresses of OSM points (reverse geocoding).

-This guide comes in three parts:
+This guide comes in four parts:

 * __[API reference](api/Overview.md)__ for users of Nominatim
 * __[Administration Guide](admin/Installation.md)__ for those who want
   to install their own Nominatim server
+ * __[Customization Guide](customize/Overview.md)__ for those who want to
+   adapt their own installation to their special requirements
 * __[Developer's Guide](develop/overview.md)__ for developers of the software
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -19,18 +19,26 @@ pages:
        - 'Import' : 'admin/Import.md'
        - 'Update' : 'admin/Update.md'
        - 'Deploy' : 'admin/Deployment.md'
-        - 'Customize Imports' : 'admin/Customization.md'
-        - 'Tokenizers' : 'admin/Tokenizers.md'
        - 'Nominatim UI'  : 'admin/Setup-Nominatim-UI.md'
        - 'Advanced Installations' : 'admin/Advanced-Installations.md'
+        - 'Maintenance' : 'admin/Maintenance.md'
        - 'Migration from older Versions' : 'admin/Migration.md'
        - 'Troubleshooting' : 'admin/Faq.md'
+    - 'Customization Guide':
+        - 'Overview': 'customize/Overview.md'
+        - 'Import Styles': 'customize/Import-Styles.md'
+        - 'Configuration Settings': 'customize/Settings.md'
+        - 'Place Ranking' : 'customize/Ranking.md'
+        - 'Tokenizers' : 'customize/Tokenizers.md'
+        - 'Special Phrases': 'customize/Special-Phrases.md'
+        - 'External data: US housenumbers from TIGER': 'customize/Tiger.md'
+        - 'External data: Postcodes': 'customize/Postcodes.md'
    - 'Developers Guide':
-        - 'Setup for Development' : 'develop/Development-Environment.md'
        - 'Architecture Overview' : 'develop/overview.md'
-        - 'OSM Data Import' : 'develop/Import.md'
-        - 'Place Ranking' : 'develop/Ranking.md'
-        - 'Postcodes' : 'develop/Postcodes.md'
+        - 'Database Layout' : 'develop/Database-Layout.md'
+        - 'Indexing' : 'develop/Indexing.md'
+        - 'Tokenizers' : 'develop/Tokenizers.md'
+        - 'Setup for Development' : 'develop/Development-Environment.md'
        - 'Testing' : 'develop/Testing.md'
        - 'External Data Sources': 'develop/data-sources.md'
    - 'Appendix':
@@ -41,6 +49,15 @@ pages:
 markdown_extensions:
    - codehilite
    - admonition
+    - def_list
    - toc:
        permalink: 
 extra_css: [extra.css, styles.css]
+plugins:
+    - search
+    - mkdocstrings:
+        handlers:
+          python:
+            rendering:
+              show_source: false
+              show_signature_annotations: false
--- a/lib-php/DebugHtml.php
+++ b/lib-php/DebugHtml.php
@@ -127,7 +127,7 @@ class Debug

    public static function printSQL($sSQL)
    {
-        echo '<p><tt><font color="#aaa">'.$sSQL.'</font></tt></p>'."\n";
+        echo '<p><tt><font color="#aaa">'.htmlspecialchars($sSQL, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401).'</font></tt></p>'."\n";
    }

    private static function outputVar($mVar, $sPreNL)
@@ -170,11 +170,12 @@ class Debug
        }

        if (is_string($mVar)) {
-            echo "'$mVar'";
-            return strlen($mVar) + 2;
+            $sOut = "'$mVar'";
+        } else {
+            $sOut = (string)$mVar;
        }

-        echo (string)$mVar;
-        return strlen((string)$mVar);
+        echo htmlspecialchars($sOut, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401);
+        return strlen($sOut);
    }
 }
--- a/lib-php/Geocode.php
+++ b/lib-php/Geocode.php
@@ -498,7 +498,6 @@ class Geocode
        if ($this->aCountryCodes) {
            $oCtx->setCountryList($this->aCountryCodes);
        }
-        $this->oTokenizer->setCountryRestriction($this->aCountryCodes);

        Debug::newSection('Query Preprocessing');

@@ -507,13 +506,6 @@ class Geocode
            userError('Query string is not UTF-8 encoded.');
        }

-        // Conflicts between US state abreviations and various words for 'the' in different languages
-        if (isset($this->aLangPrefOrder['name:en'])) {
-            $sQuery = preg_replace('/(^|,)\s*il\s*(,|$)/i', '\1illinois\2', $sQuery);
-            $sQuery = preg_replace('/(^|,)\s*al\s*(,|$)/i', '\1alabama\2', $sQuery);
-            $sQuery = preg_replace('/(^|,)\s*la\s*(,|$)/i', '\1louisiana\2', $sQuery);
-        }
-
        // Do we have anything that looks like a lat/lon pair?
        $sQuery = $oCtx->setNearPointFromQuery($sQuery);

--- a/lib-php/Phrase.php
+++ b/lib-php/Phrase.php
@@ -9,29 +9,14 @@ namespace Nominatim;
 */
 class Phrase
 {
-    const MAX_WORDSET_LEN = 20;
-    const MAX_WORDSETS = 100;
-
-    // Complete phrase as a string.
+    // Complete phrase as a string (guaranteed to have no leading or trailing
+    // spaces).
    private $sPhrase;
    // Element type for structured searches.
    private $sPhraseType;
    // Possible segmentations of the phrase.
    private $aWordSets;

-    public static function cmpByArraylen($aA, $aB)
-    {
-        $iALen = count($aA);
-        $iBLen = count($aB);
-
-        if ($iALen == $iBLen) {
-            return 0;
-        }
-
-        return ($iALen < $iBLen) ? -1 : 1;
-    }
-
-
    public function __construct($sPhrase, $sPhraseType)
    {
        $this->sPhrase = trim($sPhrase);
@@ -57,6 +42,11 @@ class Phrase
        return $this->sPhraseType;
    }

+    public function setWordSets($aWordSets)
+    {
+        $this->aWordSets = $aWordSets;
+    }
+
    /**
     * Return the array of possible segmentations of the phrase.
     *
@@ -80,61 +70,6 @@ class Phrase
        }
    }

-    public function computeWordSets($aWords, $oTokens)
-    {
-        $iNumWords = count($aWords);
-
-        if ($iNumWords == 0) {
-            $this->aWordSets = null;
-            return;
-        }
-
-        // Caches the word set for the partial phrase up to word i.
-        $aSetCache = array_fill(0, $iNumWords, array());
-
-        // Initialise first element of cache. There can only be the word.
-        if ($oTokens->containsAny($aWords[0])) {
-            $aSetCache[0][] = array($aWords[0]);
-        }
-
-        // Now do the next elements using what we already have.
-        for ($i = 1; $i < $iNumWords; $i++) {
-            for ($j = $i; $j > 0; $j--) {
-                $sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
-                if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
-                    $aPartial = array($sPartial);
-                    foreach ($aSetCache[$j - 1] as $aSet) {
-                        if (count($aSet) < Phrase::MAX_WORDSET_LEN) {
-                            $aSetCache[$i][] = array_merge($aSet, $aPartial);
-                        }
-                    }
-                    if (count($aSetCache[$i]) > 2 * Phrase::MAX_WORDSETS) {
-                        usort(
-                            $aSetCache[$i],
-                            array('\Nominatim\Phrase', 'cmpByArraylen')
-                        );
-                        $aSetCache[$i] = array_slice(
-                            $aSetCache[$i],
-                            0,
-                            Phrase::MAX_WORDSETS
-                        );
-                    }
-                }
-            }
-
-            // finally the current full phrase
-            $sPartial = $aWords[0].' '.$sPartial;
-            if ($oTokens->containsAny($sPartial)) {
-                $aSetCache[$i][] = array($sPartial);
-            }
-        }
-
-        $this->aWordSets = $aSetCache[$iNumWords - 1];
-        usort($this->aWordSets, array('\Nominatim\Phrase', 'cmpByArraylen'));
-        $this->aWordSets = array_slice($this->aWordSets, 0, Phrase::MAX_WORDSETS);
-    }
-
-
    public function debugInfo()
    {
        return array(
--- a/lib-php/ReverseGeocode.php
+++ b/lib-php/ReverseGeocode.php
@@ -111,6 +111,7 @@ class ReverseGeocode
                $sSQL .= ' FROM placex';
                $sSQL .= ' WHERE osm_type = \'N\'';
                $sSQL .= ' AND country_code = \''.$sCountryCode.'\'';
+                $sSQL .= ' AND rank_search < 26 '; // needed to select right index
                $sSQL .= ' AND rank_search between 5 and ' .min(25, $iMaxRank);
                $sSQL .= ' AND class = \'place\' AND type != \'postcode\'';
                $sSQL .= ' AND name IS NOT NULL ';
@@ -206,6 +207,7 @@ class ReverseGeocode
                // for place nodes at rank_address 16
                $sSQL .= ' AND rank_search > '.$iRankSearch;
                $sSQL .= ' AND rank_search <= '.$iMaxRank;
+                $sSQL .= ' AND rank_search < 26 '; // needed to select right index
                $sSQL .= ' AND rank_address > 0';
                $sSQL .= ' AND class = \'place\'';
                $sSQL .= ' AND type != \'postcode\'';
--- a/lib-php/SearchContext.php
+++ b/lib-php/SearchContext.php
@@ -28,6 +28,8 @@ class SearchContext
    public $sqlViewboxLarge = '';
    /// Reference along a route (as SQL).
    public $sqlViewboxCentre = '';
+    /// List of countries to restrict search to (as array).
+    public $aCountryList = null;
    /// List of countries to restrict search to (as SQL).
    public $sqlCountryList = '';
    /// List of place IDs to exclude (as SQL).
@@ -187,6 +189,7 @@ class SearchContext
    public function setCountryList($aCountries)
    {
        $this->sqlCountryList = '('.join(',', array_map('addQuotes', $aCountries)).')';
+        $this->aCountryList = $aCountries;
    }

    /**
@@ -279,6 +282,19 @@ class SearchContext
        return '';
    }

+    /**
+     * Check if the given country is covered by the search context.
+     *
+     * @param string $sCountryCode  Country code of the country to check.
+     *
+     * @return True, if no country code restrictions are set or the
+     *         country is included in the country list.
+     */
+    public function isCountryApplicable($sCountryCode)
+    {
+        return $this->aCountryList === null || in_array($sCountryCode, $this->aCountryList);
+    }
+
    public function debugInfo()
    {
        return array(
--- a/lib-php/SearchDescription.php
+++ b/lib-php/SearchDescription.php
@@ -19,6 +19,8 @@ class SearchDescription
    private $aName = array();
    /// True if the name is rare enough to force index use on name.
    private $bRareName = false;
+    /// True if the name requires to be accompanied by address terms.
+    private $bNameNeedsAddress = false;
    /// List of word ids making up the address of the object.
    private $aAddress = array();
    /// List of word ids that appear in the name but should be ignored.
@@ -113,6 +115,9 @@ class SearchDescription
                return false;
            }
        }
+        if ($this->bNameNeedsAddress && empty($this->aAddress)) {
+            return false;
+        }

        return true;
    }
@@ -231,6 +236,7 @@ class SearchDescription
    {
        $this->aName[$iId] = $iId;
        $this->bRareName = $bRareName;
+        $this->bNameNeedsAddress = false;
    }

    /**
@@ -240,11 +246,19 @@ class SearchDescription
     * @param integer iID            ID of term to add.
     * @param bool bSearchable       Term should be used to search for result
     *                               (i.e. term is not a stop word).
+     * @param bool bNeedsAddress     True if the term is too unspecific to be used
+     *                               in a stand-alone search without an address
+     *                               to narrow down the search.
     * @param integer iPhraseNumber  Index of phrase, where the partial term
     *                               appears.
     */
-    public function addPartialNameToken($iId, $bSearchable, $iPhraseNumber)
+    public function addPartialNameToken($iId, $bSearchable, $bNeedsAddress, $iPhraseNumber)
    {
+        if (empty($this->aName)) {
+            $this->bNameNeedsAddress = $bNeedsAddress;
+        } else {
+            $this->bNameNeedsAddress &= $bNeedsAddress;
+        }
        if ($bSearchable) {
            $this->aName[$iId] = $iId;
        } else {
@@ -310,6 +324,7 @@ class SearchDescription
    {
        $this->aAddress = array_merge($this->aAddress, $this->aName);
        $this->bRareName = false;
+        $this->bNameNeedsAddress = true;
        $this->aName = array($iId => $iId);
        $this->iNamePhrase = -1;
    }
@@ -566,32 +581,37 @@ class SearchDescription

        // Sort by existence of the requested house number but only if not
        // too many results are expected for the street, i.e. if the result
-        // will be narrowed down by an address. Remeber that with ordering
+        // will be narrowed down by an address. Remember that with ordering
        // every single result has to be checked.
        if ($this->sHouseNumber && ($this->bRareName || !empty($this->aAddress) || $this->sPostcode)) {
-            $sHouseNumberRegex = '\\\\m'.$this->sHouseNumber.'\\\\M';
-            $aOrder[] = ' (';
-            $aOrder[0] .= 'EXISTS(';
-            $aOrder[0] .= '  SELECT place_id';
-            $aOrder[0] .= '  FROM placex';
-            $aOrder[0] .= '  WHERE parent_place_id = search_name.place_id';
-            $aOrder[0] .= "    AND housenumber ~* E'".$sHouseNumberRegex."'";
-            $aOrder[0] .= '  LIMIT 1';
-            $aOrder[0] .= ') ';
-            // also housenumbers from interpolation lines table are needed
-            if (preg_match('/[0-9]+/', $this->sHouseNumber)) {
-                $iHouseNumber = intval($this->sHouseNumber);
-                $aOrder[0] .= 'OR EXISTS(';
-                $aOrder[0] .= '  SELECT place_id ';
-                $aOrder[0] .= '  FROM location_property_osmline ';
-                $aOrder[0] .= '  WHERE parent_place_id = search_name.place_id';
-                $aOrder[0] .= '    AND startnumber is not NULL';
-                $aOrder[0] .= '    AND '.$iHouseNumber.'>=startnumber ';
-                $aOrder[0] .= '    AND '.$iHouseNumber.'<=endnumber ';
-                $aOrder[0] .= '  LIMIT 1';
-                $aOrder[0] .= ')';
+            $sHouseNumberRegex = $oDB->getDBQuoted('\\\\m'.$this->sHouseNumber.'\\\\M');
+
+            // Housenumbers on streets and places.
+            $sChildHnr = 'SELECT * FROM placex WHERE parent_place_id = search_name.place_id';
+            $sChildHnr .= '    AND housenumber ~* E'.$sHouseNumberRegex;
+            // Interpolations on streets and places.
+            if (preg_match('/^[0-9]+$/', $this->sHouseNumber)) {
+                $sIpolHnr = 'SELECT * FROM location_property_osmline ';
+                $sIpolHnr .= 'WHERE parent_place_id = search_name.place_id ';
+                $sIpolHnr .= '  AND startnumber is not NULL';
+                $sIpolHnr .= '    AND '.$this->sHouseNumber.'>=startnumber ';
+                $sIpolHnr .= '    AND '.$this->sHouseNumber.'<=endnumber ';
+            } else {
+                $sIpolHnr = false;
            }
-            $aOrder[0] .= ') DESC';
+            // Housenumbers on the object iteself for unlisted places.
+            $sSelfHnr = 'SELECT * FROM placex WHERE place_id = search_name.place_id';
+            $sSelfHnr .= '    AND housenumber ~* E'.$sHouseNumberRegex;
+
+            $sSql = '(CASE WHEN address_rank = 30 THEN EXISTS('.$sSelfHnr.') ';
+            $sSql .= ' ELSE EXISTS('.$sChildHnr.') ';
+            if ($sIpolHnr) {
+                $sSql .= 'OR EXISTS('.$sIpolHnr.') ';
+            }
+            $sSql .= 'END) DESC';
+
+
+            $aOrder[] = $sSql;
        }

        if (!empty($this->aName)) {
@@ -624,7 +644,7 @@ class SearchDescription
            $aOrder[] = $this->oContext->distanceSQL('centroid');
        } elseif ($this->sPostcode) {
            if (empty($this->aAddress)) {
-                $aTerms[] = "EXISTS(SELECT place_id FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."' AND ST_DWithin(search_name.centroid, p.geometry, 0.1))";
+                $aTerms[] = "EXISTS(SELECT place_id FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."' AND ST_DWithin(search_name.centroid, p.geometry, 0.12))";
            } else {
                $aOrder[] = "(SELECT min(ST_Distance(search_name.centroid, p.geometry)) FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."')";
            }
@@ -719,9 +739,9 @@ class SearchDescription
            return $aResults;
        }

-        $sHouseNumberRegex = '\\\\m'.$this->sHouseNumber.'\\\\M';
+        $sHouseNumberRegex = $oDB->getDBQuoted('\\\\m'.$this->sHouseNumber.'\\\\M');
        $sSQL = 'SELECT place_id FROM placex WHERE';
-        $sSQL .= "  housenumber ~* E'".$sHouseNumberRegex."'";
+        $sSQL .= '  housenumber ~* E'.$sHouseNumberRegex;
        $sSQL .= ' AND ('.join(' OR ', $aIDCondition).')';
        $sSQL .= $this->oContext->excludeSQL(' AND place_id');

--- a/lib-php/SimpleWordList.php
+++ b/lib-php/SimpleWordList.php
@@ -0,0 +1,131 @@
+<?php
+
+namespace Nominatim;
+
+/**
+ * A word list creator based on simple splitting by space.
+ *
+ * Creates possible permutations of split phrases by finding all combination
+ * of splitting the phrase on space boundaries.
+ */
+class SimpleWordList
+{
+    const MAX_WORDSET_LEN = 20;
+    const MAX_WORDSETS = 100;
+
+    // The phrase as a list of simple terms (without spaces).
+    private $aWords;
+
+    /**
+     * Create a new word list
+     *
+     * @param string sPhrase  Phrase to create the word list from. The phrase is
+     *                        expected to be normalised, so that there are no
+     *                        subsequent spaces.
+     */
+    public function __construct($sPhrase)
+    {
+        if (strlen($sPhrase) > 0) {
+            $this->aWords = explode(' ', $sPhrase);
+        } else {
+            $this->aWords = array();
+        }
+    }
+
+    /**
+     * Get all possible tokens that are present in this word list.
+     *
+     * @return array The list of string tokens in the word list.
+     */
+    public function getTokens()
+    {
+        $aTokens = array();
+        $iNumWords = count($this->aWords);
+
+        for ($i = 0; $i < $iNumWords; $i++) {
+            $sPhrase = $this->aWords[$i];
+            $aTokens[$sPhrase] = $sPhrase;
+
+            for ($j = $i + 1; $j < $iNumWords; $j++) {
+                $sPhrase .= ' '.$this->aWords[$j];
+                $aTokens[$sPhrase] = $sPhrase;
+            }
+        }
+
+        return $aTokens;
+    }
+
+    /**
+     * Compute all possible permutations of phrase splits that result in
+     * words which are in the token list.
+     */
+    public function getWordSets($oTokens)
+    {
+        $iNumWords = count($this->aWords);
+
+        if ($iNumWords == 0) {
+            return null;
+        }
+
+        // Caches the word set for the partial phrase up to word i.
+        $aSetCache = array_fill(0, $iNumWords, array());
+
+        // Initialise first element of cache. There can only be the word.
+        if ($oTokens->containsAny($this->aWords[0])) {
+            $aSetCache[0][] = array($this->aWords[0]);
+        }
+
+        // Now do the next elements using what we already have.
+        for ($i = 1; $i < $iNumWords; $i++) {
+            for ($j = $i; $j > 0; $j--) {
+                $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
+                if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
+                    $aPartial = array($sPartial);
+                    foreach ($aSetCache[$j - 1] as $aSet) {
+                        if (count($aSet) < SimpleWordList::MAX_WORDSET_LEN) {
+                            $aSetCache[$i][] = array_merge($aSet, $aPartial);
+                        }
+                    }
+                    if (count($aSetCache[$i]) > 2 * SimpleWordList::MAX_WORDSETS) {
+                        usort(
+                            $aSetCache[$i],
+                            array('\Nominatim\SimpleWordList', 'cmpByArraylen')
+                        );
+                        $aSetCache[$i] = array_slice(
+                            $aSetCache[$i],
+                            0,
+                            SimpleWordList::MAX_WORDSETS
+                        );
+                    }
+                }
+            }
+
+            // finally the current full phrase
+            $sPartial = $this->aWords[0].' '.$sPartial;
+            if ($oTokens->containsAny($sPartial)) {
+                $aSetCache[$i][] = array($sPartial);
+            }
+        }
+
+        $aWordSets = $aSetCache[$iNumWords - 1];
+        usort($aWordSets, array('\Nominatim\SimpleWordList', 'cmpByArraylen'));
+        return array_slice($aWordSets, 0, SimpleWordList::MAX_WORDSETS);
+    }
+
+    public static function cmpByArraylen($aA, $aB)
+    {
+        $iALen = count($aA);
+        $iBLen = count($aB);
+
+        if ($iALen == $iBLen) {
+            return 0;
+        }
+
+        return ($iALen < $iBLen) ? -1 : 1;
+    }
+
+    public function debugInfo()
+    {
+        return $this->aWords;
+    }
+}
--- a/lib-php/TokenCountry.php
+++ b/lib-php/TokenCountry.php
@@ -36,7 +36,9 @@ class Country
     */
    public function isExtendable($oSearch, $oPosition)
    {
-        return !$oSearch->hasCountry() && $oPosition->maybePhrase('country');
+        return !$oSearch->hasCountry()
+               && $oPosition->maybePhrase('country')
+               && $oSearch->getContext()->isCountryApplicable($this->sCountryCode);
    }

    /**
--- a/lib-php/TokenHousenumber.php
+++ b/lib-php/TokenHousenumber.php
@@ -58,8 +58,8 @@ class HouseNumber
        // up of numbers, add a penalty
        $iSearchCost = 1;
        if (preg_match('/\\d/', $this->sToken) === 0
-            || preg_match_all('/[^0-9]/', $this->sToken, $aMatches) > 2) {
-            $iSearchCost++;
+            || preg_match_all('/[^0-9 ]/', $this->sToken, $aMatches) > 3) {
+            $iSearchCost += strlen($this->sToken) - 1;
        }
        if (!$oSearch->hasOperator(\Nominatim\Operator::NONE)) {
            $iSearchCost++;
--- a/lib-php/TokenPartial.php
+++ b/lib-php/TokenPartial.php
@@ -90,6 +90,7 @@ class Partial
            $oNewSearch->addPartialNameToken(
                $this->iId,
                $this->iSearchNameCount < CONST_Max_Word_Frequency,
+                $this->iSearchNameCount > CONST_Search_NameOnlySearchFrequencyThreshold,
                $oPosition->getPhrase()
            );

--- a/lib-php/TokenSpecialTerm.php
+++ b/lib-php/TokenSpecialTerm.php
@@ -44,7 +44,10 @@ class SpecialTerm
     */
    public function isExtendable($oSearch, $oPosition)
    {
-        return !$oSearch->hasOperator() && $oPosition->isPhrase('');
+        return !$oSearch->hasOperator()
+               && $oPosition->isPhrase('')
+               && ($this->iOperator != \Nominatim\Operator::NONE
+                  || (!$oSearch->hasAddress() && !$oSearch->hasHousenumber() && !$oSearch->hasCountry()));
    }

    /**
@@ -66,8 +69,8 @@ class SpecialTerm
                $iOp = \Nominatim\Operator::NAME;
            } else {
                $iOp = \Nominatim\Operator::NEAR;
+                $iSearchCost += 2;
            }
-            $iSearchCost += 2;
        } elseif (!$oPosition->isFirstToken() && !$oPosition->isLastToken()) {
            $iSearchCost += 2;
        }
--- a/lib-php/admin/country_languages.php
+++ b/lib-php/admin/country_languages.php
@@ -1,34 +0,0 @@
-<?php
-@define('CONST_LibDir', dirname(dirname(__FILE__)));
-
-require_once(CONST_LibDir.'/init-cmd.php');
-
-ini_set('memory_limit', '800M');
-ini_set('display_errors', 'stderr');
-
-$aCMDOptions
- = array(
-    'Import country language data from osm wiki',
-    array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
-    array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
-    array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
-    array('project-dir', '', 0, 1, 1, 1, 'realpath', 'Base directory of the Nominatim installation (default: .)'),
-   );
-getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
-
-loadSettings($aCMDResult['project-dir'] ?? getcwd());
-setupHTTPProxy();
-
-if (true) {
-    $sURL = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Country_Codes';
-    $sWikiPageXML = file_get_contents($sURL);
-    if (preg_match_all('#\\| ([a-z]{2}) \\|\\| [^|]+\\|\\| ([a-z,]+)#', $sWikiPageXML, $aMatches, PREG_SET_ORDER)) {
-        foreach ($aMatches as $aMatch) {
-            $aLanguages = explode(',', $aMatch[2]);
-            foreach ($aLanguages as $i => $s) {
-                $aLanguages[$i] = '"'.pg_escape_string($s).'"';
-            }
-            echo "UPDATE country_name set country_default_language_codes = '{".join(',', $aLanguages)."}' where country_code = '".pg_escape_string($aMatch[1])."';\n";
-        }
-    }
-}
--- a/lib-php/admin/warm.php
+++ b/lib-php/admin/warm.php
@@ -86,8 +86,13 @@ if (!$aResult['reverse-only']) {
    if ($bVerbose) {
        echo "\n";
    }
+
+    $oTokenizer = new \Nominatim\Tokenizer($oDB);
+
+    $aWords = $oTokenizer->mostFrequentWords(1000);
+
    $sSQL = 'SELECT word FROM word WHERE word is not null ORDER BY search_name_count DESC LIMIT 1000';
-    foreach ($oDB->getCol($sSQL) as $sWord) {
+    foreach ($aWords as $sWord) {
        if ($bVerbose) {
            echo "$sWord = ";
        }
--- a/lib-php/migration/PhraseSettingsToJson.php
+++ b/lib-php/migration/PhraseSettingsToJson.php
@@ -1,21 +0,0 @@
-<?php
-
-$phpPhraseSettingsFile = $argv[1];
-$jsonPhraseSettingsFile = dirname($phpPhraseSettingsFile).'/'.basename($phpPhraseSettingsFile, '.php').'.json';
-
-if (file_exists($phpPhraseSettingsFile) && !file_exists($jsonPhraseSettingsFile)) {
-    include $phpPhraseSettingsFile;
-
-    $data = array();
-
-    if (isset($aTagsBlacklist)) {
-        $data['blackList'] = $aTagsBlacklist;
-    }
-    if (isset($aTagsWhitelist)) {
-        $data['whiteList'] = $aTagsWhitelist;
-    }
-
-    $jsonFile = fopen($jsonPhraseSettingsFile, 'w');
-    fwrite($jsonFile, json_encode($data));
-    fclose($jsonFile);
-}
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -2,13 +2,14 @@

 namespace Nominatim;

+require_once(CONST_LibDir.'/SimpleWordList.php');
+
 class Tokenizer
 {
    private $oDB;

    private $oNormalizer;
    private $oTransliterator;
-    private $aCountryRestriction;

    public function __construct(&$oDB)
    {
@@ -19,7 +20,7 @@ class Tokenizer

    public function checkStatus()
    {
-        $sSQL = 'SELECT word_id FROM word limit 1';
+        $sSQL = 'SELECT word_id FROM word WHERE word_id is not null limit 1';
        $iWordID = $this->oDB->getOne($sSQL);
        if ($iWordID === false) {
            throw new \Exception('Query failed', 703);
@@ -30,12 +31,6 @@ class Tokenizer
    }


-    public function setCountryRestriction($aCountries)
-    {
-        $this->aCountryRestriction = $aCountries;
-    }
-
-
    public function normalizeString($sTerm)
    {
        if ($this->oNormalizer === null) {
@@ -45,6 +40,15 @@ class Tokenizer
        return $this->oNormalizer->transliterate($sTerm);
    }

+
+    public function mostFrequentWords($iNum)
+    {
+        $sSQL = "SELECT word FROM word WHERE type = 'W'";
+        $sSQL .= "ORDER BY info->'count' DESC LIMIT ".$iNum;
+        return $this->oDB->getCol($sSQL);
+    }
+
+
    private function makeStandardWord($sTerm)
    {
        return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
@@ -88,13 +92,10 @@ class Tokenizer
            $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
            $sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
            Debug::printVar('Phrase', $sPhrase);
-            if (strlen($sPhrase) > 0) {
-                $aWords = explode(' ', $sPhrase);
-                Tokenizer::addTokens($aTokens, $aWords);
-                $aWordLists[] = $aWords;
-            } else {
-                $aWordLists[] = array();
-            }
+
+            $oWordList = new SimpleWordList($sPhrase);
+            $aTokens = array_merge($aTokens, $oWordList->getTokens());
+            $aWordLists[] = $oWordList;
        }

        Debug::printVar('Tokens', $aTokens);
@@ -103,7 +104,7 @@ class Tokenizer
        $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);

        foreach ($aPhrases as $iPhrase => $oPhrase) {
-            $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
+            $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
        }

        return $oValidTokens;
@@ -162,10 +163,7 @@ class Tokenizer

            switch ($aWord['type']) {
                case 'C':  // country name tokens
-                    if ($aWord['word'] !== null
-                        && (!$this->aCountryRestriction
-                            || in_array($aWord['word'], $this->aCountryRestriction))
-                    ) {
+                    if ($aWord['word'] !== null) {
                        $oValidTokens->addToken(
                            $sTok,
                            new Token\Country($iId, $aWord['word'])
@@ -220,27 +218,4 @@ class Tokenizer
            }
        }
    }
-
-
-    /**
-     * Add the tokens from this phrase to the given list of tokens.
-     *
-     * @param string[] $aTokens List of tokens to append.
-     *
-     * @return void
-     */
-    private static function addTokens(&$aTokens, $aWords)
-    {
-        $iNumWords = count($aWords);
-
-        for ($i = 0; $i < $iNumWords; $i++) {
-            $sPhrase = $aWords[$i];
-            $aTokens[$sPhrase] = $sPhrase;
-
-            for ($j = $i + 1; $j < $iNumWords; $j++) {
-                $sPhrase .= ' '.$aWords[$j];
-                $aTokens[$sPhrase] = $sPhrase;
-            }
-        }
-    }
 }
--- a/lib-php/tokenizer/legacy_tokenizer.php
+++ b/lib-php/tokenizer/legacy_tokenizer.php
@@ -2,12 +2,13 @@

 namespace Nominatim;

+require_once(CONST_LibDir.'/SimpleWordList.php');
+
 class Tokenizer
 {
    private $oDB;

    private $oNormalizer = null;
-    private $aCountryRestriction = null;

    public function __construct(&$oDB)
    {
@@ -37,12 +38,6 @@ class Tokenizer
    }


-    public function setCountryRestriction($aCountries)
-    {
-        $this->aCountryRestriction = $aCountries;
-    }
-
-
    public function normalizeString($sTerm)
    {
        if ($this->oNormalizer === null) {
@@ -53,6 +48,14 @@ class Tokenizer
    }


+    public function mostFrequentWords($iNum)
+    {
+        $sSQL = 'SELECT word FROM word WHERE word is not null ';
+        $sSQL .= 'ORDER BY search_name_count DESC LIMIT '.$iNum;
+        return $this->oDB->getCol($sSQL);
+    }
+
+
    public function tokensForSpecialTerm($sTerm)
    {
        $aResults = array();
@@ -92,6 +95,23 @@ class Tokenizer
            $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
            $sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
            $aParams[':'.$iPhrase] = $oPhrase->getPhrase();
+
+            // Conflicts between US state abbreviations and various words
+            // for 'the' in different languages
+            switch (strtolower($oPhrase->getPhrase())) {
+                case 'il':
+                    $aParams[':'.$iPhrase] = 'illinois';
+                    break;
+                case 'al':
+                    $aParams[':'.$iPhrase] = 'alabama';
+                    break;
+                case 'la':
+                    $aParams[':'.$iPhrase] = 'louisiana';
+                    break;
+                default:
+                    $aParams[':'.$iPhrase] = $oPhrase->getPhrase();
+                    break;
+            }
        }
        $sSQL = substr($sSQL, 0, -1);

@@ -106,13 +126,14 @@ class Tokenizer
        $aWordLists = array();
        $aTokens = array();
        foreach ($aNormPhrases as $sPhrase) {
-            if (strlen($sPhrase) > 0) {
-                $aWords = explode(' ', $sPhrase);
-                Tokenizer::addTokens($aTokens, $aWords);
-                $aWordLists[] = $aWords;
-            } else {
-                $aWordLists[] = array();
+            $oWordList = new SimpleWordList($sPhrase);
+
+            foreach ($oWordList->getTokens() as $sToken) {
+                $aTokens[' '.$sToken] = ' '.$sToken;
+                $aTokens[$sToken] = $sToken;
            }
+
+            $aWordLists[] = $oWordList;
        }

        Debug::printVar('Tokens', $aTokens);
@@ -121,7 +142,7 @@ class Tokenizer
        $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);

        foreach ($aPhrases as $iPhrase => $oPhrase) {
-            $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
+            $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
        }

        return $oValidTokens;
@@ -206,12 +227,7 @@ class Tokenizer
                    );
                }
            } elseif ($aWord['country_code']) {
-                // Filter country tokens that do not match restricted countries.
-                if (!$this->aCountryRestriction
-                    || in_array($aWord['country_code'], $this->aCountryRestriction)
-                ) {
-                    $oToken = new Token\Country($iId, $aWord['country_code']);
-                }
+                $oToken = new Token\Country($iId, $aWord['country_code']);
            } elseif ($aWord['word_token'][0] == ' ') {
                $oToken = new Token\Word(
                    $iId,
@@ -238,29 +254,4 @@ class Tokenizer
            }
        }
    }
-
-
-    /**
-     * Add the tokens from this phrase to the given list of tokens.
-     *
-     * @param string[] $aTokens List of tokens to append.
-     *
-     * @return void
-     */
-    private static function addTokens(&$aTokens, $aWords)
-    {
-        $iNumWords = count($aWords);
-
-        for ($i = 0; $i < $iNumWords; $i++) {
-            $sPhrase = $aWords[$i];
-            $aTokens[' '.$sPhrase] = ' '.$sPhrase;
-            $aTokens[$sPhrase] = $sPhrase;
-
-            for ($j = $i + 1; $j < $iNumWords; $j++) {
-                $sPhrase .= ' '.$aWords[$j];
-                $aTokens[' '.$sPhrase] = ' '.$sPhrase;
-                $aTokens[$sPhrase] = $sPhrase;
-            }
-        }
-    }
 }
--- a/lib-sql/functions/address_lookup.sql
+++ b/lib-sql/functions/address_lookup.sql
@@ -223,11 +223,13 @@ BEGIN
                 OR placex.country_code = place.country_code)
      ORDER BY rank_address desc,
               (place_addressline.place_id = in_place_id) desc,
-               (fromarea and place.centroid is not null and not isaddress
-                and (place.address is null or avals(name) && avals(place.address))
-                and ST_Contains(geometry, place.centroid)) desc,
-               isaddress desc, fromarea desc,
-               distance asc, rank_search desc
+               (CASE WHEN coalesce((avals(name) && avals(place.address)), False) THEN 2
+                     WHEN isaddress THEN 0
+                     WHEN fromarea
+                          and place.centroid is not null
+                          and ST_Contains(geometry, place.centroid) THEN 1
+                     ELSE -1 END) desc,
+               fromarea desc, distance asc, rank_search desc
  LOOP
    -- RAISE WARNING '%',location;
    location_isaddress := location.rank_address != current_rank_address;
--- a/lib-sql/functions/interpolation.sql
+++ b/lib-sql/functions/interpolation.sql
@@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE;


 -- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
+CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB,
                                                    partition SMALLINT,
                                                    centroid GEOMETRY, geom GEOMETRY)
  RETURNS BIGINT
@@ -52,7 +52,7 @@ DECLARE
  parent_place_id BIGINT;
  location RECORD;
 BEGIN
-  parent_place_id := find_parent_for_address(street, place, partition, centroid);
+  parent_place_id := find_parent_for_address(token_info, partition, centroid);

  IF parent_place_id is null THEN
    FOR location IN SELECT place_id FROM placex
@@ -155,9 +155,8 @@ BEGIN
  NEW.interpolationtype = NEW.address->'interpolation';

  place_centroid := ST_PointOnSurface(NEW.linegeo);
-  NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
-                                                 token_addr_place_match_tokens(NEW.token_info),
-                                                 NEW.partition, place_centroid, NEW.linegeo);
+  NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition,
+                                                 place_centroid, NEW.linegeo);

  interpol_postcode := token_normalized_postcode(NEW.address->'postcode');

--- a/lib-sql/functions/partition-functions.sql
+++ b/lib-sql/functions/partition-functions.sql
@@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE;

 CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
                                             from_rank SMALLINT, to_rank SMALLINT,
-                                             extent FLOAT, tokens INT[])
+                                             extent FLOAT, token_info JSONB, key TEXT)
  RETURNS nearfeaturecentr
  AS $$
 DECLARE
@@ -80,7 +80,7 @@ BEGIN
        FROM location_area_large_{{ partition }}
        WHERE geometry && ST_Expand(feature, extent)
              AND rank_address between from_rank and to_rank
-              AND tokens && keywords
+              AND token_matches_address(token_info, key, keywords)
        GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
        ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
      RETURN r;
@@ -148,18 +148,21 @@ LANGUAGE plpgsql;

 CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
                                                      point GEOMETRY,
-                                                      isin_token INTEGER[])
+                                                      token_info JSONB)
  RETURNS BIGINT
  AS $$
 DECLARE
  parent BIGINT;
 BEGIN
+  IF not token_has_addr_street(token_info) THEN
+    RETURN NULL;
+  END IF;

 {% for partition in db.partitions %}
  IF in_partition = {{ partition }} THEN
    SELECT place_id FROM search_name_{{ partition }}
      INTO parent
-      WHERE name_vector && isin_token
+      WHERE token_matches_street(token_info, name_vector)
            AND centroid && ST_Expand(point, 0.015)
            AND address_rank between 26 and 27
      ORDER BY ST_Distance(centroid, point) ASC limit 1;
@@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE;

 CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
                                                       point GEOMETRY,
-                                                       isin_token INTEGER[])
+                                                       token_info JSONB)
  RETURNS BIGINT
  AS $$
 DECLARE
  parent BIGINT;
 BEGIN
+  IF not token_has_addr_place(token_info) THEN
+    RETURN NULL;
+  END IF;

 {% for partition in db.partitions %}
  IF in_partition = {{ partition }} THEN
    SELECT place_id
      INTO parent
      FROM search_name_{{ partition }}
-      WHERE name_vector && isin_token
+      WHERE token_matches_place(token_info, name_vector)
            AND centroid && ST_Expand(point, 0.04)
            AND address_rank between 16 and 25
      ORDER BY ST_Distance(centroid, point) ASC limit 1;
--- a/lib-sql/functions/place_triggers.sql
+++ b/lib-sql/functions/place_triggers.sql
@@ -247,6 +247,7 @@ BEGIN
        indexed_status = 2,
        geometry = NEW.geometry
        where place_id = existingplacex.place_id;
+
      -- if a node(=>house), which is part of a interpolation line, changes (e.g. the street attribute) => mark this line for reparenting 
      -- (already here, because interpolation lines are reindexed before nodes, so in the second call it would be too late)
      IF NEW.osm_type='N'
@@ -270,6 +271,26 @@ BEGIN
              and x.class = p.class;
      END IF;

+      IF coalesce(existing.name::text, '') != coalesce(NEW.name::text, '')
+      THEN
+        IF existingplacex.rank_address between 26 and 27 THEN
+          -- When streets change their name, this may have an effect on POI objects
+          -- with addr:street tags.
+          UPDATE placex SET indexed_status = 2
+          WHERE indexed_status = 0 and address ? 'street'
+                and parent_place_id = existingplacex.place_id;
+          UPDATE placex SET indexed_status = 2
+          WHERE indexed_status = 0 and rank_search = 30 and address ? 'street'
+                and ST_DWithin(NEW.geometry, geometry, 0.002);
+        ELSEIF existingplacex.rank_address between 16 and 25 THEN
+          -- When places change their name, this may have an effect on POI objects
+          -- with addr:place tags.
+          UPDATE placex SET indexed_status = 2
+          WHERE indexed_status = 0 and address ? 'place' and rank_search = 30
+                and parent_place_id = existingplacex.place_id;
+          -- No update of surrounding objects, potentially too expensive.
+        END IF;
+      END IF;
    END IF;

    -- Abort the add (we modified the existing place instead)
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -1,27 +1,33 @@
 -- Trigger functions for the placex table.

+-- Information returned by update preparation.
+DROP TYPE IF EXISTS prepare_update_info CASCADE;
+CREATE TYPE prepare_update_info AS (
+  name HSTORE,
+  address HSTORE,
+  rank_address SMALLINT,
+  country_code TEXT,
+  class TEXT,
+  type TEXT,
+  linked_place_id BIGINT
+);
+
 -- Retrieve the data needed by the indexer for updating the place.
--
-- Return parameters:
--  name            list of names
--  address         list of address tags, either from the object or a surrounding
--                  building
--  country_feature If the place is a country feature, this contains the
--                  country code, otherwise it is null.
-CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
-                                                 OUT name HSTORE,
-                                                 OUT address HSTORE,
-                                                 OUT country_feature VARCHAR)
+CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex)
+  RETURNS prepare_update_info
  AS $$
+DECLARE
+  location RECORD;
+  result prepare_update_info;
 BEGIN
  -- For POI nodes, check if the address should be derived from a surrounding
  -- building.
  IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
-    address := p.address;
+    result.address := p.address;
  ELSE
    -- The additional && condition works around the misguided query
    -- planner of postgis 3.0.
-    SELECT placex.address || hstore('_inherited', '') INTO address
+    SELECT placex.address || hstore('_inherited', '') INTO result.address
      FROM placex
     WHERE ST_Covers(geometry, p.centroid)
           and geometry && p.centroid
@@ -31,15 +37,26 @@ BEGIN
     LIMIT 1;
  END IF;

-  address := address - '_unlisted_place'::TEXT;
-  name := p.name;
+  result.address := result.address - '_unlisted_place'::TEXT;
+  result.name := p.name;
+  result.class := p.class;
+  result.type := p.type;
+  result.country_code := p.country_code;
+  result.rank_address := p.rank_address;

-  country_feature := CASE WHEN p.admin_level = 2
-                               and p.class = 'boundary' and p.type = 'administrative'
-                               and p.osm_type = 'R'
-                          THEN p.country_code
-                          ELSE null
-                     END;
+  -- Names of linked places need to be merged in, so search for a linkable
+  -- place already here.
+  SELECT * INTO location FROM find_linked_place(p);
+
+  IF location.place_id is not NULL THEN
+    result.linked_place_id := location.place_id;
+
+    IF NOT location.name IS NULL THEN
+      result.name := location.name || result.name;
+    END IF;
+  END IF;
+
+  RETURN result;
 END;
 $$
 LANGUAGE plpgsql STABLE;
@@ -89,8 +106,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
                                               poi_osm_id BIGINT,
                                               poi_partition SMALLINT,
                                               bbox GEOMETRY,
-                                               addr_street INTEGER[],
-                                               addr_place INTEGER[],
+                                               token_info JSONB,
                                               is_place_addr BOOLEAN)
  RETURNS BIGINT
  AS $$
@@ -104,8 +120,7 @@ BEGIN
  parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);

  IF parent_place_id is null THEN
-    parent_place_id := find_parent_for_address(addr_street, addr_place,
-                                               poi_partition, bbox);
+    parent_place_id := find_parent_for_address(token_info, poi_partition, bbox);
  END IF;

  IF parent_place_id is null and poi_osm_type = 'N' THEN
@@ -318,13 +333,14 @@ BEGIN
    WHERE s.place_id = parent_place_id;

  FOR addr_item IN
-    SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
-      FROM token_get_address_tokens(token_info)
-      WHERE not search_tokens <@ parent_address_vector
+    SELECT (get_addr_tag_rank(key, country)).*, key,
+           token_get_address_search_tokens(token_info, key) as search_tokens
+      FROM token_get_address_keys(token_info) as key
+      WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector
  LOOP
    addr_place := get_address_place(in_partition, geometry,
                                    addr_item.from_rank, addr_item.to_rank,
-                                    addr_item.extent, addr_item.match_tokens);
+                                    addr_item.extent, token_info, addr_item.key);

    IF addr_place is null THEN
      -- No place found in OSM that matches. Make it at least searchable.
@@ -432,14 +448,16 @@ BEGIN

  FOR location IN
    SELECT (get_address_place(partition, geometry, from_rank, to_rank,
-                              extent, match_tokens)).*, search_tokens
-      FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
-              FROM token_get_address_tokens(token_info)) x
+                              extent, token_info, key)).*, key
+      FROM (SELECT (get_addr_tag_rank(key, country)).*, key
+              FROM token_get_address_keys(token_info) as key) x
      ORDER BY rank_address, distance, isguess desc
  LOOP
    IF location.place_id is null THEN
      {% if not db.reverse_only %}
-      nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
+      nameaddress_vector := array_merge(nameaddress_vector,
+                                        token_get_address_search_tokens(token_info,
+                                                                        location.key));
      {% endif %}
    ELSE
      {% if not db.reverse_only %}
@@ -674,15 +692,14 @@ DECLARE
  parent_address_level SMALLINT;
  place_address_level SMALLINT;

-  addr_street INTEGER[];
-  addr_place INTEGER[];
-
  max_rank SMALLINT;

  name_vector INTEGER[];
  nameaddress_vector INTEGER[];
  addr_nameaddress_vector INTEGER[];

+  linked_place BIGINT;
+
  linked_node_id BIGINT;
  linked_importance FLOAT;
  linked_wikipedia TEXT;
@@ -718,9 +735,14 @@ BEGIN

  NEW.extratags := NEW.extratags - 'linked_place'::TEXT;

+  -- NEW.linked_place_id contains the precomputed linkee. Save this and restore
+  -- the previous link status.
+  linked_place := NEW.linked_place_id;
+  NEW.linked_place_id := OLD.linked_place_id;
+
  IF NEW.linked_place_id is not null THEN
    NEW.token_info := null;
-    {% if debug %}RAISE WARNING 'place already linked to %', NEW.linked_place_id;{% endif %}
+    {% if debug %}RAISE WARNING 'place already linked to %', OLD.linked_place_id;{% endif %}
    RETURN NEW;
  END IF;

@@ -838,8 +860,6 @@ BEGIN
  END IF;

  NEW.housenumber := token_normalized_housenumber(NEW.token_info);
-  addr_street := token_addr_street_match_tokens(NEW.token_info);
-  addr_place := token_addr_place_match_tokens(NEW.token_info);

  NEW.postcode := null;

@@ -885,7 +905,7 @@ BEGIN
    NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
                                               NEW.partition,
                                               ST_Envelope(NEW.geometry),
-                                               addr_street, addr_place,
+                                               NEW.token_info,
                                               is_place_address);

    -- If we found the road take a shortcut here.
@@ -956,8 +976,9 @@ BEGIN
  -- ---------------------------------------------------------------------------
  -- Full indexing
  {% if debug %}RAISE WARNING 'Using full index mode for % %', NEW.osm_type, NEW.osm_id;{% endif %}
-  SELECT * INTO location FROM find_linked_place(NEW);
-  IF location.place_id is not null THEN
+  IF linked_place is not null THEN
+    SELECT * INTO location FROM placex WHERE place_id = linked_place;
+
    {% if debug %}RAISE WARNING 'Linked %', location;{% endif %}

    -- Use the linked point as the centre point of the geometry,
@@ -974,11 +995,6 @@ BEGIN
      NEW.rank_address := location.rank_address;
    END IF;

-    -- merge in the label name
-    IF NOT location.name IS NULL THEN
-      NEW.name := location.name || NEW.name;
-    END IF;
-
    -- merge in extra tags
    NEW.extratags := hstore('linked_' || location.class, location.type)
                     || coalesce(location.extratags, ''::hstore)
--- a/lib-sql/functions/utils.sql
+++ b/lib-sql/functions/utils.sql
@@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE;

 -- Find the parent of an address with addr:street/addr:place tag.
 --
-- \param street     Value of addr:street or NULL if tag is missing.
-- \param place      Value of addr:place or NULL if tag is missing.
+-- \param token_info Naming info with the address information.
 -- \param partition  Partition where to search the parent.
 -- \param centroid   Location of the address.
 --
 -- \return Place ID of the parent if one was found, NULL otherwise.
-CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
+CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB,
                                                   partition SMALLINT,
                                                   centroid GEOMETRY)
  RETURNS BIGINT
@@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG
 DECLARE
  parent_place_id BIGINT;
 BEGIN
-  IF street is not null THEN
-    -- Check for addr:street attributes
-    -- Note that addr:street links can only be indexed, once the street itself is indexed
-    parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
-    IF parent_place_id is not null THEN
-      {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
-      RETURN parent_place_id;
-    END IF;
+  -- Check for addr:street attributes
+  parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info);
+  IF parent_place_id is not null THEN
+    {% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %}
+    RETURN parent_place_id;
  END IF;

  -- Check for addr:place attributes.
-  IF place is not null THEN
-    parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
-    IF parent_place_id is not null THEN
-      {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
-      RETURN parent_place_id;
-    END IF;
-  END IF;
-
-  RETURN NULL;
+  parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info);
+  {% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %}
+  RETURN parent_place_id;
 END;
 $$
 LANGUAGE plpgsql STABLE;

+
 CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
  RETURNS BOOLEAN
  AS $$
--- a/lib-sql/tables.sql
+++ b/lib-sql/tables.sql
@@ -155,11 +155,11 @@ CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id)
 CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
 CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
 CREATE INDEX idx_placex_geometry_buildings ON placex
-  USING GIST (geometry) {{db.tablespace.search_index}}
+  USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.search_index}}
  WHERE address is not null and rank_search = 30
        and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
 CREATE INDEX idx_placex_geometry_placenode ON placex
-  USING GIST (geometry) {{db.tablespace.search_index}}
+  USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.search_index}}
  WHERE osm_type = 'N' and rank_search < 26
        and class = 'place' and type != 'postcode' and linked_place_id is null;
 CREATE INDEX idx_placex_wikidata on placex USING BTREE ((extratags -> 'wikidata')) {{db.tablespace.address_index}} WHERE extratags ? 'wikidata' and class = 'place' and osm_type = 'N' and rank_search < 26;
--- a/lib-sql/tiger_import_start.sql
+++ b/lib-sql/tiger_import_start.sql
@@ -14,7 +14,6 @@ DECLARE
  out_partition INTEGER;
  out_parent_place_id BIGINT;
  location RECORD;
-  address_street_word_ids INTEGER[];

 BEGIN

@@ -54,13 +53,9 @@ BEGIN

  place_centroid := ST_Centroid(linegeo);
  out_partition := get_partition('us');
-  out_parent_place_id := null;

-  address_street_word_ids := token_addr_street_match_tokens(token_info);
-  IF address_street_word_ids IS NOT NULL THEN
-    out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
-                                                      address_street_word_ids);
-  END IF;
+  out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
+                                                    token_info);

  IF out_parent_place_id IS NULL THEN
    SELECT getNearestParallelRoadFeature(out_partition, linegeo)
--- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql
@@ -34,40 +34,59 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;


-CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'street')::INTEGER[]
+  SELECT info->>'street' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
+  RETURNS BOOLEAN
+AS $$
+  SELECT info->>'place' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->>'street')::INTEGER[] <@ street_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;


-CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'place_match')::INTEGER[]
+  SELECT (info->>'place')::INTEGER[] <@ place_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;


 CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
-  SELECT (info->>'place_search')::INTEGER[]
+  SELECT (info->>'place')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;


-DROP TYPE IF EXISTS token_addresstoken CASCADE;
-CREATE TYPE token_addresstoken AS (
-  key TEXT,
-  match_tokens INT[],
-  search_tokens INT[]
-);
-
-CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
-  RETURNS SETOF token_addresstoken
+CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
+  RETURNS SETOF TEXT
 AS $$
-  SELECT key, (value->>1)::int[] as match_tokens,
-         (value->>0)::int[] as search_tokens
-  FROM jsonb_each(info->'addr');
+  SELECT * FROM jsonb_object_keys(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->'addr'->>key)::INTEGER[];
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->'addr'->>key)::INTEGER[] <@ tokens;
 $$ LANGUAGE SQL IMMUTABLE STRICT;


@@ -127,15 +146,34 @@ BEGIN
        VALUES (term_id, term, 'w', json_build_object('count', term_count));
    END IF;

-    IF term_count < {{ max_word_freq }} THEN
-      partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
-    END IF;
+    partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
  END LOOP;
 END;
 $$
 LANGUAGE plpgsql;


+CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  token INTEGER;
+BEGIN
+  SELECT min(word_id) INTO token
+    FROM word WHERE word_token = partial and type = 'w';
+
+  IF token IS NULL THEN
+    token := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, type, info)
+        VALUES (token, partial, 'w', json_build_object('count', 0));
+  END IF;
+
+  RETURN token;
+END;
+$$
+LANGUAGE plpgsql;
+
+
 CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
  RETURNS INTEGER
  AS $$
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@@ -34,17 +34,31 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;


-CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'street')::INTEGER[]
+  SELECT info->>'street' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
+  RETURNS BOOLEAN
+AS $$
+  SELECT info->>'place_match' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->>'street')::INTEGER[] && street_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;


-CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'place_match')::INTEGER[]
+  SELECT (info->>'place_match')::INTEGER[] && place_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;


@@ -55,19 +69,24 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;


-DROP TYPE IF EXISTS token_addresstoken CASCADE;
-CREATE TYPE token_addresstoken AS (
-  key TEXT,
-  match_tokens INT[],
-  search_tokens INT[]
-);
-
-CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
-  RETURNS SETOF token_addresstoken
+CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
+  RETURNS SETOF TEXT
 AS $$
-  SELECT key, (value->>1)::int[] as match_tokens,
-         (value->>0)::int[] as search_tokens
-  FROM jsonb_each(info->'addr');
+  SELECT * FROM jsonb_object_keys(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->'addr'->key->>0)::INTEGER[];
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
 $$ LANGUAGE SQL IMMUTABLE STRICT;


--- a/lib-sql/tokenizer/legacy_tokenizer_indices.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer_indices.sql
@@ -1,2 +1,3 @@
+-- Required for details lookup.
 CREATE INDEX IF NOT EXISTS idx_word_word_id
  ON word USING BTREE (word_id) {{db.tablespace.search_index}};
--- a/lib-sql/words_from_search_name.sql
+++ b/lib-sql/words_from_search_name.sql
@@ -1,11 +0,0 @@
-DROP TABLE IF EXISTS word_frequencies;
-CREATE TABLE word_frequencies AS
- SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
-
-CREATE INDEX idx_word_frequencies ON word_frequencies(id);
-
-UPDATE word SET search_name_count = count
-  FROM word_frequencies
- WHERE word_token like ' %' and word_id = id;
-
-DROP TABLE word_frequencies;
--- a/manual/CMakeLists.txt
+++ b/manual/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Creates and installs manual page

-configure_file(${PROJECT_SOURCE_DIR}/manual/create-manpage.tmpl create_manpage.py)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/create-manpage.tmpl create_manpage.py)

 find_program(ARGPARSEMANPAGE argparse-manpage)

@@ -8,8 +8,8 @@ ADD_CUSTOM_TARGET(manpage
    COMMAND ${ARGPARSEMANPAGE} --pyfile ${CMAKE_CURRENT_BINARY_DIR}/create_manpage.py
    --function get_parser --project-name Nominatim
    --url https://nominatim.org  > ${CMAKE_CURRENT_SOURCE_DIR}/nominatim.1
-
-    COMMAND sed -i '/.SH AUTHORS/I,+2 d' ${CMAKE_CURRENT_SOURCE_DIR}/nominatim.1
+    --author 'the Nominatim developer community'
+    --author-email info@nominatim.org
 )

 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/nominatim.1 DESTINATION share/man/man1 )
--- a/manual/create-manpage.tmpl
+++ b/manual/create-manpage.tmpl
--- a/manual/nominatim.1
+++ b/manual/nominatim.1
@@ -6,7 +6,9 @@ nominatim
 [-h] {import,freeze,replication,special-phrases,add-data,index,refresh,admin,export,serve,search,reverse,lookup,details,status} ...
 .SH DESCRIPTION
    Command\-line tools for importing, updating, administrating and
+.br
    querying the Nominatim database.
+.br
    
 .SH OPTIONS

@@ -45,7 +47,7 @@ nominatim
    Start a simple web server for serving the API.
 .TP
 \fBnominatim\fR \fI\,search\/\fR
-    Execute API search query.
+    Execute a search query.
 .TP
 \fBnominatim\fR \fI\,reverse\/\fR
    Execute API reverse query.
@@ -66,6 +68,15 @@ usage: nominatim import [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                        [--index-noanalyse]

    Create a new Nominatim database from an OSM file.
+.br
+
+.br
+    This sub\-command sets up a new Nominatim database from scratch starting
+.br
+    with creating a new database in Postgresql. The user running this command
+.br
+    needs superuser rights on the database.
+.br
    


@@ -88,7 +99,7 @@ Number of parallel threads to use

 .TP
 \fB\-\-osm\-file\fR FILE
-OSM file to be imported.
+OSM file to be imported (repeat for importing multiple files)

 .TP
 \fB\-\-continue\fR {load\-data,indexing,db\-postprocess}
@@ -116,19 +127,27 @@ Continue import even when errors in SQL are present

 .TP
 \fB\-\-index\-noanalyse\fR
-Do not perform analyse operations during index
+Do not perform analyse operations during index (expert only)

 .SH OPTIONS 'nominatim freeze'
 usage: nominatim freeze [-h] [-q] [-v] [--project-dir DIR] [-j NUM]

    Make database read\-only.
+.br

+.br
    About half of data in the Nominatim database is kept only to be able to
+.br
    keep the data up\-to\-date with new changes made in OpenStreetMap. This
+.br
    command drops all this data and only keeps the part needed for geocoding
+.br
    itself.
+.br

+.br
    This command has the same effect as the `\-\-no\-updates` option for imports.
+.br
    


@@ -157,6 +176,33 @@ usage: nominatim replication [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                             [--socket-timeout SOCKET_TIMEOUT]

    Update the database using an online replication service.
+.br
+
+.br
+    An OSM replication service is an online service that provides regular
+.br
+    updates (OSM diff files) for the planet or update they provide. The OSMF
+.br
+    provides the primary replication service for the full planet at
+.br
+    https://planet.osm.org/replication/ but there are other providers of
+.br
+    extracts of OSM data who provide such a service as well.
+.br
+
+.br
+    This sub\-command allows to set up such a replication service and download
+.br
+    and import updates at regular intervals. You need to call '\-\-init' once to
+.br
+    set up the process or whenever you change the replication configuration
+.br
+    parameters. Without any arguments, the sub\-command will go into a loop and
+.br
+    continuously apply updates as they become available. Giving `\-\-once` just
+.br
+    downloads and imports the next batch of updates.
+.br
    


@@ -195,7 +241,7 @@ Download and apply updates only once. When not set, updates are continuously app

 .TP
 \fB\-\-no\-index\fR
-Do not index the new data. Only applicable together with \-\-once
+Do not index the new data. Only usable together with \-\-once

 .TP
 \fB\-\-osm2pgsql\-cache\fR SIZE
@@ -203,13 +249,47 @@ Size of cache to be used by osm2pgsql (in MB)

 .TP
 \fB\-\-socket\-timeout\fR \fI\,SOCKET_TIMEOUT\/\fR
-Set timeout for file downloads.
+Set timeout for file downloads

 .SH OPTIONS 'nominatim special-phrases'
 usage: nominatim special-phrases [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
-                                 [--import-from-wiki]
+                                 [--import-from-wiki] [--import-from-csv FILE]
+                                 [--no-replace]

    Import special phrases.
+.br
+
+.br
+    Special phrases are search terms that narrow down the type of object
+.br
+    that should be searched. For example, you might want to search for
+.br
+    'Hotels in Barcelona'. The OSM wiki has a selection of special phrases
+.br
+    in many languages, which can be imported with this command.
+.br
+
+.br
+    You can also provide your own phrases in a CSV file. The file needs to have
+.br
+    the following five columns:
+.br
+     * phrase \- the term expected for searching
+.br
+     * class \- the OSM tag key of the object type
+.br
+     * type \- the OSM tag value of the object type
+.br
+     * operator \- the kind of search to be done (one of: in, near, name, \-)
+.br
+     * plural \- whether the term is a plural or not (Y/N)
+.br
+
+.br
+    An example file can be found in the Nominatim sources at
+.br
+    'test/testdb/full_en_phrases_test.csv'.
+.br
    


@@ -232,17 +312,48 @@ Number of parallel threads to use

 .TP
 \fB\-\-import\-from\-wiki\fR
-Import special phrases from the OSM wiki to the database.
+Import special phrases from the OSM wiki to the database
+
+.TP
+\fB\-\-import\-from\-csv\fR FILE
+Import special phrases from a CSV file
+
+.TP
+\fB\-\-no\-replace\fR
+Keep the old phrases and only add the new ones

 .SH OPTIONS 'nominatim add-data'
 usage: nominatim add-data [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                          (--file FILE | --diff FILE | --node ID | --way ID | --relation ID | --tiger-data DIR)
-                          [--use-main-api]
+                          [--use-main-api] [--osm2pgsql-cache SIZE]
+                          [--socket-timeout SOCKET_TIMEOUT]

    Add additional data from a file or an online source.
+.br

-    Data is only imported, not indexed. You need to call `nominatim index`
-    to complete the process.
+.br
+    This command allows to add or update the search data in the database.
+.br
+    The data can come either from an OSM file or single OSM objects can
+.br
+    directly be downloaded from the OSM API. This function only loads the
+.br
+    data into the database. Afterwards it still needs to be integrated
+.br
+    in the search index. Use the `nominatim index` command for that.
+.br
+
+.br
+    The command can also be used to add external non\-OSM data to the
+.br
+    database. At the moment the only supported format is TIGER housenumber
+.br
+    data. See the online documentation at
+.br
+    https://nominatim.org/release\-docs/latest/admin/Import/#installing\-tiger\-housenumber\-data\-for\-the\-us
+.br
+    for more information.
+.br
    


@@ -265,11 +376,11 @@ Number of parallel threads to use

 .TP
 \fB\-\-file\fR FILE
-Import data from an OSM file
+Import data from an OSM file or diff file

 .TP
 \fB\-\-diff\fR FILE
-Import data from an OSM diff file
+Import data from an OSM diff file (deprecated: use \-\-file)

 .TP
 \fB\-\-node\fR ID
@@ -285,18 +396,37 @@ Import a single relation from the API

 .TP
 \fB\-\-tiger\-data\fR DIR
-Add housenumbers from the US TIGER census database.
+Add housenumbers from the US TIGER census database

 .TP
 \fB\-\-use\-main\-api\fR
 Use OSM API instead of Overpass to download objects

+.TP
+\fB\-\-osm2pgsql\-cache\fR SIZE
+Size of cache to be used by osm2pgsql (in MB)
+
+.TP
+\fB\-\-socket\-timeout\fR \fI\,SOCKET_TIMEOUT\/\fR
+Set timeout for file downloads
+
 .SH OPTIONS 'nominatim index'
 usage: nominatim index [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                       [--boundaries-only] [--no-boundaries] [--minrank RANK]
                       [--maxrank RANK]

    Reindex all new and modified data.
+.br
+
+.br
+    Indexing is the process of computing the address and search terms for
+.br
+    the places in the database. Every time data is added or changed, indexing
+.br
+    needs to be run. Imports and replication updates automatically take care
+.br
+    of indexing. For other cases, this function allows to run indexing manually.
+.br
    


@@ -341,8 +471,23 @@ usage: nominatim refresh [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                         [--enable-debug-statements]

    Recompute auxiliary data used by the indexing process.
+.br

-    These functions must not be run in parallel with other update commands.
+.br
+    This sub\-commands updates various static data and functions in the database.
+.br
+    It usually needs to be run after changing various aspects of the
+.br
+    configuration. The configuration documentation will mention the exact
+.br
+    command to use in such case.
+.br
+
+.br
+    Warning: the 'update' command must not be run in parallel with other update
+.br
+             commands like 'replication' or 'add\-data'.
+.br
    


@@ -381,7 +526,7 @@ Update the PL/pgSQL functions in the database

 .TP
 \fB\-\-wiki\-data\fR
-Update Wikipedia/data importance numbers.
+Update Wikipedia/data importance numbers

 .TP
 \fB\-\-importance\fR
@@ -406,6 +551,7 @@ usage: nominatim admin [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                       [--osm-id OSM_ID | --place-id PLACE_ID]

    Analyse and maintain the database.
+.br
    


@@ -428,19 +574,19 @@ Number of parallel threads to use

 .TP
 \fB\-\-warm\fR
-Warm database caches for search and reverse queries.
+Warm database caches for search and reverse queries

 .TP
 \fB\-\-check\-database\fR
-Check that the database is complete and operational.
+Check that the database is complete and operational

 .TP
 \fB\-\-migrate\fR
-Migrate the database to a new software version.
+Migrate the database to a new software version

 .TP
 \fB\-\-analyse\-indexing\fR
-Print performance analysis of the indexing process.
+Print performance analysis of the indexing process

 .TP
 \fB\-\-search\-only\fR
@@ -468,6 +614,7 @@ usage: nominatim export [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                        [--restrict-to-osm-relation ID]

    Export addresses as CSV file from the database.
+.br
    


@@ -525,12 +672,19 @@ usage: nominatim serve [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                       [--server SERVER]

    Start a simple web server for serving the API.
+.br

+.br
    This command starts the built\-in PHP webserver to serve the website
+.br
    from the current project directory. This webserver is only suitable
-    for testing and develop. Do not use it in production setups!
+.br
+    for testing and development. Do not use it in production setups!
+.br

+.br
    By the default, the webserver can be accessed at: http://127.0.0.1:8088
+.br
    


@@ -568,7 +722,18 @@ usage: nominatim search [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                        [--exclude_place_ids ID,..] [--limit LIMIT]
                        [--viewbox X1,Y1,X2,Y2] [--bounded] [--no-dedupe]

-    Execute API search query.
+    Execute a search query.
+.br
+
+.br
+    This command works exactly the same as if calling the /search endpoint on
+.br
+    the web API. See the online documentation for more details on the
+.br
+    various parameters:
+.br
+    https://nominatim.org/release\-docs/latest/api/Search/
+.br
    


@@ -623,15 +788,15 @@ Format of result

 .TP
 \fB\-\-addressdetails\fR
-Include a breakdown of the address into elements.
+Include a breakdown of the address into elements

 .TP
 \fB\-\-extratags\fR
-Include additional information if available (e.g. wikipedia link, opening hours).
+Include additional information if available (e.g. wikipedia link, opening hours)

 .TP
 \fB\-\-namedetails\fR
-Include a list of alternative names.
+Include a list of alternative names

 .TP
 \fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
@@ -639,7 +804,7 @@ Preferred language order for presenting search results

 .TP
 \fB\-\-polygon\-output\fR {geojson,kml,svg,text}
-Output geometry of results as a GeoJSON, KML, SVG or WKT.
+Output geometry of results as a GeoJSON, KML, SVG or WKT

 .TP
 \fB\-\-polygon\-threshold\fR TOLERANCE
@@ -647,7 +812,7 @@ Simplify output geometry.Parameter is difference tolerance in degrees.

 .TP
 \fB\-\-countrycodes\fR CC,..
-Limit search results to one or more countries.
+Limit search results to one or more countries

 .TP
 \fB\-\-exclude_place_ids\fR ID,..
@@ -679,6 +844,17 @@ usage: nominatim reverse [-h] [-q] [-v] [--project-dir DIR] [-j NUM] --lat LAT
                         [--polygon-threshold TOLERANCE]

    Execute API reverse query.
+.br
+
+.br
+    This command works exactly the same as if calling the /reverse endpoint on
+.br
+    the web API. See the online documentation for more details on the
+.br
+    various parameters:
+.br
+    https://nominatim.org/release\-docs/latest/api/Reverse/
+.br
    


@@ -717,15 +893,15 @@ Format of result

 .TP
 \fB\-\-addressdetails\fR
-Include a breakdown of the address into elements.
+Include a breakdown of the address into elements

 .TP
 \fB\-\-extratags\fR
-Include additional information if available (e.g. wikipedia link, opening hours).
+Include additional information if available (e.g. wikipedia link, opening hours)

 .TP
 \fB\-\-namedetails\fR
-Include a list of alternative names.
+Include a list of alternative names

 .TP
 \fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
@@ -733,7 +909,7 @@ Preferred language order for presenting search results

 .TP
 \fB\-\-polygon\-output\fR {geojson,kml,svg,text}
-Output geometry of results as a GeoJSON, KML, SVG or WKT.
+Output geometry of results as a GeoJSON, KML, SVG or WKT

 .TP
 \fB\-\-polygon\-threshold\fR TOLERANCE
@@ -748,6 +924,17 @@ usage: nominatim lookup [-h] [-q] [-v] [--project-dir DIR] [-j NUM] --id OSMID
                        [--polygon-threshold TOLERANCE]

    Execute API lookup query.
+.br
+
+.br
+    This command works exactly the same as if calling the /lookup endpoint on
+.br
+    the web API. See the online documentation for more details on the
+.br
+    various parameters:
+.br
+    https://nominatim.org/release\-docs/latest/api/Lookup/
+.br
    


@@ -778,15 +965,15 @@ Format of result

 .TP
 \fB\-\-addressdetails\fR
-Include a breakdown of the address into elements.
+Include a breakdown of the address into elements

 .TP
 \fB\-\-extratags\fR
-Include additional information if available (e.g. wikipedia link, opening hours).
+Include additional information if available (e.g. wikipedia link, opening hours)

 .TP
 \fB\-\-namedetails\fR
-Include a list of alternative names.
+Include a list of alternative names

 .TP
 \fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
@@ -794,7 +981,7 @@ Preferred language order for presenting search results

 .TP
 \fB\-\-polygon\-output\fR {geojson,kml,svg,text}
-Output geometry of results as a GeoJSON, KML, SVG or WKT.
+Output geometry of results as a GeoJSON, KML, SVG or WKT

 .TP
 \fB\-\-polygon\-threshold\fR TOLERANCE
@@ -809,6 +996,17 @@ usage: nominatim details [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                         [--lang LANGS]

    Execute API details query.
+.br
+
+.br
+    This command works exactly the same as if calling the /details endpoint on
+.br
+    the web API. See the online documentation for more details on the
+.br
+    various parameters:
+.br
+    https://nominatim.org/release\-docs/latest/api/Details/
+.br
    


@@ -843,7 +1041,7 @@ Look up the OSM relation with the given ID.

 .TP
 \fB\-\-place_id\fR \fI\,PLACE_ID\/\fR, \fB\-p\fR \fI\,PLACE_ID\/\fR
-Database internal identifier of the OSM object to look up.
+Database internal identifier of the OSM object to look up

 .TP
 \fB\-\-class\fR \fI\,OBJECT_CLASS\/\fR
@@ -851,27 +1049,27 @@ Class type to disambiguated multiple entries of the same object.

 .TP
 \fB\-\-addressdetails\fR
-Include a breakdown of the address into elements.
+Include a breakdown of the address into elements

 .TP
 \fB\-\-keywords\fR
-Include a list of name keywords and address keywords.
+Include a list of name keywords and address keywords

 .TP
 \fB\-\-linkedplaces\fR
-Include a details of places that are linked with this one.
+Include a details of places that are linked with this one

 .TP
 \fB\-\-hierarchy\fR
-Include details of places lower in the address hierarchy.
+Include details of places lower in the address hierarchy

 .TP
 \fB\-\-group_hierarchy\fR
-Group the places by type.
+Group the places by type

 .TP
 \fB\-\-polygon_geojson\fR
-Include geometry of result.
+Include geometry of result

 .TP
 \fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
@@ -882,6 +1080,17 @@ usage: nominatim status [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                        [--format {text,json}]

    Execute API status query.
+.br
+
+.br
+    This command works exactly the same as if calling the /status endpoint on
+.br
+    the web API. See the online documentation for more details on the
+.br
+    various parameters:
+.br
+    https://nominatim.org/release\-docs/latest/api/Status/
+.br
    


@@ -906,6 +1115,9 @@ Number of parallel threads to use
 \fB\-\-format\fR {text,json}
 Format of result

+.SH AUTHORS
+.B Nominatim
+was written by the Nominatim developer community <info@nominatim.org>.
 .SH DISTRIBUTION
 The latest version of Nominatim may be downloaded from
 .UR https://nominatim.org
--- a/nominatim/cli.py
+++ b/nominatim/cli.py
@@ -176,7 +176,7 @@ class AdminServe:

    This command starts the built-in PHP webserver to serve the website
    from the current project directory. This webserver is only suitable
-    for testing and develop. Do not use it in production setups!
+    for testing and development. Do not use it in production setups!

    By the default, the webserver can be accessed at: http://127.0.0.1:8088
    """
--- a/nominatim/clicmd/add_data.py
+++ b/nominatim/clicmd/add_data.py
@@ -3,6 +3,8 @@ Implementation of the 'add-data' subcommand.
 """
 import logging

+import psutil
+
 # Do not repeat documentation of subcommand classes.
 # pylint: disable=C0111
 # Using non-top-level imports to avoid eventually unused imports.
@@ -14,8 +16,17 @@ class UpdateAddData:
    """\
    Add additional data from a file or an online source.

-    Data is only imported, not indexed. You need to call `nominatim index`
-    to complete the process.
+    This command allows to add or update the search data in the database.
+    The data can come either from an OSM file or single OSM objects can
+    directly be downloaded from the OSM API. This function only loads the
+    data into the database. Afterwards it still needs to be integrated
+    in the search index. Use the `nominatim index` command for that.
+
+    The command can also be used to add external non-OSM data to the
+    database. At the moment the only supported format is TIGER housenumber
+    data. See the online documentation at
+    https://nominatim.org/release-docs/latest/admin/Import/#installing-tiger-housenumber-data-for-the-us
+    for more information.
    """

    @staticmethod
@@ -33,14 +44,14 @@ class UpdateAddData:
        group.add_argument('--relation', metavar='ID', type=int,
                           help='Import a single relation from the API')
        group.add_argument('--tiger-data', metavar='DIR',
-                           help='Add housenumbers from the US TIGER census database.')
+                           help='Add housenumbers from the US TIGER census database')
        group = parser.add_argument_group('Extra arguments')
        group.add_argument('--use-main-api', action='store_true',
                           help='Use OSM API instead of Overpass to download objects')
        group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
                           help='Size of cache to be used by osm2pgsql (in MB)')
        group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
-                           help='Set timeout for file downloads.')
+                           help='Set timeout for file downloads')

    @staticmethod
    def run(args):
@@ -50,7 +61,8 @@ class UpdateAddData:
        if args.tiger_data:
            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
            return tiger_data.add_tiger_data(args.tiger_data,
-                                             args.config, args.threads or 1,
+                                             args.config,
+                                             args.threads or psutil.cpu_count()  or 1,
                                             tokenizer)

        osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
--- a/nominatim/clicmd/admin.py
+++ b/nominatim/clicmd/admin.py
@@ -23,13 +23,13 @@ class AdminFuncs:
        group = parser.add_argument_group('Admin tasks')
        objs = group.add_mutually_exclusive_group(required=True)
        objs.add_argument('--warm', action='store_true',
-                          help='Warm database caches for search and reverse queries.')
+                          help='Warm database caches for search and reverse queries')
        objs.add_argument('--check-database', action='store_true',
-                          help='Check that the database is complete and operational.')
+                          help='Check that the database is complete and operational')
        objs.add_argument('--migrate', action='store_true',
-                          help='Migrate the database to a new software version.')
+                          help='Migrate the database to a new software version')
        objs.add_argument('--analyse-indexing', action='store_true',
-                          help='Print performance analysis of the indexing process.')
+                          help='Print performance analysis of the indexing process')
        group = parser.add_argument_group('Arguments for cache warming')
        group.add_argument('--search-only', action='store_const', dest='target',
                           const='search',
--- a/nominatim/clicmd/api.py
+++ b/nominatim/clicmd/api.py
@@ -4,6 +4,7 @@ Subcommand definitions for API calls from the command line.
 import logging

 from nominatim.tools.exec_utils import run_api_script
+from nominatim.errors import UsageError

 # Do not repeat documentation of subcommand classes.
 # pylint: disable=C0111
@@ -20,19 +21,19 @@ STRUCTURED_QUERY = (
 )

 EXTRADATA_PARAMS = (
-    ('addressdetails', 'Include a breakdown of the address into elements.'),
+    ('addressdetails', 'Include a breakdown of the address into elements'),
    ('extratags', ("Include additional information if available "
-                   "(e.g. wikipedia link, opening hours).")),
-    ('namedetails', 'Include a list of alternative names.')
+                   "(e.g. wikipedia link, opening hours)")),
+    ('namedetails', 'Include a list of alternative names')
 )

 DETAILS_SWITCHES = (
-    ('addressdetails', 'Include a breakdown of the address into elements.'),
-    ('keywords', 'Include a list of name keywords and address keywords.'),
-    ('linkedplaces', 'Include a details of places that are linked with this one.'),
-    ('hierarchy', 'Include details of places lower in the address hierarchy.'),
-    ('group_hierarchy', 'Group the places by type.'),
-    ('polygon_geojson', 'Include geometry of result.')
+    ('addressdetails', 'Include a breakdown of the address into elements'),
+    ('keywords', 'Include a list of name keywords and address keywords'),
+    ('linkedplaces', 'Include a details of places that are linked with this one'),
+    ('hierarchy', 'Include details of places lower in the address hierarchy'),
+    ('group_hierarchy', 'Group the places by type'),
+    ('polygon_geojson', 'Include geometry of result')
 )

 def _add_api_output_arguments(parser):
@@ -47,15 +48,32 @@ def _add_api_output_arguments(parser):
                       help='Preferred language order for presenting search results')
    group.add_argument('--polygon-output',
                       choices=['geojson', 'kml', 'svg', 'text'],
-                       help='Output geometry of results as a GeoJSON, KML, SVG or WKT.')
+                       help='Output geometry of results as a GeoJSON, KML, SVG or WKT')
    group.add_argument('--polygon-threshold', type=float, metavar='TOLERANCE',
                       help=("Simplify output geometry."
                             "Parameter is difference tolerance in degrees."))


+def _run_api(endpoint, args, params):
+    script_file = args.project_dir / 'website' / (endpoint + '.php')
+
+    if not script_file.exists():
+        LOG.error("Cannot find API script file.\n\n"
+                  "Make sure to run 'nominatim' from the project directory \n"
+                  "or use the option --project-dir.")
+        raise UsageError("API script not found.")
+
+    return run_api_script(endpoint, args.project_dir,
+                          phpcgi_bin=args.phpcgi_path, params=params)
+
 class APISearch:
    """\
-    Execute API search query.
+    Execute a search query.
+
+    This command works exactly the same as if calling the /search endpoint on
+    the web API. See the online documentation for more details on the
+    various parameters:
+    https://nominatim.org/release-docs/latest/api/Search/
    """

    @staticmethod
@@ -70,7 +88,7 @@ class APISearch:

        group = parser.add_argument_group('Result limitation')
        group.add_argument('--countrycodes', metavar='CC,..',
-                           help='Limit search results to one or more countries.')
+                           help='Limit search results to one or more countries')
        group.add_argument('--exclude_place_ids', metavar='ID,..',
                           help='List of search object to be excluded')
        group.add_argument('--limit', type=int,
@@ -109,12 +127,16 @@ class APISearch:
        if not args.dedupe:
            params['dedupe'] = '0'

-        return run_api_script('search', args.project_dir,
-                              phpcgi_bin=args.phpcgi_path, params=params)
+        return _run_api('search', args, params)

 class APIReverse:
    """\
    Execute API reverse query.
+
+    This command works exactly the same as if calling the /reverse endpoint on
+    the web API. See the online documentation for more details on the
+    various parameters:
+    https://nominatim.org/release-docs/latest/api/Reverse/
    """

    @staticmethod
@@ -148,13 +170,17 @@ class APIReverse:
        if args.polygon_threshold:
            params['polygon_threshold'] = args.polygon_threshold

-        return run_api_script('reverse', args.project_dir,
-                              phpcgi_bin=args.phpcgi_path, params=params)
+        return _run_api('reverse', args, params)


 class APILookup:
    """\
    Execute API lookup query.
+
+    This command works exactly the same as if calling the /lookup endpoint on
+    the web API. See the online documentation for more details on the
+    various parameters:
+    https://nominatim.org/release-docs/latest/api/Lookup/
    """

    @staticmethod
@@ -183,13 +209,17 @@ class APILookup:
        if args.polygon_threshold:
            params['polygon_threshold'] = args.polygon_threshold

-        return run_api_script('lookup', args.project_dir,
-                              phpcgi_bin=args.phpcgi_path, params=params)
+        return _run_api('lookup', args, params)


 class APIDetails:
    """\
    Execute API details query.
+
+    This command works exactly the same as if calling the /details endpoint on
+    the web API. See the online documentation for more details on the
+    various parameters:
+    https://nominatim.org/release-docs/latest/api/Details/
    """

    @staticmethod
@@ -203,7 +233,7 @@ class APIDetails:
        objs.add_argument('--relation', '-r', type=int,
                          help="Look up the OSM relation with the given ID.")
        objs.add_argument('--place_id', '-p', type=int,
-                          help='Database internal identifier of the OSM object to look up.')
+                          help='Database internal identifier of the OSM object to look up')
        group.add_argument('--class', dest='object_class',
                           help=("Class type to disambiguated multiple entries "
                                 "of the same object."))
@@ -229,13 +259,17 @@ class APIDetails:
        for name, _ in DETAILS_SWITCHES:
            params[name] = '1' if getattr(args, name) else '0'

-        return run_api_script('details', args.project_dir,
-                              phpcgi_bin=args.phpcgi_path, params=params)
+        return _run_api('details', args, params)


 class APIStatus:
    """\
    Execute API status query.
+
+    This command works exactly the same as if calling the /status endpoint on
+    the web API. See the online documentation for more details on the
+    various parameters:
+    https://nominatim.org/release-docs/latest/api/Status/
    """

    @staticmethod
@@ -246,6 +280,4 @@ class APIStatus:

    @staticmethod
    def run(args):
-        return run_api_script('status', args.project_dir,
-                              phpcgi_bin=args.phpcgi_path,
-                              params=dict(format=args.format))
+        return _run_api('status', args, dict(format=args.format))
--- a/nominatim/clicmd/args.py
+++ b/nominatim/clicmd/args.py
@@ -1,7 +1,12 @@
 """
 Provides custom functions over command-line arguments.
 """
+import logging
+from pathlib import Path

+from nominatim.errors import UsageError
+
+LOG = logging.getLogger()

 class NominatimArgs:
    """ Customized namespace class for the nominatim command line tool
@@ -18,10 +23,27 @@ class NominatimArgs:
                    osm2pgsql_style=self.config.get_import_style_file(),
                    threads=self.threads or default_threads,
                    dsn=self.config.get_libpq_dsn(),
-                    flatnode_file=self.config.FLATNODE_FILE,
+                    flatnode_file=str(self.config.get_path('FLATNODE_FILE')),
                    tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA,
                                     slim_index=self.config.TABLESPACE_OSM_INDEX,
                                     main_data=self.config.TABLESPACE_PLACE_DATA,
                                     main_index=self.config.TABLESPACE_PLACE_INDEX
                                    )
                   )
+
+
+    def get_osm_file_list(self):
+        """ Return the --osm-file argument as a list of Paths or None
+            if no argument was given. The function also checks if the files
+            exist and raises a UsageError if one cannot be found.
+        """
+        if not self.osm_file:
+            return None
+
+        files = [Path(f) for f in self.osm_file]
+        for fname in files:
+            if not fname.is_file():
+                LOG.fatal("OSM file '%s' does not exist.", fname)
+                raise UsageError('Cannot access file.')
+
+        return files
--- a/nominatim/clicmd/freeze.py
+++ b/nominatim/clicmd/freeze.py
@@ -31,6 +31,6 @@ class SetupFreeze:

        with connect(args.config.get_libpq_dsn()) as conn:
            freeze.drop_update_tables(conn)
-        freeze.drop_flatnode_file(args.config.FLATNODE_FILE)
+        freeze.drop_flatnode_file(str(args.config.get_path('FLATNODE_FILE')))

        return 0
--- a/nominatim/clicmd/index.py
+++ b/nominatim/clicmd/index.py
@@ -15,6 +15,11 @@ from nominatim.db.connection import connect
 class UpdateIndex:
    """\
    Reindex all new and modified data.
+
+    Indexing is the process of computing the address and search terms for
+    the places in the database. Every time data is added or changed, indexing
+    needs to be run. Imports and replication updates automatically take care
+    of indexing. For other cases, this function allows to run indexing manually.
    """

    @staticmethod
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -17,7 +17,13 @@ class UpdateRefresh:
    """\
    Recompute auxiliary data used by the indexing process.

-    These functions must not be run in parallel with other update commands.
+    This sub-commands updates various static data and functions in the database.
+    It usually needs to be run after changing various aspects of the
+    configuration. The configuration documentation will mention the exact
+    command to use in such case.
+
+    Warning: the 'update' command must not be run in parallel with other update
+             commands like 'replication' or 'add-data'.
    """
    def __init__(self):
        self.tokenizer = None
@@ -34,7 +40,7 @@ class UpdateRefresh:
        group.add_argument('--functions', action='store_true',
                           help='Update the PL/pgSQL functions in the database')
        group.add_argument('--wiki-data', action='store_true',
-                           help='Update Wikipedia/data importance numbers.')
+                           help='Update Wikipedia/data importance numbers')
        group.add_argument('--importance', action='store_true',
                           help='Recompute place importances (expensive!)')
        group.add_argument('--website', action='store_true',
@@ -65,14 +71,13 @@ class UpdateRefresh:
                          "Postcode updates on a frozen database is not possible.")

        if args.word_counts:
-            LOG.warning('Recompute frequency of full-word search terms')
-            refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
+            LOG.warning('Recompute word statistics')
+            self._get_tokenizer(args.config).update_statistics()

        if args.address_levels:
-            cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
-            LOG.warning('Updating address levels from %s', cfg)
+            LOG.warning('Updating address levels')
            with connect(args.config.get_libpq_dsn()) as conn:
-                refresh.load_address_levels_from_file(conn, cfg)
+                refresh.load_address_levels_from_config(conn, args.config)

        if args.functions:
            LOG.warning('Create functions')
--- a/nominatim/clicmd/replication.py
+++ b/nominatim/clicmd/replication.py
@@ -20,6 +20,19 @@ LOG = logging.getLogger()
 class UpdateReplication:
    """\
    Update the database using an online replication service.
+
+    An OSM replication service is an online service that provides regular
+    updates (OSM diff files) for the planet or update they provide. The OSMF
+    provides the primary replication service for the full planet at
+    https://planet.osm.org/replication/ but there are other providers of
+    extracts of OSM data who provide such a service as well.
+
+    This sub-command allows to set up such a replication service and download
+    and import updates at regular intervals. You need to call '--init' once to
+    set up the process or whenever you change the replication configuration
+    parameters. Without any arguments, the sub-command will go into a loop and
+    continuously apply updates as they become available. Giving `--once` just
+    downloads and imports the next batch of updates.
    """

    @staticmethod
@@ -29,22 +42,25 @@ class UpdateReplication:
                           help='Initialise the update process')
        group.add_argument('--no-update-functions', dest='update_functions',
                           action='store_false',
-                           help=("Do not update the trigger function to "
-                                 "support differential updates."))
+                           help="Do not update the trigger function to "
+                                "support differential updates (EXPERT)")
        group = parser.add_argument_group('Arguments for updates')
        group.add_argument('--check-for-updates', action='store_true',
                           help='Check if new updates are available and exit')
        group.add_argument('--once', action='store_true',
-                           help=("Download and apply updates only once. When "
-                                 "not set, updates are continuously applied"))
+                           help="Download and apply updates only once. When "
+                                "not set, updates are continuously applied")
+        group.add_argument('--catch-up', action='store_true',
+                           help="Download and apply updates until no new "
+                                "data is available on the server")
        group.add_argument('--no-index', action='store_false', dest='do_index',
-                           help=("Do not index the new data. Only applicable "
+                           help=("Do not index the new data. Only usable "
                                 "together with --once"))
        group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
                           help='Size of cache to be used by osm2pgsql (in MB)')
        group = parser.add_argument_group('Download parameters')
        group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
-                           help='Set timeout for file downloads.')
+                           help='Set timeout for file downloads')

    @staticmethod
    def _init_replication(args):
@@ -79,28 +95,40 @@ class UpdateReplication:
                    round_time(end - start_import),
                    round_time(end - batchdate))

+
+    @staticmethod
+    def _compute_update_interval(args):
+        if args.catch_up:
+            return 0
+
+        update_interval = args.config.get_int('REPLICATION_UPDATE_INTERVAL')
+        # Sanity check to not overwhelm the Geofabrik servers.
+        if 'download.geofabrik.de' in args.config.REPLICATION_URL\
+           and update_interval < 86400:
+            LOG.fatal("Update interval too low for download.geofabrik.de.\n"
+                      "Please check install documentation "
+                      "(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
+                      "setting-up-the-update-process).")
+            raise UsageError("Invalid replication update interval setting.")
+
+        return update_interval
+
+
    @staticmethod
    def _update(args):
        from ..tools import replication
        from ..indexer.indexer import Indexer
        from ..tokenizer import factory as tokenizer_factory

+        update_interval = UpdateReplication._compute_update_interval(args)
+
        params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
        params.update(base_url=args.config.REPLICATION_URL,
-                      update_interval=args.config.get_int('REPLICATION_UPDATE_INTERVAL'),
+                      update_interval=update_interval,
                      import_file=args.project_dir / 'osmosischange.osc',
                      max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'),
                      indexed_only=not args.once)

-        # Sanity check to not overwhelm the Geofabrik servers.
-        if 'download.geofabrik.de' in params['base_url']\
-           and params['update_interval'] < 86400:
-            LOG.fatal("Update interval too low for download.geofabrik.de.\n"
-                      "Please check install documentation "
-                      "(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
-                      "setting-up-the-update-process).")
-            raise UsageError("Invalid replication update interval setting.")
-
        if not args.once:
            if not args.do_index:
                LOG.fatal("Indexing cannot be disabled when running updates continuously.")
@@ -108,6 +136,7 @@ class UpdateReplication:
            recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')

        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+        indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, args.threads or 1)

        while True:
            with connect(args.config.get_libpq_dsn()) as conn:
@@ -120,10 +149,7 @@ class UpdateReplication:

            if state is not replication.UpdateState.NO_CHANGES and args.do_index:
                index_start = dt.datetime.now(dt.timezone.utc)
-                indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
-                                  args.threads or 1)
-                indexer.index_boundaries(0, 30)
-                indexer.index_by_rank(0, 30)
+                indexer.index_full(analyse=False)

                with connect(args.config.get_libpq_dsn()) as conn:
                    status.set_indexed(conn, True)
@@ -132,10 +158,15 @@ class UpdateReplication:
            else:
                index_start = None

+            if state is replication.UpdateState.NO_CHANGES and \
+               args.catch_up or update_interval > 40*60:
+                while indexer.has_pending():
+                    indexer.index_full(analyse=False)
+
            if LOG.isEnabledFor(logging.WARNING):
                UpdateReplication._report_update(batchdate, start, index_start)

-            if args.once:
+            if args.once or (args.catch_up and state is replication.UpdateState.NO_CHANGES):
                break

            if state is replication.UpdateState.NO_CHANGES:
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -9,7 +9,6 @@ import psutil
 from nominatim.db.connection import connect
 from nominatim.db import status, properties
 from nominatim.version import NOMINATIM_VERSION
-from nominatim.errors import UsageError

 # Do not repeat documentation of subcommand classes.
 # pylint: disable=C0111
@@ -21,14 +20,19 @@ LOG = logging.getLogger()
 class SetupAll:
    """\
    Create a new Nominatim database from an OSM file.
+
+    This sub-command sets up a new Nominatim database from scratch starting
+    with creating a new database in Postgresql. The user running this command
+    needs superuser rights on the database.
    """

    @staticmethod
    def add_args(parser):
        group_name = parser.add_argument_group('Required arguments')
        group = group_name.add_mutually_exclusive_group(required=True)
-        group.add_argument('--osm-file', metavar='FILE',
-                           help='OSM file to be imported.')
+        group.add_argument('--osm-file', metavar='FILE', action='append',
+                           help='OSM file to be imported'
+                                ' (repeat for importing multiple files)')
        group.add_argument('--continue', dest='continue_at',
                           choices=['load-data', 'indexing', 'db-postprocess'],
                           help='Continue an import that was interrupted')
@@ -47,46 +51,35 @@ class SetupAll:
        group.add_argument('--ignore-errors', action='store_true',
                           help='Continue import even when errors in SQL are present')
        group.add_argument('--index-noanalyse', action='store_true',
-                           help='Do not perform analyse operations during index')
+                           help='Do not perform analyse operations during index (expert only)')


    @staticmethod
-    def run(args): # pylint: disable=too-many-statements
-        from ..tools import database_import, refresh, postcodes, freeze
+    def run(args):
+        from ..tools import database_import, refresh, postcodes, freeze, country_info
        from ..indexer.indexer import Indexer
-        from ..tokenizer import factory as tokenizer_factory

-        if args.osm_file and not Path(args.osm_file).is_file():
-            LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
-            raise UsageError('Cannot access file.')
+        country_info.setup_country_config(args.config)

        if args.continue_at is None:
+            files = args.get_osm_file_list()
+
+            LOG.warning('Creating database')
            database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
-                                                    args.data_dir,
-                                                    args.no_partitions,
                                                    rouser=args.config.DATABASE_WEBUSER)

+            LOG.warning('Setting up country tables')
+            country_info.setup_country_tables(args.config.get_libpq_dsn(),
+                                              args.data_dir,
+                                              args.no_partitions)
+
            LOG.warning('Importing OSM data file')
-            database_import.import_osm_data(Path(args.osm_file),
+            database_import.import_osm_data(files,
                                            args.osm2pgsql_options(0, 1),
                                            drop=args.no_updates,
                                            ignore_errors=args.ignore_errors)

-            with connect(args.config.get_libpq_dsn()) as conn:
-                LOG.warning('Create functions (1st pass)')
-                refresh.create_functions(conn, args.config, False, False)
-                LOG.warning('Create tables')
-                database_import.create_tables(conn, args.config,
-                                              reverse_only=args.reverse_only)
-                refresh.load_address_levels_from_file(conn, Path(args.config.ADDRESS_LEVEL_CONFIG))
-                LOG.warning('Create functions (2nd pass)')
-                refresh.create_functions(conn, args.config, False, False)
-                LOG.warning('Create table triggers')
-                database_import.create_table_triggers(conn, args.config)
-                LOG.warning('Create partition tables')
-                database_import.create_partition_tables(conn, args.config)
-                LOG.warning('Create functions (3rd pass)')
-                refresh.create_functions(conn, args.config, False, False)
+            SetupAll._setup_tables(args.config, args.reverse_only)

            LOG.warning('Importing wikipedia importance data')
            data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
@@ -105,12 +98,7 @@ class SetupAll:
                                      args.threads or psutil.cpu_count() or 1)

        LOG.warning("Setting up tokenizer")
-        if args.continue_at is None or args.continue_at == 'load-data':
-            # (re)initialise the tokenizer data
-            tokenizer = tokenizer_factory.create_tokenizer(args.config)
-        else:
-            # just load the tokenizer
-            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+        tokenizer = SetupAll._get_tokenizer(args.continue_at, args.config)

        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Calculate postcodes')
@@ -131,33 +119,60 @@ class SetupAll:
            database_import.create_search_indices(conn, args.config,
                                                  drop=args.no_updates)
            LOG.warning('Create search index for default country names.')
-            database_import.create_country_names(conn, tokenizer,
-                                                 args.config.LANGUAGES)
-            conn.commit()
+            country_info.create_country_names(conn, tokenizer,
+                                              args.config.LANGUAGES)
            if args.no_updates:
                freeze.drop_update_tables(conn)
        tokenizer.finalize_import(args.config)

+        LOG.warning('Recompute word counts')
+        tokenizer.update_statistics()

        webdir = args.project_dir / 'website'
        LOG.warning('Setup website at %s', webdir)
        with connect(args.config.get_libpq_dsn()) as conn:
            refresh.setup_website(webdir, args.config, conn)

-        with connect(args.config.get_libpq_dsn()) as conn:
-            try:
-                dbdate = status.compute_database_date(conn)
-                status.set_status(conn, dbdate)
-                LOG.info('Database is at %s.', dbdate)
-            except Exception as exc: # pylint: disable=broad-except
-                LOG.error('Cannot determine date of database: %s', exc)
-
-            properties.set_property(conn, 'database_version',
-                                    '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
+        SetupAll._set_database_date(args.config.get_libpq_dsn())

        return 0


+    @staticmethod
+    def _setup_tables(config, reverse_only):
+        """ Set up the basic database layout: tables, indexes and functions.
+        """
+        from ..tools import database_import, refresh
+
+        with connect(config.get_libpq_dsn()) as conn:
+            LOG.warning('Create functions (1st pass)')
+            refresh.create_functions(conn, config, False, False)
+            LOG.warning('Create tables')
+            database_import.create_tables(conn, config, reverse_only=reverse_only)
+            refresh.load_address_levels_from_config(conn, config)
+            LOG.warning('Create functions (2nd pass)')
+            refresh.create_functions(conn, config, False, False)
+            LOG.warning('Create table triggers')
+            database_import.create_table_triggers(conn, config)
+            LOG.warning('Create partition tables')
+            database_import.create_partition_tables(conn, config)
+            LOG.warning('Create functions (3rd pass)')
+            refresh.create_functions(conn, config, False, False)
+
+
+    @staticmethod
+    def _get_tokenizer(continue_at, config):
+        """ Set up a new tokenizer or load an already initialised one.
+        """
+        from ..tokenizer import factory as tokenizer_factory
+
+        if continue_at is None or continue_at == 'load-data':
+            # (re)initialise the tokenizer data
+            return tokenizer_factory.create_tokenizer(config)
+
+        # just load the tokenizer
+        return tokenizer_factory.get_tokenizer_for_db(config)
+
    @staticmethod
    def _create_pending_index(conn, tablespace):
        """ Add a supporting index for finding places still to be indexed.
@@ -178,3 +193,19 @@ class SetupAll:
                           {} WHERE indexed_status > 0
                        """.format(tablespace))
        conn.commit()
+
+
+    @staticmethod
+    def _set_database_date(dsn):
+        """ Determine the database date and set the status accordingly.
+        """
+        with connect(dsn) as conn:
+            try:
+                dbdate = status.compute_database_date(conn)
+                status.set_status(conn, dbdate)
+                LOG.info('Database is at %s.', dbdate)
+            except Exception as exc: # pylint: disable=broad-except
+                LOG.error('Cannot determine date of database: %s', exc)
+
+            properties.set_property(conn, 'database_version',
+                                    '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
--- a/nominatim/clicmd/special_phrases.py
+++ b/nominatim/clicmd/special_phrases.py
@@ -19,16 +19,42 @@ LOG = logging.getLogger()
 class ImportSpecialPhrases:
    """\
    Import special phrases.
+
+    Special phrases are search terms that narrow down the type of object
+    that should be searched. For example, you might want to search for
+    'Hotels in Barcelona'. The OSM wiki has a selection of special phrases
+    in many languages, which can be imported with this command.
+
+    You can also provide your own phrases in a CSV file. The file needs to have
+    the following five columns:
+     * phrase - the term expected for searching
+     * class - the OSM tag key of the object type
+     * type - the OSM tag value of the object type
+     * operator - the kind of search to be done (one of: in, near, name, -)
+     * plural - whether the term is a plural or not (Y/N)
+
+    An example file can be found in the Nominatim sources at
+    'test/testdb/full_en_phrases_test.csv'.
+
+    The import can be further configured to ignore specific key/value pairs.
+    This is particularly useful when importing phrases from the wiki. The
+    default configuration excludes some very common tags like building=yes.
+    The configuration can be customized by putting a file `phrase-settings.json`
+    with custom rules into the project directory or by using the `--config`
+    option to point to another configuration file.
    """
    @staticmethod
    def add_args(parser):
        group = parser.add_argument_group('Input arguments')
        group.add_argument('--import-from-wiki', action='store_true',
-                           help='Import special phrases from the OSM wiki to the database.')
+                           help='Import special phrases from the OSM wiki to the database')
        group.add_argument('--import-from-csv', metavar='FILE',
-                           help='Import special phrases from a CSV file.')
+                           help='Import special phrases from a CSV file')
        group.add_argument('--no-replace', action='store_true',
-                           help='Keep the old phrases and only add the new ones.')
+                           help='Keep the old phrases and only add the new ones')
+        group.add_argument('--config', action='store',
+                           help='Configuration file for black/white listing '
+                                '(default: phrase-settings.json)')

    @staticmethod
    def run(args):
@@ -56,5 +82,5 @@ class ImportSpecialPhrases:
        should_replace = not args.no_replace
        with connect(args.config.get_libpq_dsn()) as db_connection:
            SPImporter(
-                args.config, args.phplib_dir, db_connection, loader
+                args.config, db_connection, loader
            ).import_phrases(tokenizer, should_replace)
--- a/nominatim/config.py
+++ b/nominatim/config.py
@@ -4,6 +4,8 @@ Nominatim configuration accessor.
 import logging
 import os
 from pathlib import Path
+import json
+import yaml

 from dotenv import dotenv_values

@@ -11,6 +13,27 @@ from nominatim.errors import UsageError

 LOG = logging.getLogger()

+
+def flatten_config_list(content, section=''):
+    """ Flatten YAML configuration lists that contain include sections
+        which are lists themselves.
+    """
+    if not content:
+        return []
+
+    if not isinstance(content, list):
+        raise UsageError(f"List expected in section '{section}'.")
+
+    output = []
+    for ele in content:
+        if isinstance(ele, list):
+            output.extend(flatten_config_list(ele, section))
+        else:
+            output.append(ele)
+
+    return output
+
+
 class Configuration:
    """ Load and manage the project configuration.

@@ -33,12 +56,6 @@ class Configuration:
        if project_dir is not None and (project_dir / '.env').is_file():
            self._config.update(dotenv_values(str((project_dir / '.env').resolve())))

-        # Add defaults for variables that are left empty to set the default.
-        # They may still be overwritten by environment variables.
-        if not self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG']:
-            self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG'] = \
-                str(config_dir / 'address-levels.json')
-
        class _LibDirs:
            pass

@@ -53,7 +70,10 @@ class Configuration:
    def __getattr__(self, name):
        name = 'NOMINATIM_' + name

-        return self.environ.get(name) or self._config[name]
+        if name in self.environ:
+            return self.environ[name]
+
+        return self._config[name]

    def get_bool(self, name):
        """ Return the given configuration parameter as a boolean.
@@ -73,6 +93,23 @@ class Configuration:
            raise UsageError("Configuration error.") from exp


+    def get_path(self, name):
+        """ Return the given configuration parameter as a Path.
+            If a relative path is configured, then the function converts this
+            into an absolute path with the project directory as root path.
+            If the configuration is unset, a falsy value is returned.
+        """
+        value = self.__getattr__(name)
+        if value:
+            value = Path(value)
+
+            if not value.is_absolute():
+                value = self.project_dir / value
+
+            value = value.resolve()
+
+        return value
+
    def get_libpq_dsn(self):
        """ Get configured database DSN converted into the key/value format
            understood by libpq and psycopg.
@@ -103,7 +140,7 @@ class Configuration:
        if style in ('admin', 'street', 'address', 'full', 'extratags'):
            return self.config_dir / 'import-{}.style'.format(style)

-        return Path(style)
+        return self.find_config_file('', 'IMPORT_STYLE')


    def get_os_env(self):
@@ -114,3 +151,98 @@ class Configuration:
        env.update(self.environ)

        return env
+
+
+    def load_sub_configuration(self, filename, config=None):
+        """ Load additional configuration from a file. `filename` is the name
+            of the configuration file. The file is first searched in the
+            project directory and then in the global settings dirctory.
+
+            If `config` is set, then the name of the configuration file can
+            be additionally given through a .env configuration option. When
+            the option is set, then the file will be exclusively loaded as set:
+            if the name is an absolute path, the file name is taken as is,
+            if the name is relative, it is taken to be relative to the
+            project directory.
+
+            The format of the file is determined from the filename suffix.
+            Currently only files with extension '.yaml' are supported.
+
+            YAML files support a special '!include' construct. When the
+            directive is given, the value is taken to be a filename, the file
+            is loaded using this function and added at the position in the
+            configuration tree.
+        """
+        configfile = self.find_config_file(filename, config)
+
+        if configfile.suffix in ('.yaml', '.yml'):
+            return self._load_from_yaml(configfile)
+
+        if configfile.suffix == '.json':
+            with configfile.open('r') as cfg:
+                return json.load(cfg)
+
+        raise UsageError(f"Config file '{configfile}' has unknown format.")
+
+
+    def find_config_file(self, filename, config=None):
+        """ Resolve the location of a configuration file given a filename and
+            an optional configuration option with the file name.
+            Raises a UsageError when the file cannot be found or is not
+            a regular file.
+        """
+        if config is not None:
+            cfg_filename = self.__getattr__(config)
+            if cfg_filename:
+                cfg_filename = Path(cfg_filename)
+
+                if cfg_filename.is_absolute():
+                    cfg_filename = cfg_filename.resolve()
+
+                    if not cfg_filename.is_file():
+                        LOG.fatal("Cannot find config file '%s'.", cfg_filename)
+                        raise UsageError("Config file not found.")
+
+                    return cfg_filename
+
+                filename = cfg_filename
+
+
+        search_paths = [self.project_dir, self.config_dir]
+        for path in search_paths:
+            if path is not None and (path / filename).is_file():
+                return path / filename
+
+        LOG.fatal("Configuration file '%s' not found.\nDirectories searched: %s",
+                  filename, search_paths)
+        raise UsageError("Config file not found.")
+
+
+    def _load_from_yaml(self, cfgfile):
+        """ Load a YAML configuration file. This installs a special handler that
+            allows to include other YAML files using the '!include' operator.
+        """
+        yaml.add_constructor('!include', self._yaml_include_representer,
+                             Loader=yaml.SafeLoader)
+        return yaml.safe_load(cfgfile.read_text(encoding='utf-8'))
+
+
+    def _yaml_include_representer(self, loader, node):
+        """ Handler for the '!include' operator in YAML files.
+
+            When the filename is relative, then the file is first searched in the
+            project directory and then in the global settings dirctory.
+        """
+        fname = loader.construct_scalar(node)
+
+        if Path(fname).is_absolute():
+            configfile = Path(fname)
+        else:
+            configfile = self.find_config_file(loader.construct_scalar(node))
+
+        if configfile.suffix != '.yaml':
+            LOG.fatal("Format error while reading '%s': only YAML format supported.",
+                      configfile)
+            raise UsageError("Cannot handle config file format.")
+
+        return yaml.safe_load(configfile.read_text(encoding='utf-8'))
--- a/nominatim/db/sql_preprocessor.py
+++ b/nominatim/db/sql_preprocessor.py
@@ -36,7 +36,7 @@ def _setup_tablespace_sql(config):
            tspace = getattr(config, 'TABLESPACE_{}_{}'.format(subset, kind))
            if tspace:
                tspace = 'TABLESPACE "{}"'.format(tspace)
-            out['{}_{}'.format(subset.lower, kind.lower())] = tspace
+            out['{}_{}'.format(subset.lower(), kind.lower())] = tspace

    return out

@@ -46,8 +46,10 @@ def _setup_postgresql_features(conn):
        depend on the database version.
    """
    pg_version = conn.server_version_tuple()
+    postgis_version = conn.postgis_version_tuple()
    return {
-        'has_index_non_key_column': pg_version >= (11, 0, 0)
+        'has_index_non_key_column': pg_version >= (11, 0, 0),
+        'spgist_geom' : 'SPGIST' if postgis_version >= (3, 0) else 'GIST'
    }

 class SQLPreprocessor:
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@@ -91,6 +91,17 @@ class Indexer:
        self.num_threads = num_threads


+    def has_pending(self):
+        """ Check if any data still needs indexing.
+            This function must only be used after the import has finished.
+            Otherwise it will be very expensive.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT 'a' FROM placex WHERE indexed_status > 0 LIMIT 1")
+                return cur.rowcount > 0
+
+
    def index_full(self, analyse=True):
        """ Index the complete database. This will first index boundaries
            followed by all other objects. When `analyse` is True, then the
--- a/nominatim/indexer/place_info.py
+++ b/nominatim/indexer/place_info.py
@@ -0,0 +1,68 @@
+"""
+Wrapper around place information the indexer gets from the database and hands to
+the tokenizer.
+"""
+
+import psycopg2.extras
+
+class PlaceInfo:
+    """ Data class containing all information the tokenizer gets about a
+        place it should process the names for.
+    """
+
+    def __init__(self, info):
+        self._info = info
+
+
+    def analyze(self, analyzer):
+        """ Process this place with the given tokenizer and return the
+            result in psycopg2-compatible Json.
+        """
+        return psycopg2.extras.Json(analyzer.process_place(self))
+
+
+    @property
+    def name(self):
+        """ A dictionary with the names of the place or None if the place
+            has no names.
+        """
+        return self._info.get('name')
+
+
+    @property
+    def address(self):
+        """ A dictionary with the address elements of the place
+            or None if no address information is available.
+        """
+        return self._info.get('address')
+
+
+    @property
+    def country_code(self):
+        """ The country code of the country the place is in. Guaranteed
+            to be a two-letter lower-case string or None, if no country
+            could be found.
+        """
+        return self._info.get('country_code')
+
+
+    @property
+    def rank_address(self):
+        """ The computed rank address before rank correction.
+        """
+        return self._info.get('rank_address')
+
+
+    def is_a(self, key, value):
+        """ Check if the place's primary tag corresponds to the given
+            key and value.
+        """
+        return self._info.get('class') == key and self._info.get('type') == value
+
+
+    def is_country(self):
+        """ Check if the place is a valid country boundary.
+        """
+        return self.rank_address == 4 \
+               and self.is_a('boundary', 'administrative') \
+               and self.country_code is not None
--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@@ -4,18 +4,21 @@ tasks.
 """
 import functools

-import psycopg2.extras
 from psycopg2 import sql as pysql

+from nominatim.indexer.place_info import PlaceInfo
+
 # pylint: disable=C0111

 def _mk_valuelist(template, num):
    return pysql.SQL(',').join([pysql.SQL(template)] * num)

+
 class AbstractPlacexRunner:
    """ Returns SQL commands for indexing of the placex table.
    """
    SELECT_SQL = pysql.SQL('SELECT place_id FROM placex ')
+    UPDATE_LINE = "(%s, %s::hstore, %s::hstore, %s::int, %s::jsonb)"

    def __init__(self, rank, analyzer):
        self.rank = rank
@@ -27,15 +30,16 @@ class AbstractPlacexRunner:
    def _index_sql(num_places):
        return pysql.SQL(
            """ UPDATE placex
-                SET indexed_status = 0, address = v.addr, token_info = v.ti
-                FROM (VALUES {}) as v(id, addr, ti)
+                SET indexed_status = 0, address = v.addr, token_info = v.ti,
+                    name = v.name, linked_place_id = v.linked_place_id
+                FROM (VALUES {}) as v(id, name, addr, linked_place_id, ti)
                WHERE place_id = v.id
-            """).format(_mk_valuelist("(%s, %s::hstore, %s::jsonb)", num_places))
+            """).format(_mk_valuelist(AbstractPlacexRunner.UPDATE_LINE, num_places))


    @staticmethod
    def get_place_details(worker, ids):
-        worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
+        worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).*
                          FROM placex WHERE place_id IN %s""",
                       (tuple((p[0] for p in ids)), ))

@@ -43,8 +47,9 @@ class AbstractPlacexRunner:
    def index_places(self, worker, places):
        values = []
        for place in places:
-            values.extend((place[x] for x in ('place_id', 'address')))
-            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+            for field in ('place_id', 'name', 'address', 'linked_place_id'):
+                values.append(place[field])
+            values.append(PlaceInfo(place).analyze(self.analyzer))

        worker.perform(self._index_sql(len(places)), values)

@@ -138,7 +143,7 @@ class InterpolationRunner:
        values = []
        for place in places:
            values.extend((place[x] for x in ('place_id', 'address')))
-            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+            values.append(PlaceInfo(place).analyze(self.analyzer))

        worker.perform(self._index_sql(len(places)), values)

--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -0,0 +1,232 @@
+"""
+Abstract class defintions for tokenizers. These base classes are here
+mainly for documentation purposes.
+"""
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Dict, Any
+
+from nominatim.config import Configuration
+from nominatim.indexer.place_info import PlaceInfo
+
+# pylint: disable=unnecessary-pass
+
+class AbstractAnalyzer(ABC):
+    """ The analyzer provides the functions for analysing names and building
+        the token database.
+
+        Analyzers are instantiated on a per-thread base. Access to global data
+        structures must be synchronised accordingly.
+    """
+
+    def __enter__(self) -> 'AbstractAnalyzer':
+        return self
+
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.close()
+
+
+    @abstractmethod
+    def close(self) -> None:
+        """ Free all resources used by the analyzer.
+        """
+        pass
+
+
+    @abstractmethod
+    def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
+        """ Return token information for the given list of words.
+
+            The function is used for testing and debugging only
+            and does not need to be particularly efficient.
+
+            Arguments:
+                words: A list of words to look up the tokens for.
+                       If a word starts with # it is assumed to be a full name
+                       otherwise is a partial term.
+
+            Returns:
+                The function returns the list of all tuples that could be
+                found for the given words. Each list entry is a tuple of
+                (original word, word token, word id).
+        """
+        pass
+
+
+    @abstractmethod
+    def normalize_postcode(self, postcode: str) -> str:
+        """ Convert the postcode to its standardized form.
+
+            This function must yield exactly the same result as the SQL function
+            `token_normalized_postcode()`.
+
+            Arguments:
+                postcode: The postcode to be normalized.
+
+            Returns:
+                The given postcode after normalization.
+        """
+        pass
+
+
+    @abstractmethod
+    def update_postcodes_from_db(self) -> None:
+        """ Update the tokenizer's postcode tokens from the current content
+            of the `location_postcode` table.
+        """
+        pass
+
+
+    @abstractmethod
+    def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
+                               should_replace: bool) -> None:
+        """ Update the tokenizer's special phrase tokens from the given
+            list of special phrases.
+
+            Arguments:
+                phrases: The new list of special phrases. Each entry is
+                         a tuple of (phrase, class, type, operator).
+                should_replace: If true, replace the current list of phrases.
+                                When false, just add the given phrases to the
+                                ones that already exist.
+        """
+        pass
+
+
+    @abstractmethod
+    def add_country_names(self, country_code: str, names: Dict[str, str]):
+        """ Add the given names to the tokenizer's list of country tokens.
+
+            Arguments:
+                country_code: two-letter country code for the country the names
+                              refer to.
+                names: Dictionary of name type to name.
+        """
+        pass
+
+
+    @abstractmethod
+    def process_place(self, place: PlaceInfo) -> Any:
+        """ Extract tokens for the given place and compute the
+            information to be handed to the PL/pgSQL processor for building
+            the search index.
+
+            Arguments:
+                place: Place information retrived from the database.
+
+            Returns:
+                A JSON-serialisable structure that will be handed into
+                the database via the `token_info` field.
+        """
+
+
+
+class AbstractTokenizer(ABC):
+    """ The tokenizer instance is the central instance of the tokenizer in
+        the system. There will only be a single instance of the tokenizer
+        active at any time.
+    """
+
+    @abstractmethod
+    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
+        """ Set up a new tokenizer for the database.
+
+            The function should copy all necessary data into the project
+            directory or save it in the property table to make sure that
+            the tokenizer remains stable over updates.
+
+            Arguments:
+              config: Read-only object with configuration options.
+
+              init_db: When set to False, then initialisation of database
+                tables should be skipped. This option is only required for
+                migration purposes and can be savely ignored by custom
+                tokenizers.
+
+            TODO: can we move the init_db parameter somewhere else?
+        """
+        pass
+
+
+    @abstractmethod
+    def init_from_project(self, config: Configuration) -> None:
+        """ Initialise the tokenizer from an existing database setup.
+
+            The function should load all previously saved configuration from
+            the project directory and/or the property table.
+
+            Arguments:
+              config: Read-only object with configuration options.
+        """
+        pass
+
+
+    @abstractmethod
+    def finalize_import(self, config: Configuration) -> None:
+        """ This function is called at the very end of an import when all
+            data has been imported and indexed. The tokenizer may create
+            at this point any additional indexes and data structures needed
+            during query time.
+
+            Arguments:
+              config: Read-only object with configuration options.
+        """
+        pass
+
+
+    @abstractmethod
+    def update_sql_functions(self, config: Configuration) -> None:
+        """ Update the SQL part of the tokenizer. This function is called
+            automatically on migrations or may be called explicitly by the
+            user through the `nominatim refresh --functions` command.
+
+            The tokenizer must only update the code of the tokenizer. The
+            data structures or data itself must not be changed by this function.
+
+            Arguments:
+              config: Read-only object with configuration options.
+        """
+        pass
+
+
+    @abstractmethod
+    def check_database(self, config: Configuration) -> str:
+        """ Check that the database is set up correctly and ready for being
+            queried.
+
+            Arguments:
+              config: Read-only object with configuration options.
+
+            Returns:
+              If an issue was found, return an error message with the
+              description of the issue as well as hints for the user on
+              how to resolve the issue. If everything is okay, return `None`.
+        """
+        pass
+
+
+    @abstractmethod
+    def update_statistics(self) -> None:
+        """ Recompute any tokenizer statistics necessary for efficient lookup.
+            This function is meant to be called from time to time by the user
+            to improve performance. However, the tokenizer must not depend on
+            it to be called in order to work.
+        """
+        pass
+
+
+    @abstractmethod
+    def name_analyzer(self) -> AbstractAnalyzer:
+        """ Create a new analyzer for tokenizing names and queries
+            using this tokinzer. Analyzers are context managers and should
+            be used accordingly:
+
+            ```
+            with tokenizer.name_analyzer() as analyzer:
+                analyser.tokenize()
+            ```
+
+            When used outside the with construct, the caller must ensure to
+            call the close() function before destructing the analyzer.
+        """
+        pass
--- a/nominatim/tokenizer/factory.py
+++ b/nominatim/tokenizer/factory.py
@@ -85,6 +85,6 @@ def get_tokenizer_for_db(config):
    tokenizer_module = _import_tokenizer(name)

    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
-    tokenizer.init_from_project()
+    tokenizer.init_from_project(config)

    return tokenizer
--- a/nominatim/tokenizer/icu_name_processor.py
+++ b/nominatim/tokenizer/icu_name_processor.py
@@ -1,146 +0,0 @@
-"""
-Processor for names that are imported into the database based on the
-ICU library.
-"""
-from collections import defaultdict
-import itertools
-
-from icu import Transliterator
-import datrie
-
-from nominatim.db.properties import set_property, get_property
-from nominatim.tokenizer import icu_variants as variants
-
-DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
-DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
-DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
-DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
-
-
-class ICUNameProcessorRules:
-    """ Data object that saves the rules needed for the name processor.
-
-        The rules can either be initialised through an ICURuleLoader or
-        be loaded from a database when a connection is given.
-    """
-    def __init__(self, loader=None, conn=None):
-        if loader is not None:
-            self.norm_rules = loader.get_normalization_rules()
-            self.trans_rules = loader.get_transliteration_rules()
-            self.replacements = loader.get_replacement_pairs()
-            self.search_rules = loader.get_search_rules()
-        elif conn is not None:
-            self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
-            self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
-            self.replacements = \
-                variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
-            self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
-        else:
-            assert False, "Parameter loader or conn required."
-
-
-    def save_rules(self, conn):
-        """ Save the rules in the property table of the given database.
-            the rules can be loaded again by handing in a connection into
-            the constructor of the class.
-        """
-        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
-        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
-        set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
-                     variants.pickle_variant_set(self.replacements))
-        set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
-
-
-class ICUNameProcessor:
-    """ Collects the different transformation rules for normalisation of names
-        and provides the functions to aply the transformations.
-    """
-
-    def __init__(self, rules):
-        self.normalizer = Transliterator.createFromRules("icu_normalization",
-                                                         rules.norm_rules)
-        self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
-                                                       rules.trans_rules +
-                                                       ";[:Space:]+ > ' '")
-        self.search = Transliterator.createFromRules("icu_search",
-                                                     rules.search_rules)
-
-        # Intermediate reorder by source. Also compute required character set.
-        immediate = defaultdict(list)
-        chars = set()
-        for variant in rules.replacements:
-            if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
-                replstr = variant.replacement[:-1]
-            else:
-                replstr = variant.replacement
-            immediate[variant.source].append(replstr)
-            chars.update(variant.source)
-        # Then copy to datrie
-        self.replacements = datrie.Trie(''.join(chars))
-        for src, repllist in immediate.items():
-            self.replacements[src] = repllist
-
-
-    def get_normalized(self, name):
-        """ Normalize the given name, i.e. remove all elements not relevant
-            for search.
-        """
-        return self.normalizer.transliterate(name).strip()
-
-    def get_variants_ascii(self, norm_name):
-        """ Compute the spelling variants for the given normalized name
-            and transliterate the result.
-        """
-        baseform = '^ ' + norm_name + ' ^'
-        partials = ['']
-
-        startpos = 0
-        pos = 0
-        force_space = False
-        while pos < len(baseform):
-            full, repl = self.replacements.longest_prefix_item(baseform[pos:],
-                                                               (None, None))
-            if full is not None:
-                done = baseform[startpos:pos]
-                partials = [v + done + r
-                            for v, r in itertools.product(partials, repl)
-                            if not force_space or r.startswith(' ')]
-                if len(partials) > 128:
-                    # If too many variants are produced, they are unlikely
-                    # to be helpful. Only use the original term.
-                    startpos = 0
-                    break
-                startpos = pos + len(full)
-                if full[-1] == ' ':
-                    startpos -= 1
-                    force_space = True
-                pos = startpos
-            else:
-                pos += 1
-                force_space = False
-
-        # No variants detected? Fast return.
-        if startpos == 0:
-            trans_name = self.to_ascii.transliterate(norm_name).strip()
-            return [trans_name] if trans_name else []
-
-        return self._compute_result_set(partials, baseform[startpos:])
-
-
-    def _compute_result_set(self, partials, prefix):
-        results = set()
-
-        for variant in partials:
-            vname = variant + prefix
-            trans_name = self.to_ascii.transliterate(vname[1:-1]).strip()
-            if trans_name:
-                results.add(trans_name)
-
-        return list(results)
-
-
-    def get_search_normalized(self, name):
-        """ Return the normalized version of the name (including transliteration)
-            to be applied at search time.
-        """
-        return self.search.transliterate(' ' + name + ' ').strip()
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -1,61 +1,86 @@
 """
 Helper class to create ICU rules from a configuration file.
 """
+import importlib
 import io
+import json
 import logging
-import itertools
-from pathlib import Path
-import re
-
-import yaml
-from icu import Transliterator

+from nominatim.config import flatten_config_list
+from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
-import nominatim.tokenizer.icu_variants as variants
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
+import nominatim.tools.country_info

 LOG = logging.getLogger()

-def _flatten_yaml_list(content):
-    if not content:
-        return []
-
-    if not isinstance(content, list):
-        raise UsageError("List expected in ICU yaml configuration.")
-
-    output = []
-    for ele in content:
-        if isinstance(ele, list):
-            output.extend(_flatten_yaml_list(ele))
-        else:
-            output.append(ele)
-
-    return output
+DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
+DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
+DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"


-class VariantRule:
-    """ Saves a single variant expansion.
-
-        An expansion consists of the normalized replacement term and
-        a dicitonary of properties that describe when the expansion applies.
+def _get_section(rules, section):
+    """ Get the section named 'section' from the rules. If the section does
+        not exist, raise a usage error with a meaningful message.
    """
+    if section not in rules:
+        LOG.fatal("Section '%s' not found in tokenizer config.", section)
+        raise UsageError("Syntax error in tokenizer configuration file.")

-    def __init__(self, replacement, properties):
-        self.replacement = replacement
-        self.properties = properties or {}
+    return rules[section]


 class ICURuleLoader:
    """ Compiler for ICU rules from a tokenizer configuration file.
    """

-    def __init__(self, configfile):
-        self.configfile = configfile
-        self.variants = set()
+    def __init__(self, config):
+        rules = config.load_sub_configuration('icu_tokenizer.yaml',
+                                              config='TOKENIZER_CONFIG')

-        if configfile.suffix == '.yaml':
-            self._load_from_yaml()
-        else:
-            raise UsageError("Unknown format of tokenizer configuration.")
+        # Make sure country information is available to analyzers and sanatizers.
+        nominatim.tools.country_info.setup_country_config(config)
+
+        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
+        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
+        self.analysis_rules = _get_section(rules, 'token-analysis')
+        self._setup_analysis()
+
+        # Load optional sanitizer rule set.
+        self.sanitizer_rules = rules.get('sanitizers', [])
+
+
+    def load_config_from_db(self, conn):
+        """ Get previously saved parts of the configuration from the
+            database.
+        """
+        self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+        self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+        self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
+        self._setup_analysis()
+
+
+    def save_config_to_db(self, conn):
+        """ Save the part of the configuration that cannot be changed into
+            the database.
+        """
+        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
+        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
+        set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
+
+
+    def make_sanitizer(self):
+        """ Create a place sanitizer from the configured rules.
+        """
+        return PlaceSanitizer(self.sanitizer_rules)
+
+
+    def make_token_analysis(self):
+        """ Create a token analyser from the reviouly loaded rules.
+        """
+        return ICUTokenAnalysis(self.normalization_rules,
+                                self.transliteration_rules, self.analysis)


    def get_search_rules(self):
@@ -70,177 +95,66 @@ class ICURuleLoader:
        rules.write(self.transliteration_rules)
        return rules.getvalue()

+
    def get_normalization_rules(self):
        """ Return rules for normalisation of a term.
        """
        return self.normalization_rules

+
    def get_transliteration_rules(self):
        """ Return the rules for converting a string into its asciii representation.
        """
        return self.transliteration_rules

-    def get_replacement_pairs(self):
-        """ Return the list of possible compound decompositions with
-            application of abbreviations included.
-            The result is a list of pairs: the first item is the sequence to
-            replace, the second is a list of replacements.
+
+    def _setup_analysis(self):
+        """ Process the rules used for creating the various token analyzers.
        """
-        return self.variants
+        self.analysis = {}

-    def _yaml_include_representer(self, loader, node):
-        value = loader.construct_scalar(node)
+        if not isinstance(self.analysis_rules, list):
+            raise UsageError("Configuration section 'token-analysis' must be a list.")

-        if Path(value).is_absolute():
-            content = Path(value).read_text()
-        else:
-            content = (self.configfile.parent / value).read_text()
-
-        return yaml.safe_load(content)
+        for section in self.analysis_rules:
+            name = section.get('id', None)
+            if name in self.analysis:
+                if name is None:
+                    LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
+                else:
+                    LOG.fatal("ICU tokenizer configuration has two token "
+                              "analyzers with id '%s'.", name)
+                raise UsageError("Syntax error in ICU tokenizer config.")
+            self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)


-    def _load_from_yaml(self):
-        yaml.add_constructor('!include', self._yaml_include_representer,
-                             Loader=yaml.SafeLoader)
-        rules = yaml.safe_load(self.configfile.read_text())
-
-        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
-        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
-        self._parse_variant_list(self._get_section(rules, 'variants'))
-
-
-    def _get_section(self, rules, section):
-        """ Get the section named 'section' from the rules. If the section does
-            not exist, raise a usage error with a meaningful message.
-        """
-        if section not in rules:
-            LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
-                      section, str(self.configfile))
-            raise UsageError("Syntax error in tokenizer configuration file.")
-
-        return rules[section]
-
-
-    def _cfg_to_icu_rules(self, rules, section):
+    @staticmethod
+    def _cfg_to_icu_rules(rules, section):
        """ Load an ICU ruleset from the given section. If the section is a
            simple string, it is interpreted as a file name and the rules are
            loaded verbatim from the given file. The filename is expected to be
            relative to the tokenizer rule file. If the section is a list then
            each line is assumed to be a rule. All rules are concatenated and returned.
        """
-        content = self._get_section(rules, section)
+        content = _get_section(rules, section)

        if content is None:
            return ''

-        return ';'.join(_flatten_yaml_list(content)) + ';'
+        return ';'.join(flatten_config_list(content, section)) + ';'


-    def _parse_variant_list(self, rules):
-        self.variants.clear()
-
-        if not rules:
-            return
-
-        rules = _flatten_yaml_list(rules)
-
-        vmaker = _VariantMaker(self.normalization_rules)
-
-        properties = []
-        for section in rules:
-            # Create the property field and deduplicate against existing
-            # instances.
-            props = variants.ICUVariantProperties.from_rules(section)
-            for existing in properties:
-                if existing == props:
-                    props = existing
-                    break
-            else:
-                properties.append(props)
-
-            for rule in (section.get('words') or []):
-                self.variants.update(vmaker.compute(rule, props))
-
-
-class _VariantMaker:
-    """ Generater for all necessary ICUVariants from a single variant rule.
-
-        All text in rules is normalized to make sure the variants match later.
+class TokenAnalyzerRule:
+    """ Factory for a single analysis module. The class saves the configuration
+        and creates a new token analyzer on request.
    """

-    def __init__(self, norm_rules):
-        self.norm = Transliterator.createFromRules("rule_loader_normalization",
-                                                   norm_rules)
+    def __init__(self, rules, normalization_rules):
+        # Find the analysis module
+        module_name = 'nominatim.tokenizer.token_analysis.' \
+                      + _get_section(rules, 'analyzer').replace('-', '_')
+        analysis_mod = importlib.import_module(module_name)
+        self.create = analysis_mod.create

-
-    def compute(self, rule, props):
-        """ Generator for all ICUVariant tuples from a single variant rule.
-        """
-        parts = re.split(r'(\|)?([=-])>', rule)
-        if len(parts) != 4:
-            raise UsageError("Syntax error in variant rule: " + rule)
-
-        decompose = parts[1] is None
-        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
-        repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
-
-        # If the source should be kept, add a 1:1 replacement
-        if parts[2] == '-':
-            for src in src_terms:
-                if src:
-                    for froms, tos in _create_variants(*src, src[0], decompose):
-                        yield variants.ICUVariant(froms, tos, props)
-
-        for src, repl in itertools.product(src_terms, repl_terms):
-            if src and repl:
-                for froms, tos in _create_variants(*src, repl, decompose):
-                    yield variants.ICUVariant(froms, tos, props)
-
-
-    def _parse_variant_word(self, name):
-        name = name.strip()
-        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
-        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
-            raise UsageError("Invalid variant word descriptor '{}'".format(name))
-        norm_name = self.norm.transliterate(match.group(2))
-        if not norm_name:
-            return None
-
-        return norm_name, match.group(1), match.group(3)
-
-
-_FLAG_MATCH = {'^': '^ ',
-               '$': ' ^',
-               '': ' '}
-
-
-def _create_variants(src, preflag, postflag, repl, decompose):
-    if preflag == '~':
-        postfix = _FLAG_MATCH[postflag]
-        # suffix decomposition
-        src = src + postfix
-        repl = repl + postfix
-
-        yield src, repl
-        yield ' ' + src, ' ' + repl
-
-        if decompose:
-            yield src, ' ' + repl
-            yield ' ' + src, repl
-    elif postflag == '~':
-        # prefix decomposition
-        prefix = _FLAG_MATCH[preflag]
-        src = prefix + src
-        repl = prefix + repl
-
-        yield src, repl
-        yield src + ' ', repl + ' '
-
-        if decompose:
-            yield src, repl + ' '
-            yield src + ' ', repl
-    else:
-        prefix = _FLAG_MATCH[preflag]
-        postfix = _FLAG_MATCH[postflag]
-
-        yield prefix + src + postfix, prefix + repl + postfix
+        # Load the configuration.
+        self.config = analysis_mod.configure(rules, normalization_rules)
--- a/nominatim/tokenizer/icu_token_analysis.py
+++ b/nominatim/tokenizer/icu_token_analysis.py
@@ -0,0 +1,23 @@
+"""
+Container class collecting all components required to transform an OSM name
+into a Nominatim token.
+"""
+
+from icu import Transliterator
+
+class ICUTokenAnalysis:
+    """ Container class collecting the transliterators and token analysis
+        modules for a single NameAnalyser instance.
+    """
+
+    def __init__(self, norm_rules, trans_rules, analysis_rules):
+        self.normalizer = Transliterator.createFromRules("icu_normalization",
+                                                         norm_rules)
+        trans_rules += ";[:Space:]+ > ' '"
+        self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
+                                                       trans_rules)
+        self.search = Transliterator.createFromRules("icu_search",
+                                                     norm_rules + trans_rules)
+
+        self.analysis = {name: arules.create(self.to_ascii, arules.config)
+                         for name, arules in analysis_rules.items()}
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -2,22 +2,19 @@
 Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
-from collections import Counter
 import itertools
 import json
 import logging
 import re
 from textwrap import dedent
-from pathlib import Path

 from nominatim.db.connection import connect
-from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer

-DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"

 LOG = logging.getLogger()
@@ -28,7 +25,7 @@ def create(dsn, data_dir):
    return LegacyICUTokenizer(dsn, data_dir)


-class LegacyICUTokenizer:
+class LegacyICUTokenizer(AbstractTokenizer):
    """ This tokenizer uses libICU to covert names and queries to ASCII.
        Otherwise it uses the same algorithms and data structures as the
        normalization routines in Nominatim 3.
@@ -37,9 +34,7 @@ class LegacyICUTokenizer:
    def __init__(self, dsn, data_dir):
        self.dsn = dsn
        self.data_dir = data_dir
-        self.naming_rules = None
-        self.term_normalization = None
-        self.max_word_frequency = None
+        self.loader = None


    def init_new_db(self, config, init_db=True):
@@ -48,58 +43,67 @@ class LegacyICUTokenizer:
            This copies all necessary data in the project directory to make
            sure the tokenizer remains stable even over updates.
        """
-        if config.TOKENIZER_CONFIG:
-            cfgfile = Path(config.TOKENIZER_CONFIG)
-        else:
-            cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
-
-        loader = ICURuleLoader(cfgfile)
-        self.naming_rules = ICUNameProcessorRules(loader=loader)
-        self.term_normalization = config.TERM_NORMALIZATION
-        self.max_word_frequency = config.MAX_WORD_FREQUENCY
+        self.loader = ICURuleLoader(config)

        self._install_php(config.lib_dir.php)
-        self._save_config(config)
+        self._save_config()

        if init_db:
            self.update_sql_functions(config)
            self._init_db_tables(config)


-    def init_from_project(self):
+    def init_from_project(self, config):
        """ Initialise the tokenizer from the project directory.
        """
+        self.loader = ICURuleLoader(config)
+
        with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
-            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
-            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
+            self.loader.load_config_from_db(conn)


-    def finalize_import(self, _):
+    def finalize_import(self, config):
        """ Do any required postprocessing to make the tokenizer data ready
            for use.
        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')


    def update_sql_functions(self, config):
        """ Reimport the SQL functions for this tokenizer.
        """
        with connect(self.dsn) as conn:
-            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
-                              max_word_freq=max_word_freq)
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')


-    def check_database(self):
+    def check_database(self, config):
        """ Check that the tokenizer is set up correctly.
        """
-        self.init_from_project()
+        # Will throw an error if there is an issue.
+        self.init_from_project(config)

-        if self.naming_rules is None:
-            return "Configuration for tokenizer 'legacy_icu' are missing."

-        return None
+    def update_statistics(self):
+        """ Recompute frequencies for all name words.
+        """
+        with connect(self.dsn) as conn:
+            if conn.table_exists('search_name'):
+                with conn.cursor() as cur:
+                    cur.drop_table("word_frequencies")
+                    LOG.info("Computing word frequencies")
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute("CREATE INDEX ON word_frequencies(id)")
+                    LOG.info("Update word table with recomputed frequencies")
+                    cur.execute("""UPDATE word
+                                   SET info = info || jsonb_build_object('count', count)
+                                   FROM word_frequencies WHERE word_id = id""")
+                    cur.drop_table("word_frequencies")
+            conn.commit()


    def name_analyzer(self):
@@ -117,7 +121,8 @@ class LegacyICUTokenizer:

            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                                     self.loader.make_token_analysis())


    def _install_php(self, phpdir):
@@ -126,21 +131,18 @@ class LegacyICUTokenizer:
        php_file = self.data_dir / "tokenizer.php"
        php_file.write_text(dedent(f"""\
            <?php
-            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
-            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
-            require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
+            @define('CONST_Max_Word_Frequency', 10000000);
+            @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))


-    def _save_config(self, config):
+    def _save_config(self):
        """ Save the configuration that needs to remain stable for the given
            database as database properties.
        """
        with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
-
-            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
-            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
+            self.loader.save_config_to_db(conn)


    def _init_db_tables(self, config):
@@ -152,69 +154,23 @@ class LegacyICUTokenizer:
            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
            conn.commit()

-            LOG.warning("Precomputing word tokens")

-            # get partial words and their frequencies
-            words = self._count_partial_terms(conn)
-
-            # copy them back into the word table
-            with CopyBuffer() as copystr:
-                for term, cnt in words.items():
-                    copystr.add('w', term, json.dumps({'count': cnt}))
-
-                with conn.cursor() as cur:
-                    copystr.copy_out(cur, 'word',
-                                     columns=['type', 'word_token', 'info'])
-                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                                   WHERE word_id is null and type = 'w'""")
-
-            conn.commit()
-
-    def _count_partial_terms(self, conn):
-        """ Count the partial terms from the names in the place table.
-        """
-        words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
-
-        with conn.cursor(name="words") as cur:
-            cur.execute(""" SELECT v, count(*) FROM
-                              (SELECT svals(name) as v FROM place)x
-                            WHERE length(v) < 75 GROUP BY v""")
-
-            for name, cnt in cur:
-                terms = set()
-                for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                    if ' ' in word:
-                        terms.update(word.split())
-                for term in terms:
-                    words[term] += cnt
-
-        return words
-
-
-class LegacyICUNameAnalyzer:
+class LegacyICUNameAnalyzer(AbstractAnalyzer):
    """ The legacy analyzer uses the ICU library for splitting names.

        Each instance opens a connection to the database to request the
        normalization.
    """

-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis

        self._cache = _TokenCache()


-    def __enter__(self):
-        return self
-
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-
-
    def close(self):
        """ Free all resources used by the analyzer.
        """
@@ -223,6 +179,19 @@ class LegacyICUNameAnalyzer:
            self.conn = None


+    def _search_normalized(self, name):
+        """ Return the search token transliteration of the given name.
+        """
+        return self.token_analysis.search.transliterate(name).strip()
+
+
+    def _normalized(self, name):
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return self.token_analysis.normalizer.transliterate(name).strip()
+
+
    def get_word_token_info(self, words):
        """ Return token information for the given list of words.
            If a word starts with # it is assumed to be a full name
@@ -238,9 +207,9 @@ class LegacyICUNameAnalyzer:
        partial_tokens = {}
        for word in words:
            if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
            else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)

        with self.conn.cursor() as cur:
            cur.execute("""SELECT word_token, word_id
@@ -271,7 +240,7 @@ class LegacyICUNameAnalyzer:

            This function takes minor shortcuts on transliteration.
        """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)

    def update_postcodes_from_db(self):
        """ Update postcode tokens in the word table from the location_postcode
@@ -294,7 +263,7 @@ class LegacyICUNameAnalyzer:
                    if postcode is None:
                        to_delete.append(word)
                    else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                    'P', postcode)

                if to_delete:
@@ -312,7 +281,7 @@ class LegacyICUNameAnalyzer:
            completely replaced. Otherwise the phrases are added to the
            already existing ones.
        """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                            for p in phrases))

        with self.conn.cursor() as cur:
@@ -342,7 +311,7 @@ class LegacyICUNameAnalyzer:
        added = 0
        with CopyBuffer() as copystr:
            for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                if term:
                    copystr.add(term, 'S', word,
                                json.dumps({'class': cls, 'type': typ,
@@ -376,9 +345,21 @@ class LegacyICUNameAnalyzer:
    def add_country_names(self, country_code, names):
        """ Add names for the given country to the search index.
        """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0])
+
+
+    def _add_country_full_names(self, country_code, names):
+        """ Add names for the given country from an already sanitized
+            name list.
+        """
        word_tokens = set()
-        for name in self._compute_full_names(names):
-            norm_name = self.name_processor.get_search_normalized(name)
+        for name in names:
+            norm_name = self._search_normalized(name.name)
            if norm_name:
                word_tokens.add(norm_name)

@@ -404,23 +385,21 @@ class LegacyICUNameAnalyzer:
    def process_place(self, place):
        """ Determine tokenizer information about the given place.

-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
            the database via the token_info field.
        """
        token_info = _TokenInfo(self._cache)

-        names = place.get('name')
+        names, address = self.sanitizer.process_names(place)

        if names:
            fulls, partials = self._compute_name_tokens(names)

            token_info.add_names(fulls, partials)

-            country_feature = place.get('country_feature')
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self._add_country_full_names(place.country_code, names)

-        address = place.get('address')
        if address:
            self._process_place_address(token_info, address)

@@ -430,18 +409,18 @@ class LegacyICUNameAnalyzer:
    def _process_place_address(self, token_info, address):
        hnrs = []
        addr_terms = []
-        for key, value in address.items():
-            if key == 'postcode':
-                self._add_postcode(value)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
-            elif key == 'street':
-                token_info.add_street(*self._compute_name_tokens({'name': value}))
-            elif key == 'place':
-                token_info.add_place(*self._compute_name_tokens({'name': value}))
-            elif not key.startswith('_') and \
-                 key not in ('country', 'full'):
-                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+        for item in address:
+            if item.kind == 'postcode':
+                self._add_postcode(item.name)
+            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(item.name)
+            elif item.kind == 'street':
+                token_info.add_street(self._compute_partial_tokens(item.name))
+            elif item.kind == 'place':
+                token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and \
+                 item.kind not in ('country', 'full'):
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))

        if hnrs:
            hnrs = self._split_housenumbers(hnrs)
@@ -451,28 +430,61 @@ class LegacyICUNameAnalyzer:
            token_info.add_address_terms(addr_terms)


+    def _compute_partial_tokens(self, name):
+        """ Normalize the given term, split it into partial words and return
+            then token list for them.
+        """
+        norm_name = self._search_normalized(name)
+
+        tokens = []
+        need_lookup = []
+        for partial in norm_name.split():
+            token = self._cache.partials.get(partial)
+            if token:
+                tokens.append(token)
+            else:
+                need_lookup.append(partial)
+
+        if need_lookup:
+            with self.conn.cursor() as cur:
+                cur.execute("""SELECT word, getorcreate_partial_word(word)
+                               FROM unnest(%s) word""",
+                            (need_lookup, ))
+
+                for partial, token in cur:
+                    tokens.append(token)
+                    self._cache.partials[partial] = token
+
+        return tokens
+
+
    def _compute_name_tokens(self, names):
        """ Computes the full name and partial name tokens for the given
            dictionary of names.
        """
-        full_names = self._compute_full_names(names)
        full_tokens = set()
        partial_tokens = set()

-        for name in full_names:
-            norm_name = self.name_processor.get_normalized(name)
-            full, part = self._cache.names.get(norm_name, (None, None))
+        for name in names:
+            analyzer_id = name.get_attr('analyzer')
+            norm_name = self._normalized(name.name)
+            if analyzer_id is None:
+                token_id = norm_name
+            else:
+                token_id = f'{norm_name}@{analyzer_id}'
+
+            full, part = self._cache.names.get(token_id, (None, None))
            if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
                if not variants:
                    continue

                with self.conn.cursor() as cur:
                    cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
-                                (norm_name, variants))
+                                (token_id, variants))
                    full, part = cur.fetchone()

-                self._cache.names[norm_name] = (full, part)
+                self._cache.names[token_id] = (full, part)

            full_tokens.add(full)
            partial_tokens.update(part)
@@ -480,23 +492,6 @@ class LegacyICUNameAnalyzer:
        return full_tokens, partial_tokens


-    @staticmethod
-    def _compute_full_names(names):
-        """ Return the set of all full name word ids to be used with the
-            given dictionary of names.
-        """
-        full_names = set()
-        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
-            if name:
-                full_names.add(name)
-
-                brace_idx = name.find('(')
-                if brace_idx >= 0:
-                    full_names.add(name[:brace_idx].strip())
-
-        return full_names
-
-
    def _add_postcode(self, postcode):
        """ Make sure the normalized postcode is present in the word table.
        """
@@ -504,7 +499,7 @@ class LegacyICUNameAnalyzer:
            postcode = self.normalize_postcode(postcode)

            if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                if not term:
                    return

@@ -563,30 +558,25 @@ class _TokenInfo:
        self.data['hnr'] = ';'.join(hnrs)


-    def add_street(self, fulls, _):
+    def add_street(self, tokens):
        """ Add addr:street match terms.
        """
-        if fulls:
-            self.data['street'] = self._mk_array(fulls)
+        if tokens:
+            self.data['street'] = self._mk_array(tokens)


-    def add_place(self, fulls, partials):
+    def add_place(self, tokens):
        """ Add addr:place search and match terms.
        """
-        if fulls:
-            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
-            self.data['place_match'] = self._mk_array(fulls)
+        if tokens:
+            self.data['place'] = self._mk_array(tokens)


    def add_address_terms(self, terms):
        """ Add additional address terms.
        """
-        tokens = {}
-
-        for key, fulls, partials in terms:
-            if fulls:
-                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
-                               self._mk_array(fulls)]
+        tokens = {key: self._mk_array(partials)
+                  for key, partials in terms if partials}

        if tokens:
            self.data['addr'] = tokens
@@ -600,6 +590,7 @@ class _TokenCache:
    """
    def __init__(self):
        self.names = {}
+        self.partials = {}
        self.postcodes = set()
        self.housenumbers = {}

--- a/nominatim/tokenizer/icu_variants.py
+++ b/nominatim/tokenizer/icu_variants.py
@@ -1,57 +0,0 @@
-"""
-Data structures for saving variant expansions for ICU tokenizer.
-"""
-from collections import namedtuple
-import json
-
-_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
-
-
-class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORPERTY_FIELDS)):
-    """ Data container for saving properties that describe when a variant
-        should be applied.
-
-        Property instances are hashable.
-    """
-    @classmethod
-    def from_rules(cls, _):
-        """ Create a new property type from a generic dictionary.
-
-            The function only takes into account the properties that are
-            understood presently and ignores all others.
-        """
-        return cls(lang=None)
-
-
-ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
-
-
-def pickle_variant_set(variants):
-    """ Serializes an iterable of variant rules to a string.
-    """
-    # Create a list of property sets. So they don't need to be duplicated
-    properties = {}
-    pid = 1
-    for variant in variants:
-        if variant.properties not in properties:
-            properties[variant.properties] = pid
-            pid += 1
-
-    # Convert the variants into a simple list.
-    variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
-
-    # Convert everythin to json.
-    return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
-                       'variants': variants})
-
-
-def unpickle_variant_set(variant_string):
-    """ Deserializes a variant string that was previously created with
-        pickle_variant_set() into a set of ICUVariants.
-    """
-    data = json.loads(variant_string)
-
-    properties = {int(k): ICUVariantProperties.from_rules(v)
-                  for k, v in data['properties'].items()}
-
-    return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -16,6 +16,7 @@ from nominatim.db import properties
 from nominatim.db import utils as db_utils
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.errors import UsageError
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer

 DBCFG_NORMALIZATION = "tokenizer_normalization"
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
@@ -76,7 +77,7 @@ def _check_module(module_dir, conn):
            raise UsageError("Database module cannot be accessed.") from err


-class LegacyTokenizer:
+class LegacyTokenizer(AbstractTokenizer):
    """ The legacy tokenizer uses a special PostgreSQL module to normalize
        names and queries. The tokenizer thus implements normalization through
        calls to the database.
@@ -112,7 +113,7 @@ class LegacyTokenizer:
            self._init_db_tables(config)


-    def init_from_project(self):
+    def init_from_project(self, _):
        """ Initialise the tokenizer from the project directory.
        """
        with connect(self.dsn) as conn:
@@ -141,7 +142,7 @@ class LegacyTokenizer:
                              modulepath=modulepath)


-    def check_database(self):
+    def check_database(self, _):
        """ Check that the tokenizer is set up correctly.
        """
        hint = """\
@@ -185,6 +186,25 @@ class LegacyTokenizer:
            self._save_config(conn, config)


+    def update_statistics(self):
+        """ Recompute the frequency of full words.
+        """
+        with connect(self.dsn) as conn:
+            if conn.table_exists('search_name'):
+                with conn.cursor() as cur:
+                    cur.drop_table("word_frequencies")
+                    LOG.info("Computing word frequencies")
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute("CREATE INDEX ON word_frequencies(id)")
+                    LOG.info("Update word table with recomputed frequencies")
+                    cur.execute("""UPDATE word SET search_name_count = count
+                                   FROM word_frequencies
+                                   WHERE word_token like ' %' and word_id = id""")
+                    cur.drop_table("word_frequencies")
+            conn.commit()
+
    def name_analyzer(self):
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
@@ -238,7 +258,7 @@ class LegacyTokenizer:
        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)


-class LegacyNameAnalyzer:
+class LegacyNameAnalyzer(AbstractAnalyzer):
    """ The legacy analyzer uses the special Postgresql module for
        splitting names.

@@ -255,14 +275,6 @@ class LegacyNameAnalyzer:
        self._cache = _TokenCache(self.conn)


-    def __enter__(self):
-        return self
-
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-
-
    def close(self):
        """ Free all resources used by the analyzer.
        """
@@ -412,16 +424,15 @@ class LegacyNameAnalyzer:
        """
        token_info = _TokenInfo(self._cache)

-        names = place.get('name')
+        names = place.name

        if names:
            token_info.add_names(self.conn, names)

-            country_feature = place.get('country_feature')
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self.add_country_names(place.country_code, names)

-        address = place.get('address')
+        address = place.address
        if address:
            self._process_place_address(token_info, address)

--- a/nominatim/tokenizer/place_sanitizer.py
+++ b/nominatim/tokenizer/place_sanitizer.py
@@ -0,0 +1,127 @@
+"""
+Handler for cleaning name and address tags in place information before it
+is handed to the token analysis.
+"""
+import importlib
+
+from nominatim.errors import UsageError
+
+class PlaceName:
+    """ A searchable name for a place together with properties.
+        Every name object saves the name proper and two basic properties:
+        * 'kind' describes the name of the OSM key used without any suffixes
+          (i.e. the part after the colon removed)
+        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
+          is the part of the key after the first colon.
+        In addition to that, the name may have arbitrary additional attributes.
+        Which attributes are used, depends on the token analyser.
+    """
+
+    def __init__(self, name, kind, suffix):
+        self.name = name
+        self.kind = kind
+        self.suffix = suffix
+        self.attr = {}
+
+
+    def __repr__(self):
+        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
+
+
+    def clone(self, name=None, kind=None, suffix=None, attr=None):
+        """ Create a deep copy of the place name, optionally with the
+            given parameters replaced. In the attribute list only the given
+            keys are updated. The list is not replaced completely.
+            In particular, the function cannot to be used to remove an
+            attribute from a place name.
+        """
+        newobj = PlaceName(name or self.name,
+                           kind or self.kind,
+                           suffix or self.suffix)
+
+        newobj.attr.update(self.attr)
+        if attr:
+            newobj.attr.update(attr)
+
+        return newobj
+
+
+    def set_attr(self, key, value):
+        """ Add the given property to the name. If the property was already
+            set, then the value is overwritten.
+        """
+        self.attr[key] = value
+
+
+    def get_attr(self, key, default=None):
+        """ Return the given property or the value of 'default' if it
+            is not set.
+        """
+        return self.attr.get(key, default)
+
+
+    def has_attr(self, key):
+        """ Check if the given attribute is set.
+        """
+        return key in self.attr
+
+
+class _ProcessInfo:
+    """ Container class for information handed into to handler functions.
+        The 'names' and 'address' members are mutable. A handler must change
+        them by either modifying the lists place or replacing the old content
+        with a new list.
+    """
+
+    def __init__(self, place):
+        self.place = place
+        self.names = self._convert_name_dict(place.name)
+        self.address = self._convert_name_dict(place.address)
+
+
+    @staticmethod
+    def _convert_name_dict(names):
+        """ Convert a dictionary of names into a list of PlaceNames.
+            The dictionary key is split into the primary part of the key
+            and the suffix (the part after an optional colon).
+        """
+        out = []
+
+        if names:
+            for key, value in names.items():
+                parts = key.split(':', 1)
+                out.append(PlaceName(value.strip(),
+                                     parts[0].strip(),
+                                     parts[1].strip() if len(parts) > 1 else None))
+
+        return out
+
+
+class PlaceSanitizer:
+    """ Controller class which applies sanitizer functions on the place
+        names and address before they are used by the token analysers.
+    """
+
+    def __init__(self, rules):
+        self.handlers = []
+
+        if rules:
+            for func in rules:
+                if 'step' not in func:
+                    raise UsageError("Sanitizer rule is missing the 'step' attribute.")
+                module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
+                handler_module = importlib.import_module(module_name)
+                self.handlers.append(handler_module.create(func))
+
+
+    def process_names(self, place):
+        """ Extract a sanitized list of names and address parts from the
+            given place. The function returns a tuple
+            (list of names, list of address names)
+        """
+        obj = _ProcessInfo(place)
+
+        for func in self.handlers:
+            func(obj)
+
+        return obj.names, obj.address
--- a/Show More
+++ b/Show More