Merge pull request #2707 from lonvia/make-icu-tokenizer-the-default

Make ICU tokenizer the default
2026-02-26 11:08:13 +00:00 · 2022-05-11 08:52:49 +02:00
parent b332b1ae23 c6a426a885
commit 5ff35d9984
21 changed files with 80 additions and 63 deletions
--- a/.github/actions/build-nominatim/action.yml
+++ b/.github/actions/build-nominatim/action.yml
@@ -5,6 +5,10 @@ inputs:
        description: 'Version of Ubuntu to install on'
        required: false
        default: '20'
    cmake-args:
        description: 'Additional options to hand to cmake'
        required: false
        default: ''
 runs:
    using: "composite"
@@ -21,18 +25,13 @@ runs:
          shell: bash
          env:
            UBUNTUVER: ${{ inputs.ubuntu }}
-
+            CMAKE_ARGS: ${{ inputs.cmake-args }}
        - name: Download dependencies
          run: |
              if [ ! -f country_grid.sql.gz ]; then
                  wget --no-verbose https://www.nominatim.org/data/country_grid.sql.gz
              fi
              cp country_grid.sql.gz Nominatim/data/country_osm_grid.sql.gz
          shell: bash
        - name: Configure
-          run: mkdir build && cd build && cmake ../Nominatim
+          run: mkdir build && cd build && cmake $CMAKE_ARGS ../Nominatim
          shell: bash
          env:
            CMAKE_ARGS: ${{ inputs.cmake-args }}
        - name: Build
          run: |
--- a/.github/actions/setup-postgresql/action.yml
+++ b/.github/actions/setup-postgresql/action.yml
@@ -22,7 +22,7 @@ runs:
        - name: Install PostgreSQL
          run: |
-              sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER} postgresql-server-dev-${PGVER}
+              sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER}
          shell: bash
          env:
              PGVER: ${{ inputs.postgresql-version }}
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -113,19 +113,9 @@ jobs:
              working-directory: Nominatim/test/bdd
-    icu-test:
+    legacy-test:
        needs: create-archive
-        strategy:
+        runs-on: ubuntu-20.04
            matrix:
                ubuntu: [20]
                include:
                    - ubuntu: 20
                      postgresql: 13
                      postgis: 3
                      pytest: py.test-3
                      php: 7.4
        runs-on: ubuntu-${{ matrix.ubuntu }}.04
        steps:
            - uses: actions/download-artifact@v2
@@ -138,35 +128,27 @@ jobs:
            - name: Setup PHP
              uses: shivammathur/setup-php@v2
              with:
-                  php-version: ${{ matrix.php }}
+                  php-version: 7.4
                  coverage: xdebug
                  tools: phpunit, phpcs, composer
            - uses: actions/setup-python@v2
              with:
                python-version: 3.6
              if: matrix.ubuntu == 18
            - uses: ./Nominatim/.github/actions/setup-postgresql
              with:
-                  postgresql-version: ${{ matrix.postgresql }}
+                  postgresql-version: 13
-                  postgis-version: ${{ matrix.postgis }}
+                  postgis-version: 3
            - name: Install Postgresql server dev
              run: sudo apt-get install postgresql-server-dev-13
            - uses: ./Nominatim/.github/actions/build-nominatim
              with:
-                  ubuntu: ${{ matrix.ubuntu }}
+                  ubuntu: 20
                  cmake-args: -DBUILD_MODULE=on
            - name: Install test prerequsites
              run: sudo apt-get install -y -qq python3-behave
              if: matrix.ubuntu == 20
-            - name: Install test prerequsites
+            - name: BDD tests (legacy tokenizer)
              run: pip3 install behave==1.2.6
              if: matrix.ubuntu == 18
            - name: BDD tests (icu tokenizer)
              run: |
-                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
+                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy --format=progress3
              working-directory: Nominatim/test/bdd
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,7 +44,7 @@ endif()
 set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
 set(BUILD_API on CACHE BOOL "Build everything for the API server")
-set(BUILD_MODULE on CACHE BOOL "Build PostgreSQL module")
+set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
 set(BUILD_TESTS on CACHE BOOL "Build test suite")
 set(BUILD_DOCS on CACHE BOOL "Build documentation")
 set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
--- a/docs/admin/Installation.md
+++ b/docs/admin/Installation.md
@@ -158,6 +158,17 @@ make
 sudo make install
 ```
 !!! warning
    The default installation no longer compiles the PostgreSQL module that
    is needed for the legacy tokenizer from older Nominatim versions. If you
    are upgrading an older database or want to run the
    [legacy tokenizer](../customize/Tokenizers.md#legacy-tokenizer) for
    some other reason, you need to enable the PostgreSQL module via
    cmake: `cmake -DBUILD_MODULE=on ../Nominatim`. To compile the module
    you need to have the server development headers for PostgreSQL installed.
    On Ubuntu/Debian run: `sudo apt install postgresql-server-dev-<postgresql version>`
 Nominatim installs itself into `/usr/local` per default. To choose a different
 installation directory add `-DCMAKE_INSTALL_PREFIX=<install root>` to the
 cmake command. Make sure that the `bin` directory is available in your path
--- a/docs/admin/Migration.md
+++ b/docs/admin/Migration.md
@@ -17,6 +17,14 @@ breaking changes. **Please read them before running the migration.**
 ## 4.0.0 -> master
 ### ICU tokenizer is the new default
 Nominatim now installs the [ICU tokenizer](../customize/Tokenizers.md#icu-tokenizer)
 by default. This only has an effect on newly installed databases. When
 updating older databases, it keeps its installed tokenizer. If you still
 run with the legacy tokenizer, make sure to compile Nominatim with the
 PostgreSQL module, see [Installation](Installation.md#building-nominatim).
 ### geocodejson output changed
 The `type` field of the geocodejson output has changed. It now contains
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -19,7 +19,22 @@ they can be configured.
 The legacy tokenizer implements the analysis algorithms of older Nominatim
 versions. It uses a special Postgresql module to normalize names and queries.
-This tokenizer is currently the default.
+This tokenizer is automatically installed and used when upgrading an older
 database. It should not be used for new installations anymore.
 ### Compiling the PostgreSQL module
 The tokeinzer needs a special C module for PostgreSQL which is not compiled
 by default. If you need the legacy tokenizer, compile Nominatim as follows:
 ```
 mkdir build
 cd build
 cmake -DBUILD_MODULE=on
 make
 ```
 ### Enabling the tokenizer
 To enable the tokenizer add the following line to your project configuration:
@@ -47,6 +62,7 @@ normalization functions are hard-coded.
 The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
 normalize names and queries. It also offers configurable decomposition and
 abbreviation handling.
 This tokenizer is currently the default.
 To enable the tokenizer add the following line to your project configuration:
--- a/nominatim/config.py
+++ b/nominatim/config.py
@@ -187,7 +187,7 @@ class Configuration:
        if configfile.suffix in ('.yaml', '.yml'):
            result = self._load_from_yaml(configfile)
        elif configfile.suffix == '.json':
-            with configfile.open('r') as cfg:
+            with configfile.open('r', encoding='utf-8') as cfg:
                result = json.load(cfg)
        else:
            raise UsageError(f"Config file '{configfile}' has unknown format.")
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -187,7 +187,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
                @define('CONST_Max_Word_Frequency', 10000000);
                @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
                @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
    def _save_config(self):
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -255,7 +255,7 @@ class LegacyTokenizer(AbstractTokenizer):
                @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
                @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
                require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
-                """.format(config)))
+                """.format(config)), encoding='utf-8')
    def _init_db_tables(self, config):
--- a/settings/env.defaults
+++ b/settings/env.defaults
@@ -21,8 +21,8 @@ NOMINATIM_DATABASE_MODULE_PATH=
 # Tokenizer used for normalizing and parsing queries and names.
 # The tokenizer is set up during import and cannot be changed afterwards
 # without a reimport.
-# Currently available tokenizers: legacy
+# Currently available tokenizers: icu, legacy
-NOMINATIM_TOKENIZER="legacy"
+NOMINATIM_TOKENIZER="icu"
 # Number of occurrences of a word before it is considered frequent.
 # Similar to the concept of stop words. Frequent partial words get ignored
--- a/test/bdd/environment.py
+++ b/test/bdd/environment.py
@@ -59,5 +59,5 @@ def after_scenario(context, scenario):
 def before_tag(context, tag):
    if tag == 'fail-legacy':
-        if context.config.userdata['TOKENIZER'] in (None, 'legacy'):
+        if context.config.userdata['TOKENIZER'] == 'legacy':
            context.scenario.skip("Not implemented in legacy tokenizer")
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@@ -207,7 +207,7 @@ class NominatimEnvironment:
                    self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                    self.run_nominatim('freeze')
-                    if self.tokenizer != 'icu':
+                    if self.tokenizer == 'legacy':
                        phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
                        run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                    else:
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -266,7 +266,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
    plist.sort()
    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        if nctx.tokenizer == 'icu':
+        if nctx.tokenizer != 'legacy':
            cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
                        (plist,))
        else:
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@@ -211,11 +211,6 @@ def osmline_table(temp_db_with_extensions, table_factory):
                     country_code VARCHAR(2)""")
@pytest.fixture
 def word_table(temp_db_conn):
    return mocks.MockWordTable(temp_db_conn)
@pytest.fixture
 def sql_preprocessor_cfg(tmp_path, table_factory, temp_db_with_extensions):
    table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
--- a/test/python/mocks.py
+++ b/test/python/mocks.py
@@ -14,7 +14,7 @@ import psycopg2.extras
 from nominatim.db import properties
 # This must always point to the mock word table for the default tokenizer.
-from mock_legacy_word_table import MockLegacyWordTable as MockWordTable
+from mock_icu_word_table import MockIcuWordTable as MockWordTable
 class MockPlacexTable:
    """ A placex table for testing.
--- a/test/python/tools/test_database_import.py
+++ b/test/python/tools/test_database_import.py
@@ -179,7 +179,7 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory, w
@pytest.mark.parametrize("threads", (1, 5))
 def test_load_data(dsn, place_row, placex_table, osmline_table,
-                   word_table, temp_db_cursor, threads):
+                   temp_db_cursor, threads):
    for func in ('precompute_words', 'getorcreate_housenumber_id', 'make_standard_name'):
        temp_db_cursor.execute(f"""CREATE FUNCTION {func} (src TEXT)
                                  RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
--- a/test/python/tools/test_migration.py
+++ b/test/python/tools/test_migration.py
@@ -14,6 +14,8 @@ from nominatim.tools import migration
 from nominatim.errors import UsageError
 import nominatim.version
 from mock_legacy_word_table import MockLegacyWordTable
 class DummyTokenizer:
    def update_sql_functions(self, config):
@@ -26,6 +28,10 @@ def postprocess_mock(monkeypatch):
    monkeypatch.setattr(migration.tokenizer_factory, 'get_tokenizer_for_db',
                        lambda *args: DummyTokenizer())
@pytest.fixture
 def legacy_word_table(temp_db_conn):
    return MockLegacyWordTable(temp_db_conn)
 def test_no_migration_old_versions(temp_db_with_extensions, table_factory, def_config):
    table_factory('country_name', 'name HSTORE, country_code TEXT')
@@ -156,7 +162,7 @@ def test_add_nominatim_property_table_repeat(temp_db_conn, temp_db_cursor,
 def test_change_housenumber_transliteration(temp_db_conn, temp_db_cursor,
-                                            word_table, placex_table):
+                                            legacy_word_table, placex_table):
    placex_table.add(housenumber='3A')
    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
--- a/test/python/tools/test_postcodes.py
+++ b/test/python/tools/test_postcodes.py
@@ -65,7 +65,7 @@ def tokenizer():
    return dummy_tokenizer.DummyTokenizer(None, None)
@pytest.fixture
-def postcode_table(temp_db_conn, placex_table, word_table):
+def postcode_table(temp_db_conn, placex_table):
    return MockPostcodeTable(temp_db_conn)
--- a/vagrant/Install-on-Ubuntu-18.sh
+++ b/vagrant/Install-on-Ubuntu-18.sh
@@ -25,10 +25,10 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
    sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
                        libboost-filesystem-dev libexpat1-dev zlib1g-dev\
                        libbz2-dev libpq-dev libproj-dev \
-                        postgresql-server-dev-10 postgresql-10-postgis-2.4 \
+                        postgresql-10-postgis-2.4 \
                        postgresql-contrib-10 postgresql-10-postgis-scripts \
                        php php-pgsql php-intl libicu-dev python3-pip \
-                        python3-psutil python3-jinja2 python3-icu git
+                        python3-psutil python3-jinja2 python3-yaml python3-icu git
 # Some of the Python packages that come with Ubuntu 18.04 are too old, so
 # install the latest version from pip:
--- a/vagrant/Install-on-Ubuntu-20.sh
+++ b/vagrant/Install-on-Ubuntu-20.sh
@@ -24,11 +24,11 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
    sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
                        libboost-filesystem-dev libexpat1-dev zlib1g-dev \
                        libbz2-dev libpq-dev libproj-dev \
-                        postgresql-server-dev-12 postgresql-12-postgis-3 \
+                        postgresql-12-postgis-3 \
                        postgresql-contrib-12 postgresql-12-postgis-3-scripts \
                        php php-pgsql php-intl libicu-dev python3-dotenv \
                        python3-psycopg2 python3-psutil python3-jinja2 \
-                        python3-icu python3-datrie git
+                        python3-icu python3-datrie python3-yaml git
 #
 # System Configuration