Merge pull request #2707 from lonvia/make-icu-tokenizer-the-default

Make ICU tokenizer the default
This commit is contained in:
Sarah Hoffmann
2022-05-11 08:52:49 +02:00
committed by GitHub
21 changed files with 80 additions and 63 deletions

View File

@@ -5,6 +5,10 @@ inputs:
description: 'Version of Ubuntu to install on' description: 'Version of Ubuntu to install on'
required: false required: false
default: '20' default: '20'
cmake-args:
description: 'Additional options to hand to cmake'
required: false
default: ''
runs: runs:
using: "composite" using: "composite"
@@ -21,18 +25,13 @@ runs:
shell: bash shell: bash
env: env:
UBUNTUVER: ${{ inputs.ubuntu }} UBUNTUVER: ${{ inputs.ubuntu }}
CMAKE_ARGS: ${{ inputs.cmake-args }}
- name: Download dependencies
run: |
if [ ! -f country_grid.sql.gz ]; then
wget --no-verbose https://www.nominatim.org/data/country_grid.sql.gz
fi
cp country_grid.sql.gz Nominatim/data/country_osm_grid.sql.gz
shell: bash
- name: Configure - name: Configure
run: mkdir build && cd build && cmake ../Nominatim run: mkdir build && cd build && cmake $CMAKE_ARGS ../Nominatim
shell: bash shell: bash
env:
CMAKE_ARGS: ${{ inputs.cmake-args }}
- name: Build - name: Build
run: | run: |

View File

@@ -22,7 +22,7 @@ runs:
- name: Install PostgreSQL - name: Install PostgreSQL
run: | run: |
sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER} postgresql-server-dev-${PGVER} sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER}
shell: bash shell: bash
env: env:
PGVER: ${{ inputs.postgresql-version }} PGVER: ${{ inputs.postgresql-version }}

View File

@@ -113,19 +113,9 @@ jobs:
working-directory: Nominatim/test/bdd working-directory: Nominatim/test/bdd
icu-test: legacy-test:
needs: create-archive needs: create-archive
strategy: runs-on: ubuntu-20.04
matrix:
ubuntu: [20]
include:
- ubuntu: 20
postgresql: 13
postgis: 3
pytest: py.test-3
php: 7.4
runs-on: ubuntu-${{ matrix.ubuntu }}.04
steps: steps:
- uses: actions/download-artifact@v2 - uses: actions/download-artifact@v2
@@ -138,35 +128,27 @@ jobs:
- name: Setup PHP - name: Setup PHP
uses: shivammathur/setup-php@v2 uses: shivammathur/setup-php@v2
with: with:
php-version: ${{ matrix.php }} php-version: 7.4
coverage: xdebug
tools: phpunit, phpcs, composer
- uses: actions/setup-python@v2
with:
python-version: 3.6
if: matrix.ubuntu == 18
- uses: ./Nominatim/.github/actions/setup-postgresql - uses: ./Nominatim/.github/actions/setup-postgresql
with: with:
postgresql-version: ${{ matrix.postgresql }} postgresql-version: 13
postgis-version: ${{ matrix.postgis }} postgis-version: 3
- name: Install Postgresql server dev
run: sudo apt-get install postgresql-server-dev-13
- uses: ./Nominatim/.github/actions/build-nominatim - uses: ./Nominatim/.github/actions/build-nominatim
with: with:
ubuntu: ${{ matrix.ubuntu }} ubuntu: 20
cmake-args: -DBUILD_MODULE=on
- name: Install test prerequsites - name: Install test prerequsites
run: sudo apt-get install -y -qq python3-behave run: sudo apt-get install -y -qq python3-behave
if: matrix.ubuntu == 20
- name: Install test prerequsites - name: BDD tests (legacy tokenizer)
run: pip3 install behave==1.2.6
if: matrix.ubuntu == 18
- name: BDD tests (icu tokenizer)
run: | run: |
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3 behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy --format=progress3
working-directory: Nominatim/test/bdd working-directory: Nominatim/test/bdd

View File

@@ -44,7 +44,7 @@ endif()
set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database") set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
set(BUILD_API on CACHE BOOL "Build everything for the API server") set(BUILD_API on CACHE BOOL "Build everything for the API server")
set(BUILD_MODULE on CACHE BOOL "Build PostgreSQL module") set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
set(BUILD_TESTS on CACHE BOOL "Build test suite") set(BUILD_TESTS on CACHE BOOL "Build test suite")
set(BUILD_DOCS on CACHE BOOL "Build documentation") set(BUILD_DOCS on CACHE BOOL "Build documentation")
set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page") set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")

View File

@@ -158,6 +158,17 @@ make
sudo make install sudo make install
``` ```
!!! warning
The default installation no longer compiles the PostgreSQL module that
is needed for the legacy tokenizer from older Nominatim versions. If you
are upgrading an older database or want to run the
[legacy tokenizer](../customize/Tokenizers.md#legacy-tokenizer) for
some other reason, you need to enable the PostgreSQL module via
cmake: `cmake -DBUILD_MODULE=on ../Nominatim`. To compile the module
you need to have the server development headers for PostgreSQL installed.
On Ubuntu/Debian run: `sudo apt install postgresql-server-dev-<postgresql version>`
Nominatim installs itself into `/usr/local` per default. To choose a different Nominatim installs itself into `/usr/local` per default. To choose a different
installation directory add `-DCMAKE_INSTALL_PREFIX=<install root>` to the installation directory add `-DCMAKE_INSTALL_PREFIX=<install root>` to the
cmake command. Make sure that the `bin` directory is available in your path cmake command. Make sure that the `bin` directory is available in your path

View File

@@ -17,6 +17,14 @@ breaking changes. **Please read them before running the migration.**
## 4.0.0 -> master ## 4.0.0 -> master
### ICU tokenizer is the new default
Nominatim now installs the [ICU tokenizer](../customize/Tokenizers.md#icu-tokenizer)
by default. This only has an effect on newly installed databases. When
updating older databases, it keeps its installed tokenizer. If you still
run with the legacy tokenizer, make sure to compile Nominatim with the
PostgreSQL module, see [Installation](Installation.md#building-nominatim).
### geocodejson output changed ### geocodejson output changed
The `type` field of the geocodejson output has changed. It now contains The `type` field of the geocodejson output has changed. It now contains

View File

@@ -19,7 +19,22 @@ they can be configured.
The legacy tokenizer implements the analysis algorithms of older Nominatim The legacy tokenizer implements the analysis algorithms of older Nominatim
versions. It uses a special Postgresql module to normalize names and queries. versions. It uses a special Postgresql module to normalize names and queries.
This tokenizer is currently the default. This tokenizer is automatically installed and used when upgrading an older
database. It should not be used for new installations anymore.
### Compiling the PostgreSQL module
The tokeinzer needs a special C module for PostgreSQL which is not compiled
by default. If you need the legacy tokenizer, compile Nominatim as follows:
```
mkdir build
cd build
cmake -DBUILD_MODULE=on
make
```
### Enabling the tokenizer
To enable the tokenizer add the following line to your project configuration: To enable the tokenizer add the following line to your project configuration:
@@ -47,6 +62,7 @@ normalization functions are hard-coded.
The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
normalize names and queries. It also offers configurable decomposition and normalize names and queries. It also offers configurable decomposition and
abbreviation handling. abbreviation handling.
This tokenizer is currently the default.
To enable the tokenizer add the following line to your project configuration: To enable the tokenizer add the following line to your project configuration:

View File

@@ -187,7 +187,7 @@ class Configuration:
if configfile.suffix in ('.yaml', '.yml'): if configfile.suffix in ('.yaml', '.yml'):
result = self._load_from_yaml(configfile) result = self._load_from_yaml(configfile)
elif configfile.suffix == '.json': elif configfile.suffix == '.json':
with configfile.open('r') as cfg: with configfile.open('r', encoding='utf-8') as cfg:
result = json.load(cfg) result = json.load(cfg)
else: else:
raise UsageError(f"Config file '{configfile}' has unknown format.") raise UsageError(f"Config file '{configfile}' has unknown format.")

View File

@@ -187,7 +187,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
@define('CONST_Max_Word_Frequency', 10000000); @define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}"); @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}"); @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');""")) require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
def _save_config(self): def _save_config(self):

View File

@@ -255,7 +255,7 @@ class LegacyTokenizer(AbstractTokenizer):
@define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY}); @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
@define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}"); @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php'); require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
""".format(config))) """.format(config)), encoding='utf-8')
def _init_db_tables(self, config): def _init_db_tables(self, config):

View File

@@ -21,8 +21,8 @@ NOMINATIM_DATABASE_MODULE_PATH=
# Tokenizer used for normalizing and parsing queries and names. # Tokenizer used for normalizing and parsing queries and names.
# The tokenizer is set up during import and cannot be changed afterwards # The tokenizer is set up during import and cannot be changed afterwards
# without a reimport. # without a reimport.
# Currently available tokenizers: legacy # Currently available tokenizers: icu, legacy
NOMINATIM_TOKENIZER="legacy" NOMINATIM_TOKENIZER="icu"
# Number of occurrences of a word before it is considered frequent. # Number of occurrences of a word before it is considered frequent.
# Similar to the concept of stop words. Frequent partial words get ignored # Similar to the concept of stop words. Frequent partial words get ignored

View File

@@ -59,5 +59,5 @@ def after_scenario(context, scenario):
def before_tag(context, tag): def before_tag(context, tag):
if tag == 'fail-legacy': if tag == 'fail-legacy':
if context.config.userdata['TOKENIZER'] in (None, 'legacy'): if context.config.userdata['TOKENIZER'] == 'legacy':
context.scenario.skip("Not implemented in legacy tokenizer") context.scenario.skip("Not implemented in legacy tokenizer")

View File

@@ -207,7 +207,7 @@ class NominatimEnvironment:
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve())) self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
self.run_nominatim('freeze') self.run_nominatim('freeze')
if self.tokenizer != 'icu': if self.tokenizer == 'legacy':
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve()) phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file]) run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
else: else:

View File

@@ -266,7 +266,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
plist.sort() plist.sort()
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
if nctx.tokenizer == 'icu': if nctx.tokenizer != 'legacy':
cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)", cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
(plist,)) (plist,))
else: else:

View File

@@ -211,11 +211,6 @@ def osmline_table(temp_db_with_extensions, table_factory):
country_code VARCHAR(2)""") country_code VARCHAR(2)""")
@pytest.fixture
def word_table(temp_db_conn):
return mocks.MockWordTable(temp_db_conn)
@pytest.fixture @pytest.fixture
def sql_preprocessor_cfg(tmp_path, table_factory, temp_db_with_extensions): def sql_preprocessor_cfg(tmp_path, table_factory, temp_db_with_extensions):
table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, ))) table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))

View File

@@ -14,7 +14,7 @@ import psycopg2.extras
from nominatim.db import properties from nominatim.db import properties
# This must always point to the mock word table for the default tokenizer. # This must always point to the mock word table for the default tokenizer.
from mock_legacy_word_table import MockLegacyWordTable as MockWordTable from mock_icu_word_table import MockIcuWordTable as MockWordTable
class MockPlacexTable: class MockPlacexTable:
""" A placex table for testing. """ A placex table for testing.

View File

@@ -179,7 +179,7 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory, w
@pytest.mark.parametrize("threads", (1, 5)) @pytest.mark.parametrize("threads", (1, 5))
def test_load_data(dsn, place_row, placex_table, osmline_table, def test_load_data(dsn, place_row, placex_table, osmline_table,
word_table, temp_db_cursor, threads): temp_db_cursor, threads):
for func in ('precompute_words', 'getorcreate_housenumber_id', 'make_standard_name'): for func in ('precompute_words', 'getorcreate_housenumber_id', 'make_standard_name'):
temp_db_cursor.execute(f"""CREATE FUNCTION {func} (src TEXT) temp_db_cursor.execute(f"""CREATE FUNCTION {func} (src TEXT)
RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL

View File

@@ -14,6 +14,8 @@ from nominatim.tools import migration
from nominatim.errors import UsageError from nominatim.errors import UsageError
import nominatim.version import nominatim.version
from mock_legacy_word_table import MockLegacyWordTable
class DummyTokenizer: class DummyTokenizer:
def update_sql_functions(self, config): def update_sql_functions(self, config):
@@ -26,6 +28,10 @@ def postprocess_mock(monkeypatch):
monkeypatch.setattr(migration.tokenizer_factory, 'get_tokenizer_for_db', monkeypatch.setattr(migration.tokenizer_factory, 'get_tokenizer_for_db',
lambda *args: DummyTokenizer()) lambda *args: DummyTokenizer())
@pytest.fixture
def legacy_word_table(temp_db_conn):
return MockLegacyWordTable(temp_db_conn)
def test_no_migration_old_versions(temp_db_with_extensions, table_factory, def_config): def test_no_migration_old_versions(temp_db_with_extensions, table_factory, def_config):
table_factory('country_name', 'name HSTORE, country_code TEXT') table_factory('country_name', 'name HSTORE, country_code TEXT')
@@ -156,7 +162,7 @@ def test_add_nominatim_property_table_repeat(temp_db_conn, temp_db_cursor,
def test_change_housenumber_transliteration(temp_db_conn, temp_db_cursor, def test_change_housenumber_transliteration(temp_db_conn, temp_db_cursor,
word_table, placex_table): legacy_word_table, placex_table):
placex_table.add(housenumber='3A') placex_table.add(housenumber='3A')
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)

View File

@@ -65,7 +65,7 @@ def tokenizer():
return dummy_tokenizer.DummyTokenizer(None, None) return dummy_tokenizer.DummyTokenizer(None, None)
@pytest.fixture @pytest.fixture
def postcode_table(temp_db_conn, placex_table, word_table): def postcode_table(temp_db_conn, placex_table):
return MockPostcodeTable(temp_db_conn) return MockPostcodeTable(temp_db_conn)

View File

@@ -25,10 +25,10 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \ sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
libboost-filesystem-dev libexpat1-dev zlib1g-dev\ libboost-filesystem-dev libexpat1-dev zlib1g-dev\
libbz2-dev libpq-dev libproj-dev \ libbz2-dev libpq-dev libproj-dev \
postgresql-server-dev-10 postgresql-10-postgis-2.4 \ postgresql-10-postgis-2.4 \
postgresql-contrib-10 postgresql-10-postgis-scripts \ postgresql-contrib-10 postgresql-10-postgis-scripts \
php php-pgsql php-intl libicu-dev python3-pip \ php php-pgsql php-intl libicu-dev python3-pip \
python3-psutil python3-jinja2 python3-icu git python3-psutil python3-jinja2 python3-yaml python3-icu git
# Some of the Python packages that come with Ubuntu 18.04 are too old, so # Some of the Python packages that come with Ubuntu 18.04 are too old, so
# install the latest version from pip: # install the latest version from pip:

View File

@@ -24,11 +24,11 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \ sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
libboost-filesystem-dev libexpat1-dev zlib1g-dev \ libboost-filesystem-dev libexpat1-dev zlib1g-dev \
libbz2-dev libpq-dev libproj-dev \ libbz2-dev libpq-dev libproj-dev \
postgresql-server-dev-12 postgresql-12-postgis-3 \ postgresql-12-postgis-3 \
postgresql-contrib-12 postgresql-12-postgis-3-scripts \ postgresql-contrib-12 postgresql-12-postgis-3-scripts \
php php-pgsql php-intl libicu-dev python3-dotenv \ php php-pgsql php-intl libicu-dev python3-dotenv \
python3-psycopg2 python3-psutil python3-jinja2 \ python3-psycopg2 python3-psutil python3-jinja2 \
python3-icu python3-datrie git python3-icu python3-datrie python3-yaml git
# #
# System Configuration # System Configuration