remove legacy tokenizer and direct tests

2024-09-21 11:38:08 +02:00
parent e92e03e2e6
commit b87d6226fb
9 changed files with 0 additions and 2360 deletions
--- a/test/python/api/search/test_legacy_query_analyzer.py
+++ b/test/python/api/search/test_legacy_query_analyzer.py
@@ -1,241 +0,0 @@
-# SPDX-License-Identifier: GPL-3.0-or-later
-#
-# This file is part of Nominatim. (https://nominatim.org)
-#
-# Copyright (C) 2024 by the Nominatim developer community.
-# For a full list of authors see the git log.
-"""
-Tests for query analyzer for legacy tokenizer.
-"""
-import pytest
-import pytest_asyncio
-
-from nominatim_api import NominatimAPIAsync
-from nominatim_api.search.query import Phrase, PhraseType, TokenType, BreakType
-import nominatim_api.search.legacy_tokenizer as tok
-from nominatim_api.logging import set_log_output, get_and_disable
-
-
-async def add_word(conn, word_id, word_token, word, count):
-    t = conn.t.meta.tables['word']
-    await conn.execute(t.insert(), {'word_id': word_id,
-                                    'word_token': word_token,
-                                    'search_name_count': count,
-                                    'word': word})
-
-
-async def add_housenumber(conn, word_id, hnr):
-    t = conn.t.meta.tables['word']
-    await conn.execute(t.insert(), {'word_id': word_id,
-                                    'word_token': ' ' + hnr,
-                                    'word': hnr,
-                                    'class': 'place',
-                                    'type': 'house'})
-
-
-async def add_postcode(conn, word_id, postcode):
-    t = conn.t.meta.tables['word']
-    await conn.execute(t.insert(), {'word_id': word_id,
-                                    'word_token': ' ' + postcode,
-                                    'word': postcode,
-                                    'class': 'place',
-                                    'type': 'postcode'})
-
-
-async def add_special_term(conn, word_id, word_token, cls, typ, op):
-    t = conn.t.meta.tables['word']
-    await conn.execute(t.insert(), {'word_id': word_id,
-                                    'word_token': word_token,
-                                    'word': word_token,
-                                    'class': cls,
-                                    'type': typ,
-                                    'operator': op})
-
-
-def make_phrase(query):
-    return [Phrase(PhraseType.NONE, s) for s in query.split(',')]
-
-
-@pytest_asyncio.fixture
-async def conn(table_factory, temp_db_cursor):
-    """ Create an asynchronous SQLAlchemy engine for the test DB.
-    """
-    table_factory('nominatim_properties',
-                  definition='property TEXT, value TEXT',
-                  content=(('tokenizer_maxwordfreq', '10000'), ))
-    table_factory('word',
-                  definition="""word_id INT, word_token TEXT, word TEXT,
-                                class TEXT, type TEXT, country_code TEXT,
-                                search_name_count INT, operator TEXT
-                             """)
-
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
-                              RETURNS TEXT AS $$ SELECT lower(name); $$ LANGUAGE SQL;""")
-
-    async with NominatimAPIAsync() as api:
-        async with api.begin() as conn:
-            yield conn
-
-
-@pytest.mark.asyncio
-async def test_empty_phrase(conn):
-    ana = await tok.create_query_analyzer(conn)
-
-    query = await ana.analyze_query([])
-
-    assert len(query.source) == 0
-    assert query.num_token_slots() == 0
-
-
-@pytest.mark.asyncio
-async def test_single_phrase_with_unknown_terms(conn):
-    ana = await tok.create_query_analyzer(conn)
-
-    await add_word(conn, 1, 'foo', 'FOO', 3)
-
-    query = await ana.analyze_query(make_phrase('foo BAR'))
-
-    assert len(query.source) == 1
-    assert query.source[0].ptype == PhraseType.NONE
-    assert query.source[0].text == 'foo bar'
-
-    assert query.num_token_slots() == 2
-    assert len(query.nodes[0].starting) == 1
-    assert not query.nodes[1].starting
-
-
-@pytest.mark.asyncio
-async def test_multiple_phrases(conn):
-    ana = await tok.create_query_analyzer(conn)
-
-    await add_word(conn, 1, 'one', 'one', 13)
-    await add_word(conn, 2, 'two', 'two', 45)
-    await add_word(conn, 100, 'one two', 'one two', 3)
-    await add_word(conn, 3, 'three', 'three', 4584)
-
-    query = await ana.analyze_query(make_phrase('one two,three'))
-
-    assert len(query.source) == 2
-
-
-@pytest.mark.asyncio
-async def test_housenumber_token(conn):
-    ana = await tok.create_query_analyzer(conn)
-
-    await add_housenumber(conn, 556, '45 a')
-
-    query = await ana.analyze_query(make_phrase('45 A'))
-
-    assert query.num_token_slots() == 2
-    assert len(query.nodes[0].starting) == 2
-
-    query.nodes[0].starting.sort(key=lambda tl: tl.end)
-
-    hn1 = query.nodes[0].starting[0]
-    assert hn1.ttype == TokenType.HOUSENUMBER
-    assert hn1.end == 1
-    assert hn1.tokens[0].token == 0
-
-    hn2 = query.nodes[0].starting[1]
-    assert hn2.ttype == TokenType.HOUSENUMBER
-    assert hn2.end == 2
-    assert hn2.tokens[0].token == 556
-
-
-@pytest.mark.asyncio
-async def test_postcode_token(conn):
-    ana = await tok.create_query_analyzer(conn)
-
-    await add_postcode(conn, 34, '45ax')
-
-    query = await ana.analyze_query(make_phrase('45AX'))
-
-    assert query.num_token_slots() == 1
-    assert [tl.ttype for tl in query.nodes[0].starting] == [TokenType.POSTCODE]
-
-
-@pytest.mark.asyncio
-async def test_partial_tokens(conn):
-    ana = await tok.create_query_analyzer(conn)
-
-    await add_word(conn, 1, ' foo', 'foo', 99)
-    await add_word(conn, 1, 'foo', 'FOO', 99)
-    await add_word(conn, 1, 'bar', 'FOO', 990000)
-
-    query = await ana.analyze_query(make_phrase('foo bar'))
-
-    assert query.num_token_slots() == 2
-
-    first = query.nodes[0].starting
-    first.sort(key=lambda tl: tl.tokens[0].penalty)
-    assert [tl.ttype for tl in first] == [TokenType.WORD, TokenType.PARTIAL]
-    assert all(tl.tokens[0].lookup_word == 'foo' for tl in first)
-
-    second = query.nodes[1].starting
-    assert [tl.ttype for tl in second] == [TokenType.PARTIAL]
-    assert not second[0].tokens[0].is_indexed
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize('term,order', [('23456', ['POSTCODE', 'HOUSENUMBER', 'WORD', 'PARTIAL']),
-                                        ('3', ['HOUSENUMBER', 'POSTCODE', 'WORD', 'PARTIAL'])
-                                       ])
-async def test_penalty_postcodes_and_housenumbers(conn, term, order):
-    ana = await tok.create_query_analyzer(conn)
-
-    await add_postcode(conn, 1, term)
-    await add_housenumber(conn, 2, term)
-    await add_word(conn, 3, term, term, 5)
-    await add_word(conn, 4, ' ' + term, term, 1)
-
-    query = await ana.analyze_query(make_phrase(term))
-
-    assert query.num_token_slots() == 1
-
-    torder = [(tl.tokens[0].penalty, tl.ttype.name) for tl in query.nodes[0].starting]
-    torder.sort()
-
-    assert [t[1] for t in torder] == order
-
-
-@pytest.mark.asyncio
-async def test_category_words_only_at_beginning(conn):
-    ana = await tok.create_query_analyzer(conn)
-
-    await add_special_term(conn, 1, 'foo', 'amenity', 'restaurant', 'in')
-    await add_word(conn, 2, ' bar', 'BAR', 1)
-
-    query = await ana.analyze_query(make_phrase('foo BAR foo'))
-
-    assert query.num_token_slots() == 3
-    assert len(query.nodes[0].starting) == 1
-    assert query.nodes[0].starting[0].ttype == TokenType.NEAR_ITEM
-    assert not query.nodes[2].starting
-
-
-@pytest.mark.asyncio
-async def test_qualifier_words(conn):
-    ana = await tok.create_query_analyzer(conn)
-
-    await add_special_term(conn, 1, 'foo', 'amenity', 'restaurant', '-')
-    await add_word(conn, 2, ' bar', 'w', None)
-
-    query = await ana.analyze_query(make_phrase('foo BAR foo BAR foo'))
-
-    assert query.num_token_slots() == 5
-    assert set(t.ttype for t in query.nodes[0].starting) == {TokenType.NEAR_ITEM, TokenType.QUALIFIER}
-    assert set(t.ttype for t in query.nodes[2].starting) == {TokenType.QUALIFIER}
-    assert set(t.ttype for t in query.nodes[4].starting) == {TokenType.NEAR_ITEM, TokenType.QUALIFIER}
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize('logtype', ['text', 'html'])
-async def test_log_output(conn, logtype):
-    ana = await tok.create_query_analyzer(conn)
-
-    await add_word(conn, 1, 'foo', 'FOO', 99)
-
-    set_log_output(logtype)
-    await ana.analyze_query(make_phrase('foo'))
-
-    assert get_and_disable()
--- a/test/python/mock_legacy_word_table.py
+++ b/test/python/mock_legacy_word_table.py
@@ -1,99 +0,0 @@
-# SPDX-License-Identifier: GPL-3.0-or-later
-#
-# This file is part of Nominatim. (https://nominatim.org)
-#
-# Copyright (C) 2024 by the Nominatim developer community.
-# For a full list of authors see the git log.
-"""
-Legacy word table for testing with functions to prefil and test contents
-of the table.
-"""
-from nominatim_db.db.connection import execute_scalar
-
-class MockLegacyWordTable:
-    """ A word table for testing using legacy word table structure.
-    """
-    def __init__(self, conn):
-        self.conn = conn
-        with conn.cursor() as cur:
-            cur.execute("""CREATE TABLE word (word_id INTEGER,
-                                              word_token text,
-                                              word text,
-                                              class text,
-                                              type text,
-                                              country_code varchar(2),
-                                              search_name_count INTEGER,
-                                              operator TEXT)""")
-
-        conn.commit()
-
-    def add_full_word(self, word_id, word, word_token=None):
-        with self.conn.cursor() as cur:
-            cur.execute("""INSERT INTO word (word_id, word_token, word)
-                           VALUES (%s, %s, %s)
-                        """, (word_id, ' ' + (word_token or word), word))
-        self.conn.commit()
-
-
-    def add_special(self, word_token, word, cls, typ, oper):
-        with self.conn.cursor() as cur:
-            cur.execute("""INSERT INTO word (word_token, word, class, type, operator)
-                              VALUES (%s, %s, %s, %s, %s)
-                        """, (word_token, word, cls, typ, oper))
-        self.conn.commit()
-
-
-    def add_country(self, country_code, word_token):
-        with self.conn.cursor() as cur:
-            cur.execute("INSERT INTO word (word_token, country_code) VALUES(%s, %s)",
-                        (word_token, country_code))
-        self.conn.commit()
-
-
-    def add_postcode(self, word_token, postcode):
-        with self.conn.cursor() as cur:
-            cur.execute("""INSERT INTO word (word_token, word, class, type)
-                              VALUES (%s, %s, 'place', 'postcode')
-                        """, (word_token, postcode))
-        self.conn.commit()
-
-
-    def count(self):
-        return execute_scalar(self.conn, "SELECT count(*) FROM word")
-
-
-    def count_special(self):
-        return execute_scalar(self.conn, "SELECT count(*) FROM word WHERE class != 'place'")
-
-
-    def get_special(self):
-        with self.conn.cursor() as cur:
-            cur.execute("""SELECT word_token, word, class as cls, type, operator
-                           FROM word WHERE class != 'place'""")
-            result = set((tuple(row) for row in cur))
-            assert len(result) == cur.rowcount, "Word table has duplicates."
-            return result
-
-
-    def get_country(self):
-        with self.conn.cursor() as cur:
-            cur.execute("""SELECT country_code, word_token
-                           FROM word WHERE country_code is not null""")
-            result = set((tuple(row) for row in cur))
-            assert len(result) == cur.rowcount, "Word table has duplicates."
-            return result
-
-
-    def get_postcodes(self):
-        with self.conn.cursor() as cur:
-            cur.execute("""SELECT word FROM word
-                           WHERE class = 'place' and type = 'postcode'""")
-            return set((row[0] for row in cur))
-
-    def get_partial_words(self):
-        with self.conn.cursor() as cur:
-            cur.execute("""SELECT word_token, search_name_count FROM word
-                           WHERE class is null and country_code is null
-                                 and not word_token like ' %'""")
-            return set((tuple(row) for row in cur))
-
--- a/test/python/tokenizer/test_legacy.py
+++ b/test/python/tokenizer/test_legacy.py
@@ -1,591 +0,0 @@
-# SPDX-License-Identifier: GPL-3.0-or-later
-#
-# This file is part of Nominatim. (https://nominatim.org)
-#
-# Copyright (C) 2024 by the Nominatim developer community.
-# For a full list of authors see the git log.
-"""
-Test for legacy tokenizer.
-"""
-import shutil
-import re
-
-import pytest
-
-from nominatim_db.data.place_info import PlaceInfo
-from nominatim_db.tokenizer import legacy_tokenizer
-from nominatim_db.db import properties
-from nominatim_db.errors import UsageError
-
-from mock_legacy_word_table import MockLegacyWordTable
-
-# Force use of legacy word table
-@pytest.fixture
-def word_table(temp_db_conn):
-    return MockLegacyWordTable(temp_db_conn)
-
-
-@pytest.fixture
-def test_config(project_env, tmp_path):
-    module_dir = tmp_path / 'module_src'
-    module_dir.mkdir()
-    (module_dir / 'nominatim.so').write_text('TEST nominatim.so')
-
-    project_env.lib_dir.module = module_dir
-
-    sqldir = tmp_path / 'sql'
-    sqldir.mkdir()
-    (sqldir / 'tokenizer').mkdir()
-
-    # Get the original SQL but replace make_standard_name to avoid module use.
-    init_sql = (project_env.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer.sql').read_text()
-    for fn in ('transliteration', 'gettokenstring'):
-        init_sql = re.sub(f'CREATE OR REPLACE FUNCTION {fn}[^;]*;',
-                          '', init_sql, re.DOTALL)
-    init_sql += """
-                   CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
-                   RETURNS TEXT AS $$ SELECT lower(name); $$ LANGUAGE SQL;
-
-                """
-    # Also load util functions. Some are needed by the tokenizer.
-    init_sql += (project_env.lib_dir.sql / 'functions' / 'utils.sql').read_text()
-    (sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text(init_sql)
-
-    (sqldir / 'words.sql').write_text("SELECT 'a'")
-
-    shutil.copy(str(project_env.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
-                str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
-
-    project_env.lib_dir.sql = sqldir
-    project_env.lib_dir.data = sqldir
-
-    return project_env
-
-
-@pytest.fixture
-def tokenizer_factory(dsn, tmp_path, property_table):
-    (tmp_path / 'tokenizer').mkdir()
-
-    def _maker():
-        return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
-
-    return _maker
-
-
-@pytest.fixture
-def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor):
-    monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None)
-    tok = tokenizer_factory()
-    tok.init_new_db(test_config)
-
-
-@pytest.fixture
-def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor,
-             word_table, temp_db_with_extensions, tmp_path):
-    monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None)
-    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
-    tok = tokenizer_factory()
-    tok.init_new_db(test_config)
-    monkeypatch.undo()
-
-    with tok.name_analyzer() as analyzer:
-        yield analyzer
-
-
-@pytest.fixture
-def make_standard_name(temp_db_cursor):
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
-                              RETURNS TEXT AS $$ SELECT '#' || lower(name) || '#'; $$ LANGUAGE SQL""")
-
-
-@pytest.fixture
-def create_postcode_id(temp_db_cursor):
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
-                              RETURNS BOOLEAN AS $$
-                              INSERT INTO word (word_token, word, class, type)
-                                VALUES (' ' || postcode, postcode, 'place', 'postcode')
-                              RETURNING True;
-                              $$ LANGUAGE SQL""")
-
-
-def test_init_new(tokenizer_factory, test_config, monkeypatch,
-                  temp_db_conn, sql_preprocessor):
-    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
-    monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None)
-
-    tok = tokenizer_factory()
-    tok.init_new_db(test_config)
-
-    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
-
-    outfile = test_config.project_dir / 'module' / 'nominatim.so'
-
-    assert outfile.exists()
-    assert outfile.read_text() == 'TEST nominatim.so'
-    assert outfile.stat().st_mode == 33261
-
-
-def test_init_module_load_failed(tokenizer_factory, test_config):
-    tok = tokenizer_factory()
-
-    with pytest.raises(UsageError):
-        tok.init_new_db(test_config)
-
-
-def test_init_module_custom(tokenizer_factory, test_config,
-                            monkeypatch, tmp_path, sql_preprocessor):
-    module_dir = (tmp_path / 'custom').resolve()
-    module_dir.mkdir()
-    (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
-
-    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir))
-    monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None)
-
-    tok = tokenizer_factory()
-    tok.init_new_db(test_config)
-
-    assert not (test_config.project_dir / 'module').exists()
-
-
-def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
-    tok = tokenizer_factory()
-
-    tok.init_from_project(test_config)
-
-    assert tok.normalization is not None
-
-
-def test_update_sql_functions(sql_preprocessor, temp_db_conn,
-                              tokenizer_factory, test_config, table_factory,
-                              monkeypatch, temp_db_cursor):
-    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
-    monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None)
-    tok = tokenizer_factory()
-    tok.init_new_db(test_config)
-    monkeypatch.undo()
-
-    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
-
-    table_factory('test', 'txt TEXT')
-
-    func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer.sql'
-    func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}'),
-                                                   ('{{modulepath}}')""")
-
-    tok.update_sql_functions(test_config)
-
-    test_content = temp_db_cursor.row_set('SELECT * FROM test')
-    assert test_content == set((('1133', ), (str(test_config.project_dir / 'module'), )))
-
-
-def test_finalize_import(tokenizer_factory, temp_db_conn,
-                         temp_db_cursor, test_config, monkeypatch,
-                         sql_preprocessor_cfg):
-    monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None)
-
-    func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_indices.sql'
-    func_file.write_text("""CREATE FUNCTION test() RETURNS TEXT
-                            AS $$ SELECT 'b'::text $$ LANGUAGE SQL""")
-
-    tok = tokenizer_factory()
-    tok.init_new_db(test_config)
-
-    tok.finalize_import(test_config)
-
-    temp_db_cursor.scalar('SELECT test()') == 'b'
-
-
-def test_migrate_database(tokenizer_factory, test_config, temp_db_conn, monkeypatch):
-    monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None)
-    tok = tokenizer_factory()
-    tok.migrate_database(test_config)
-
-    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) is not None
-    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) is not None
-
-    outfile = test_config.project_dir / 'module' / 'nominatim.so'
-
-    assert outfile.exists()
-    assert outfile.read_text() == 'TEST nominatim.so'
-    assert outfile.stat().st_mode == 33261
-
-
-def test_check_database(test_config, tokenizer_factory, monkeypatch,
-                        temp_db_cursor, sql_preprocessor_cfg):
-    monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None)
-    tok = tokenizer_factory()
-    tok.init_new_db(test_config)
-
-    assert tok.check_database(False) is None
-
-
-def test_check_database_no_tokenizer(test_config, tokenizer_factory):
-    tok = tokenizer_factory()
-
-    assert tok.check_database(False) is not None
-
-
-def test_check_database_bad_setup(test_config, tokenizer_factory, monkeypatch,
-                                  temp_db_cursor, sql_preprocessor_cfg):
-    monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None)
-    tok = tokenizer_factory()
-    tok.init_new_db(test_config)
-
-    # Inject a bad transliteration.
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
-                              RETURNS TEXT AS $$ SELECT 'garbage'::text; $$ LANGUAGE SQL""")
-
-    assert tok.check_database(False) is not None
-
-
-def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
-    tok = tokenizer_factory()
-    tok.update_statistics(test_config)
-
-
-def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory, test_config):
-    word_table.add_full_word(1000, 'hello')
-    table_factory('search_name',
-                  'place_id BIGINT, name_vector INT[]',
-                  [(12, [1000])])
-    tok = tokenizer_factory()
-
-    tok.update_statistics(test_config)
-
-    assert temp_db_cursor.scalar("""SELECT count(*) FROM word
-                                    WHERE word_token like ' %' and
-                                          search_name_count > 0""") > 0
-
-
-def test_update_word_tokens(tokenizer_factory):
-    tok = tokenizer_factory()
-
-    # This is a noop and should just pass.
-    tok.update_word_tokens()
-
-
-def test_normalize(analyzer):
-    assert analyzer.normalize('TEsT') == 'test'
-
-
-def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table,
-                                        create_postcode_id):
-    table_factory('location_postcode', 'postcode TEXT',
-                  content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
-
-    analyzer.update_postcodes_from_db()
-
-    assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'}
-
-
-def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table,
-                                                 create_postcode_id):
-    table_factory('location_postcode', 'postcode TEXT',
-                  content=(('1234',), ('45BC', ), ('XX45', )))
-    word_table.add_postcode(' 1234', '1234')
-    word_table.add_postcode(' 5678', '5678')
-
-    analyzer.update_postcodes_from_db()
-
-    assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'}
-
-
-def test_update_special_phrase_empty_table(analyzer, word_table, make_standard_name):
-    analyzer.update_special_phrases([
-        ("König bei", "amenity", "royal", "near"),
-        ("Könige", "amenity", "royal", "-"),
-        ("könige", "amenity", "royal", "-"),
-        ("strasse", "highway", "primary", "in")
-    ], True)
-
-    assert word_table.get_special() \
-               == set(((' #könig bei#', 'könig bei', 'amenity', 'royal', 'near'),
-                       (' #könige#', 'könige', 'amenity', 'royal', None),
-                       (' #strasse#', 'strasse', 'highway', 'primary', 'in')))
-
-
-def test_update_special_phrase_delete_all(analyzer, word_table, make_standard_name):
-    word_table.add_special(' #foo#', 'foo', 'amenity', 'prison', 'in')
-    word_table.add_special(' #bar#', 'bar', 'highway', 'road', None)
-
-    assert word_table.count_special() == 2
-
-    analyzer.update_special_phrases([], True)
-
-    assert word_table.count_special() == 0
-
-
-def test_update_special_phrases_no_replace(analyzer, word_table, make_standard_name):
-    word_table.add_special(' #foo#', 'foo', 'amenity', 'prison', 'in')
-    word_table.add_special(' #bar#', 'bar', 'highway', 'road', None)
-
-    assert word_table.count_special() == 2
-
-    analyzer.update_special_phrases([], False)
-
-    assert word_table.count_special() == 2
-
-
-def test_update_special_phrase_modify(analyzer, word_table, make_standard_name):
-    word_table.add_special(' #foo#', 'foo', 'amenity', 'prison', 'in')
-    word_table.add_special(' #bar#', 'bar', 'highway', 'road', None)
-
-    assert word_table.count_special() == 2
-
-    analyzer.update_special_phrases([
-        ('prison', 'amenity', 'prison', 'in'),
-        ('bar', 'highway', 'road', '-'),
-        ('garden', 'leisure', 'garden', 'near')
-    ], True)
-
-    assert word_table.get_special() \
-               == set(((' #prison#', 'prison', 'amenity', 'prison', 'in'),
-                       (' #bar#', 'bar', 'highway', 'road', None),
-                       (' #garden#', 'garden', 'leisure', 'garden', 'near')))
-
-
-def test_add_country_names(analyzer, word_table, make_standard_name):
-    analyzer.add_country_names('de', {'name': 'Germany',
-                                      'name:de': 'Deutschland',
-                                      'short_name': 'germany'})
-
-    assert word_table.get_country() \
-               == {('de', ' #germany#'),
-                   ('de', ' #deutschland#')}
-
-
-def test_add_more_country_names(analyzer, word_table, make_standard_name):
-    word_table.add_country('fr', ' #france#')
-    word_table.add_country('it', ' #italy#')
-    word_table.add_country('it', ' #itala#')
-
-    analyzer.add_country_names('it', {'name': 'Italy', 'ref': 'IT'})
-
-    assert word_table.get_country() \
-               == {('fr', ' #france#'),
-                   ('it', ' #italy#'),
-                   ('it', ' #itala#'),
-                   ('it', ' #it#')}
-
-
-@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
-def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode):
-    analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
-
-    assert word_table.get_postcodes() == {pcode, }
-
-
-@pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
-def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode):
-    analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
-
-    assert not word_table.get_postcodes()
-
-
-class TestHousenumberName:
-
-    @staticmethod
-    @pytest.fixture(autouse=True)
-    def setup_create_housenumbers(temp_db_cursor):
-        temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_housenumbers(
-                                      housenumbers TEXT[],
-                                      OUT tokens TEXT, OUT normtext TEXT)
-                                  AS $$
-                                  SELECT housenumbers::TEXT, array_to_string(housenumbers, ';')
-                                  $$ LANGUAGE SQL""")
-
-
-    @staticmethod
-    @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
-    def test_process_place_housenumbers_simple(analyzer, hnr):
-        info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : hnr}}))
-
-        assert info['hnr'] == hnr
-        assert info['hnr_tokens'].startswith("{")
-
-
-    @staticmethod
-    def test_process_place_housenumbers_lists(analyzer):
-        info = analyzer.process_place(PlaceInfo({'address': {'conscriptionnumber' : '1; 2;3'}}))
-
-        assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
-
-
-    @staticmethod
-    def test_process_place_housenumbers_duplicates(analyzer):
-        info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : '134',
-                                                   'conscriptionnumber' : '134',
-                                                   'streetnumber' : '99a'}}))
-
-        assert set(info['hnr'].split(';')) == set(('134', '99a'))
-
-
-class TestPlaceNames:
-
-    @pytest.fixture(autouse=True)
-    def setup(self, analyzer):
-        self.analyzer = analyzer
-
-
-    def expect_name_terms(self, info, *expected_terms):
-        tokens = self.analyzer.get_word_token_info(list(expected_terms))
-        for token in tokens:
-            assert token[2] is not None, "No token for {0}".format(token)
-
-        assert eval(info['names']) == set((t[2] for t in tokens)),\
-               f"Expected: {tokens}\nGot: {info['names']}"
-
-
-    def process_named_place(self, names):
-        return self.analyzer.process_place(PlaceInfo({'name': names}))
-
-
-    def test_simple_names(self):
-        info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'})
-
-        self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
-
-
-    @pytest.mark.parametrize('sep', [',' , ';'])
-    def test_names_with_separator(self, sep):
-        info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))})
-
-        self.expect_name_terms(info, '#New York', '#Big Apple',
-                                     'new', 'york', 'big', 'apple')
-
-
-    def test_full_names_with_bracket(self):
-        info = self.process_named_place({'name': 'Houseboat (left)'})
-
-        self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
-                                     'houseboat', '(left)')
-
-
-    def test_country_name(self, word_table):
-        place = PlaceInfo({'name' : {'name': 'Norge'},
-                           'country_code': 'no',
-                           'rank_address': 4,
-                           'class': 'boundary',
-                           'type': 'administrative'})
-
-        info = self.analyzer.process_place(place)
-
-        self.expect_name_terms(info, '#norge', 'norge')
-        assert word_table.get_country() == {('no', ' norge')}
-
-
-class TestPlaceAddress:
-
-    @pytest.fixture(autouse=True)
-    def setup(self, analyzer):
-        self.analyzer = analyzer
-
-
-    @pytest.fixture
-    def getorcreate_hnr_id(self, temp_db_cursor):
-        temp_db_cursor.execute("""CREATE SEQUENCE seq_hnr start 1;
-                                  CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
-                                  RETURNS INTEGER AS $$
-                                  SELECT -nextval('seq_hnr')::INTEGER; $$ LANGUAGE SQL""")
-
-    def process_address(self, **kwargs):
-        return self.analyzer.process_place(PlaceInfo({'address': kwargs}))
-
-
-    def name_token_set(self, *expected_terms):
-        tokens = self.analyzer.get_word_token_info(list(expected_terms))
-        for token in tokens:
-            assert token[2] is not None, "No token for {0}".format(token)
-
-        return set((t[2] for t in tokens))
-
-
-    @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
-    def test_process_place_postcode(self, word_table, pcode):
-        self.process_address(postcode=pcode)
-
-        assert word_table.get_postcodes() == {pcode, }
-
-
-    @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
-    def test_process_place_bad_postcode(self, word_table, pcode):
-        self.process_address(postcode=pcode)
-
-        assert not word_table.get_postcodes()
-
-
-    @pytest.mark.parametrize('hnr', ['123a', '0', '101'])
-    def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
-        info = self.process_address(housenumber=hnr)
-
-        assert info['hnr'] == hnr.lower()
-        assert info['hnr_tokens'] == "{-1}"
-
-
-    def test_process_place_housenumbers_lists(self, getorcreate_hnr_id):
-        info = self.process_address(conscriptionnumber='1; 2;3')
-
-        assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
-        assert info['hnr_tokens'] == "{-1,-2,-3}"
-
-
-    def test_process_place_housenumbers_duplicates(self, getorcreate_hnr_id):
-        info = self.process_address(housenumber='134',
-                                    conscriptionnumber='134',
-                                    streetnumber='99A')
-
-        assert set(info['hnr'].split(';')) == set(('134', '99a'))
-        assert info['hnr_tokens'] == "{-1,-2}"
-
-
-    def test_process_place_street(self):
-        # legacy tokenizer only indexes known names
-        self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road'}}))
-        info = self.process_address(street='Grand Road')
-
-        assert eval(info['street']) == self.name_token_set('#Grand Road')
-
-
-    def test_process_place_street_empty(self):
-        info = self.process_address(street='🜵')
-
-        assert info['street'] == '{}'
-
-
-    def test_process_place_place(self):
-        self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Honu Lulu'}}))
-        info = self.process_address(place='Honu Lulu')
-
-        assert eval(info['place_search']) == self.name_token_set('#Honu Lulu',
-                                                                 'Honu', 'Lulu')
-        assert eval(info['place_match']) == self.name_token_set('#Honu Lulu')
-
-
-    def test_process_place_place_empty(self):
-        info = self.process_address(place='🜵')
-
-        assert 'place' not in info
-
-
-    def test_process_place_address_terms(self):
-        for name in ('Zwickau', 'Haupstraße', 'Sachsen'):
-            self.analyzer.process_place(PlaceInfo({'name': {'name' : name}}))
-        info = self.process_address(country='de', city='Zwickau', state='Sachsen',
-                                    suburb='Zwickau', street='Hauptstr',
-                                    full='right behind the church')
-
-        city = self.name_token_set('ZWICKAU')
-        state = self.name_token_set('SACHSEN')
-
-        print(info)
-        result = {k: eval(v[0]) for k,v in info['addr'].items()}
-
-        assert result == {'city': city, 'suburb': city, 'state': state}
-
-
-    def test_process_place_address_terms_empty(self):
-        info = self.process_address(country='de', city=' ', street='Hauptstr',
-                                    full='right behind the church')
-
-        assert 'addr' not in info
-
--- a/test/python/tools/test_migration.py
+++ b/test/python/tools/test_migration.py
@@ -14,8 +14,6 @@ from nominatim_db.errors import UsageError
 from nominatim_db.db.connection import server_version_tuple
 import nominatim_db.version

-from mock_legacy_word_table import MockLegacyWordTable
-
 class DummyTokenizer:

    def update_sql_functions(self, config):
@@ -28,10 +26,6 @@ def postprocess_mock(monkeypatch):
    monkeypatch.setattr(migration.tokenizer_factory, 'get_tokenizer_for_db',
                        lambda *args: DummyTokenizer())

-@pytest.fixture
-def legacy_word_table(temp_db_conn):
-    return MockLegacyWordTable(temp_db_conn)
-

 def test_no_migration_old_versions(temp_db_with_extensions, table_factory, def_config):
    table_factory('country_name', 'name HSTORE, country_code TEXT')
@@ -145,26 +139,6 @@ def test_add_nominatim_property_table_repeat(temp_db_conn, temp_db_cursor,
    assert temp_db_cursor.table_exists('nominatim_properties')


-def test_change_housenumber_transliteration(temp_db_conn, temp_db_cursor,
-                                            legacy_word_table, placex_table):
-    placex_table.add(housenumber='3A')
-
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
-                              RETURNS TEXT AS $$ SELECT lower(name) $$ LANGUAGE SQL """)
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
-                              RETURNS INTEGER AS $$ SELECT 4325 $$ LANGUAGE SQL """)
-
-    migration.change_housenumber_transliteration(temp_db_conn)
-    temp_db_conn.commit()
-
-    assert temp_db_cursor.scalar('SELECT housenumber from placex') == '3a'
-
-    migration.change_housenumber_transliteration(temp_db_conn)
-    temp_db_conn.commit()
-
-    assert temp_db_cursor.scalar('SELECT housenumber from placex') == '3a'
-
-
 def test_switch_placenode_geometry_index(temp_db_conn, temp_db_cursor, placex_table):
    temp_db_cursor.execute("""CREATE INDEX idx_placex_adminname
                              ON placex (place_id)""")