enable all API tests for sqlite and port missing features

2026-03-13 14:24:08 +00:00 · 2023-12-06 20:56:21 +01:00
parent 0d840c8d4e
commit 6d39563b87
15 changed files with 514 additions and 230 deletions
--- a/nominatim/api/core.py
+++ b/nominatim/api/core.py
@@ -19,6 +19,7 @@ import sqlalchemy.ext.asyncio as sa_asyncio
 from nominatim.errors import UsageError
 from nominatim.db.sqlalchemy_schema import SearchTables
 from nominatim.db.async_core_library import PGCORE_LIB, PGCORE_ERROR
+import nominatim.db.sqlite_functions
 from nominatim.config import Configuration
 from nominatim.api.connection import SearchConnection
 from nominatim.api.status import get_status, StatusResult
@@ -122,6 +123,7 @@ class NominatimAPIAsync: #pylint: disable=too-many-instance-attributes
                @sa.event.listens_for(engine.sync_engine, "connect")
                def _on_sqlite_connect(dbapi_con: Any, _: Any) -> None:
                    dbapi_con.run_async(lambda conn: conn.enable_load_extension(True))
+                    nominatim.db.sqlite_functions.install_custom_functions(dbapi_con)
                    cursor = dbapi_con.cursor()
                    cursor.execute("SELECT load_extension('mod_spatialite')")
                    cursor.execute('SELECT SetDecimalPrecision(7)')
--- a/nominatim/api/search/db_search_lookups.py
+++ b/nominatim/api/search/db_search_lookups.py
@@ -26,18 +26,38 @@ class LookupAll(LookupType):
    inherit_cache = True

    def __init__(self, table: SaFromClause, column: str, tokens: List[int]) -> None:
-        super().__init__(getattr(table.c, column),
+        super().__init__(table.c.place_id, getattr(table.c, column), column,
                         sa.type_coerce(tokens, IntArray))


@compiles(LookupAll) # type: ignore[no-untyped-call, misc]
 def _default_lookup_all(element: LookupAll,
                        compiler: 'sa.Compiled', **kw: Any) -> str:
-    col, tokens = list(element.clauses)
+    _, col, _, tokens = list(element.clauses)
    return "(%s @> %s)" % (compiler.process(col, **kw),
                           compiler.process(tokens, **kw))


+@compiles(LookupAll, 'sqlite') # type: ignore[no-untyped-call, misc]
+def _sqlite_lookup_all(element: LookupAll,
+                        compiler: 'sa.Compiled', **kw: Any) -> str:
+    place, col, colname, tokens = list(element.clauses)
+    return "(%s IN (SELECT CAST(value as bigint) FROM"\
+           " (SELECT array_intersect_fuzzy(places) as p FROM"\
+           "   (SELECT places FROM reverse_search_name"\
+           "   WHERE word IN (SELECT value FROM json_each('[' || %s || ']'))"\
+           "     AND column = %s"\
+           "   ORDER BY length(places)) as x) as u,"\
+           " json_each('[' || u.p || ']'))"\
+           " AND array_contains(%s, %s))"\
+             % (compiler.process(place, **kw),
+                compiler.process(tokens, **kw),
+                compiler.process(colname, **kw),
+                compiler.process(col, **kw),
+                compiler.process(tokens, **kw)
+                )
+
+

 class LookupAny(LookupType):
    """ Find all entries that contain at least one of the given tokens.
@@ -46,17 +66,28 @@ class LookupAny(LookupType):
    inherit_cache = True

    def __init__(self, table: SaFromClause, column: str, tokens: List[int]) -> None:
-        super().__init__(getattr(table.c, column),
+        super().__init__(table.c.place_id, getattr(table.c, column), column,
                         sa.type_coerce(tokens, IntArray))

-
@compiles(LookupAny) # type: ignore[no-untyped-call, misc]
 def _default_lookup_any(element: LookupAny,
                        compiler: 'sa.Compiled', **kw: Any) -> str:
-    col, tokens = list(element.clauses)
+    _, col, _, tokens = list(element.clauses)
    return "(%s && %s)" % (compiler.process(col, **kw),
                           compiler.process(tokens, **kw))

+@compiles(LookupAny, 'sqlite') # type: ignore[no-untyped-call, misc]
+def _sqlite_lookup_any(element: LookupAny,
+                        compiler: 'sa.Compiled', **kw: Any) -> str:
+    place, _, colname, tokens = list(element.clauses)
+    return "%s IN (SELECT CAST(value as bigint) FROM"\
+           " (SELECT array_union(places) as p FROM reverse_search_name"\
+           "   WHERE word IN (SELECT value FROM json_each('[' || %s || ']'))"\
+           "     AND column = %s) as u,"\
+           " json_each('[' || u.p || ']'))" % (compiler.process(place, **kw),
+                                               compiler.process(tokens, **kw),
+                                               compiler.process(colname, **kw))
+


 class Restrict(LookupType):
@@ -76,3 +107,8 @@ def _default_restrict(element: Restrict,
    arg1, arg2 = list(element.clauses)
    return "(coalesce(null, %s) @> %s)" % (compiler.process(arg1, **kw),
                                           compiler.process(arg2, **kw))
+
+@compiles(Restrict, 'sqlite') # type: ignore[no-untyped-call, misc]
+def _sqlite_restrict(element: Restrict,
+                        compiler: 'sa.Compiled', **kw: Any) -> str:
+    return "array_contains(%s)" % compiler.process(element.clauses, **kw)
--- a/nominatim/api/search/db_searches.py
+++ b/nominatim/api/search/db_searches.py
@@ -11,7 +11,6 @@ from typing import List, Tuple, AsyncIterator, Dict, Any, Callable
 import abc

 import sqlalchemy as sa
-from sqlalchemy.dialects.postgresql import array_agg

 from nominatim.typing import SaFromClause, SaScalarSelect, SaColumn, \
                             SaExpression, SaSelect, SaLambdaSelect, SaRow, SaBind
@@ -19,7 +18,7 @@ from nominatim.api.connection import SearchConnection
 from nominatim.api.types import SearchDetails, DataLayer, GeometryFormat, Bbox
 import nominatim.api.results as nres
 from nominatim.api.search.db_search_fields import SearchData, WeightedCategories
-from nominatim.db.sqlalchemy_types import Geometry
+from nominatim.db.sqlalchemy_types import Geometry, IntArray

 #pylint: disable=singleton-comparison,not-callable
 #pylint: disable=too-many-branches,too-many-arguments,too-many-locals,too-many-statements
@@ -110,7 +109,7 @@ def _add_geometry_columns(sql: SaLambdaSelect, col: SaColumn, details: SearchDet

 def _make_interpolation_subquery(table: SaFromClause, inner: SaFromClause,
                                 numerals: List[int], details: SearchDetails) -> SaScalarSelect:
-    all_ids = array_agg(table.c.place_id) # type: ignore[no-untyped-call]
+    all_ids = sa.func.ArrayAgg(table.c.place_id)
    sql = sa.select(all_ids).where(table.c.parent_place_id == inner.c.place_id)

    if len(numerals) == 1:
@@ -134,9 +133,7 @@ def _filter_by_layer(table: SaFromClause, layers: DataLayer) -> SaColumn:
        orexpr.append(no_index(table.c.rank_address).between(1, 30))
    elif layers & DataLayer.ADDRESS:
        orexpr.append(no_index(table.c.rank_address).between(1, 29))
-        orexpr.append(sa.and_(no_index(table.c.rank_address) == 30,
-                              sa.or_(table.c.housenumber != None,
-                                     table.c.address.has_key('addr:housename'))))
+        orexpr.append(sa.func.IsAddressPoint(table))
    elif layers & DataLayer.POI:
        orexpr.append(sa.and_(no_index(table.c.rank_address) == 30,
                              table.c.class_.not_in(('place', 'building'))))
@@ -188,12 +185,21 @@ async def _get_placex_housenumbers(conn: SearchConnection,
        yield result


+def _int_list_to_subquery(inp: List[int]) -> 'sa.Subquery':
+    """ Create a subselect that returns the given list of integers
+        as rows in the column 'nr'.
+    """
+    vtab = sa.func.JsonArrayEach(sa.type_coerce(inp, sa.JSON))\
+               .table_valued(sa.column('value', type_=sa.JSON)) # type: ignore[no-untyped-call]
+    return sa.select(sa.cast(sa.cast(vtab.c.value, sa.Text), sa.Integer).label('nr')).subquery()
+
+
 async def _get_osmline(conn: SearchConnection, place_ids: List[int],
                       numerals: List[int],
                       details: SearchDetails) -> AsyncIterator[nres.SearchResult]:
    t = conn.t.osmline
-    values = sa.values(sa.Column('nr', sa.Integer()), name='housenumber')\
-               .data([(n,) for n in numerals])
+
+    values = _int_list_to_subquery(numerals)
    sql = sa.select(t.c.place_id, t.c.osm_id,
                    t.c.parent_place_id, t.c.address,
                    values.c.nr.label('housenumber'),
@@ -216,8 +222,7 @@ async def _get_tiger(conn: SearchConnection, place_ids: List[int],
                     numerals: List[int], osm_id: int,
                     details: SearchDetails) -> AsyncIterator[nres.SearchResult]:
    t = conn.t.tiger
-    values = sa.values(sa.Column('nr', sa.Integer()), name='housenumber')\
-               .data([(n,) for n in numerals])
+    values = _int_list_to_subquery(numerals)
    sql = sa.select(t.c.place_id, t.c.parent_place_id,
                    sa.literal('W').label('osm_type'),
                    sa.literal(osm_id).label('osm_id'),
@@ -573,7 +578,8 @@ class PostcodeSearch(AbstractSearch):
            tsearch = conn.t.search_name
            sql = sql.where(tsearch.c.place_id == t.c.parent_place_id)\
                     .where((tsearch.c.name_vector + tsearch.c.nameaddress_vector)
-                                     .contains(self.lookups[0].tokens))
+                                     .contains(sa.type_coerce(self.lookups[0].tokens,
+                                                              IntArray)))

        for ranking in self.rankings:
            penalty += ranking.sql_penalty(conn.t.search_name)
@@ -692,10 +698,10 @@ class PlaceSearch(AbstractSearch):
            sql = sql.order_by(sa.text('accuracy'))

        if self.housenumbers:
-            hnr_regexp = f"\\m({'|'.join(self.housenumbers.values)})\\M"
+            hnr_list = '|'.join(self.housenumbers.values)
            sql = sql.where(tsearch.c.address_rank.between(16, 30))\
                     .where(sa.or_(tsearch.c.address_rank < 30,
-                                   t.c.housenumber.op('~*')(hnr_regexp)))
+                                   sa.func.RegexpWord(hnr_list, t.c.housenumber)))

            # Cross check for housenumbers, need to do that on a rather large
            # set. Worst case there are 40.000 main streets in OSM.
@@ -703,10 +709,10 @@ class PlaceSearch(AbstractSearch):

            # Housenumbers from placex
            thnr = conn.t.placex.alias('hnr')
-            pid_list = array_agg(thnr.c.place_id) # type: ignore[no-untyped-call]
+            pid_list = sa.func.ArrayAgg(thnr.c.place_id)
            place_sql = sa.select(pid_list)\
                          .where(thnr.c.parent_place_id == inner.c.place_id)\
-                          .where(thnr.c.housenumber.op('~*')(hnr_regexp))\
+                          .where(sa.func.RegexpWord(hnr_list, thnr.c.housenumber))\
                          .where(thnr.c.linked_place_id == None)\
                          .where(thnr.c.indexed_status == 0)

--- a/nominatim/db/sqlalchemy_functions.py
+++ b/nominatim/db/sqlalchemy_functions.py
@@ -188,6 +188,7 @@ def sqlite_json_array_each(element: JsonArrayEach, compiler: 'sa.Compiled', **kw
    return "json_each(%s)" % compiler.process(element.clauses, **kw)


+
 class Greatest(sa.sql.functions.GenericFunction[Any]):
    """ Function to compute maximum of all its input parameters.
    """
@@ -198,3 +199,23 @@ class Greatest(sa.sql.functions.GenericFunction[Any]):
@compiles(Greatest, 'sqlite') # type: ignore[no-untyped-call, misc]
 def sqlite_greatest(element: Greatest, compiler: 'sa.Compiled', **kw: Any) -> str:
    return "max(%s)" % compiler.process(element.clauses, **kw)
+
+
+
+class RegexpWord(sa.sql.functions.GenericFunction[Any]):
+    """ Check if a full word is in a given string.
+    """
+    name = 'RegexpWord'
+    inherit_cache = True
+
+
+@compiles(RegexpWord, 'postgresql') # type: ignore[no-untyped-call, misc]
+def postgres_regexp_nocase(element: RegexpWord, compiler: 'sa.Compiled', **kw: Any) -> str:
+    arg1, arg2 = list(element.clauses)
+    return "%s ~* ('\\m(' || %s  || ')\\M')::text" % (compiler.process(arg2, **kw), compiler.process(arg1, **kw))
+
+
+@compiles(RegexpWord, 'sqlite') # type: ignore[no-untyped-call, misc]
+def sqlite_regexp_nocase(element: RegexpWord, compiler: 'sa.Compiled', **kw: Any) -> str:
+    arg1, arg2 = list(element.clauses)
+    return "regexp('\\b(' || %s  || ')\\b', %s)" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
--- a/nominatim/db/sqlalchemy_types/int_array.py
+++ b/nominatim/db/sqlalchemy_types/int_array.py
@@ -57,22 +57,16 @@ class IntArray(sa.types.TypeDecorator[Any]):
            """ Concate the array with the given array. If one of the
                operants is null, the value of the other will be returned.
            """
-            return sa.func.array_cat(self, other, type_=IntArray)
+            return ArrayCat(self.expr, other)


        def contains(self, other: SaColumn, **kwargs: Any) -> 'sa.ColumnOperators':
            """ Return true if the array contains all the value of the argument
                array.
            """
-            return cast('sa.ColumnOperators', self.op('@>', is_comparison=True)(other))
+            return ArrayContains(self.expr, other)


-        def overlaps(self, other: SaColumn) -> 'sa.Operators':
-            """ Return true if at least one value of the argument is contained
-                in the array.
-            """
-            return self.op('&&', is_comparison=True)(other)
-

 class ArrayAgg(sa.sql.functions.GenericFunction[Any]):
    """ Aggregate function to collect elements in an array.
@@ -82,6 +76,48 @@ class ArrayAgg(sa.sql.functions.GenericFunction[Any]):
    name = 'array_agg'
    inherit_cache = True

+
@compiles(ArrayAgg, 'sqlite') # type: ignore[no-untyped-call, misc]
 def sqlite_array_agg(element: ArrayAgg, compiler: 'sa.Compiled', **kw: Any) -> str:
    return "group_concat(%s, ',')" % compiler.process(element.clauses, **kw)
+
+
+
+class ArrayContains(sa.sql.expression.FunctionElement[Any]):
+    """ Function to check if an array is fully contained in another.
+    """
+    name = 'ArrayContains'
+    inherit_cache = True
+
+
+@compiles(ArrayContains) # type: ignore[no-untyped-call, misc]
+def generic_array_contains(element: ArrayContains, compiler: 'sa.Compiled', **kw: Any) -> str:
+    arg1, arg2 = list(element.clauses)
+    return "(%s @> %s)" % (compiler.process(arg1, **kw),
+                           compiler.process(arg2, **kw))
+
+
+@compiles(ArrayContains, 'sqlite') # type: ignore[no-untyped-call, misc]
+def sqlite_array_contains(element: ArrayContains, compiler: 'sa.Compiled', **kw: Any) -> str:
+    return "array_contains(%s)" % compiler.process(element.clauses, **kw)
+
+
+
+class ArrayCat(sa.sql.expression.FunctionElement[Any]):
+    """ Function to check if an array is fully contained in another.
+    """
+    type = IntArray()
+    identifier = 'ArrayCat'
+    inherit_cache = True
+
+
+@compiles(ArrayCat) # type: ignore[no-untyped-call, misc]
+def generic_array_cat(element: ArrayCat, compiler: 'sa.Compiled', **kw: Any) -> str:
+    return "array_cat(%s)" % compiler.process(element.clauses, **kw)
+
+
+@compiles(ArrayCat, 'sqlite') # type: ignore[no-untyped-call, misc]
+def sqlite_array_cat(element: ArrayCat, compiler: 'sa.Compiled', **kw: Any) -> str:
+    arg1, arg2 = list(element.clauses)
+    return "(%s || ',' || %s)" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
+
--- a/nominatim/db/sqlalchemy_types/key_value.py
+++ b/nominatim/db/sqlalchemy_types/key_value.py
@@ -10,6 +10,7 @@ A custom type that implements a simple key-value store of strings.
 from typing import Any

 import sqlalchemy as sa
+from sqlalchemy.ext.compiler import compiles
 from sqlalchemy.dialects.postgresql import HSTORE
 from sqlalchemy.dialects.sqlite import JSON as sqlite_json

@@ -37,11 +38,25 @@ class KeyValueStore(sa.types.TypeDecorator[Any]):
                one, overwriting values where necessary. When the argument
                is null, nothing happens.
            """
-            return self.op('||')(sa.func.coalesce(other,
-                                                  sa.type_coerce('', KeyValueStore)))
+            return KeyValueConcat(self.expr, other)
+
+
+class KeyValueConcat(sa.sql.expression.FunctionElement[Any]):
+    """ Return the merged key-value store from the input parameters.
+    """
+    type = KeyValueStore()
+    name = 'JsonConcat'
+    inherit_cache = True
+
+@compiles(KeyValueConcat) # type: ignore[no-untyped-call, misc]
+def default_json_concat(element: KeyValueConcat, compiler: 'sa.Compiled', **kw: Any) -> str:
+    arg1, arg2 = list(element.clauses)
+    return "(%s || coalesce(%s, ''::hstore))" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
+
+@compiles(KeyValueConcat, 'sqlite') # type: ignore[no-untyped-call, misc]
+def sqlite_json_concat(element: KeyValueConcat, compiler: 'sa.Compiled', **kw: Any) -> str:
+    arg1, arg2 = list(element.clauses)
+    return "json_patch(%s, coalesce(%s, '{}'))" % (compiler.process(arg1, **kw), compiler.process(arg2, **kw))
+


-        def has_key(self, key: SaColumn) -> 'sa.Operators':
-            """ Return true if the key is cotained in the store.
-            """
-            return self.op('?', is_comparison=True)(key)
--- a/nominatim/db/sqlite_functions.py
+++ b/nominatim/db/sqlite_functions.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2023 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Custom functions for SQLite.
+"""
+from typing import cast, Optional, Set, Any
+import json
+
+# pylint: disable=protected-access
+
+def weigh_search(search_vector: Optional[str], rankings: str, default: float) -> float:
+    """ Custom weight function for search results.
+    """
+    if search_vector is not None:
+        svec = [int(x) for x in search_vector.split(',')]
+        for rank in json.loads(rankings):
+            if all(r in svec for r in rank[1]):
+                return cast(float, rank[0])
+
+    return default
+
+
+class ArrayIntersectFuzzy:
+    """ Compute the array of common elements of all input integer arrays.
+        Very large input paramenters may be ignored to speed up
+        computation. Therefore, the result is a superset of common elements.
+
+        Input and output arrays are given as comma-separated lists.
+    """
+    def __init__(self) -> None:
+        self.first = ''
+        self.values: Optional[Set[int]] = None
+
+    def step(self, value: Optional[str]) -> None:
+        """ Add the next array to the intersection.
+        """
+        if value is not None:
+            if not self.first:
+                self.first = value
+            elif len(value) < 10000000:
+                if self.values is None:
+                    self.values = {int(x) for x in self.first.split(',')}
+                self.values.intersection_update((int(x) for x in value.split(',')))
+
+    def finalize(self) -> str:
+        """ Return the final result.
+        """
+        if self.values is not None:
+            return ','.join(map(str, self.values))
+
+        return self.first
+
+
+class ArrayUnion:
+    """ Compute the set of all elements of the input integer arrays.
+
+        Input and output arrays are given as strings of comma-separated lists.
+    """
+    def __init__(self) -> None:
+        self.values: Optional[Set[str]] = None
+
+    def step(self, value: Optional[str]) -> None:
+        """ Add the next array to the union.
+        """
+        if value is not None:
+            if self.values is None:
+                self.values = set(value.split(','))
+            else:
+                self.values.update(value.split(','))
+
+    def finalize(self) -> str:
+        """ Return the final result.
+        """
+        return '' if self.values is None else ','.join(self.values)
+
+
+def array_contains(container: Optional[str], containee: Optional[str]) -> Optional[bool]:
+    """ Is the array 'containee' completely contained in array 'container'.
+    """
+    if container is None or containee is None:
+        return None
+
+    vset = container.split(',')
+    return all(v in vset for v in containee.split(','))
+
+
+def array_pair_contains(container1: Optional[str], container2: Optional[str],
+                        containee: Optional[str]) -> Optional[bool]:
+    """ Is the array 'containee' completely contained in the union of
+        array 'container1' and array 'container2'.
+    """
+    if container1 is None or container2 is None or containee is None:
+        return None
+
+    vset = container1.split(',') + container2.split(',')
+    return all(v in vset for v in containee.split(','))
+
+
+def install_custom_functions(conn: Any) -> None:
+    """ Install helper functions for Nominatim into the given SQLite
+        database connection.
+    """
+    conn.create_function('weigh_search', 3, weigh_search, deterministic=True)
+    conn.create_function('array_contains', 2, array_contains, deterministic=True)
+    conn.create_function('array_pair_contains', 3, array_pair_contains, deterministic=True)
+    _create_aggregate(conn, 'array_intersect_fuzzy', 1, ArrayIntersectFuzzy)
+    _create_aggregate(conn, 'array_union', 1, ArrayUnion)
+
+
+async def _make_aggregate(aioconn: Any, *args: Any) -> None:
+    await aioconn._execute(aioconn._conn.create_aggregate, *args)
+
+
+def _create_aggregate(conn: Any, name: str, nargs: int, aggregate: Any) -> None:
+    try:
+        conn.await_(_make_aggregate(conn._connection, name, nargs, aggregate))
+    except Exception as error: # pylint: disable=broad-exception-caught
+        conn._handle_exception(error)
--- a/nominatim/tools/convert_sqlite.py
+++ b/nominatim/tools/convert_sqlite.py
@@ -205,15 +205,15 @@ class SqliteWriter:
    async def create_search_index(self) -> None:
        """ Create the tables and indexes needed for word lookup.
        """
+        LOG.warning("Creating reverse search table")
+        rsn = sa.Table('reverse_search_name', self.dest.t.meta,
+                       sa.Column('word', sa.Integer()),
+                       sa.Column('column', sa.Text()),
+                       sa.Column('places', IntArray))
+        await self.dest.connection.run_sync(rsn.create)
+
        tsrc = self.src.t.search_name
        for column in ('name_vector', 'nameaddress_vector'):
-            table_name = f'reverse_search_{column}'
-            LOG.warning("Creating reverse search %s", table_name)
-            rsn = sa.Table(table_name, self.dest.t.meta,
-                           sa.Column('word', sa.Integer()),
-                           sa.Column('places', IntArray))
-            await self.dest.connection.run_sync(rsn.create)
-
            sql = sa.select(sa.func.unnest(getattr(tsrc.c, column)).label('word'),
                            sa.func.ArrayAgg(tsrc.c.place_id).label('places'))\
                    .group_by('word')
@@ -224,11 +224,12 @@ class SqliteWriter:
                for row in partition:
                    row.places.sort()
                    data.append({'word': row.word,
+                                 'column': column,
                                 'places': row.places})
                await self.dest.execute(rsn.insert(), data)

-            await self.dest.connection.run_sync(
-                sa.Index(f'idx_reverse_search_{column}_word', rsn.c.word).create)
+        await self.dest.connection.run_sync(
+            sa.Index('idx_reverse_search_name_word', rsn.c.word).create)


    def select_from(self, table: str) -> SaSelect: