bdd: run full import on tests

This uncovered a couple of outdated/wrong tests which have been fixed, too.
2026-02-26 11:08:13 +00:00 · 2022-02-24 11:35:21 +01:00
parent a9e3329c39
commit f74228830d
10 changed files with 115 additions and 137 deletions
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -390,17 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
    def add_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
+        """ Add default names for the given country to the search index.
        """
        # Make sure any name preprocessing for country names applies.
        info = PlaceInfo({'name': names, 'country_code': country_code,
                          'rank_address': 4, 'class': 'boundary',
                          'type': 'administrative'})
        self._add_country_full_names(country_code,
-                                     self.sanitizer.process_names(info)[0])
+                                     self.sanitizer.process_names(info)[0],
                                     internal=True)
-    def _add_country_full_names(self, country_code, names):
+    def _add_country_full_names(self, country_code, names, internal=False):
        """ Add names for the given country from an already sanitized
            name list.
        """
@@ -412,21 +413,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        with self.conn.cursor() as cur:
            # Get existing names
-            cur.execute("""SELECT word_token FROM word
+            cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
-                            WHERE type = 'C' and word = %s""",
+                             FROM word
                             WHERE type = 'C' and word = %s""",
                        (country_code, ))
-            existing_tokens = {t[0] for t in cur}
+            existing_tokens = {True: set(), False: set()} # internal/external names
-
+            for word in cur:
-            # Only add those names that are not yet in the list.
+                existing_tokens[word[1]].add(word[0])
            new_tokens = word_tokens - existing_tokens
            if new_tokens:
                cur.execute("""INSERT INTO word (word_token, type, word)
                               (SELECT token, 'C', %s
                                FROM unnest(%s) as token)
                            """, (country_code, list(new_tokens)))
            # Delete names that no longer exist.
-            gone_tokens = existing_tokens - word_tokens
+            gone_tokens = existing_tokens[internal] - word_tokens
            if internal:
                gone_tokens.update(existing_tokens[False] & word_tokens)
            if gone_tokens:
                cur.execute("""DELETE FROM word
                               USING unnest(%s) as token
@@ -434,6 +432,23 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                                     and word_token = token""",
                            (list(gone_tokens), country_code))
            # Only add those names that are not yet in the list.
            new_tokens = word_tokens - existing_tokens[True]
            if not internal:
                new_tokens -= existing_tokens[False]
            if new_tokens:
                if internal:
                    sql = """INSERT INTO word (word_token, type, word, info)
                               (SELECT token, 'C', %s, '{"internal": "yes"}'
                                  FROM unnest(%s) as token)
                           """
                else:
                    sql = """INSERT INTO word (word_token, type, word)
                                   (SELECT token, 'C', %s
                                    FROM unnest(%s) as token)
                          """
                cur.execute(sql, (country_code, list(new_tokens)))
    def process_place(self, place):
        """ Determine tokenizer information about the given place.
--- a/test/bdd/db/import/placex.feature
+++ b/test/bdd/db/import/placex.feature
@@ -61,70 +61,6 @@ Feature: Import into placex
        When importing
        Then placex has no entry for R1
    Scenario: search and address ranks for GB post codes correctly assigned
        Given the places
         | osm  | class | type     | postcode | geometry |
         | N1   | place | postcode | E45 2CD  | country:gb |
         | N2   | place | postcode | E45 2    | country:gb |
         | N3   | place | postcode | Y45      | country:gb |
        When importing
        Then placex contains
         | object | addr+postcode | country_code | rank_search | rank_address |
         | N1     | E45 2CD       | gb           | 25          | 5 |
         | N2     | E45 2         | gb           | 23          | 5 |
         | N3     | Y45           | gb           | 21          | 5 |
    Scenario: wrongly formatted GB postcodes are down-ranked
        Given the places
         | osm  | class | type     | postcode | geometry |
         | N1   | place | postcode | EA452CD  | country:gb |
         | N2   | place | postcode | E45 23   | country:gb |
        When importing
        Then placex contains
         | object | country_code | rank_search | rank_address |
         | N1     | gb           | 30          | 30 |
         | N2     | gb           | 30          | 30 |
    Scenario: search and address rank for DE postcodes correctly assigned
        Given the places
         | osm | class | type     | postcode | geometry |
         | N1  | place | postcode | 56427    | country:de |
         | N2  | place | postcode | 5642     | country:de |
         | N3  | place | postcode | 5642A    | country:de |
         | N4  | place | postcode | 564276   | country:de |
        When importing
        Then placex contains
         | object | country_code | rank_search | rank_address |
         | N1     | de           | 21          | 11 |
         | N2     | de           | 30          | 30 |
         | N3     | de           | 30          | 30 |
         | N4     | de           | 30          | 30 |
    Scenario: search and address rank for other postcodes are correctly assigned
        Given the places
         | osm | class | type     | postcode | geometry |
         | N1  | place | postcode | 1        | country:ca |
         | N2  | place | postcode | X3       | country:ca |
         | N3  | place | postcode | 543      | country:ca |
         | N4  | place | postcode | 54dc     | country:ca |
         | N5  | place | postcode | 12345    | country:ca |
         | N6  | place | postcode | 55TT667  | country:ca |
         | N7  | place | postcode | 123-65   | country:ca |
         | N8  | place | postcode | 12 445 4 | country:ca |
         | N9  | place | postcode | A1:bc10  | country:ca |
        When importing
        Then placex contains
         | object | country_code | rank_search | rank_address |
         | N1     | ca           | 21          | 11 |
         | N2     | ca           | 21          | 11 |
         | N3     | ca           | 21          | 11 |
         | N4     | ca           | 21          | 11 |
         | N5     | ca           | 21          | 11 |
         | N6     | ca           | 21          | 11 |
         | N7     | ca           | 25          | 11 |
         | N8     | ca           | 25          | 11 |
         | N9     | ca           | 25          | 11 |
    Scenario: search and address ranks for boundaries are correctly assigned
        Given the named places
          | osm | class    | type |
--- a/test/bdd/db/import/postcodes.feature
+++ b/test/bdd/db/import/postcodes.feature
@@ -153,4 +153,70 @@ Feature: Import of postcodes
        When sending search query "E4 7EA"
        Then results contain
           | type     | display_name |
-           | postcode | E4 7EA      |
+           | postcode | E4 7EA       |
    Scenario: search and address ranks for GB post codes correctly assigned
        Given the places
         | osm  | class | type     | postcode | geometry |
         | N1   | place | postcode | E45 2CD  | country:gb |
         | N2   | place | postcode | E45 2    | country:gb |
         | N3   | place | postcode | Y45      | country:gb |
        When importing
        Then location_postcode contains exactly
         | postcode | country | rank_search | rank_address |
         | E45 2CD  | gb      | 25          | 5 |
         | E45 2    | gb      | 23          | 5 |
         | Y45      | gb      | 21          | 5 |
    Scenario: wrongly formatted GB postcodes are down-ranked
        Given the places
         | osm  | class | type     | postcode | geometry |
         | N1   | place | postcode | EA452CD  | country:gb |
         | N2   | place | postcode | E45 23   | country:gb |
        When importing
        Then location_postcode contains exactly
         | postcode | country | rank_search | rank_address |
         | EA452CD  | gb      | 30          | 30 |
         | E45 23   | gb      | 30          | 30 |
    Scenario: search and address rank for DE postcodes correctly assigned
        Given the places
         | osm | class | type     | postcode | geometry |
         | N1  | place | postcode | 56427    | country:de |
         | N2  | place | postcode | 5642     | country:de |
         | N3  | place | postcode | 5642A    | country:de |
         | N4  | place | postcode | 564276   | country:de |
        When importing
        Then location_postcode contains exactly
         | postcode | country | rank_search | rank_address |
         | 56427    | de      | 21          | 11 |
         | 5642     | de      | 30          | 30 |
         | 5642A    | de      | 30          | 30 |
         | 564276   | de      | 30          | 30 |
    Scenario: search and address rank for other postcodes are correctly assigned
        Given the places
         | osm | class | type     | postcode | geometry |
         | N1  | place | postcode | 1        | country:ca |
         | N2  | place | postcode | X3       | country:ca |
         | N3  | place | postcode | 543      | country:ca |
         | N4  | place | postcode | 54dc     | country:ca |
         | N5  | place | postcode | 12345    | country:ca |
         | N6  | place | postcode | 55TT667  | country:ca |
         | N7  | place | postcode | 123-65   | country:ca |
         | N8  | place | postcode | 12 445 4 | country:ca |
         | N9  | place | postcode | A1:bc10  | country:ca |
        When importing
        Then location_postcode contains exactly
         | postcode | country | rank_search | rank_address |
         | 1        | ca      | 21          | 11 |
         | X3       | ca      | 21          | 11 |
         | 543      | ca      | 21          | 11 |
         | 54DC     | ca      | 21          | 11 |
         | 12345    | ca      | 21          | 11 |
         | 55TT667  | ca      | 21          | 11 |
         | 123-65   | ca      | 25          | 11 |
         | 12 445 4 | ca      | 25          | 11 |
         | A1:BC10  | ca      | 25          | 11 |
--- a/test/bdd/db/import/rank_computation.feature
+++ b/test/bdd/db/import/rank_computation.feature
@@ -16,7 +16,6 @@ Feature: Rank assignment
          | N18  | place     | city      | 0 0 |
          | N19  | place     | island    | 0 0 |
          | N36  | place     | house     | 0 0 |
          | N38  | place     | houses    | 0 0 |
        And the named places
          | osm  | class     | type      | extra+capital | geometry |
          | N101 | place     | city      | yes           | 0 0 |
@@ -35,7 +34,6 @@ Feature: Rank assignment
          | N19    | 17          | 0 |
          | N101   | 15          | 16 |
          | N36    | 30          | 30 |
          | N38    | 28          | 0 |
    Scenario: Ranks for boundaries are assigned according to admin level
        Given the named places
--- a/test/bdd/db/query/normalization.feature
+++ b/test/bdd/db/query/normalization.feature
@@ -164,8 +164,8 @@ Feature: Import and search of names
    Scenario: Unprintable characters in postcodes are ignored
        Given the named places
-            | osm  | class   | type   | address |
+            | osm  | class   | type   | address                    | geometry   |
-            | N234 | amenity | prison | 'postcode' : u'1234\u200e' |
+            | N234 | amenity | prison | 'postcode' : u'1234\u200e' | country:de |
        When importing
        And sending search query "1234"
        Then result 0 has not attributes osm_type
--- a/test/bdd/db/query/search_simple.feature
+++ b/test/bdd/db/query/search_simple.feature
@@ -25,7 +25,7 @@ Feature: Searching of simple objects
          | osm | class    | type        | postcode | geometry |
          | R1  | boundary | postal_code | 54321    | poly-area:1.0 |
        And sending search query "12345"
-        Then result 0 has not attributes osm_type
+        Then exactly 0 results are returned
        When sending search query "54321"
        Then results contain
         | ID | osm |
--- a/test/bdd/db/update/simple.feature
+++ b/test/bdd/db/update/simple.feature
@@ -63,9 +63,7 @@ Feature: Update of simple objects
          | osm | class | type     | postcode | geometry |
          | N3  | place | postcode | 12345    | 1 -1 |
        When importing
-        Then placex contains
+        Then placex has no entry for N3
          | object | class | type |
          | N3     | place | postcode |
        When updating places
          | osm | class | type  | postcode | housenr | geometry |
          | N3  | place | house | 12345    | 13      | 1 -1 |
--- a/test/bdd/osm2pgsql/update/relation.feature
+++ b/test/bdd/osm2pgsql/update/relation.feature
@@ -139,21 +139,3 @@ Feature: Update of relations by osm2pgsql
        Then place contains
          | object | addr+country | name           |
          | R1     | XX           | 'name' : 'Foo' |
    Scenario: Country boundary names are extended when country_code known
        When loading osm data
          """
          n200 Tamenity=prison x0 y0
          n201 x0 y0.0001
          n202 x0.0001 y0.0001
          n203 x0.0001 y0
          """
        And updating osm data
          """
          w1 Nn200,n201,n202,n203,n200
          r1 Ttype=boundary,boundary=administrative,name=Foo,country_code=ch,admin_level=2 Mw1@
          """
        Then place contains
            | object | addr+country | name+name:de | name+name |
            | R1     | ch           | Schweiz      | Foo       |
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -93,29 +93,8 @@ def add_data_to_planet_ways(context):
 def import_and_index_data_from_place_table(context):
    """ Import data previously set up in the place table.
    """
-    nctx = context.nominatim
+    context.nominatim.run_nominatim('import', '--continue', 'load-data',
-
+                                              '--index-noanalyse', '-q')
    tokenizer = tokenizer_factory.create_tokenizer(nctx.get_test_config())
    context.nominatim.copy_from_place(context.db)
    # XXX use tool function as soon as it is ported
    with context.db.cursor() as cur:
        with (context.nominatim.src_dir / 'lib-sql' / 'postcode_tables.sql').open('r') as fd:
            cur.execute(fd.read())
        cur.execute("""
            INSERT INTO location_postcode
             (place_id, indexed_status, country_code, postcode, geometry)
            SELECT nextval('seq_place'), 1, country_code,
                   upper(trim (both ' ' from address->'postcode')) as pc,
                   ST_Centroid(ST_Collect(ST_Centroid(geometry)))
              FROM placex
             WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
                   AND geometry IS NOT null
             GROUP BY country_code, pc""")
    # Call directly as the refresh function does not include postcodes.
    indexer.LOG.setLevel(logging.ERROR)
    indexer.Indexer(context.nominatim.get_libpq_dsn(), tokenizer, 1).index_full(analyse=False)
    check_database_integrity(context)
@@ -268,7 +247,7 @@ def check_location_postcode(context):
        for row in context.table:
            db_row = results.get((row['country'],row['postcode']))
            assert db_row is not None, \
-                "Missing row for country '{r['country']}' postcode '{r['postcode']}'.".format(r=row)
+                f"Missing row for country '{row['country']}' postcode '{row['postcode']}'."
            db_row.assert_row(row, ('country', 'postcode'))
@@ -333,12 +312,13 @@ def check_place_addressline_exclude(context):
    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
        for row in context.table:
            pid = NominatimID(row['object']).get_place_id(cur)
-            apid = NominatimID(row['address']).get_place_id(cur)
+            apid = NominatimID(row['address']).get_place_id(cur, allow_empty=True)
-            cur.execute(""" SELECT * FROM place_addressline
+            if apid is not None:
-                            WHERE place_id = %s AND address_place_id = %s""",
+                cur.execute(""" SELECT * FROM place_addressline
-                        (pid, apid))
+                                WHERE place_id = %s AND address_place_id = %s""",
-            assert cur.rowcount == 0, \
+                            (pid, apid))
-                "Row found for place %s and address %s" % (row['object'], row['address'])
+                assert cur.rowcount == 0, \
                    "Row found for place %s and address %s" % (row['object'], row['address'])
@then("W(?P<oid>\d+) expands to(?P<neg> no)? interpolation")
 def check_location_property_osmline(context, oid, neg):
--- a/test/bdd/steps/table_compare.py
+++ b/test/bdd/steps/table_compare.py
@@ -62,11 +62,14 @@ class NominatimID:
                    ','.join(['*'] + (extra_columns or [])), table)
        cur.execute(query, (pid, ))
-    def get_place_id(self, cur):
+    def get_place_id(self, cur, allow_empty=False):
        """ Look up the place id for the ID. Throws an assertion if the ID
            is not unique.
        """
        self.query_osm_id(cur, "SELECT place_id FROM placex WHERE {}")
        if cur.rowcount == 0 and allow_empty:
            return None
        assert cur.rowcount == 1, \
               "Place ID {!s} not unique. Found {} entries.".format(self, cur.rowcount)