bdd: run full import on tests

This uncovered a couple of outdated/wrong tests which have been
fixed, too.
This commit is contained in:
Sarah Hoffmann
2022-02-24 11:35:21 +01:00
parent a9e3329c39
commit f74228830d
10 changed files with 115 additions and 137 deletions

View File

@@ -390,17 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
def add_country_names(self, country_code, names): def add_country_names(self, country_code, names):
""" Add names for the given country to the search index. """ Add default names for the given country to the search index.
""" """
# Make sure any name preprocessing for country names applies. # Make sure any name preprocessing for country names applies.
info = PlaceInfo({'name': names, 'country_code': country_code, info = PlaceInfo({'name': names, 'country_code': country_code,
'rank_address': 4, 'class': 'boundary', 'rank_address': 4, 'class': 'boundary',
'type': 'administrative'}) 'type': 'administrative'})
self._add_country_full_names(country_code, self._add_country_full_names(country_code,
self.sanitizer.process_names(info)[0]) self.sanitizer.process_names(info)[0],
internal=True)
def _add_country_full_names(self, country_code, names): def _add_country_full_names(self, country_code, names, internal=False):
""" Add names for the given country from an already sanitized """ Add names for the given country from an already sanitized
name list. name list.
""" """
@@ -412,21 +413,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
# Get existing names # Get existing names
cur.execute("""SELECT word_token FROM word cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
WHERE type = 'C' and word = %s""", FROM word
WHERE type = 'C' and word = %s""",
(country_code, )) (country_code, ))
existing_tokens = {t[0] for t in cur} existing_tokens = {True: set(), False: set()} # internal/external names
for word in cur:
# Only add those names that are not yet in the list. existing_tokens[word[1]].add(word[0])
new_tokens = word_tokens - existing_tokens
if new_tokens:
cur.execute("""INSERT INTO word (word_token, type, word)
(SELECT token, 'C', %s
FROM unnest(%s) as token)
""", (country_code, list(new_tokens)))
# Delete names that no longer exist. # Delete names that no longer exist.
gone_tokens = existing_tokens - word_tokens gone_tokens = existing_tokens[internal] - word_tokens
if internal:
gone_tokens.update(existing_tokens[False] & word_tokens)
if gone_tokens: if gone_tokens:
cur.execute("""DELETE FROM word cur.execute("""DELETE FROM word
USING unnest(%s) as token USING unnest(%s) as token
@@ -434,6 +432,23 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
and word_token = token""", and word_token = token""",
(list(gone_tokens), country_code)) (list(gone_tokens), country_code))
# Only add those names that are not yet in the list.
new_tokens = word_tokens - existing_tokens[True]
if not internal:
new_tokens -= existing_tokens[False]
if new_tokens:
if internal:
sql = """INSERT INTO word (word_token, type, word, info)
(SELECT token, 'C', %s, '{"internal": "yes"}'
FROM unnest(%s) as token)
"""
else:
sql = """INSERT INTO word (word_token, type, word)
(SELECT token, 'C', %s
FROM unnest(%s) as token)
"""
cur.execute(sql, (country_code, list(new_tokens)))
def process_place(self, place): def process_place(self, place):
""" Determine tokenizer information about the given place. """ Determine tokenizer information about the given place.

View File

@@ -61,70 +61,6 @@ Feature: Import into placex
When importing When importing
Then placex has no entry for R1 Then placex has no entry for R1
Scenario: search and address ranks for GB post codes correctly assigned
Given the places
| osm | class | type | postcode | geometry |
| N1 | place | postcode | E45 2CD | country:gb |
| N2 | place | postcode | E45 2 | country:gb |
| N3 | place | postcode | Y45 | country:gb |
When importing
Then placex contains
| object | addr+postcode | country_code | rank_search | rank_address |
| N1 | E45 2CD | gb | 25 | 5 |
| N2 | E45 2 | gb | 23 | 5 |
| N3 | Y45 | gb | 21 | 5 |
Scenario: wrongly formatted GB postcodes are down-ranked
Given the places
| osm | class | type | postcode | geometry |
| N1 | place | postcode | EA452CD | country:gb |
| N2 | place | postcode | E45 23 | country:gb |
When importing
Then placex contains
| object | country_code | rank_search | rank_address |
| N1 | gb | 30 | 30 |
| N2 | gb | 30 | 30 |
Scenario: search and address rank for DE postcodes correctly assigned
Given the places
| osm | class | type | postcode | geometry |
| N1 | place | postcode | 56427 | country:de |
| N2 | place | postcode | 5642 | country:de |
| N3 | place | postcode | 5642A | country:de |
| N4 | place | postcode | 564276 | country:de |
When importing
Then placex contains
| object | country_code | rank_search | rank_address |
| N1 | de | 21 | 11 |
| N2 | de | 30 | 30 |
| N3 | de | 30 | 30 |
| N4 | de | 30 | 30 |
Scenario: search and address rank for other postcodes are correctly assigned
Given the places
| osm | class | type | postcode | geometry |
| N1 | place | postcode | 1 | country:ca |
| N2 | place | postcode | X3 | country:ca |
| N3 | place | postcode | 543 | country:ca |
| N4 | place | postcode | 54dc | country:ca |
| N5 | place | postcode | 12345 | country:ca |
| N6 | place | postcode | 55TT667 | country:ca |
| N7 | place | postcode | 123-65 | country:ca |
| N8 | place | postcode | 12 445 4 | country:ca |
| N9 | place | postcode | A1:bc10 | country:ca |
When importing
Then placex contains
| object | country_code | rank_search | rank_address |
| N1 | ca | 21 | 11 |
| N2 | ca | 21 | 11 |
| N3 | ca | 21 | 11 |
| N4 | ca | 21 | 11 |
| N5 | ca | 21 | 11 |
| N6 | ca | 21 | 11 |
| N7 | ca | 25 | 11 |
| N8 | ca | 25 | 11 |
| N9 | ca | 25 | 11 |
Scenario: search and address ranks for boundaries are correctly assigned Scenario: search and address ranks for boundaries are correctly assigned
Given the named places Given the named places
| osm | class | type | | osm | class | type |

View File

@@ -153,4 +153,70 @@ Feature: Import of postcodes
When sending search query "E4 7EA" When sending search query "E4 7EA"
Then results contain Then results contain
| type | display_name | | type | display_name |
| postcode | E4 7EA | | postcode | E4 7EA |
Scenario: search and address ranks for GB post codes correctly assigned
Given the places
| osm | class | type | postcode | geometry |
| N1 | place | postcode | E45 2CD | country:gb |
| N2 | place | postcode | E45 2 | country:gb |
| N3 | place | postcode | Y45 | country:gb |
When importing
Then location_postcode contains exactly
| postcode | country | rank_search | rank_address |
| E45 2CD | gb | 25 | 5 |
| E45 2 | gb | 23 | 5 |
| Y45 | gb | 21 | 5 |
Scenario: wrongly formatted GB postcodes are down-ranked
Given the places
| osm | class | type | postcode | geometry |
| N1 | place | postcode | EA452CD | country:gb |
| N2 | place | postcode | E45 23 | country:gb |
When importing
Then location_postcode contains exactly
| postcode | country | rank_search | rank_address |
| EA452CD | gb | 30 | 30 |
| E45 23 | gb | 30 | 30 |
Scenario: search and address rank for DE postcodes correctly assigned
Given the places
| osm | class | type | postcode | geometry |
| N1 | place | postcode | 56427 | country:de |
| N2 | place | postcode | 5642 | country:de |
| N3 | place | postcode | 5642A | country:de |
| N4 | place | postcode | 564276 | country:de |
When importing
Then location_postcode contains exactly
| postcode | country | rank_search | rank_address |
| 56427 | de | 21 | 11 |
| 5642 | de | 30 | 30 |
| 5642A | de | 30 | 30 |
| 564276 | de | 30 | 30 |
Scenario: search and address rank for other postcodes are correctly assigned
Given the places
| osm | class | type | postcode | geometry |
| N1 | place | postcode | 1 | country:ca |
| N2 | place | postcode | X3 | country:ca |
| N3 | place | postcode | 543 | country:ca |
| N4 | place | postcode | 54dc | country:ca |
| N5 | place | postcode | 12345 | country:ca |
| N6 | place | postcode | 55TT667 | country:ca |
| N7 | place | postcode | 123-65 | country:ca |
| N8 | place | postcode | 12 445 4 | country:ca |
| N9 | place | postcode | A1:bc10 | country:ca |
When importing
Then location_postcode contains exactly
| postcode | country | rank_search | rank_address |
| 1 | ca | 21 | 11 |
| X3 | ca | 21 | 11 |
| 543 | ca | 21 | 11 |
| 54DC | ca | 21 | 11 |
| 12345 | ca | 21 | 11 |
| 55TT667 | ca | 21 | 11 |
| 123-65 | ca | 25 | 11 |
| 12 445 4 | ca | 25 | 11 |
| A1:BC10 | ca | 25 | 11 |

View File

@@ -16,7 +16,6 @@ Feature: Rank assignment
| N18 | place | city | 0 0 | | N18 | place | city | 0 0 |
| N19 | place | island | 0 0 | | N19 | place | island | 0 0 |
| N36 | place | house | 0 0 | | N36 | place | house | 0 0 |
| N38 | place | houses | 0 0 |
And the named places And the named places
| osm | class | type | extra+capital | geometry | | osm | class | type | extra+capital | geometry |
| N101 | place | city | yes | 0 0 | | N101 | place | city | yes | 0 0 |
@@ -35,7 +34,6 @@ Feature: Rank assignment
| N19 | 17 | 0 | | N19 | 17 | 0 |
| N101 | 15 | 16 | | N101 | 15 | 16 |
| N36 | 30 | 30 | | N36 | 30 | 30 |
| N38 | 28 | 0 |
Scenario: Ranks for boundaries are assigned according to admin level Scenario: Ranks for boundaries are assigned according to admin level
Given the named places Given the named places

View File

@@ -164,8 +164,8 @@ Feature: Import and search of names
Scenario: Unprintable characters in postcodes are ignored Scenario: Unprintable characters in postcodes are ignored
Given the named places Given the named places
| osm | class | type | address | | osm | class | type | address | geometry |
| N234 | amenity | prison | 'postcode' : u'1234\u200e' | | N234 | amenity | prison | 'postcode' : u'1234\u200e' | country:de |
When importing When importing
And sending search query "1234" And sending search query "1234"
Then result 0 has not attributes osm_type Then result 0 has not attributes osm_type

View File

@@ -25,7 +25,7 @@ Feature: Searching of simple objects
| osm | class | type | postcode | geometry | | osm | class | type | postcode | geometry |
| R1 | boundary | postal_code | 54321 | poly-area:1.0 | | R1 | boundary | postal_code | 54321 | poly-area:1.0 |
And sending search query "12345" And sending search query "12345"
Then result 0 has not attributes osm_type Then exactly 0 results are returned
When sending search query "54321" When sending search query "54321"
Then results contain Then results contain
| ID | osm | | ID | osm |

View File

@@ -63,9 +63,7 @@ Feature: Update of simple objects
| osm | class | type | postcode | geometry | | osm | class | type | postcode | geometry |
| N3 | place | postcode | 12345 | 1 -1 | | N3 | place | postcode | 12345 | 1 -1 |
When importing When importing
Then placex contains Then placex has no entry for N3
| object | class | type |
| N3 | place | postcode |
When updating places When updating places
| osm | class | type | postcode | housenr | geometry | | osm | class | type | postcode | housenr | geometry |
| N3 | place | house | 12345 | 13 | 1 -1 | | N3 | place | house | 12345 | 13 | 1 -1 |

View File

@@ -139,21 +139,3 @@ Feature: Update of relations by osm2pgsql
Then place contains Then place contains
| object | addr+country | name | | object | addr+country | name |
| R1 | XX | 'name' : 'Foo' | | R1 | XX | 'name' : 'Foo' |
Scenario: Country boundary names are extended when country_code known
When loading osm data
"""
n200 Tamenity=prison x0 y0
n201 x0 y0.0001
n202 x0.0001 y0.0001
n203 x0.0001 y0
"""
And updating osm data
"""
w1 Nn200,n201,n202,n203,n200
r1 Ttype=boundary,boundary=administrative,name=Foo,country_code=ch,admin_level=2 Mw1@
"""
Then place contains
| object | addr+country | name+name:de | name+name |
| R1 | ch | Schweiz | Foo |

View File

@@ -93,29 +93,8 @@ def add_data_to_planet_ways(context):
def import_and_index_data_from_place_table(context): def import_and_index_data_from_place_table(context):
""" Import data previously set up in the place table. """ Import data previously set up in the place table.
""" """
nctx = context.nominatim context.nominatim.run_nominatim('import', '--continue', 'load-data',
'--index-noanalyse', '-q')
tokenizer = tokenizer_factory.create_tokenizer(nctx.get_test_config())
context.nominatim.copy_from_place(context.db)
# XXX use tool function as soon as it is ported
with context.db.cursor() as cur:
with (context.nominatim.src_dir / 'lib-sql' / 'postcode_tables.sql').open('r') as fd:
cur.execute(fd.read())
cur.execute("""
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, country_code,
upper(trim (both ' ' from address->'postcode')) as pc,
ST_Centroid(ST_Collect(ST_Centroid(geometry)))
FROM placex
WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
AND geometry IS NOT null
GROUP BY country_code, pc""")
# Call directly as the refresh function does not include postcodes.
indexer.LOG.setLevel(logging.ERROR)
indexer.Indexer(context.nominatim.get_libpq_dsn(), tokenizer, 1).index_full(analyse=False)
check_database_integrity(context) check_database_integrity(context)
@@ -268,7 +247,7 @@ def check_location_postcode(context):
for row in context.table: for row in context.table:
db_row = results.get((row['country'],row['postcode'])) db_row = results.get((row['country'],row['postcode']))
assert db_row is not None, \ assert db_row is not None, \
"Missing row for country '{r['country']}' postcode '{r['postcode']}'.".format(r=row) f"Missing row for country '{row['country']}' postcode '{row['postcode']}'."
db_row.assert_row(row, ('country', 'postcode')) db_row.assert_row(row, ('country', 'postcode'))
@@ -333,12 +312,13 @@ def check_place_addressline_exclude(context):
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
for row in context.table: for row in context.table:
pid = NominatimID(row['object']).get_place_id(cur) pid = NominatimID(row['object']).get_place_id(cur)
apid = NominatimID(row['address']).get_place_id(cur) apid = NominatimID(row['address']).get_place_id(cur, allow_empty=True)
cur.execute(""" SELECT * FROM place_addressline if apid is not None:
WHERE place_id = %s AND address_place_id = %s""", cur.execute(""" SELECT * FROM place_addressline
(pid, apid)) WHERE place_id = %s AND address_place_id = %s""",
assert cur.rowcount == 0, \ (pid, apid))
"Row found for place %s and address %s" % (row['object'], row['address']) assert cur.rowcount == 0, \
"Row found for place %s and address %s" % (row['object'], row['address'])
@then("W(?P<oid>\d+) expands to(?P<neg> no)? interpolation") @then("W(?P<oid>\d+) expands to(?P<neg> no)? interpolation")
def check_location_property_osmline(context, oid, neg): def check_location_property_osmline(context, oid, neg):

View File

@@ -62,11 +62,14 @@ class NominatimID:
','.join(['*'] + (extra_columns or [])), table) ','.join(['*'] + (extra_columns or [])), table)
cur.execute(query, (pid, )) cur.execute(query, (pid, ))
def get_place_id(self, cur): def get_place_id(self, cur, allow_empty=False):
""" Look up the place id for the ID. Throws an assertion if the ID """ Look up the place id for the ID. Throws an assertion if the ID
is not unique. is not unique.
""" """
self.query_osm_id(cur, "SELECT place_id FROM placex WHERE {}") self.query_osm_id(cur, "SELECT place_id FROM placex WHERE {}")
if cur.rowcount == 0 and allow_empty:
return None
assert cur.rowcount == 1, \ assert cur.rowcount == 1, \
"Place ID {!s} not unique. Found {} entries.".format(self, cur.rowcount) "Place ID {!s} not unique. Found {} entries.".format(self, cur.rowcount)