diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py index f90c6d7f..591d32ca 100644 --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@ -374,7 +374,7 @@ class SearchBuilder: tokens = self.get_country_tokens(assignment.country) if not tokens: return None - sdata.set_strings('countries', tokens) + sdata.set_countries(tokens) sdata.penalty += self.query.get_in_word_penalty(assignment.country) elif self.details.countries: sdata.countries = dbf.WeightedStrings(self.details.countries, diff --git a/src/nominatim_api/search/db_search_fields.py b/src/nominatim_api/search/db_search_fields.py index 669e2a5e..70b8ad7b 100644 --- a/src/nominatim_api/search/db_search_fields.py +++ b/src/nominatim_api/search/db_search_fields.py @@ -244,6 +244,21 @@ class SearchData: setattr(self, field, wstrs) + def set_countries(self, tokens: List[Token]) -> None: + """ Set the WeightedStrings properties for countries. Multiple + entries for the same country are deduplicated and the minimum + penalty is used. Adapts the global penalty, so that the + minimum penalty is 0. + """ + if tokens: + min_penalty = min(t.penalty for t in tokens) + self.penalty += min_penalty + countries: dict[str, float] = {} + for t in tokens: + cc = t.get_country() + countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000)) + self.countries = WeightedStrings(list(countries.keys()), list(countries.values())) + def set_qualifiers(self, tokens: List[Token]) -> None: """ Set the qulaifier field from the given tokens. """ diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 4ab85fd3..50c133a0 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -59,12 +59,16 @@ class ICUToken(qmod.Token): assert self.info return self.info.get('class', ''), self.info.get('type', '') - def rematch(self, norm: str) -> None: + def get_country(self) -> str: + assert self.info + return cast(str, self.info.get('cc', '')) + + def match_penalty(self, norm: str) -> float: """ Check how well the token matches the given normalized string and add a penalty, if necessary. """ if not self.lookup_word: - return + return 0.0 seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm) distance = 0 @@ -75,7 +79,7 @@ class ICUToken(qmod.Token): distance += max((ato-afrom), (bto-bfrom)) elif tag != 'equal': distance += abs((ato-afrom) - (bto-bfrom)) - self.penalty += (distance/len(self.lookup_word)) + return (distance/len(self.lookup_word)) @staticmethod def from_db_row(row: SaRow) -> 'ICUToken': @@ -330,9 +334,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}" for n in query.nodes[start + 1:end + 1]).strip() for ttype, tokens in tlist.items(): - if ttype != qmod.TOKEN_COUNTRY: - for token in tokens: - cast(ICUToken, token).rematch(norm) + for token in tokens: + itok = cast(ICUToken, token) + itok.penalty += itok.match_penalty(norm) * \ + (1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2) def compute_break_penalties(self, query: qmod.QueryStruct) -> None: """ Set the break penalties for the nodes in the query. diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py index f64dd1db..c08e6499 100644 --- a/src/nominatim_api/search/query.py +++ b/src/nominatim_api/search/query.py @@ -127,6 +127,12 @@ class Token(ABC): category objects. """ + @abstractmethod + def get_country(self) -> str: + """ Return the country code this tojen is associated with + (currently for country tokens only). + """ + @dataclasses.dataclass class TokenRange: diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py index b7fa8682..5d90bb27 100644 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@ -475,20 +475,23 @@ class ICUNameAnalyzer(AbstractAnalyzer): assert self.conn is not None word_tokens = set() for name in names: - norm_name = self._search_normalized(name.name) - if norm_name: - word_tokens.add(norm_name) + norm_name = self._normalized(name.name) + token_name = self._search_normalized(name.name) + if norm_name and token_name: + word_tokens.add((token_name, norm_name)) with self.conn.cursor() as cur: # Get existing names - cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal + cur.execute("""SELECT word_token, + word as lookup, + coalesce(info ? 'internal', false) as is_internal FROM word - WHERE type = 'C' and word = %s""", + WHERE type = 'C' and info->>'cc' = %s""", (country_code, )) # internal/external names - existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()} + existing_tokens: Dict[bool, Set[Tuple[str, str]]] = {True: set(), False: set()} for word in cur: - existing_tokens[word[1]].add(word[0]) + existing_tokens[word[2]].add((word[0], word[1])) # Delete names that no longer exist. gone_tokens = existing_tokens[internal] - word_tokens @@ -496,10 +499,10 @@ class ICUNameAnalyzer(AbstractAnalyzer): gone_tokens.update(existing_tokens[False] & word_tokens) if gone_tokens: cur.execute("""DELETE FROM word - USING unnest(%s::text[]) as token - WHERE type = 'C' and word = %s - and word_token = token""", - (list(gone_tokens), country_code)) + USING jsonb_array_elements(%s) as data + WHERE type = 'C' and info->>'cc' = %s + and word_token = data->>0 and word = data->>1""", + (Jsonb(list(gone_tokens)), country_code)) # Only add those names that are not yet in the list. new_tokens = word_tokens - existing_tokens[True] @@ -508,15 +511,17 @@ class ICUNameAnalyzer(AbstractAnalyzer): if new_tokens: if internal: sql = """INSERT INTO word (word_token, type, word, info) - (SELECT token, 'C', %s, '{"internal": "yes"}' - FROM unnest(%s::text[]) as token) + (SELECT data->>0, 'C', data->>1, + jsonb_build_object('internal', 'yes', 'cc', %s::text) + FROM jsonb_array_elements(%s) as data) """ else: - sql = """INSERT INTO word (word_token, type, word) - (SELECT token, 'C', %s - FROM unnest(%s::text[]) as token) + sql = """INSERT INTO word (word_token, type, word, info) + (SELECT data->>0, 'C', data->>1, + jsonb_build_object('cc', %s::text) + FROM jsonb_array_elements(%s) as data) """ - cur.execute(sql, (country_code, list(new_tokens))) + cur.execute(sql, (country_code, Jsonb(list(new_tokens)))) def process_place(self, place: PlaceInfo) -> Mapping[str, Any]: """ Determine tokenizer information about the given place. diff --git a/test/python/mock_icu_word_table.py b/test/python/mock_icu_word_table.py index b26025a0..083246cb 100644 --- a/test/python/mock_icu_word_table.py +++ b/test/python/mock_icu_word_table.py @@ -10,6 +10,8 @@ of the table. """ from nominatim_db.db.connection import execute_scalar +from psycopg.types.json import Jsonb + class MockIcuWordTable: """ A word table for testing using legacy word table structure. @@ -42,11 +44,11 @@ class MockIcuWordTable: """, (word_token, word, cls, typ, oper)) self.conn.commit() - def add_country(self, country_code, word_token): + def add_country(self, country_code, word_token, lookup): with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, type, word) - VALUES(%s, 'C', %s)""", - (word_token, country_code)) + cur.execute("""INSERT INTO word (word_token, type, word, info) + VALUES(%s, 'C', %s, %s)""", + (word_token, lookup, Jsonb({'cc': country_code}))) self.conn.commit() def add_postcode(self, word_token, postcode): @@ -93,7 +95,7 @@ class MockIcuWordTable: def get_country(self): with self.conn.cursor() as cur: - cur.execute("SELECT word, word_token FROM word WHERE type = 'C'") + cur.execute("SELECT info->>'cc', word_token, word FROM word WHERE type = 'C'") result = set((tuple(row) for row in cur)) assert len(result) == cur.rowcount, "Word table has duplicates." return result diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 6d2e9ce7..39796822 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -343,16 +343,18 @@ def test_add_country_names_new(analyzer, word_table): with analyzer() as anl: anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'}) - assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')} + assert word_table.get_country() == {('es', 'ESPAGÑA', 'Espagña'), + ('es', 'SPAIN', 'Spain')} def test_add_country_names_extend(analyzer, word_table): - word_table.add_country('ch', 'SCHWEIZ') + word_table.add_country('ch', 'SCHWEIZ', 'Schweiz') with analyzer() as anl: anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'}) - assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')} + assert word_table.get_country() == {('ch', 'SCHWEIZ', 'Schweiz'), + ('ch', 'SUISSE', 'Suisse')} class TestPlaceNames: