forked from hans/Nominatim
Merge pull request #3894 from lonvia/country-names-with-word-lookup
Add normalized form of country names to coutry tokens in word table
This commit is contained in:
@@ -15,7 +15,7 @@ classifiers = [
|
|||||||
"Operating System :: OS Independent",
|
"Operating System :: OS Independent",
|
||||||
]
|
]
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"psycopg",
|
"psycopg<3.3",
|
||||||
"python-dotenv",
|
"python-dotenv",
|
||||||
"jinja2",
|
"jinja2",
|
||||||
"pyYAML>=5.1",
|
"pyYAML>=5.1",
|
||||||
|
|||||||
@@ -374,7 +374,7 @@ class SearchBuilder:
|
|||||||
tokens = self.get_country_tokens(assignment.country)
|
tokens = self.get_country_tokens(assignment.country)
|
||||||
if not tokens:
|
if not tokens:
|
||||||
return None
|
return None
|
||||||
sdata.set_strings('countries', tokens)
|
sdata.set_countries(tokens)
|
||||||
sdata.penalty += self.query.get_in_word_penalty(assignment.country)
|
sdata.penalty += self.query.get_in_word_penalty(assignment.country)
|
||||||
elif self.details.countries:
|
elif self.details.countries:
|
||||||
sdata.countries = dbf.WeightedStrings(self.details.countries,
|
sdata.countries = dbf.WeightedStrings(self.details.countries,
|
||||||
|
|||||||
@@ -244,6 +244,21 @@ class SearchData:
|
|||||||
|
|
||||||
setattr(self, field, wstrs)
|
setattr(self, field, wstrs)
|
||||||
|
|
||||||
|
def set_countries(self, tokens: List[Token]) -> None:
|
||||||
|
""" Set the WeightedStrings properties for countries. Multiple
|
||||||
|
entries for the same country are deduplicated and the minimum
|
||||||
|
penalty is used. Adapts the global penalty, so that the
|
||||||
|
minimum penalty is 0.
|
||||||
|
"""
|
||||||
|
if tokens:
|
||||||
|
min_penalty = min(t.penalty for t in tokens)
|
||||||
|
self.penalty += min_penalty
|
||||||
|
countries: dict[str, float] = {}
|
||||||
|
for t in tokens:
|
||||||
|
cc = t.get_country()
|
||||||
|
countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000))
|
||||||
|
self.countries = WeightedStrings(list(countries.keys()), list(countries.values()))
|
||||||
|
|
||||||
def set_qualifiers(self, tokens: List[Token]) -> None:
|
def set_qualifiers(self, tokens: List[Token]) -> None:
|
||||||
""" Set the qulaifier field from the given tokens.
|
""" Set the qulaifier field from the given tokens.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -59,12 +59,16 @@ class ICUToken(qmod.Token):
|
|||||||
assert self.info
|
assert self.info
|
||||||
return self.info.get('class', ''), self.info.get('type', '')
|
return self.info.get('class', ''), self.info.get('type', '')
|
||||||
|
|
||||||
def rematch(self, norm: str) -> None:
|
def get_country(self) -> str:
|
||||||
|
assert self.info
|
||||||
|
return cast(str, self.info.get('cc', ''))
|
||||||
|
|
||||||
|
def match_penalty(self, norm: str) -> float:
|
||||||
""" Check how well the token matches the given normalized string
|
""" Check how well the token matches the given normalized string
|
||||||
and add a penalty, if necessary.
|
and add a penalty, if necessary.
|
||||||
"""
|
"""
|
||||||
if not self.lookup_word:
|
if not self.lookup_word:
|
||||||
return
|
return 0.0
|
||||||
|
|
||||||
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
|
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
|
||||||
distance = 0
|
distance = 0
|
||||||
@@ -75,7 +79,7 @@ class ICUToken(qmod.Token):
|
|||||||
distance += max((ato-afrom), (bto-bfrom))
|
distance += max((ato-afrom), (bto-bfrom))
|
||||||
elif tag != 'equal':
|
elif tag != 'equal':
|
||||||
distance += abs((ato-afrom) - (bto-bfrom))
|
distance += abs((ato-afrom) - (bto-bfrom))
|
||||||
self.penalty += (distance/len(self.lookup_word))
|
return (distance/len(self.lookup_word))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_db_row(row: SaRow) -> 'ICUToken':
|
def from_db_row(row: SaRow) -> 'ICUToken':
|
||||||
@@ -330,9 +334,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
|
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
|
||||||
for n in query.nodes[start + 1:end + 1]).strip()
|
for n in query.nodes[start + 1:end + 1]).strip()
|
||||||
for ttype, tokens in tlist.items():
|
for ttype, tokens in tlist.items():
|
||||||
if ttype != qmod.TOKEN_COUNTRY:
|
for token in tokens:
|
||||||
for token in tokens:
|
itok = cast(ICUToken, token)
|
||||||
cast(ICUToken, token).rematch(norm)
|
itok.penalty += itok.match_penalty(norm) * \
|
||||||
|
(1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2)
|
||||||
|
|
||||||
def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
|
def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
|
||||||
""" Set the break penalties for the nodes in the query.
|
""" Set the break penalties for the nodes in the query.
|
||||||
|
|||||||
@@ -127,6 +127,12 @@ class Token(ABC):
|
|||||||
category objects.
|
category objects.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_country(self) -> str:
|
||||||
|
""" Return the country code this tojen is associated with
|
||||||
|
(currently for country tokens only).
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
class TokenRange:
|
class TokenRange:
|
||||||
|
|||||||
@@ -475,20 +475,23 @@ class ICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
assert self.conn is not None
|
assert self.conn is not None
|
||||||
word_tokens = set()
|
word_tokens = set()
|
||||||
for name in names:
|
for name in names:
|
||||||
norm_name = self._search_normalized(name.name)
|
norm_name = self._normalized(name.name)
|
||||||
if norm_name:
|
token_name = self._search_normalized(name.name)
|
||||||
word_tokens.add(norm_name)
|
if norm_name and token_name:
|
||||||
|
word_tokens.add((token_name, norm_name))
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
# Get existing names
|
# Get existing names
|
||||||
cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
|
cur.execute("""SELECT word_token,
|
||||||
|
word as lookup,
|
||||||
|
coalesce(info ? 'internal', false) as is_internal
|
||||||
FROM word
|
FROM word
|
||||||
WHERE type = 'C' and word = %s""",
|
WHERE type = 'C' and info->>'cc' = %s""",
|
||||||
(country_code, ))
|
(country_code, ))
|
||||||
# internal/external names
|
# internal/external names
|
||||||
existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
|
existing_tokens: Dict[bool, Set[Tuple[str, str]]] = {True: set(), False: set()}
|
||||||
for word in cur:
|
for word in cur:
|
||||||
existing_tokens[word[1]].add(word[0])
|
existing_tokens[word[2]].add((word[0], word[1]))
|
||||||
|
|
||||||
# Delete names that no longer exist.
|
# Delete names that no longer exist.
|
||||||
gone_tokens = existing_tokens[internal] - word_tokens
|
gone_tokens = existing_tokens[internal] - word_tokens
|
||||||
@@ -496,10 +499,10 @@ class ICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
gone_tokens.update(existing_tokens[False] & word_tokens)
|
gone_tokens.update(existing_tokens[False] & word_tokens)
|
||||||
if gone_tokens:
|
if gone_tokens:
|
||||||
cur.execute("""DELETE FROM word
|
cur.execute("""DELETE FROM word
|
||||||
USING unnest(%s::text[]) as token
|
USING jsonb_array_elements(%s) as data
|
||||||
WHERE type = 'C' and word = %s
|
WHERE type = 'C' and info->>'cc' = %s
|
||||||
and word_token = token""",
|
and word_token = data->>0 and word = data->>1""",
|
||||||
(list(gone_tokens), country_code))
|
(Jsonb(list(gone_tokens)), country_code))
|
||||||
|
|
||||||
# Only add those names that are not yet in the list.
|
# Only add those names that are not yet in the list.
|
||||||
new_tokens = word_tokens - existing_tokens[True]
|
new_tokens = word_tokens - existing_tokens[True]
|
||||||
@@ -508,15 +511,17 @@ class ICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
if new_tokens:
|
if new_tokens:
|
||||||
if internal:
|
if internal:
|
||||||
sql = """INSERT INTO word (word_token, type, word, info)
|
sql = """INSERT INTO word (word_token, type, word, info)
|
||||||
(SELECT token, 'C', %s, '{"internal": "yes"}'
|
(SELECT data->>0, 'C', data->>1,
|
||||||
FROM unnest(%s::text[]) as token)
|
jsonb_build_object('internal', 'yes', 'cc', %s::text)
|
||||||
|
FROM jsonb_array_elements(%s) as data)
|
||||||
"""
|
"""
|
||||||
else:
|
else:
|
||||||
sql = """INSERT INTO word (word_token, type, word)
|
sql = """INSERT INTO word (word_token, type, word, info)
|
||||||
(SELECT token, 'C', %s
|
(SELECT data->>0, 'C', data->>1,
|
||||||
FROM unnest(%s::text[]) as token)
|
jsonb_build_object('cc', %s::text)
|
||||||
|
FROM jsonb_array_elements(%s) as data)
|
||||||
"""
|
"""
|
||||||
cur.execute(sql, (country_code, list(new_tokens)))
|
cur.execute(sql, (country_code, Jsonb(list(new_tokens))))
|
||||||
|
|
||||||
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
|
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
|
||||||
""" Determine tokenizer information about the given place.
|
""" Determine tokenizer information about the given place.
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
#
|
#
|
||||||
# This file is part of Nominatim. (https://nominatim.org)
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
#
|
#
|
||||||
# Copyright (C) 2024 by the Nominatim developer community.
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
# For a full list of authors see the git log.
|
# For a full list of authors see the git log.
|
||||||
"""
|
"""
|
||||||
Functions for database migration to newer software versions.
|
Functions for database migration to newer software versions.
|
||||||
@@ -18,6 +18,7 @@ from ..db.connection import connect, Connection, \
|
|||||||
from ..db.sql_preprocessor import SQLPreprocessor
|
from ..db.sql_preprocessor import SQLPreprocessor
|
||||||
from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version
|
from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version
|
||||||
from ..tokenizer import factory as tokenizer_factory
|
from ..tokenizer import factory as tokenizer_factory
|
||||||
|
from ..data.country_info import create_country_names, setup_country_config
|
||||||
from . import refresh
|
from . import refresh
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
@@ -156,3 +157,25 @@ def create_place_entrance_table(conn: Connection, config: Configuration, **_: An
|
|||||||
CREATE UNIQUE INDEX place_entrance_osm_id_idx ON place_entrance
|
CREATE UNIQUE INDEX place_entrance_osm_id_idx ON place_entrance
|
||||||
USING BTREE (osm_id);
|
USING BTREE (osm_id);
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
@_migration(5, 2, 99, 1)
|
||||||
|
def convert_country_tokens(conn: Connection, config: Configuration, **_: Any) -> None:
|
||||||
|
""" Convert country word tokens
|
||||||
|
|
||||||
|
Country tokens now save the country in the info field instead of the
|
||||||
|
word. This migration removes all country tokens from the word table
|
||||||
|
and reimports the default country name. This means that custom names
|
||||||
|
are lost. If you need them back, invalidate the OSM objects containing
|
||||||
|
the names by setting indexed_status to 2 and then reindex the database.
|
||||||
|
"""
|
||||||
|
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
|
||||||
|
# There is only one tokenizer at the time of migration, so we make
|
||||||
|
# some assumptions here about the structure of the database. This will
|
||||||
|
# fail if somebody has written a custom tokenizer.
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("DELETE FROM word WHERE type = 'C'")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
setup_country_config(config)
|
||||||
|
create_country_names(conn, tokenizer, config.get_str_list('LANGUAGES'))
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion:
|
|||||||
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
|
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
|
||||||
|
|
||||||
|
|
||||||
NOMINATIM_VERSION = parse_version('5.2.0-0')
|
NOMINATIM_VERSION = parse_version('5.2.99-0')
|
||||||
|
|
||||||
POSTGRESQL_REQUIRED_VERSION = (12, 0)
|
POSTGRESQL_REQUIRED_VERSION = (12, 0)
|
||||||
POSTGIS_REQUIRED_VERSION = (3, 0)
|
POSTGIS_REQUIRED_VERSION = (3, 0)
|
||||||
|
|||||||
@@ -17,6 +17,9 @@ class MyToken(query.Token):
|
|||||||
def get_category(self):
|
def get_category(self):
|
||||||
return 'this', 'that'
|
return 'this', 'that'
|
||||||
|
|
||||||
|
def get_country(self):
|
||||||
|
return 'cc'
|
||||||
|
|
||||||
|
|
||||||
def mktoken(tid: int):
|
def mktoken(tid: int):
|
||||||
return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
|
return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
|
||||||
|
|||||||
@@ -21,6 +21,9 @@ class MyToken(Token):
|
|||||||
def get_category(self):
|
def get_category(self):
|
||||||
return 'this', 'that'
|
return 'this', 'that'
|
||||||
|
|
||||||
|
def get_country(self):
|
||||||
|
return self.lookup_word
|
||||||
|
|
||||||
|
|
||||||
def make_query(*args):
|
def make_query(*args):
|
||||||
q = QueryStruct([Phrase(qmod.PHRASE_ANY, '')])
|
q = QueryStruct([Phrase(qmod.PHRASE_ANY, '')])
|
||||||
|
|||||||
@@ -20,6 +20,9 @@ class MyToken(Token):
|
|||||||
def get_category(self):
|
def get_category(self):
|
||||||
return 'this', 'that'
|
return 'this', 'that'
|
||||||
|
|
||||||
|
def get_country(self):
|
||||||
|
return 'cc'
|
||||||
|
|
||||||
|
|
||||||
def make_query(*args):
|
def make_query(*args):
|
||||||
q = QueryStruct([Phrase(args[0][1], '')])
|
q = QueryStruct([Phrase(args[0][1], '')])
|
||||||
|
|||||||
@@ -99,7 +99,7 @@ def test_address_simple_places(apiobj, frontend, atype, address, search):
|
|||||||
|
|
||||||
|
|
||||||
def test_address_country(apiobj, frontend):
|
def test_address_country(apiobj, frontend):
|
||||||
apiobj.add_word_table([(None, 'ro', 'C', 'ro', None)])
|
apiobj.add_word_table([(None, 'ro', 'C', 'ro', {'cc': 'ro'})])
|
||||||
apiobj.add_country('ro', 'POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))')
|
apiobj.add_country('ro', 'POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))')
|
||||||
apiobj.add_country_name('ro', {'name': 'România'})
|
apiobj.add_country_name('ro', {'name': 'România'})
|
||||||
|
|
||||||
|
|||||||
@@ -10,6 +10,8 @@ of the table.
|
|||||||
"""
|
"""
|
||||||
from nominatim_db.db.connection import execute_scalar
|
from nominatim_db.db.connection import execute_scalar
|
||||||
|
|
||||||
|
from psycopg.types.json import Jsonb
|
||||||
|
|
||||||
|
|
||||||
class MockIcuWordTable:
|
class MockIcuWordTable:
|
||||||
""" A word table for testing using legacy word table structure.
|
""" A word table for testing using legacy word table structure.
|
||||||
@@ -42,11 +44,11 @@ class MockIcuWordTable:
|
|||||||
""", (word_token, word, cls, typ, oper))
|
""", (word_token, word, cls, typ, oper))
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
def add_country(self, country_code, word_token):
|
def add_country(self, country_code, word_token, lookup):
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("""INSERT INTO word (word_token, type, word)
|
cur.execute("""INSERT INTO word (word_token, type, word, info)
|
||||||
VALUES(%s, 'C', %s)""",
|
VALUES(%s, 'C', %s, %s)""",
|
||||||
(word_token, country_code))
|
(word_token, lookup, Jsonb({'cc': country_code})))
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
def add_postcode(self, word_token, postcode):
|
def add_postcode(self, word_token, postcode):
|
||||||
@@ -93,7 +95,7 @@ class MockIcuWordTable:
|
|||||||
|
|
||||||
def get_country(self):
|
def get_country(self):
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("SELECT word, word_token FROM word WHERE type = 'C'")
|
cur.execute("SELECT info->>'cc', word_token, word FROM word WHERE type = 'C'")
|
||||||
result = set((tuple(row) for row in cur))
|
result = set((tuple(row) for row in cur))
|
||||||
assert len(result) == cur.rowcount, "Word table has duplicates."
|
assert len(result) == cur.rowcount, "Word table has duplicates."
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -343,16 +343,18 @@ def test_add_country_names_new(analyzer, word_table):
|
|||||||
with analyzer() as anl:
|
with analyzer() as anl:
|
||||||
anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})
|
anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})
|
||||||
|
|
||||||
assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')}
|
assert word_table.get_country() == {('es', 'ESPAGÑA', 'Espagña'),
|
||||||
|
('es', 'SPAIN', 'Spain')}
|
||||||
|
|
||||||
|
|
||||||
def test_add_country_names_extend(analyzer, word_table):
|
def test_add_country_names_extend(analyzer, word_table):
|
||||||
word_table.add_country('ch', 'SCHWEIZ')
|
word_table.add_country('ch', 'SCHWEIZ', 'Schweiz')
|
||||||
|
|
||||||
with analyzer() as anl:
|
with analyzer() as anl:
|
||||||
anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})
|
anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})
|
||||||
|
|
||||||
assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')}
|
assert word_table.get_country() == {('ch', 'SCHWEIZ', 'Schweiz'),
|
||||||
|
('ch', 'SUISSE', 'Suisse')}
|
||||||
|
|
||||||
|
|
||||||
class TestPlaceNames:
|
class TestPlaceNames:
|
||||||
@@ -403,7 +405,7 @@ class TestPlaceNames:
|
|||||||
info = self.analyzer.process_place(place)
|
info = self.analyzer.process_place(place)
|
||||||
|
|
||||||
self.expect_name_terms(info, '#norge', 'norge')
|
self.expect_name_terms(info, '#norge', 'norge')
|
||||||
assert word_table.get_country() == {('no', 'NORGE')}
|
assert word_table.get_country() == {('no', 'NORGE', 'Norge')}
|
||||||
|
|
||||||
|
|
||||||
class TestPlaceAddress:
|
class TestPlaceAddress:
|
||||||
|
|||||||
Reference in New Issue
Block a user