mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-11 21:34:06 +00:00
add normalised country name to word table
Country tokens now follow the usual convetion of having the normalized version in the word column and the extra info about the country code in the info column.
This commit is contained in:
@@ -374,7 +374,7 @@ class SearchBuilder:
|
|||||||
tokens = self.get_country_tokens(assignment.country)
|
tokens = self.get_country_tokens(assignment.country)
|
||||||
if not tokens:
|
if not tokens:
|
||||||
return None
|
return None
|
||||||
sdata.set_strings('countries', tokens)
|
sdata.set_countries(tokens)
|
||||||
sdata.penalty += self.query.get_in_word_penalty(assignment.country)
|
sdata.penalty += self.query.get_in_word_penalty(assignment.country)
|
||||||
elif self.details.countries:
|
elif self.details.countries:
|
||||||
sdata.countries = dbf.WeightedStrings(self.details.countries,
|
sdata.countries = dbf.WeightedStrings(self.details.countries,
|
||||||
|
|||||||
@@ -244,6 +244,21 @@ class SearchData:
|
|||||||
|
|
||||||
setattr(self, field, wstrs)
|
setattr(self, field, wstrs)
|
||||||
|
|
||||||
|
def set_countries(self, tokens: List[Token]) -> None:
|
||||||
|
""" Set the WeightedStrings properties for countries. Multiple
|
||||||
|
entries for the same country are deduplicated and the minimum
|
||||||
|
penalty is used. Adapts the global penalty, so that the
|
||||||
|
minimum penalty is 0.
|
||||||
|
"""
|
||||||
|
if tokens:
|
||||||
|
min_penalty = min(t.penalty for t in tokens)
|
||||||
|
self.penalty += min_penalty
|
||||||
|
countries: dict[str, float] = {}
|
||||||
|
for t in tokens:
|
||||||
|
cc = t.get_country()
|
||||||
|
countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000))
|
||||||
|
self.countries = WeightedStrings(list(countries.keys()), list(countries.values()))
|
||||||
|
|
||||||
def set_qualifiers(self, tokens: List[Token]) -> None:
|
def set_qualifiers(self, tokens: List[Token]) -> None:
|
||||||
""" Set the qulaifier field from the given tokens.
|
""" Set the qulaifier field from the given tokens.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -59,12 +59,16 @@ class ICUToken(qmod.Token):
|
|||||||
assert self.info
|
assert self.info
|
||||||
return self.info.get('class', ''), self.info.get('type', '')
|
return self.info.get('class', ''), self.info.get('type', '')
|
||||||
|
|
||||||
def rematch(self, norm: str) -> None:
|
def get_country(self) -> str:
|
||||||
|
assert self.info
|
||||||
|
return cast(str, self.info.get('cc', ''))
|
||||||
|
|
||||||
|
def match_penalty(self, norm: str) -> float:
|
||||||
""" Check how well the token matches the given normalized string
|
""" Check how well the token matches the given normalized string
|
||||||
and add a penalty, if necessary.
|
and add a penalty, if necessary.
|
||||||
"""
|
"""
|
||||||
if not self.lookup_word:
|
if not self.lookup_word:
|
||||||
return
|
return 0.0
|
||||||
|
|
||||||
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
|
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
|
||||||
distance = 0
|
distance = 0
|
||||||
@@ -75,7 +79,7 @@ class ICUToken(qmod.Token):
|
|||||||
distance += max((ato-afrom), (bto-bfrom))
|
distance += max((ato-afrom), (bto-bfrom))
|
||||||
elif tag != 'equal':
|
elif tag != 'equal':
|
||||||
distance += abs((ato-afrom) - (bto-bfrom))
|
distance += abs((ato-afrom) - (bto-bfrom))
|
||||||
self.penalty += (distance/len(self.lookup_word))
|
return (distance/len(self.lookup_word))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_db_row(row: SaRow) -> 'ICUToken':
|
def from_db_row(row: SaRow) -> 'ICUToken':
|
||||||
@@ -330,9 +334,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
|
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
|
||||||
for n in query.nodes[start + 1:end + 1]).strip()
|
for n in query.nodes[start + 1:end + 1]).strip()
|
||||||
for ttype, tokens in tlist.items():
|
for ttype, tokens in tlist.items():
|
||||||
if ttype != qmod.TOKEN_COUNTRY:
|
for token in tokens:
|
||||||
for token in tokens:
|
itok = cast(ICUToken, token)
|
||||||
cast(ICUToken, token).rematch(norm)
|
itok.penalty += itok.match_penalty(norm) * \
|
||||||
|
(1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2)
|
||||||
|
|
||||||
def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
|
def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
|
||||||
""" Set the break penalties for the nodes in the query.
|
""" Set the break penalties for the nodes in the query.
|
||||||
|
|||||||
@@ -127,6 +127,12 @@ class Token(ABC):
|
|||||||
category objects.
|
category objects.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_country(self) -> str:
|
||||||
|
""" Return the country code this tojen is associated with
|
||||||
|
(currently for country tokens only).
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
class TokenRange:
|
class TokenRange:
|
||||||
|
|||||||
@@ -475,20 +475,23 @@ class ICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
assert self.conn is not None
|
assert self.conn is not None
|
||||||
word_tokens = set()
|
word_tokens = set()
|
||||||
for name in names:
|
for name in names:
|
||||||
norm_name = self._search_normalized(name.name)
|
norm_name = self._normalized(name.name)
|
||||||
if norm_name:
|
token_name = self._search_normalized(name.name)
|
||||||
word_tokens.add(norm_name)
|
if norm_name and token_name:
|
||||||
|
word_tokens.add((token_name, norm_name))
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
# Get existing names
|
# Get existing names
|
||||||
cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
|
cur.execute("""SELECT word_token,
|
||||||
|
word as lookup,
|
||||||
|
coalesce(info ? 'internal', false) as is_internal
|
||||||
FROM word
|
FROM word
|
||||||
WHERE type = 'C' and word = %s""",
|
WHERE type = 'C' and info->>'cc' = %s""",
|
||||||
(country_code, ))
|
(country_code, ))
|
||||||
# internal/external names
|
# internal/external names
|
||||||
existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
|
existing_tokens: Dict[bool, Set[Tuple[str, str]]] = {True: set(), False: set()}
|
||||||
for word in cur:
|
for word in cur:
|
||||||
existing_tokens[word[1]].add(word[0])
|
existing_tokens[word[2]].add((word[0], word[1]))
|
||||||
|
|
||||||
# Delete names that no longer exist.
|
# Delete names that no longer exist.
|
||||||
gone_tokens = existing_tokens[internal] - word_tokens
|
gone_tokens = existing_tokens[internal] - word_tokens
|
||||||
@@ -496,10 +499,10 @@ class ICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
gone_tokens.update(existing_tokens[False] & word_tokens)
|
gone_tokens.update(existing_tokens[False] & word_tokens)
|
||||||
if gone_tokens:
|
if gone_tokens:
|
||||||
cur.execute("""DELETE FROM word
|
cur.execute("""DELETE FROM word
|
||||||
USING unnest(%s::text[]) as token
|
USING jsonb_array_elements(%s) as data
|
||||||
WHERE type = 'C' and word = %s
|
WHERE type = 'C' and info->>'cc' = %s
|
||||||
and word_token = token""",
|
and word_token = data->>0 and word = data->>1""",
|
||||||
(list(gone_tokens), country_code))
|
(Jsonb(list(gone_tokens)), country_code))
|
||||||
|
|
||||||
# Only add those names that are not yet in the list.
|
# Only add those names that are not yet in the list.
|
||||||
new_tokens = word_tokens - existing_tokens[True]
|
new_tokens = word_tokens - existing_tokens[True]
|
||||||
@@ -508,15 +511,17 @@ class ICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
if new_tokens:
|
if new_tokens:
|
||||||
if internal:
|
if internal:
|
||||||
sql = """INSERT INTO word (word_token, type, word, info)
|
sql = """INSERT INTO word (word_token, type, word, info)
|
||||||
(SELECT token, 'C', %s, '{"internal": "yes"}'
|
(SELECT data->>0, 'C', data->>1,
|
||||||
FROM unnest(%s::text[]) as token)
|
jsonb_build_object('internal', 'yes', 'cc', %s::text)
|
||||||
|
FROM jsonb_array_elements(%s) as data)
|
||||||
"""
|
"""
|
||||||
else:
|
else:
|
||||||
sql = """INSERT INTO word (word_token, type, word)
|
sql = """INSERT INTO word (word_token, type, word, info)
|
||||||
(SELECT token, 'C', %s
|
(SELECT data->>0, 'C', data->>1,
|
||||||
FROM unnest(%s::text[]) as token)
|
jsonb_build_object('cc', %s::text)
|
||||||
|
FROM jsonb_array_elements(%s) as data)
|
||||||
"""
|
"""
|
||||||
cur.execute(sql, (country_code, list(new_tokens)))
|
cur.execute(sql, (country_code, Jsonb(list(new_tokens))))
|
||||||
|
|
||||||
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
|
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
|
||||||
""" Determine tokenizer information about the given place.
|
""" Determine tokenizer information about the given place.
|
||||||
|
|||||||
@@ -10,6 +10,8 @@ of the table.
|
|||||||
"""
|
"""
|
||||||
from nominatim_db.db.connection import execute_scalar
|
from nominatim_db.db.connection import execute_scalar
|
||||||
|
|
||||||
|
from psycopg.types.json import Jsonb
|
||||||
|
|
||||||
|
|
||||||
class MockIcuWordTable:
|
class MockIcuWordTable:
|
||||||
""" A word table for testing using legacy word table structure.
|
""" A word table for testing using legacy word table structure.
|
||||||
@@ -42,11 +44,11 @@ class MockIcuWordTable:
|
|||||||
""", (word_token, word, cls, typ, oper))
|
""", (word_token, word, cls, typ, oper))
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
def add_country(self, country_code, word_token):
|
def add_country(self, country_code, word_token, lookup):
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("""INSERT INTO word (word_token, type, word)
|
cur.execute("""INSERT INTO word (word_token, type, word, info)
|
||||||
VALUES(%s, 'C', %s)""",
|
VALUES(%s, 'C', %s, %s)""",
|
||||||
(word_token, country_code))
|
(word_token, lookup, Jsonb({'cc': country_code})))
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
def add_postcode(self, word_token, postcode):
|
def add_postcode(self, word_token, postcode):
|
||||||
@@ -93,7 +95,7 @@ class MockIcuWordTable:
|
|||||||
|
|
||||||
def get_country(self):
|
def get_country(self):
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("SELECT word, word_token FROM word WHERE type = 'C'")
|
cur.execute("SELECT info->>'cc', word_token, word FROM word WHERE type = 'C'")
|
||||||
result = set((tuple(row) for row in cur))
|
result = set((tuple(row) for row in cur))
|
||||||
assert len(result) == cur.rowcount, "Word table has duplicates."
|
assert len(result) == cur.rowcount, "Word table has duplicates."
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -343,16 +343,18 @@ def test_add_country_names_new(analyzer, word_table):
|
|||||||
with analyzer() as anl:
|
with analyzer() as anl:
|
||||||
anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})
|
anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})
|
||||||
|
|
||||||
assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')}
|
assert word_table.get_country() == {('es', 'ESPAGÑA', 'Espagña'),
|
||||||
|
('es', 'SPAIN', 'Spain')}
|
||||||
|
|
||||||
|
|
||||||
def test_add_country_names_extend(analyzer, word_table):
|
def test_add_country_names_extend(analyzer, word_table):
|
||||||
word_table.add_country('ch', 'SCHWEIZ')
|
word_table.add_country('ch', 'SCHWEIZ', 'Schweiz')
|
||||||
|
|
||||||
with analyzer() as anl:
|
with analyzer() as anl:
|
||||||
anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})
|
anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})
|
||||||
|
|
||||||
assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')}
|
assert word_table.get_country() == {('ch', 'SCHWEIZ', 'Schweiz'),
|
||||||
|
('ch', 'SUISSE', 'Suisse')}
|
||||||
|
|
||||||
|
|
||||||
class TestPlaceNames:
|
class TestPlaceNames:
|
||||||
|
|||||||
Reference in New Issue
Block a user