mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-14 18:37:58 +00:00
Compare commits
7 Commits
922667b650
...
96d04e3a2e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
96d04e3a2e | ||
|
|
23db1ab981 | ||
|
|
cd1b1736a9 | ||
|
|
9447c90b09 | ||
|
|
81c6cb72e6 | ||
|
|
f2a122c5c0 | ||
|
|
57ef0e1f98 |
@@ -117,6 +117,7 @@ module.MAIN_TAGS.all_boundaries = {
|
||||
boundary = {'named',
|
||||
place = 'delete',
|
||||
land_area = 'delete',
|
||||
protected_area = 'fallback',
|
||||
postal_code = 'always'},
|
||||
landuse = 'fallback',
|
||||
place = 'always'
|
||||
@@ -198,7 +199,7 @@ module.MAIN_TAGS_POIS = function (group)
|
||||
no = group},
|
||||
landuse = {cemetery = 'always'},
|
||||
leisure = {'always',
|
||||
nature_reserve = 'fallback',
|
||||
nature_reserve = 'named',
|
||||
swimming_pool = 'named',
|
||||
garden = 'named',
|
||||
common = 'named',
|
||||
|
||||
@@ -15,7 +15,7 @@ classifiers = [
|
||||
"Operating System :: OS Independent",
|
||||
]
|
||||
dependencies = [
|
||||
"psycopg",
|
||||
"psycopg<3.3",
|
||||
"python-dotenv",
|
||||
"jinja2",
|
||||
"pyYAML>=5.1",
|
||||
|
||||
@@ -374,7 +374,7 @@ class SearchBuilder:
|
||||
tokens = self.get_country_tokens(assignment.country)
|
||||
if not tokens:
|
||||
return None
|
||||
sdata.set_strings('countries', tokens)
|
||||
sdata.set_countries(tokens)
|
||||
sdata.penalty += self.query.get_in_word_penalty(assignment.country)
|
||||
elif self.details.countries:
|
||||
sdata.countries = dbf.WeightedStrings(self.details.countries,
|
||||
|
||||
@@ -244,6 +244,21 @@ class SearchData:
|
||||
|
||||
setattr(self, field, wstrs)
|
||||
|
||||
def set_countries(self, tokens: List[Token]) -> None:
|
||||
""" Set the WeightedStrings properties for countries. Multiple
|
||||
entries for the same country are deduplicated and the minimum
|
||||
penalty is used. Adapts the global penalty, so that the
|
||||
minimum penalty is 0.
|
||||
"""
|
||||
if tokens:
|
||||
min_penalty = min(t.penalty for t in tokens)
|
||||
self.penalty += min_penalty
|
||||
countries: dict[str, float] = {}
|
||||
for t in tokens:
|
||||
cc = t.get_country()
|
||||
countries[cc] = min(t.penalty - min_penalty, countries.get(cc, 10000))
|
||||
self.countries = WeightedStrings(list(countries.keys()), list(countries.values()))
|
||||
|
||||
def set_qualifiers(self, tokens: List[Token]) -> None:
|
||||
""" Set the qulaifier field from the given tokens.
|
||||
"""
|
||||
|
||||
@@ -59,12 +59,16 @@ class ICUToken(qmod.Token):
|
||||
assert self.info
|
||||
return self.info.get('class', ''), self.info.get('type', '')
|
||||
|
||||
def rematch(self, norm: str) -> None:
|
||||
def get_country(self) -> str:
|
||||
assert self.info
|
||||
return cast(str, self.info.get('cc', ''))
|
||||
|
||||
def match_penalty(self, norm: str) -> float:
|
||||
""" Check how well the token matches the given normalized string
|
||||
and add a penalty, if necessary.
|
||||
"""
|
||||
if not self.lookup_word:
|
||||
return
|
||||
return 0.0
|
||||
|
||||
seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
|
||||
distance = 0
|
||||
@@ -75,7 +79,7 @@ class ICUToken(qmod.Token):
|
||||
distance += max((ato-afrom), (bto-bfrom))
|
||||
elif tag != 'equal':
|
||||
distance += abs((ato-afrom) - (bto-bfrom))
|
||||
self.penalty += (distance/len(self.lookup_word))
|
||||
return (distance/len(self.lookup_word))
|
||||
|
||||
@staticmethod
|
||||
def from_db_row(row: SaRow) -> 'ICUToken':
|
||||
@@ -330,9 +334,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
||||
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
|
||||
for n in query.nodes[start + 1:end + 1]).strip()
|
||||
for ttype, tokens in tlist.items():
|
||||
if ttype != qmod.TOKEN_COUNTRY:
|
||||
for token in tokens:
|
||||
cast(ICUToken, token).rematch(norm)
|
||||
for token in tokens:
|
||||
itok = cast(ICUToken, token)
|
||||
itok.penalty += itok.match_penalty(norm) * \
|
||||
(1 if ttype in (qmod.TOKEN_WORD, qmod.TOKEN_PARTIAL) else 2)
|
||||
|
||||
def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
|
||||
""" Set the break penalties for the nodes in the query.
|
||||
|
||||
@@ -127,6 +127,12 @@ class Token(ABC):
|
||||
category objects.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_country(self) -> str:
|
||||
""" Return the country code this tojen is associated with
|
||||
(currently for country tokens only).
|
||||
"""
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TokenRange:
|
||||
|
||||
@@ -475,20 +475,23 @@ class ICUNameAnalyzer(AbstractAnalyzer):
|
||||
assert self.conn is not None
|
||||
word_tokens = set()
|
||||
for name in names:
|
||||
norm_name = self._search_normalized(name.name)
|
||||
if norm_name:
|
||||
word_tokens.add(norm_name)
|
||||
norm_name = self._normalized(name.name)
|
||||
token_name = self._search_normalized(name.name)
|
||||
if norm_name and token_name:
|
||||
word_tokens.add((token_name, norm_name))
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
# Get existing names
|
||||
cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
|
||||
cur.execute("""SELECT word_token,
|
||||
word as lookup,
|
||||
coalesce(info ? 'internal', false) as is_internal
|
||||
FROM word
|
||||
WHERE type = 'C' and word = %s""",
|
||||
WHERE type = 'C' and info->>'cc' = %s""",
|
||||
(country_code, ))
|
||||
# internal/external names
|
||||
existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
|
||||
existing_tokens: Dict[bool, Set[Tuple[str, str]]] = {True: set(), False: set()}
|
||||
for word in cur:
|
||||
existing_tokens[word[1]].add(word[0])
|
||||
existing_tokens[word[2]].add((word[0], word[1]))
|
||||
|
||||
# Delete names that no longer exist.
|
||||
gone_tokens = existing_tokens[internal] - word_tokens
|
||||
@@ -496,10 +499,10 @@ class ICUNameAnalyzer(AbstractAnalyzer):
|
||||
gone_tokens.update(existing_tokens[False] & word_tokens)
|
||||
if gone_tokens:
|
||||
cur.execute("""DELETE FROM word
|
||||
USING unnest(%s::text[]) as token
|
||||
WHERE type = 'C' and word = %s
|
||||
and word_token = token""",
|
||||
(list(gone_tokens), country_code))
|
||||
USING jsonb_array_elements(%s) as data
|
||||
WHERE type = 'C' and info->>'cc' = %s
|
||||
and word_token = data->>0 and word = data->>1""",
|
||||
(Jsonb(list(gone_tokens)), country_code))
|
||||
|
||||
# Only add those names that are not yet in the list.
|
||||
new_tokens = word_tokens - existing_tokens[True]
|
||||
@@ -508,15 +511,17 @@ class ICUNameAnalyzer(AbstractAnalyzer):
|
||||
if new_tokens:
|
||||
if internal:
|
||||
sql = """INSERT INTO word (word_token, type, word, info)
|
||||
(SELECT token, 'C', %s, '{"internal": "yes"}'
|
||||
FROM unnest(%s::text[]) as token)
|
||||
(SELECT data->>0, 'C', data->>1,
|
||||
jsonb_build_object('internal', 'yes', 'cc', %s::text)
|
||||
FROM jsonb_array_elements(%s) as data)
|
||||
"""
|
||||
else:
|
||||
sql = """INSERT INTO word (word_token, type, word)
|
||||
(SELECT token, 'C', %s
|
||||
FROM unnest(%s::text[]) as token)
|
||||
sql = """INSERT INTO word (word_token, type, word, info)
|
||||
(SELECT data->>0, 'C', data->>1,
|
||||
jsonb_build_object('cc', %s::text)
|
||||
FROM jsonb_array_elements(%s) as data)
|
||||
"""
|
||||
cur.execute(sql, (country_code, list(new_tokens)))
|
||||
cur.execute(sql, (country_code, Jsonb(list(new_tokens))))
|
||||
|
||||
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
|
||||
""" Determine tokenizer information about the given place.
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for database migration to newer software versions.
|
||||
@@ -18,6 +18,7 @@ from ..db.connection import connect, Connection, \
|
||||
from ..db.sql_preprocessor import SQLPreprocessor
|
||||
from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
from ..data.country_info import create_country_names, setup_country_config
|
||||
from . import refresh
|
||||
|
||||
LOG = logging.getLogger()
|
||||
@@ -156,3 +157,25 @@ def create_place_entrance_table(conn: Connection, config: Configuration, **_: An
|
||||
CREATE UNIQUE INDEX place_entrance_osm_id_idx ON place_entrance
|
||||
USING BTREE (osm_id);
|
||||
""")
|
||||
|
||||
|
||||
@_migration(5, 2, 99, 1)
|
||||
def convert_country_tokens(conn: Connection, config: Configuration, **_: Any) -> None:
|
||||
""" Convert country word tokens
|
||||
|
||||
Country tokens now save the country in the info field instead of the
|
||||
word. This migration removes all country tokens from the word table
|
||||
and reimports the default country name. This means that custom names
|
||||
are lost. If you need them back, invalidate the OSM objects containing
|
||||
the names by setting indexed_status to 2 and then reindex the database.
|
||||
"""
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
|
||||
# There is only one tokenizer at the time of migration, so we make
|
||||
# some assumptions here about the structure of the database. This will
|
||||
# fail if somebody has written a custom tokenizer.
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("DELETE FROM word WHERE type = 'C'")
|
||||
conn.commit()
|
||||
|
||||
setup_country_config(config)
|
||||
create_country_names(conn, tokenizer, config.get_str_list('LANGUAGES'))
|
||||
|
||||
@@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion:
|
||||
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
|
||||
|
||||
|
||||
NOMINATIM_VERSION = parse_version('5.2.0-0')
|
||||
NOMINATIM_VERSION = parse_version('5.2.99-0')
|
||||
|
||||
POSTGRESQL_REQUIRED_VERSION = (12, 0)
|
||||
POSTGIS_REQUIRED_VERSION = (3, 0)
|
||||
|
||||
@@ -17,6 +17,9 @@ class MyToken(query.Token):
|
||||
def get_category(self):
|
||||
return 'this', 'that'
|
||||
|
||||
def get_country(self):
|
||||
return 'cc'
|
||||
|
||||
|
||||
def mktoken(tid: int):
|
||||
return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
|
||||
|
||||
@@ -21,6 +21,9 @@ class MyToken(Token):
|
||||
def get_category(self):
|
||||
return 'this', 'that'
|
||||
|
||||
def get_country(self):
|
||||
return self.lookup_word
|
||||
|
||||
|
||||
def make_query(*args):
|
||||
q = QueryStruct([Phrase(qmod.PHRASE_ANY, '')])
|
||||
|
||||
@@ -20,6 +20,9 @@ class MyToken(Token):
|
||||
def get_category(self):
|
||||
return 'this', 'that'
|
||||
|
||||
def get_country(self):
|
||||
return 'cc'
|
||||
|
||||
|
||||
def make_query(*args):
|
||||
q = QueryStruct([Phrase(args[0][1], '')])
|
||||
|
||||
@@ -99,7 +99,7 @@ def test_address_simple_places(apiobj, frontend, atype, address, search):
|
||||
|
||||
|
||||
def test_address_country(apiobj, frontend):
|
||||
apiobj.add_word_table([(None, 'ro', 'C', 'ro', None)])
|
||||
apiobj.add_word_table([(None, 'ro', 'C', 'ro', {'cc': 'ro'})])
|
||||
apiobj.add_country('ro', 'POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))')
|
||||
apiobj.add_country_name('ro', {'name': 'România'})
|
||||
|
||||
|
||||
@@ -10,6 +10,8 @@ of the table.
|
||||
"""
|
||||
from nominatim_db.db.connection import execute_scalar
|
||||
|
||||
from psycopg.types.json import Jsonb
|
||||
|
||||
|
||||
class MockIcuWordTable:
|
||||
""" A word table for testing using legacy word table structure.
|
||||
@@ -42,11 +44,11 @@ class MockIcuWordTable:
|
||||
""", (word_token, word, cls, typ, oper))
|
||||
self.conn.commit()
|
||||
|
||||
def add_country(self, country_code, word_token):
|
||||
def add_country(self, country_code, word_token, lookup):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""INSERT INTO word (word_token, type, word)
|
||||
VALUES(%s, 'C', %s)""",
|
||||
(word_token, country_code))
|
||||
cur.execute("""INSERT INTO word (word_token, type, word, info)
|
||||
VALUES(%s, 'C', %s, %s)""",
|
||||
(word_token, lookup, Jsonb({'cc': country_code})))
|
||||
self.conn.commit()
|
||||
|
||||
def add_postcode(self, word_token, postcode):
|
||||
@@ -93,7 +95,7 @@ class MockIcuWordTable:
|
||||
|
||||
def get_country(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT word, word_token FROM word WHERE type = 'C'")
|
||||
cur.execute("SELECT info->>'cc', word_token, word FROM word WHERE type = 'C'")
|
||||
result = set((tuple(row) for row in cur))
|
||||
assert len(result) == cur.rowcount, "Word table has duplicates."
|
||||
return result
|
||||
|
||||
@@ -343,16 +343,18 @@ def test_add_country_names_new(analyzer, word_table):
|
||||
with analyzer() as anl:
|
||||
anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})
|
||||
|
||||
assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')}
|
||||
assert word_table.get_country() == {('es', 'ESPAGÑA', 'Espagña'),
|
||||
('es', 'SPAIN', 'Spain')}
|
||||
|
||||
|
||||
def test_add_country_names_extend(analyzer, word_table):
|
||||
word_table.add_country('ch', 'SCHWEIZ')
|
||||
word_table.add_country('ch', 'SCHWEIZ', 'Schweiz')
|
||||
|
||||
with analyzer() as anl:
|
||||
anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})
|
||||
|
||||
assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')}
|
||||
assert word_table.get_country() == {('ch', 'SCHWEIZ', 'Schweiz'),
|
||||
('ch', 'SUISSE', 'Suisse')}
|
||||
|
||||
|
||||
class TestPlaceNames:
|
||||
@@ -403,7 +405,7 @@ class TestPlaceNames:
|
||||
info = self.analyzer.process_place(place)
|
||||
|
||||
self.expect_name_terms(info, '#norge', 'norge')
|
||||
assert word_table.get_country() == {('no', 'NORGE')}
|
||||
assert word_table.get_country() == {('no', 'NORGE', 'Norge')}
|
||||
|
||||
|
||||
class TestPlaceAddress:
|
||||
|
||||
Reference in New Issue
Block a user