support arbitrary prefixes in country name list

This means we can now get rid of the last special cases for names.
This commit is contained in:
Sarah Hoffmann
2022-05-05 17:16:15 +02:00
parent 3a8ddf736e
commit 9d468f6da0
4 changed files with 148 additions and 80 deletions

View File

@@ -11,6 +11,30 @@ import psycopg2.extras
from nominatim.db import utils as db_utils from nominatim.db import utils as db_utils
from nominatim.db.connection import connect from nominatim.db.connection import connect
from nominatim.errors import UsageError
def _flatten_name_list(names):
if names is None:
return {}
if not isinstance(names, dict):
raise UsageError("Expected key-value list for names in country_settings.py")
flat = {}
for prefix, remain in names.items():
if isinstance(remain, str):
flat[prefix] = remain
elif not isinstance(remain, dict):
raise UsageError("Entries in names must be key-value lists.")
else:
for suffix, name in remain.items():
if suffix == 'default':
flat[prefix] = name
else:
flat[f'{prefix}:{suffix}'] = name
return flat
class _CountryInfo: class _CountryInfo:
@@ -19,7 +43,7 @@ class _CountryInfo:
def __init__(self): def __init__(self):
self._info = {} self._info = {}
self._key_prefix = 'name'
def load(self, config): def load(self, config):
""" Load the country properties from the configuration files, """ Load the country properties from the configuration files,
@@ -27,26 +51,26 @@ class _CountryInfo:
""" """
if not self._info: if not self._info:
self._info = config.load_sub_configuration('country_settings.yaml') self._info = config.load_sub_configuration('country_settings.yaml')
# Convert languages into a list for simpler handling.
for prop in self._info.values(): for prop in self._info.values():
# Convert languages into a list for simpler handling.
if 'languages' not in prop: if 'languages' not in prop:
prop['languages'] = [] prop['languages'] = []
elif not isinstance(prop['languages'], list): elif not isinstance(prop['languages'], list):
prop['languages'] = [x.strip() prop['languages'] = [x.strip()
for x in prop['languages'].split(',')] for x in prop['languages'].split(',')]
if 'names' not in prop or prop['names'] is None: prop['names'] = _flatten_name_list(prop.get('names'))
prop['names'] = {self._key_prefix: {}}
def items(self): def items(self):
""" Return tuples of (country_code, property dict) as iterable. """ Return tuples of (country_code, property dict) as iterable.
""" """
return self._info.items() return self._info.items()
def key_prefix(self): def get(self, country_code):
""" Return the prefix that will be attached to the keys of the country """ Get country information for the country with the given country code.
names values when storing them in the database
""" """
return self._key_prefix return self._info.get(country_code, {})
_COUNTRY_INFO = _CountryInfo() _COUNTRY_INFO = _CountryInfo()
@@ -72,9 +96,6 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
""" """
db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz') db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')
def add_prefix_to_keys(names, prefix):
return {prefix+':'+k: v for k, v in names.items()}
params = [] params = []
for ccode, props in _COUNTRY_INFO.items(): for ccode, props in _COUNTRY_INFO.items():
if ccode is not None and props is not None: if ccode is not None and props is not None:
@@ -84,9 +105,8 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
partition = props.get('partition') partition = props.get('partition')
lang = props['languages'][0] if len( lang = props['languages'][0] if len(
props['languages']) == 1 else None props['languages']) == 1 else None
name = add_prefix_to_keys(props.get('names').get(
_COUNTRY_INFO.key_prefix()), _COUNTRY_INFO.key_prefix()) params.append((ccode, props['names'], lang, partition))
params.append((ccode, name, lang, partition))
with connect(dsn) as conn: with connect(dsn) as conn:
with conn.cursor() as cur: with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur) psycopg2.extras.register_hstore(cur)
@@ -115,9 +135,8 @@ def create_country_names(conn, tokenizer, languages=None):
languages = languages.split(',') languages = languages.split(',')
def _include_key(key): def _include_key(key):
return key == _COUNTRY_INFO.key_prefix() or \ return ':' not in key or not languages or \
(key.startswith(_COUNTRY_INFO.key_prefix()+':') and key[key.index(':') + 1:] in languages
(not languages or key[len(_COUNTRY_INFO.key_prefix())+1:] in languages))
with conn.cursor() as cur: with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur) psycopg2.extras.register_hstore(cur)
@@ -127,15 +146,10 @@ def create_country_names(conn, tokenizer, languages=None):
with tokenizer.name_analyzer() as analyzer: with tokenizer.name_analyzer() as analyzer:
for code, name in cur: for code, name in cur:
names = {'countrycode': code} names = {'countrycode': code}
if code == 'gb':
names['short_name'] = 'UK'
if code == 'us':
names['short_name'] = 'United States'
# country names (only in languages as provided) # country names (only in languages as provided)
if name: if name:
names.update(((k, v) names.update({k : v for k, v in name.items() if _include_key(k)})
for k, v in name.items() if _include_key(k)))
analyzer.add_country_names(code, names) analyzer.add_country_names(code, names)

View File

@@ -1,3 +1,5 @@
short_name:
default: UK
name: name:
default: United Kingdom default: United Kingdom
ab: Британиа Ду ab: Британиа Ду

View File

@@ -1,3 +1,5 @@
short_name:
default: USA
name: name:
default: United States default: United States
ab: Америка Еиду Аштатқәа ab: Америка Еиду Аштатқәа

View File

@@ -7,20 +7,30 @@
""" """
Tests for function that handle country properties. Tests for function that handle country properties.
""" """
from textwrap import dedent
import pytest import pytest
from nominatim.tools import country_info from nominatim.tools import country_info
@pytest.fixture
def read_config(def_config): def loaded_country(def_config):
country_info.setup_country_config(def_config) country_info.setup_country_config(def_config)
@pytest.fixture
def env_with_country_config(project_env):
def _mk_config(cfg):
(project_env.project_dir / 'country_settings.yaml').write_text(dedent(cfg))
return project_env
return _mk_config
@pytest.mark.parametrize("no_partitions", (True, False)) @pytest.mark.parametrize("no_partitions", (True, False))
def test_setup_country_tables(src_dir, temp_db_with_extensions, dsn, temp_db_cursor, def test_setup_country_tables(src_dir, temp_db_with_extensions, dsn, temp_db_cursor,
def_config, no_partitions): loaded_country, no_partitions):
read_config(def_config)
country_info.setup_country_tables(dsn, src_dir / 'data', no_partitions) country_info.setup_country_tables(dsn, src_dir / 'data', no_partitions)
assert temp_db_cursor.table_exists('country_name') assert temp_db_cursor.table_exists('country_name')
@@ -41,8 +51,7 @@ def test_setup_country_tables(src_dir, temp_db_with_extensions, dsn, temp_db_cur
@pytest.mark.parametrize("languages", (None, ' fr,en')) @pytest.mark.parametrize("languages", (None, ' fr,en'))
def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor, def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
table_factory, tokenizer_mock, languages, def_config): table_factory, tokenizer_mock, languages, loaded_country):
read_config(def_config)
table_factory('country_name', 'country_code varchar(2), name hstore', table_factory('country_name', 'country_code varchar(2), name hstore',
content=(('us', '"name"=>"us1","name:af"=>"us2"'), content=(('us', '"name"=>"us1","name:af"=>"us2"'),
@@ -60,62 +69,103 @@ def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cur
for k, v in tokenizer.analyser_cache['countries']} for k, v in tokenizer.analyser_cache['countries']}
if languages: if languages:
assert result_set == {'us': set(('us', 'us1', 'United States')), assert result_set == {'us': set(('us', 'us1')),
'fr': set(('fr', 'Fra', 'Fren'))} 'fr': set(('fr', 'Fra', 'Fren'))}
else: else:
assert result_set == {'us': set(('us', 'us1', 'us2', 'United States')), assert result_set == {'us': set(('us', 'us1', 'us2')),
'fr': set(('fr', 'Fra', 'Fren'))} 'fr': set(('fr', 'Fra', 'Fren'))}
def test_setup_country_config_languages_not_loaded(project_env): def test_setup_country_names_prefixes(env_with_country_config):
(project_env.project_dir / 'country_settings.yaml').write_text(""" config = env_with_country_config("""\
de: es:
partition: 3 names:
names: name:
name: en: Spain
default: Deutschland de: Spanien
""") default: Espagñe
country_info._COUNTRY_INFO._info = None us:
country_info.setup_country_config(project_env) names:
assert country_info._COUNTRY_INFO._info == {'de': {'partition': 3, short_name:
'languages': [], 'names': {'name': {'default': 'Deutschland'}}}} default: USA
name:
default: United States
en: United States
""")
info = country_info._CountryInfo()
info.load(config)
assert info.get('es')['names'] == {"name": "Espagñe",
"name:en": "Spain",
"name:de": "Spanien"}
assert info.get('us')['names'] == {"name": "United States",
"name:en": "United States",
"short_name": "USA"}
assert 'names' not in info.get('xx')
def test_setup_country_config_name_not_loaded(project_env): def test_setup_country_config_languages_not_loaded(env_with_country_config):
(project_env.project_dir / 'country_settings.yaml').write_text(""" config = env_with_country_config("""\
de: de:
partition: 3 partition: 3
languages: de names:
names: name:
""") default: Deutschland
country_info._COUNTRY_INFO._info = None """)
country_info.setup_country_config(project_env) info = country_info._CountryInfo()
assert country_info._COUNTRY_INFO._info == {'de': {'partition': 3, info.load(config)
'languages': ['de'], 'names': {'name': {}}}} assert dict(info.items()) == {'de': {'partition': 3,
'languages': [],
'names': {'name': 'Deutschland'}}}
def test_setup_country_config_names_not_loaded(project_env): def test_setup_country_config_name_not_loaded(env_with_country_config):
(project_env.project_dir / 'country_settings.yaml').write_text(""" config = env_with_country_config("""\
de: de:
partition: 3 partition: 3
languages: de languages: de
""") names:
country_info._COUNTRY_INFO._info = None """)
country_info.setup_country_config(project_env)
assert country_info._COUNTRY_INFO._info == {'de': {'partition': 3, info = country_info._CountryInfo()
'languages': ['de'], 'names': {'name': {}}}} info.load(config)
assert dict(info.items()) == {'de': {'partition': 3,
'languages': ['de'],
'names': {}
}}
def test_setup_country_config_special_character(project_env): def test_setup_country_config_names_not_loaded(env_with_country_config):
(project_env.project_dir / 'country_settings.yaml').write_text(""" config = env_with_country_config("""
bq: de:
partition: 250 partition: 3
languages: nl languages: de
names: """)
name:
default: "\\N" info = country_info._CountryInfo()
""") info.load(config)
country_info._COUNTRY_INFO._info = None
country_info.setup_country_config(project_env) assert dict(info.items()) == {'de': {'partition': 3,
assert country_info._COUNTRY_INFO._info == {'bq': {'partition': 250, 'languages': ['de'],
'languages': ['nl'], 'names': {'name': {'default': '\x85'}}}} 'names': {}
}}
def test_setup_country_config_special_character(env_with_country_config):
config = env_with_country_config("""
bq:
partition: 250
languages: nl
names:
name:
default: "\\N"
""")
info = country_info._CountryInfo()
info.load(config)
assert dict(info.items()) == {'bq': {'partition': 250,
'languages': ['nl'],
'names': {'name': '\x85'}
}}