Merge pull request #2629 from tareqpi/country-names-yaml-configuration

Move default country names into yaml configuration
This commit is contained in:
Sarah Hoffmann
2022-04-04 09:04:25 +02:00
committed by GitHub
7 changed files with 26887 additions and 295 deletions

View File

@@ -226,8 +226,7 @@ if (BUILD_IMPORTER)
PATTERN __pycache__ EXCLUDE) PATTERN __pycache__ EXCLUDE)
install(DIRECTORY lib-sql DESTINATION ${NOMINATIM_LIBDIR}) install(DIRECTORY lib-sql DESTINATION ${NOMINATIM_LIBDIR})
install(FILES data/country_name.sql install(FILES ${COUNTRY_GRID_FILE}
${COUNTRY_GRID_FILE}
data/words.sql data/words.sql
DESTINATION ${NOMINATIM_DATADIR}) DESTINATION ${NOMINATIM_DATADIR})
endif() endif()

File diff suppressed because one or more lines are too long

View File

@@ -43,8 +43,8 @@ BEGIN
END IF; END IF;
END LOOP; END LOOP;
-- anything will do as a fallback - just take the first name type thing there is -- as a fallback - take the last element since it is the default name
RETURN trim((avals(name))[1]); RETURN trim((avals(name))[array_length(avals(name), 1)]);
END; END;
$$ $$
LANGUAGE plpgsql IMMUTABLE; LANGUAGE plpgsql IMMUTABLE;

View File

@@ -12,13 +12,14 @@ import psycopg2.extras
from nominatim.db import utils as db_utils from nominatim.db import utils as db_utils
from nominatim.db.connection import connect from nominatim.db.connection import connect
class _CountryInfo: class _CountryInfo:
""" Caches country-specific properties from the configuration file. """ Caches country-specific properties from the configuration file.
""" """
def __init__(self): def __init__(self):
self._info = {} self._info = {}
self._key_prefix = 'name'
def load(self, config): def load(self, config):
""" Load the country properties from the configuration files, """ Load the country properties from the configuration files,
@@ -33,16 +34,24 @@ class _CountryInfo:
elif not isinstance(prop['languages'], list): elif not isinstance(prop['languages'], list):
prop['languages'] = [x.strip() prop['languages'] = [x.strip()
for x in prop['languages'].split(',')] for x in prop['languages'].split(',')]
if 'names' not in prop or prop['names'] is None:
prop['names'] = {self._key_prefix: {}}
def items(self): def items(self):
""" Return tuples of (country_code, property dict) as iterable. """ Return tuples of (country_code, property dict) as iterable.
""" """
return self._info.items() return self._info.items()
def key_prefix(self):
""" Return the prefix that will be attached to the keys of the country
names values when storing them in the database
"""
return self._key_prefix
_COUNTRY_INFO = _CountryInfo() _COUNTRY_INFO = _CountryInfo()
def setup_country_config(config): def setup_country_config(config):
""" Load country properties from the configuration file. """ Load country properties from the configuration file.
Needs to be called before using any other functions in this Needs to be called before using any other functions in this
@@ -61,9 +70,11 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
""" Create and populate the tables with basic static data that provides """ Create and populate the tables with basic static data that provides
the background for geocoding. Data is assumed to not yet exist. the background for geocoding. Data is assumed to not yet exist.
""" """
db_utils.execute_file(dsn, sql_dir / 'country_name.sql')
db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz') db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')
def add_prefix_to_keys(names, prefix):
return {prefix+':'+k: v for k, v in names.items()}
params = [] params = []
for ccode, props in _COUNTRY_INFO.items(): for ccode, props in _COUNTRY_INFO.items():
if ccode is not None and props is not None: if ccode is not None and props is not None:
@@ -71,16 +82,26 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
partition = 0 partition = 0
else: else:
partition = props.get('partition') partition = props.get('partition')
lang = props['languages'][0] if len(props['languages']) == 1 else None lang = props['languages'][0] if len(
params.append((ccode, partition, lang)) props['languages']) == 1 else None
name = add_prefix_to_keys(props.get('names').get(
_COUNTRY_INFO.key_prefix()), _COUNTRY_INFO.key_prefix())
params.append((ccode, name, lang, partition))
with connect(dsn) as conn: with connect(dsn) as conn:
with conn.cursor() as cur: with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur)
cur.execute(
""" CREATE TABLE public.country_name (
country_code character varying(2),
name public.hstore,
derived_name public.hstore,
country_default_language_code text,
partition integer
); """)
cur.execute_values( cur.execute_values(
""" UPDATE country_name """ INSERT INTO public.country_name
SET partition = part, country_default_language_code = lang (country_code, name, country_default_language_code, partition) VALUES %s
FROM (VALUES %s) AS v (cc, part, lang) """, params)
WHERE country_code = v.cc""", params)
conn.commit() conn.commit()
@@ -94,8 +115,9 @@ def create_country_names(conn, tokenizer, languages=None):
languages = languages.split(',') languages = languages.split(',')
def _include_key(key): def _include_key(key):
return key == 'name' or \ return key == _COUNTRY_INFO.key_prefix() or \
(key.startswith('name:') and (not languages or key[5:] in languages)) (key.startswith(_COUNTRY_INFO.key_prefix()+':') and
(not languages or key[len(_COUNTRY_INFO.key_prefix())+1:] in languages))
with conn.cursor() as cur: with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur) psycopg2.extras.register_hstore(cur)
@@ -112,7 +134,8 @@ def create_country_names(conn, tokenizer, languages=None):
# country names (only in languages as provided) # country names (only in languages as provided)
if name: if name:
names.update(((k, v) for k, v in name.items() if _include_key(k))) names.update(((k, v)
for k, v in name.items() if _include_key(k)))
analyzer.add_country_names(code, names) analyzer.add_country_names(code, names)

File diff suppressed because it is too large Load Diff

View File

@@ -12,20 +12,24 @@ import pytest
from nominatim.tools import country_info from nominatim.tools import country_info
@pytest.fixture(autouse=True)
def read_config(def_config): def read_config(def_config):
country_info.setup_country_config(def_config) country_info.setup_country_config(def_config)
@pytest.mark.parametrize("no_partitions", (True, False)) @pytest.mark.parametrize("no_partitions", (True, False))
def test_setup_country_tables(src_dir, temp_db_with_extensions, dsn, temp_db_cursor, def test_setup_country_tables(src_dir, temp_db_with_extensions, dsn, temp_db_cursor,
def_config, no_partitions): def_config, no_partitions):
read_config(def_config)
country_info.setup_country_tables(dsn, src_dir / 'data', no_partitions) country_info.setup_country_tables(dsn, src_dir / 'data', no_partitions)
assert temp_db_cursor.table_exists('country_name') assert temp_db_cursor.table_exists('country_name')
assert temp_db_cursor.table_rows('country_name') == \ assert temp_db_cursor.table_rows('country_name') == \
temp_db_cursor.scalar('SELECT count(DISTINCT country_code) FROM country_name') temp_db_cursor.scalar(
'SELECT count(DISTINCT country_code) FROM country_name')
partitions = temp_db_cursor.row_set("SELECT DISTINCT partition FROM country_name") partitions = temp_db_cursor.row_set(
"SELECT DISTINCT partition FROM country_name")
if no_partitions: if no_partitions:
assert partitions == {(0, )} assert partitions == {(0, )}
else: else:
@@ -37,7 +41,8 @@ def test_setup_country_tables(src_dir, temp_db_with_extensions, dsn, temp_db_cur
@pytest.mark.parametrize("languages", (None, ' fr,en')) @pytest.mark.parametrize("languages", (None, ' fr,en'))
def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor, def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
table_factory, tokenizer_mock, languages): table_factory, tokenizer_mock, languages, def_config):
read_config(def_config)
table_factory('country_name', 'country_code varchar(2), name hstore', table_factory('country_name', 'country_code varchar(2), name hstore',
content=(('us', '"name"=>"us1","name:af"=>"us2"'), content=(('us', '"name"=>"us1","name:af"=>"us2"'),
@@ -51,11 +56,66 @@ def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cur
assert len(tokenizer.analyser_cache['countries']) == 2 assert len(tokenizer.analyser_cache['countries']) == 2
result_set = {k: set(v.values()) for k, v in tokenizer.analyser_cache['countries']} result_set = {k: set(v.values())
for k, v in tokenizer.analyser_cache['countries']}
if languages: if languages:
assert result_set == {'us' : set(('us', 'us1', 'United States')), assert result_set == {'us': set(('us', 'us1', 'United States')),
'fr' : set(('fr', 'Fra', 'Fren'))} 'fr': set(('fr', 'Fra', 'Fren'))}
else: else:
assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')), assert result_set == {'us': set(('us', 'us1', 'us2', 'United States')),
'fr' : set(('fr', 'Fra', 'Fren'))} 'fr': set(('fr', 'Fra', 'Fren'))}
def test_setup_country_config_languages_not_loaded(project_env):
(project_env.project_dir / 'country_settings.yaml').write_text("""
de:
partition: 3
names:
name:
default: Deutschland
""")
country_info._COUNTRY_INFO._info = None
country_info.setup_country_config(project_env)
assert country_info._COUNTRY_INFO._info == {'de': {'partition': 3,
'languages': [], 'names': {'name': {'default': 'Deutschland'}}}}
def test_setup_country_config_name_not_loaded(project_env):
(project_env.project_dir / 'country_settings.yaml').write_text("""
de:
partition: 3
languages: de
names:
""")
country_info._COUNTRY_INFO._info = None
country_info.setup_country_config(project_env)
assert country_info._COUNTRY_INFO._info == {'de': {'partition': 3,
'languages': ['de'], 'names': {'name': {}}}}
def test_setup_country_config_names_not_loaded(project_env):
(project_env.project_dir / 'country_settings.yaml').write_text("""
de:
partition: 3
languages: de
""")
country_info._COUNTRY_INFO._info = None
country_info.setup_country_config(project_env)
assert country_info._COUNTRY_INFO._info == {'de': {'partition': 3,
'languages': ['de'], 'names': {'name': {}}}}
def test_setup_country_config_special_character(project_env):
(project_env.project_dir / 'country_settings.yaml').write_text("""
bq:
partition: 250
languages: nl
names:
name:
default: "\\N"
""")
country_info._COUNTRY_INFO._info = None
country_info.setup_country_config(project_env)
assert country_info._COUNTRY_INFO._info == {'bq': {'partition': 250,
'languages': ['nl'], 'names': {'name': {'default': '\x85'}}}}