forked from hans/Nominatim
move default country name creation to tokenizer
The new function is also used, when a country us updated. All SQL function related to country names have been removed.
This commit is contained in:
@@ -202,29 +202,6 @@ $$
|
|||||||
LANGUAGE plpgsql;
|
LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
|
|
||||||
lookup_country_code varchar(2))
|
|
||||||
RETURNS INTEGER
|
|
||||||
AS $$
|
|
||||||
DECLARE
|
|
||||||
lookup_token TEXT;
|
|
||||||
return_word_id INTEGER;
|
|
||||||
BEGIN
|
|
||||||
lookup_token := ' '||trim(lookup_word);
|
|
||||||
SELECT min(word_id) FROM word
|
|
||||||
WHERE word_token = lookup_token and country_code=lookup_country_code
|
|
||||||
INTO return_word_id;
|
|
||||||
IF return_word_id IS NULL THEN
|
|
||||||
return_word_id := nextval('seq_word');
|
|
||||||
INSERT INTO word VALUES (return_word_id, lookup_token, null,
|
|
||||||
null, null, lookup_country_code, 0);
|
|
||||||
END IF;
|
|
||||||
RETURN return_word_id;
|
|
||||||
END;
|
|
||||||
$$
|
|
||||||
LANGUAGE plpgsql;
|
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
|
CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
|
||||||
lookup_class text, lookup_type text)
|
lookup_class text, lookup_type text)
|
||||||
RETURNS INTEGER
|
RETURNS INTEGER
|
||||||
@@ -363,36 +340,6 @@ $$
|
|||||||
LANGUAGE plpgsql STABLE STRICT;
|
LANGUAGE plpgsql STABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
|
|
||||||
RETURNS VOID
|
|
||||||
AS $$
|
|
||||||
DECLARE
|
|
||||||
s TEXT;
|
|
||||||
w INTEGER;
|
|
||||||
words TEXT[];
|
|
||||||
item RECORD;
|
|
||||||
j INTEGER;
|
|
||||||
BEGIN
|
|
||||||
FOR item IN SELECT (each(src)).* LOOP
|
|
||||||
|
|
||||||
s := make_standard_name(item.value);
|
|
||||||
w := getorcreate_country(s, country_code);
|
|
||||||
|
|
||||||
words := regexp_split_to_array(item.value, E'[,;()]');
|
|
||||||
IF array_upper(words, 1) != 1 THEN
|
|
||||||
FOR j IN 1..array_upper(words, 1) LOOP
|
|
||||||
s := make_standard_name(words[j]);
|
|
||||||
IF s != '' THEN
|
|
||||||
w := getorcreate_country(s, country_code);
|
|
||||||
END IF;
|
|
||||||
END LOOP;
|
|
||||||
END IF;
|
|
||||||
END LOOP;
|
|
||||||
END;
|
|
||||||
$$
|
|
||||||
LANGUAGE plpgsql;
|
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
|
CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
|
||||||
RETURNS INTEGER[]
|
RETURNS INTEGER[]
|
||||||
AS $$
|
AS $$
|
||||||
|
|||||||
@@ -133,7 +133,8 @@ class SetupAll:
|
|||||||
database_import.create_search_indices(conn, args.config,
|
database_import.create_search_indices(conn, args.config,
|
||||||
drop=args.no_updates)
|
drop=args.no_updates)
|
||||||
LOG.warning('Create search index for default country names.')
|
LOG.warning('Create search index for default country names.')
|
||||||
database_import.create_country_names(conn, args.config)
|
database_import.create_country_names(conn, tokenizer,
|
||||||
|
args.config.LANGUAGES)
|
||||||
|
|
||||||
webdir = args.project_dir / 'website'
|
webdir = args.project_dir / 'website'
|
||||||
LOG.warning('Setup website at %s', webdir)
|
LOG.warning('Setup website at %s', webdir)
|
||||||
|
|||||||
@@ -223,6 +223,21 @@ class LegacyNameAnalyzer:
|
|||||||
FROM (SELECT distinct(postcode) as pc
|
FROM (SELECT distinct(postcode) as pc
|
||||||
FROM location_postcode) x""")
|
FROM location_postcode) x""")
|
||||||
|
|
||||||
|
|
||||||
|
def add_country_names(self, country_code, names):
|
||||||
|
""" Add names for the given country to the search index.
|
||||||
|
"""
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"""INSERT INTO word (word_id, word_token, country_code)
|
||||||
|
(SELECT nextval('seq_word'), lookup_token, %s
|
||||||
|
FROM (SELECT ' ' || make_standard_name(n) as lookup_token
|
||||||
|
FROM unnest(%s)n) y
|
||||||
|
WHERE NOT EXISTS(SELECT * FROM word
|
||||||
|
WHERE word_token = lookup_token and country_code = %s))
|
||||||
|
""", (country_code, names, country_code))
|
||||||
|
|
||||||
|
|
||||||
def process_place(self, place):
|
def process_place(self, place):
|
||||||
""" Determine tokenizer information about the given place.
|
""" Determine tokenizer information about the given place.
|
||||||
|
|
||||||
@@ -231,7 +246,14 @@ class LegacyNameAnalyzer:
|
|||||||
"""
|
"""
|
||||||
token_info = _TokenInfo(self._cache)
|
token_info = _TokenInfo(self._cache)
|
||||||
|
|
||||||
token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
|
names = place.get('name')
|
||||||
|
|
||||||
|
if names:
|
||||||
|
token_info.add_names(self.conn, names)
|
||||||
|
|
||||||
|
country_feature = place.get('country_feature')
|
||||||
|
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
||||||
|
self.add_country_names(country_feature.lower(), list(names.values()))
|
||||||
|
|
||||||
address = place.get('address')
|
address = place.get('address')
|
||||||
|
|
||||||
@@ -279,22 +301,14 @@ class _TokenInfo:
|
|||||||
self.data = {}
|
self.data = {}
|
||||||
|
|
||||||
|
|
||||||
def add_names(self, conn, names, country_feature):
|
def add_names(self, conn, names):
|
||||||
""" Add token information for the names of the place.
|
""" Add token information for the names of the place.
|
||||||
"""
|
"""
|
||||||
if not names:
|
|
||||||
return
|
|
||||||
|
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
# Create the token IDs for all names.
|
# Create the token IDs for all names.
|
||||||
self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
|
self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
|
||||||
(names, ))
|
(names, ))
|
||||||
|
|
||||||
# Add country tokens to word table if necessary.
|
|
||||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
|
||||||
cur.execute("SELECT create_country(%s, %s)",
|
|
||||||
(names, country_feature.lower()))
|
|
||||||
|
|
||||||
|
|
||||||
def add_housenumbers(self, conn, hnrs):
|
def add_housenumbers(self, conn, hnrs):
|
||||||
""" Extract housenumber information from the address.
|
""" Extract housenumber information from the address.
|
||||||
@@ -334,7 +348,8 @@ class _TokenInfo:
|
|||||||
"""
|
"""
|
||||||
def _get_place(name):
|
def _get_place(name):
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute("""SELECT (addr_ids_from_name(%s) || getorcreate_name_id(make_standard_name(%s), ''))::text,
|
cur.execute("""SELECT (addr_ids_from_name(%s)
|
||||||
|
|| getorcreate_name_id(make_standard_name(%s), ''))::text,
|
||||||
word_ids_from_name(%s)::text""",
|
word_ids_from_name(%s)::text""",
|
||||||
(name, name, name))
|
(name, name, name))
|
||||||
return cur.fetchone()
|
return cur.fetchone()
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import subprocess
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
|
import psycopg2.extras
|
||||||
|
|
||||||
from nominatim.db.connection import connect, get_pg_env
|
from nominatim.db.connection import connect, get_pg_env
|
||||||
from nominatim.db import utils as db_utils
|
from nominatim.db import utils as db_utils
|
||||||
@@ -250,34 +251,37 @@ def create_search_indices(conn, config, drop=False):
|
|||||||
|
|
||||||
sql.run_sql_file(conn, 'indices.sql', drop=drop)
|
sql.run_sql_file(conn, 'indices.sql', drop=drop)
|
||||||
|
|
||||||
def create_country_names(conn, config):
|
def create_country_names(conn, tokenizer, languages=None):
|
||||||
""" Create search index for default country names.
|
""" Add default country names to search index. `languages` is a comma-
|
||||||
|
separated list of language codes as used in OSM. If `languages` is not
|
||||||
|
empty then only name translations for the given languages are added
|
||||||
|
to the index.
|
||||||
"""
|
"""
|
||||||
|
if languages:
|
||||||
|
languages = languages.split(',')
|
||||||
|
|
||||||
|
def _include_key(key):
|
||||||
|
return key == 'name' or \
|
||||||
|
(key.startswith('name:') \
|
||||||
|
and (not languages or key[5:] in languages))
|
||||||
|
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""")
|
psycopg2.extras.register_hstore(cur)
|
||||||
cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""")
|
cur.execute("""SELECT country_code, name FROM country_name
|
||||||
cur.execute("""SELECT COUNT(*) FROM
|
WHERE country_code is not null""")
|
||||||
(SELECT getorcreate_country(make_standard_name(country_code),
|
|
||||||
country_code) FROM country_name WHERE country_code is not null) AS x""")
|
|
||||||
cur.execute("""SELECT COUNT(*) FROM
|
|
||||||
(SELECT getorcreate_country(make_standard_name(name->'name'), country_code)
|
|
||||||
FROM country_name WHERE name ? 'name') AS x""")
|
|
||||||
sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v),
|
|
||||||
country_code) FROM (SELECT country_code, skeys(name)
|
|
||||||
AS k, svals(name) AS v FROM country_name) x WHERE k"""
|
|
||||||
|
|
||||||
languages = config.LANGUAGES
|
with tokenizer.name_analyzer() as analyzer:
|
||||||
|
for code, name in cur:
|
||||||
|
names = [code]
|
||||||
|
if code == 'gb':
|
||||||
|
names.append('UK')
|
||||||
|
if code == 'us':
|
||||||
|
names.append('United States')
|
||||||
|
|
||||||
|
# country names (only in languages as provided)
|
||||||
|
if name:
|
||||||
|
names.extend((v for k, v in name.items() if _include_key(k)))
|
||||||
|
|
||||||
|
analyzer.add_country_names(code, names)
|
||||||
|
|
||||||
if languages:
|
|
||||||
sql_statement = "{} IN (".format(sql_statement)
|
|
||||||
delim = ''
|
|
||||||
for language in languages.split(','):
|
|
||||||
sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language)
|
|
||||||
delim = ', '
|
|
||||||
sql_statement = '{})'.format(sql_statement)
|
|
||||||
else:
|
|
||||||
sql_statement = "{} LIKE 'name:%'".format(sql_statement)
|
|
||||||
sql_statement = "{}) v".format(sql_statement)
|
|
||||||
cur.execute(sql_statement)
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|||||||
@@ -121,9 +121,8 @@ def table_factory(temp_db_cursor):
|
|||||||
def mk_table(name, definition='id INT', content=None):
|
def mk_table(name, definition='id INT', content=None):
|
||||||
temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
|
temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
|
||||||
if content is not None:
|
if content is not None:
|
||||||
if not isinstance(content, str):
|
psycopg2.extras.execute_values(
|
||||||
content = '),('.join([str(x) for x in content])
|
temp_db_cursor, "INSERT INTO {} VALUES %s".format(name), content)
|
||||||
temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content))
|
|
||||||
|
|
||||||
return mk_table
|
return mk_table
|
||||||
|
|
||||||
@@ -290,7 +289,7 @@ def osm2pgsql_options(temp_db):
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
|
def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
|
||||||
table_factory('country_name', 'partition INT', (0, 1, 2))
|
table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
|
||||||
cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
|
cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
|
||||||
cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
|
cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
|
||||||
sql=tmp_path, data=SRC_DIR / 'data')
|
sql=tmp_path, data=SRC_DIR / 'data')
|
||||||
@@ -299,9 +298,10 @@ def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn):
|
def tokenizer_mock(monkeypatch, property_table, temp_db_conn, tmp_path):
|
||||||
""" Sets up the configuration so that the test dummy tokenizer will be
|
""" Sets up the configuration so that the test dummy tokenizer will be
|
||||||
loaded.
|
loaded when the tokenizer factory is used. Also returns a factory
|
||||||
|
with which a new dummy tokenizer may be created.
|
||||||
"""
|
"""
|
||||||
monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
|
monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
|
||||||
|
|
||||||
@@ -310,3 +310,8 @@ def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn):
|
|||||||
|
|
||||||
monkeypatch.setattr(importlib, "import_module", _import_dummy)
|
monkeypatch.setattr(importlib, "import_module", _import_dummy)
|
||||||
properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
|
properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
|
||||||
|
|
||||||
|
def _create_tokenizer():
|
||||||
|
return dummy_tokenizer.DummyTokenizer(None, None)
|
||||||
|
|
||||||
|
return _create_tokenizer
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ class DummyTokenizer:
|
|||||||
self.dsn = dsn
|
self.dsn = dsn
|
||||||
self.data_dir = data_dir
|
self.data_dir = data_dir
|
||||||
self.init_state = None
|
self.init_state = None
|
||||||
|
self.analyser_cache = {}
|
||||||
|
|
||||||
|
|
||||||
def init_new_db(self, config):
|
def init_new_db(self, config):
|
||||||
@@ -26,7 +27,7 @@ class DummyTokenizer:
|
|||||||
|
|
||||||
|
|
||||||
def name_analyzer(self):
|
def name_analyzer(self):
|
||||||
return DummyNameAnalyzer()
|
return DummyNameAnalyzer(self.analyser_cache)
|
||||||
|
|
||||||
|
|
||||||
class DummyNameAnalyzer:
|
class DummyNameAnalyzer:
|
||||||
@@ -38,18 +39,20 @@ class DummyNameAnalyzer:
|
|||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, cache):
|
||||||
|
self.analyser_cache = cache
|
||||||
|
cache['countries'] = []
|
||||||
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
""" Free all resources used by the analyzer.
|
|
||||||
"""
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def add_postcodes_from_db(self):
|
def add_postcodes_from_db(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def process_place(self, place):
|
|
||||||
""" Determine tokenizer information about the given place.
|
|
||||||
|
|
||||||
Returns a JSON-serialisable structure that will be handed into
|
def add_country_names(self, code, names):
|
||||||
the database via the token_info field.
|
self.analyser_cache['countries'].append((code, names))
|
||||||
"""
|
|
||||||
|
def process_place(self, place):
|
||||||
return {}
|
return {}
|
||||||
|
|||||||
@@ -143,7 +143,8 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
|
|||||||
'location_property_tiger', 'location_property_osmline',
|
'location_property_tiger', 'location_property_osmline',
|
||||||
'location_postcode', 'search_name', 'location_road_23')
|
'location_postcode', 'search_name', 'location_road_23')
|
||||||
for table in tables:
|
for table in tables:
|
||||||
table_factory(table, content=(1, 2, 3))
|
table_factory(table, content=((1, ), (2, ), (3, )))
|
||||||
|
assert temp_db_cursor.table_rows(table) == 3
|
||||||
|
|
||||||
database_import.truncate_data_tables(temp_db_conn)
|
database_import.truncate_data_tables(temp_db_conn)
|
||||||
|
|
||||||
@@ -168,31 +169,28 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
|
|||||||
assert temp_db_cursor.table_rows('placex') == 30
|
assert temp_db_cursor.table_rows('placex') == 30
|
||||||
assert temp_db_cursor.table_rows('location_property_osmline') == 1
|
assert temp_db_cursor.table_rows('location_property_osmline') == 1
|
||||||
|
|
||||||
@pytest.mark.parametrize("languages", (False, True))
|
|
||||||
def test_create_country_names(temp_db_conn, temp_db_cursor, def_config,
|
@pytest.mark.parametrize("languages", (None, ' fr,en'))
|
||||||
temp_db_with_extensions, monkeypatch, languages):
|
def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
|
||||||
|
table_factory, tokenizer_mock, languages):
|
||||||
|
|
||||||
|
table_factory('country_name', 'country_code varchar(2), name hstore',
|
||||||
|
content=(('us', '"name"=>"us1","name:af"=>"us2"'),
|
||||||
|
('fr', '"name"=>"Fra", "name:en"=>"Fren"')))
|
||||||
|
|
||||||
|
assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2
|
||||||
|
|
||||||
|
tokenizer = tokenizer_mock()
|
||||||
|
|
||||||
|
database_import.create_country_names(temp_db_conn, tokenizer, languages)
|
||||||
|
|
||||||
|
assert len(tokenizer.analyser_cache['countries']) == 2
|
||||||
|
|
||||||
|
result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
|
||||||
|
|
||||||
if languages:
|
if languages:
|
||||||
monkeypatch.setenv('NOMINATIM_LANGUAGES', 'fr,en')
|
assert result_set == {'us' : set(('us', 'us1', 'United States')),
|
||||||
temp_db_cursor.execute("""CREATE FUNCTION make_standard_name (name TEXT)
|
'fr' : set(('fr', 'Fra', 'Fren'))}
|
||||||
RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
|
|
||||||
""")
|
|
||||||
temp_db_cursor.execute('CREATE TABLE country_name (country_code varchar(2), name hstore)')
|
|
||||||
temp_db_cursor.execute('CREATE TABLE word (code varchar(2))')
|
|
||||||
temp_db_cursor.execute("""INSERT INTO country_name VALUES ('us',
|
|
||||||
'"name"=>"us","name:af"=>"us"')""")
|
|
||||||
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
|
|
||||||
lookup_country_code varchar(2))
|
|
||||||
RETURNS INTEGER
|
|
||||||
AS $$
|
|
||||||
BEGIN
|
|
||||||
INSERT INTO word VALUES (lookup_country_code);
|
|
||||||
RETURN 5;
|
|
||||||
END;
|
|
||||||
$$
|
|
||||||
LANGUAGE plpgsql;
|
|
||||||
""")
|
|
||||||
database_import.create_country_names(temp_db_conn, def_config)
|
|
||||||
if languages:
|
|
||||||
assert temp_db_cursor.table_rows('word') == 4
|
|
||||||
else:
|
else:
|
||||||
assert temp_db_cursor.table_rows('word') == 5
|
assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')),
|
||||||
|
'fr' : set(('fr', 'Fra', 'Fren'))}
|
||||||
|
|||||||
@@ -11,9 +11,7 @@ def sql_tmp_path(tmp_path, def_config):
|
|||||||
return tmp_path
|
return tmp_path
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def conn(temp_db_conn, table_factory, monkeypatch):
|
def conn(sql_preprocessor, temp_db_conn):
|
||||||
monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
|
|
||||||
table_factory('country_name', 'partition INT', (0, 1, 2))
|
|
||||||
return temp_db_conn
|
return temp_db_conn
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user