move default country name creation to tokenizer

The new function is also used, when a country us updated. All SQL
function related to country names have been removed.
This commit is contained in:
Sarah Hoffmann
2021-04-27 11:37:18 +02:00
parent dc700c25b6
commit bef300305e
8 changed files with 105 additions and 134 deletions

View File

@@ -202,29 +202,6 @@ $$
LANGUAGE plpgsql; LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
lookup_country_code varchar(2))
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' '||trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and country_code=lookup_country_code
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, null,
null, null, lookup_country_code, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT, CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
lookup_class text, lookup_type text) lookup_class text, lookup_type text)
RETURNS INTEGER RETURNS INTEGER
@@ -363,36 +340,6 @@ $$
LANGUAGE plpgsql STABLE STRICT; LANGUAGE plpgsql STABLE STRICT;
CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
RETURNS VOID
AS $$
DECLARE
s TEXT;
w INTEGER;
words TEXT[];
item RECORD;
j INTEGER;
BEGIN
FOR item IN SELECT (each(src)).* LOOP
s := make_standard_name(item.value);
w := getorcreate_country(s, country_code);
words := regexp_split_to_array(item.value, E'[,;()]');
IF array_upper(words, 1) != 1 THEN
FOR j IN 1..array_upper(words, 1) LOOP
s := make_standard_name(words[j]);
IF s != '' THEN
w := getorcreate_country(s, country_code);
END IF;
END LOOP;
END IF;
END LOOP;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION make_keywords(src HSTORE) CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
RETURNS INTEGER[] RETURNS INTEGER[]
AS $$ AS $$

View File

@@ -133,7 +133,8 @@ class SetupAll:
database_import.create_search_indices(conn, args.config, database_import.create_search_indices(conn, args.config,
drop=args.no_updates) drop=args.no_updates)
LOG.warning('Create search index for default country names.') LOG.warning('Create search index for default country names.')
database_import.create_country_names(conn, args.config) database_import.create_country_names(conn, tokenizer,
args.config.LANGUAGES)
webdir = args.project_dir / 'website' webdir = args.project_dir / 'website'
LOG.warning('Setup website at %s', webdir) LOG.warning('Setup website at %s', webdir)

View File

@@ -223,6 +223,21 @@ class LegacyNameAnalyzer:
FROM (SELECT distinct(postcode) as pc FROM (SELECT distinct(postcode) as pc
FROM location_postcode) x""") FROM location_postcode) x""")
def add_country_names(self, country_code, names):
""" Add names for the given country to the search index.
"""
with self.conn.cursor() as cur:
cur.execute(
"""INSERT INTO word (word_id, word_token, country_code)
(SELECT nextval('seq_word'), lookup_token, %s
FROM (SELECT ' ' || make_standard_name(n) as lookup_token
FROM unnest(%s)n) y
WHERE NOT EXISTS(SELECT * FROM word
WHERE word_token = lookup_token and country_code = %s))
""", (country_code, names, country_code))
def process_place(self, place): def process_place(self, place):
""" Determine tokenizer information about the given place. """ Determine tokenizer information about the given place.
@@ -231,7 +246,14 @@ class LegacyNameAnalyzer:
""" """
token_info = _TokenInfo(self._cache) token_info = _TokenInfo(self._cache)
token_info.add_names(self.conn, place.get('name'), place.get('country_feature')) names = place.get('name')
if names:
token_info.add_names(self.conn, names)
country_feature = place.get('country_feature')
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
self.add_country_names(country_feature.lower(), list(names.values()))
address = place.get('address') address = place.get('address')
@@ -279,22 +301,14 @@ class _TokenInfo:
self.data = {} self.data = {}
def add_names(self, conn, names, country_feature): def add_names(self, conn, names):
""" Add token information for the names of the place. """ Add token information for the names of the place.
""" """
if not names:
return
with conn.cursor() as cur: with conn.cursor() as cur:
# Create the token IDs for all names. # Create the token IDs for all names.
self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text", self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
(names, )) (names, ))
# Add country tokens to word table if necessary.
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
cur.execute("SELECT create_country(%s, %s)",
(names, country_feature.lower()))
def add_housenumbers(self, conn, hnrs): def add_housenumbers(self, conn, hnrs):
""" Extract housenumber information from the address. """ Extract housenumber information from the address.
@@ -334,7 +348,8 @@ class _TokenInfo:
""" """
def _get_place(name): def _get_place(name):
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute("""SELECT (addr_ids_from_name(%s) || getorcreate_name_id(make_standard_name(%s), ''))::text, cur.execute("""SELECT (addr_ids_from_name(%s)
|| getorcreate_name_id(make_standard_name(%s), ''))::text,
word_ids_from_name(%s)::text""", word_ids_from_name(%s)::text""",
(name, name, name)) (name, name, name))
return cur.fetchone() return cur.fetchone()

View File

@@ -8,6 +8,7 @@ import subprocess
from pathlib import Path from pathlib import Path
import psutil import psutil
import psycopg2.extras
from nominatim.db.connection import connect, get_pg_env from nominatim.db.connection import connect, get_pg_env
from nominatim.db import utils as db_utils from nominatim.db import utils as db_utils
@@ -250,34 +251,37 @@ def create_search_indices(conn, config, drop=False):
sql.run_sql_file(conn, 'indices.sql', drop=drop) sql.run_sql_file(conn, 'indices.sql', drop=drop)
def create_country_names(conn, config): def create_country_names(conn, tokenizer, languages=None):
""" Create search index for default country names. """ Add default country names to search index. `languages` is a comma-
separated list of language codes as used in OSM. If `languages` is not
empty then only name translations for the given languages are added
to the index.
""" """
if languages:
languages = languages.split(',')
def _include_key(key):
return key == 'name' or \
(key.startswith('name:') \
and (not languages or key[5:] in languages))
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""") psycopg2.extras.register_hstore(cur)
cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""") cur.execute("""SELECT country_code, name FROM country_name
cur.execute("""SELECT COUNT(*) FROM WHERE country_code is not null""")
(SELECT getorcreate_country(make_standard_name(country_code),
country_code) FROM country_name WHERE country_code is not null) AS x""")
cur.execute("""SELECT COUNT(*) FROM
(SELECT getorcreate_country(make_standard_name(name->'name'), country_code)
FROM country_name WHERE name ? 'name') AS x""")
sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v),
country_code) FROM (SELECT country_code, skeys(name)
AS k, svals(name) AS v FROM country_name) x WHERE k"""
languages = config.LANGUAGES with tokenizer.name_analyzer() as analyzer:
for code, name in cur:
names = [code]
if code == 'gb':
names.append('UK')
if code == 'us':
names.append('United States')
# country names (only in languages as provided)
if name:
names.extend((v for k, v in name.items() if _include_key(k)))
analyzer.add_country_names(code, names)
if languages:
sql_statement = "{} IN (".format(sql_statement)
delim = ''
for language in languages.split(','):
sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language)
delim = ', '
sql_statement = '{})'.format(sql_statement)
else:
sql_statement = "{} LIKE 'name:%'".format(sql_statement)
sql_statement = "{}) v".format(sql_statement)
cur.execute(sql_statement)
conn.commit() conn.commit()

View File

@@ -121,9 +121,8 @@ def table_factory(temp_db_cursor):
def mk_table(name, definition='id INT', content=None): def mk_table(name, definition='id INT', content=None):
temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition)) temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
if content is not None: if content is not None:
if not isinstance(content, str): psycopg2.extras.execute_values(
content = '),('.join([str(x) for x in content]) temp_db_cursor, "INSERT INTO {} VALUES %s".format(name), content)
temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content))
return mk_table return mk_table
@@ -290,7 +289,7 @@ def osm2pgsql_options(temp_db):
@pytest.fixture @pytest.fixture
def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory): def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
table_factory('country_name', 'partition INT', (0, 1, 2)) table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
cfg = Configuration(None, SRC_DIR.resolve() / 'settings') cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php', cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
sql=tmp_path, data=SRC_DIR / 'data') sql=tmp_path, data=SRC_DIR / 'data')
@@ -299,9 +298,10 @@ def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
@pytest.fixture @pytest.fixture
def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn): def tokenizer_mock(monkeypatch, property_table, temp_db_conn, tmp_path):
""" Sets up the configuration so that the test dummy tokenizer will be """ Sets up the configuration so that the test dummy tokenizer will be
loaded. loaded when the tokenizer factory is used. Also returns a factory
with which a new dummy tokenizer may be created.
""" """
monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy') monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
@@ -310,3 +310,8 @@ def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn):
monkeypatch.setattr(importlib, "import_module", _import_dummy) monkeypatch.setattr(importlib, "import_module", _import_dummy)
properties.set_property(temp_db_conn, 'tokenizer', 'dummy') properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
def _create_tokenizer():
return dummy_tokenizer.DummyTokenizer(None, None)
return _create_tokenizer

View File

@@ -13,6 +13,7 @@ class DummyTokenizer:
self.dsn = dsn self.dsn = dsn
self.data_dir = data_dir self.data_dir = data_dir
self.init_state = None self.init_state = None
self.analyser_cache = {}
def init_new_db(self, config): def init_new_db(self, config):
@@ -26,7 +27,7 @@ class DummyTokenizer:
def name_analyzer(self): def name_analyzer(self):
return DummyNameAnalyzer() return DummyNameAnalyzer(self.analyser_cache)
class DummyNameAnalyzer: class DummyNameAnalyzer:
@@ -38,18 +39,20 @@ class DummyNameAnalyzer:
self.close() self.close()
def __init__(self, cache):
self.analyser_cache = cache
cache['countries'] = []
def close(self): def close(self):
""" Free all resources used by the analyzer.
"""
pass pass
def add_postcodes_from_db(self): def add_postcodes_from_db(self):
pass pass
def process_place(self, place):
""" Determine tokenizer information about the given place.
Returns a JSON-serialisable structure that will be handed into def add_country_names(self, code, names):
the database via the token_info field. self.analyser_cache['countries'].append((code, names))
"""
def process_place(self, place):
return {} return {}

View File

@@ -143,7 +143,8 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
'location_property_tiger', 'location_property_osmline', 'location_property_tiger', 'location_property_osmline',
'location_postcode', 'search_name', 'location_road_23') 'location_postcode', 'search_name', 'location_road_23')
for table in tables: for table in tables:
table_factory(table, content=(1, 2, 3)) table_factory(table, content=((1, ), (2, ), (3, )))
assert temp_db_cursor.table_rows(table) == 3
database_import.truncate_data_tables(temp_db_conn) database_import.truncate_data_tables(temp_db_conn)
@@ -168,31 +169,28 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
assert temp_db_cursor.table_rows('placex') == 30 assert temp_db_cursor.table_rows('placex') == 30
assert temp_db_cursor.table_rows('location_property_osmline') == 1 assert temp_db_cursor.table_rows('location_property_osmline') == 1
@pytest.mark.parametrize("languages", (False, True))
def test_create_country_names(temp_db_conn, temp_db_cursor, def_config, @pytest.mark.parametrize("languages", (None, ' fr,en'))
temp_db_with_extensions, monkeypatch, languages): def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
table_factory, tokenizer_mock, languages):
table_factory('country_name', 'country_code varchar(2), name hstore',
content=(('us', '"name"=>"us1","name:af"=>"us2"'),
('fr', '"name"=>"Fra", "name:en"=>"Fren"')))
assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2
tokenizer = tokenizer_mock()
database_import.create_country_names(temp_db_conn, tokenizer, languages)
assert len(tokenizer.analyser_cache['countries']) == 2
result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
if languages: if languages:
monkeypatch.setenv('NOMINATIM_LANGUAGES', 'fr,en') assert result_set == {'us' : set(('us', 'us1', 'United States')),
temp_db_cursor.execute("""CREATE FUNCTION make_standard_name (name TEXT) 'fr' : set(('fr', 'Fra', 'Fren'))}
RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
""")
temp_db_cursor.execute('CREATE TABLE country_name (country_code varchar(2), name hstore)')
temp_db_cursor.execute('CREATE TABLE word (code varchar(2))')
temp_db_cursor.execute("""INSERT INTO country_name VALUES ('us',
'"name"=>"us","name:af"=>"us"')""")
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
lookup_country_code varchar(2))
RETURNS INTEGER
AS $$
BEGIN
INSERT INTO word VALUES (lookup_country_code);
RETURN 5;
END;
$$
LANGUAGE plpgsql;
""")
database_import.create_country_names(temp_db_conn, def_config)
if languages:
assert temp_db_cursor.table_rows('word') == 4
else: else:
assert temp_db_cursor.table_rows('word') == 5 assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')),
'fr' : set(('fr', 'Fra', 'Fren'))}

View File

@@ -11,9 +11,7 @@ def sql_tmp_path(tmp_path, def_config):
return tmp_path return tmp_path
@pytest.fixture @pytest.fixture
def conn(temp_db_conn, table_factory, monkeypatch): def conn(sql_preprocessor, temp_db_conn):
monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
table_factory('country_name', 'partition INT', (0, 1, 2))
return temp_db_conn return temp_db_conn