mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-13 06:14:07 +00:00
refactor import_wiki* script for readability
This commit is contained in:
@@ -1,81 +1,213 @@
|
||||
#!/bin/bash
|
||||
|
||||
psqlcmd() {
|
||||
psql wikiprocessingdb
|
||||
psql --quiet wikiprocessingdb
|
||||
}
|
||||
|
||||
mysql2pgsqlcmd() {
|
||||
./mysql2pgsql.perl /dev/stdin /dev/stdout
|
||||
}
|
||||
|
||||
|
||||
# list the languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
|
||||
|
||||
language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" )
|
||||
download() {
|
||||
echo "Downloading $1"
|
||||
wget --quiet --no-clobber --tries 3
|
||||
}
|
||||
|
||||
|
||||
# create wikipedia calculation tables
|
||||
|
||||
echo "CREATE TABLE linkcounts (language text, title text, count integer, sumcount integer, lat double precision, lon double precision);" | psqlcmd
|
||||
echo "CREATE TABLE wikipedia_article (language text NOT NULL, title text NOT NULL, langcount integer, othercount integer, totalcount integer, lat double precision, lon double precision, importance double precision, title_en text, osm_type character(1), osm_id bigint );" | psqlcmd
|
||||
echo "CREATE TABLE wikipedia_redirect (language text, from_title text, to_title text );" | psqlcmd
|
||||
# languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
|
||||
# requires Bash 4.0
|
||||
readarray -t LANGUAGES < languages.txt
|
||||
|
||||
|
||||
# download individual wikipedia language tables
|
||||
|
||||
for i in "${language[@]}"
|
||||
echo "====================================================================="
|
||||
echo "Create wikipedia calculation tables"
|
||||
echo "====================================================================="
|
||||
|
||||
echo "CREATE TABLE linkcounts (
|
||||
language text,
|
||||
title text,
|
||||
count integer,
|
||||
sumcount integer,
|
||||
lat double precision,
|
||||
lon double precision
|
||||
);" | psqlcmd
|
||||
|
||||
echo "CREATE TABLE wikipedia_article (
|
||||
language text NOT NULL,
|
||||
title text NOT NULL,
|
||||
langcount integer,
|
||||
othercount integer,
|
||||
totalcount integer,
|
||||
lat double precision,
|
||||
lon double precision,
|
||||
importance double precision,
|
||||
title_en text,
|
||||
osm_type character(1),
|
||||
osm_id bigint
|
||||
);" | psqlcmd
|
||||
|
||||
echo "CREATE TABLE wikipedia_redirect (
|
||||
language text,
|
||||
from_title text,
|
||||
to_title text
|
||||
);" | psqlcmd
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
echo "====================================================================="
|
||||
echo "Download individual wikipedia language tables"
|
||||
echo "====================================================================="
|
||||
|
||||
|
||||
for i in "${LANGUAGES[@]}"
|
||||
do
|
||||
wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz
|
||||
wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz
|
||||
wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz
|
||||
wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz
|
||||
echo "Language: $i"
|
||||
|
||||
# english is the largest
|
||||
# 1.7G enwiki-latest-page.sql.gz
|
||||
# 6.2G enwiki-latest-pagelinks.sql.gz
|
||||
# 355M enwiki-latest-langlinks.sql.gz
|
||||
# 128M enwiki-latest-redirect.sql.gz
|
||||
|
||||
# example of smaller languge turkish
|
||||
# 53M trwiki-latest-page.sql.gz
|
||||
# 176M trwiki-latest-pagelinks.sql.gz
|
||||
# 106M trwiki-latest-langlinks.sql.gz
|
||||
# 3.2M trwiki-latest-redirect.sql.gz
|
||||
|
||||
download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz
|
||||
download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz
|
||||
download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz
|
||||
download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz
|
||||
done
|
||||
|
||||
|
||||
# import individual wikipedia language tables
|
||||
|
||||
for i in "${language[@]}"
|
||||
|
||||
|
||||
echo "====================================================================="
|
||||
echo "Import individual wikipedia language tables"
|
||||
echo "====================================================================="
|
||||
|
||||
for i in "${LANGUAGES[@]}"
|
||||
do
|
||||
echo "Language: $i"
|
||||
|
||||
echo "Importing ${i}wiki-latest-pagelinks"
|
||||
gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | mysql2pgsqlcmd | psqlcmd
|
||||
gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd
|
||||
|
||||
echo "Importing ${i}wiki-latest-page"
|
||||
gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd
|
||||
|
||||
echo "Importing ${i}wiki-latest-langlinks"
|
||||
gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | mysql2pgsqlcmd | psqlcmd
|
||||
gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd
|
||||
|
||||
echo "Importing ${i}wiki-latest-redirect"
|
||||
gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd
|
||||
done
|
||||
|
||||
|
||||
# process language tables and associated pagelink counts
|
||||
|
||||
for i in "${language[@]}"
|
||||
do
|
||||
echo "create table ${i}pagelinkcount as select pl_title as title,count(*) as count from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd
|
||||
echo "insert into linkcounts select '${i}',pl_title,count(*) from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd
|
||||
echo "insert into wikipedia_redirect select '${i}',page_title,rd_title from ${i}redirect join ${i}page on (rd_from = page_id) where page_namespace = 0 and rd_namespace = 0;" | psqlcmd
|
||||
echo "alter table ${i}pagelinkcount add column othercount integer;" | psqlcmd
|
||||
echo "update ${i}pagelinkcount set othercount = 0;" | psqlcmd
|
||||
done
|
||||
|
||||
for i in "${language[@]}"
|
||||
|
||||
echo "====================================================================="
|
||||
echo "Process language tables and associated pagelink counts"
|
||||
echo "====================================================================="
|
||||
|
||||
|
||||
for i in "${LANGUAGES[@]}"
|
||||
do
|
||||
for j in "${language[@]}"
|
||||
echo "Language: $i"
|
||||
|
||||
echo "CREATE TABLE ${i}pagelinkcount
|
||||
AS
|
||||
SELECT pl_title AS title,
|
||||
COUNT(*) AS count
|
||||
FROM ${i}pagelinks
|
||||
WHERE pl_namespace = 0
|
||||
GROUP BY pl_title
|
||||
;" | psqlcmd
|
||||
|
||||
echo "INSERT INTO linkcounts
|
||||
SELECT '${i}',
|
||||
pl_title,
|
||||
COUNT(*)
|
||||
FROM ${i}pagelinks
|
||||
WHERE pl_namespace = 0
|
||||
GROUP BY pl_title
|
||||
;" | psqlcmd
|
||||
|
||||
echo "INSERT INTO wikipedia_redirect
|
||||
SELECT '${i}',
|
||||
page_title,
|
||||
rd_title
|
||||
FROM ${i}redirect
|
||||
JOIN ${i}page ON (rd_from = page_id)
|
||||
WHERE page_namespace = 0
|
||||
AND rd_namespace = 0
|
||||
;" | psqlcmd
|
||||
|
||||
echo "ALTER TABLE ${i}pagelinkcount
|
||||
ADD COLUMN othercount integer
|
||||
;" | psqlcmd
|
||||
|
||||
echo "UPDATE ${i}pagelinkcount
|
||||
SET othercount = 0
|
||||
;" | psqlcmd
|
||||
|
||||
for j in "${LANGUAGES[@]}"
|
||||
do
|
||||
echo "update ${i}pagelinkcount set othercount = ${i}pagelinkcount.othercount + x.count from (select page_title as title,count from ${i}langlinks join ${i}page on (ll_from = page_id) join ${j}pagelinkcount on (ll_lang = '${j}' and ll_title = title)) as x where x.title = ${i}pagelinkcount.title;" | psqlcmd
|
||||
echo "UPDATE ${i}pagelinkcount
|
||||
SET othercount = ${i}pagelinkcount.othercount + x.count
|
||||
FROM (
|
||||
SELECT page_title AS title,
|
||||
count
|
||||
FROM ${i}langlinks
|
||||
JOIN ${i}page ON (ll_from = page_id)
|
||||
JOIN ${j}pagelinkcount ON (ll_lang = '${j}' AND ll_title = title)
|
||||
) AS x
|
||||
WHERE x.title = ${i}pagelinkcount.title
|
||||
;" | psqlcmd
|
||||
done
|
||||
echo "insert into wikipedia_article select '${i}', title, count, othercount, count+othercount from ${i}pagelinkcount;" | psqlcmd
|
||||
|
||||
echo "INSERT INTO wikipedia_article
|
||||
SELECT '${i}',
|
||||
title,
|
||||
count,
|
||||
othercount,
|
||||
count + othercount
|
||||
FROM ${i}pagelinkcount
|
||||
;" | psqlcmd
|
||||
done
|
||||
|
||||
|
||||
# calculate importance score for each wikipedia page
|
||||
|
||||
echo "update wikipedia_article set importance = log(totalcount)/log((select max(totalcount) from wikipedia_article))" | psqlcmd
|
||||
|
||||
|
||||
# clean up intermediate tables to conserve space
|
||||
|
||||
for i in "${language[@]}"
|
||||
echo "====================================================================="
|
||||
echo "Calculate importance score for each wikipedia page"
|
||||
echo "====================================================================="
|
||||
|
||||
echo "UPDATE wikipedia_article
|
||||
SET importance = LOG(totalcount)/LOG((SELECT MAX(totalcount) FROM wikipedia_article))
|
||||
;" | psqlcmd
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
echo "====================================================================="
|
||||
echo "Clean up intermediate tables to conserve space"
|
||||
echo "====================================================================="
|
||||
|
||||
for i in "${LANGUAGES[@]}"
|
||||
do
|
||||
echo "DROP TABLE ${i}pagelinks;" | psqlcmd
|
||||
echo "DROP TABLE ${i}page;" | psqlcmd
|
||||
echo "DROP TABLE ${i}langlinks;" | psqlcmd
|
||||
echo "DROP TABLE ${i}redirect;" | psqlcmd
|
||||
echo "DROP TABLE ${i}pagelinks;" | psqlcmd
|
||||
echo "DROP TABLE ${i}page;" | psqlcmd
|
||||
echo "DROP TABLE ${i}langlinks;" | psqlcmd
|
||||
echo "DROP TABLE ${i}redirect;" | psqlcmd
|
||||
echo "DROP TABLE ${i}pagelinkcount;" | psqlcmd
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user