summaryrefslogtreecommitdiff
path: root/maintenance/postgres/mediawiki_mysql2postgres.pl
blob: 8f170abceb18eec2d4c180fe32b36167242171e4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
#!/usr/bin/perl

## Convert data from a MySQL mediawiki database into a Postgres mediawiki database
## svn: $Id$

## NOTE: It is probably easier to dump your wiki using maintenance/dumpBackup.php
## and then import it with maintenance/importDump.php

## If having UTF-8 problems, there are reports that adding --compatible=postgresql
## may help.

use strict;
use warnings;
use Data::Dumper;
use Getopt::Long;

use vars qw(%table %tz %special @torder $COM);
my $VERSION = '1.2';

## The following options can be changed via command line arguments:
my $MYSQLDB       = '';
my $MYSQLUSER     = '';

## If the following are zero-length, we omit their arguments entirely:
my $MYSQLHOST     = '';
my $MYSQLPASSWORD = '';
my $MYSQLSOCKET   = '';

## Name of the dump file created
my $MYSQLDUMPFILE = 'mediawiki_upgrade.pg';

## How verbose should this script be (0, 1, or 2)
my $verbose = 0;

my $help = 0;

my $USAGE = "
Usage: $0 --db=<dbname> --user=<user> [OPTION]...
Example: $0 --db=wikidb --user=wikiuser --pass=sushi

Converts a MediaWiki schema from MySQL to Postgres
Options:
  db       Name of the MySQL database
  user     MySQL database username
  pass     MySQL database password
  host     MySQL database host
  socket   MySQL database socket
  verbose  Verbosity, increases with multiple uses
";

GetOptions
	(
	 'db=s'     => \$MYSQLDB,
	 'user=s'   => \$MYSQLUSER,
	 'pass=s'   => \$MYSQLPASSWORD,
	 'host=s'   => \$MYSQLHOST,
	 'socket=s' => \$MYSQLSOCKET,
	 'verbose+' => \$verbose,
	 'help'     => \$help,
 );

die $USAGE
	if ! length $MYSQLDB
	or ! length $MYSQLUSER
	or $help;

## The Postgres schema file: should not be changed
my $PG_SCHEMA = 'tables.sql';

## What version we default to when we can't parse the old schema
my $MW_DEFAULT_VERSION = 110;

## Try and find a working version of mysqldump
$verbose and warn "Locating the mysqldump executable\n";
my @MYSQLDUMP = ('/usr/local/bin/mysqldump', '/usr/bin/mysqldump');
my $MYSQLDUMP;
for my $mytry (@MYSQLDUMP) {
	next if ! -e $mytry;
	-x $mytry or die qq{Not an executable file: "$mytry"\n};
	my $version = qx{$mytry -V};
	$version =~ /^mysqldump\s+Ver\s+\d+/ or die qq{Program at "$mytry" does not act like mysqldump\n};
	$MYSQLDUMP = $mytry;
}
$MYSQLDUMP or die qq{Could not find the mysqldump program\n};

## Flags we use for mysqldump
my @MYSQLDUMPARGS = qw(
--skip-lock-tables
--complete-insert
--skip-extended-insert
--skip-add-drop-table
--skip-add-locks
--skip-disable-keys
--skip-set-charset
--skip-comments
--skip-quote-names
);


$verbose and warn "Checking that mysqldump can handle our flags\n";
## Make sure this version can handle all the flags we want.
## Combine with user dump below
my $MYSQLDUMPARGS = join ' ' => @MYSQLDUMPARGS;
## Argh. Any way to make this work on Win32?
my $version = qx{$MYSQLDUMP $MYSQLDUMPARGS 2>&1};
if ($version =~ /unknown option/) {
	die qq{Sorry, you need to use a newer version of the mysqldump program than the one at "$MYSQLDUMP"\n};
}

push @MYSQLDUMPARGS, "--user=$MYSQLUSER";
length $MYSQLPASSWORD and push @MYSQLDUMPARGS, "--password=$MYSQLPASSWORD";
length $MYSQLHOST and push @MYSQLDUMPARGS, "--host=$MYSQLHOST";

## Open the dump file to hold the mysqldump output
open my $mdump, '+>', $MYSQLDUMPFILE or die qq{Could not open "$MYSQLDUMPFILE": $!\n};
print qq{Writing file "$MYSQLDUMPFILE"\n};

open my $mfork2, '-|' or exec $MYSQLDUMP, @MYSQLDUMPARGS, '--no-data', $MYSQLDB;
my $oldselect = select $mdump;

print while <$mfork2>;

## Slurp in the current schema
my $current_schema;
seek $mdump, 0, 0;
{
	local $/;
	$current_schema = <$mdump>;
}
seek $mdump, 0, 0;
truncate $mdump, 0;

warn qq{Trying to determine database version...\n} if $verbose;

my $current_version = 0;
if ($current_schema =~ /CREATE TABLE \S+cur /) {
	$current_version = 103;
}
elsif ($current_schema =~ /CREATE TABLE \S+brokenlinks /) {
	$current_version = 104;
}
elsif ($current_schema !~ /CREATE TABLE \S+templatelinks /) {
	$current_version = 105;
}
elsif ($current_schema !~ /CREATE TABLE \S+validate /) {
	$current_version = 106;
}
elsif ($current_schema !~ /ipb_auto tinyint/) {
	$current_version = 107;
}
elsif ($current_schema !~ /CREATE TABLE \S+profiling /) {
	$current_version = 108;
}
elsif ($current_schema !~ /CREATE TABLE \S+querycachetwo /) {
	$current_version = 109;
}
else {
	$current_version = $MW_DEFAULT_VERSION;
}

if (!$current_version) {
	warn qq{WARNING! Could not figure out the old version, assuming MediaWiki $MW_DEFAULT_VERSION\n};
	$current_version = $MW_DEFAULT_VERSION;
}

## Check for a table prefix:
my $table_prefix = '';
if ($current_schema =~ /CREATE TABLE (\S+)querycache /) {
	$table_prefix = $1;
}

warn qq{Old schema is from MediaWiki version $current_version\n} if $verbose;
warn qq{Table prefix is "$table_prefix"\n} if $verbose and length $table_prefix;

$verbose and warn qq{Writing file "$MYSQLDUMPFILE"\n};
my $now = scalar localtime;
my $conninfo = '';
$MYSQLHOST and $conninfo .= "\n--   host      $MYSQLHOST";
$MYSQLSOCKET and $conninfo .= "\n--   socket    $MYSQLSOCKET";

print qq{
-- Dump of MySQL Mediawiki tables for import into a Postgres Mediawiki schema
-- Performed by the program: $0
-- Version: $VERSION (subversion }.q{$LastChangedRevision$}.qq{)
-- Author: Greg Sabino Mullane <greg\@turnstep.com> Comments welcome
--
-- This file was created: $now
-- Executable used: $MYSQLDUMP
-- Connection information:
--   database:  $MYSQLDB
--   user:      $MYSQLUSER$conninfo

-- This file can be imported manually with psql like so:
-- psql -p port# -h hostname -U username -f $MYSQLDUMPFILE databasename
-- This will overwrite any existing MediaWiki information, so be careful

};

## psql specific stuff
print q{
\\set ON_ERROR_STOP
BEGIN;
SET client_min_messages = 'WARNING';
SET timezone = 'GMT';
SET DateStyle = 'ISO, YMD';
};

warn qq{Reading in the Postgres schema information\n} if $verbose;
open my $schema, '<', $PG_SCHEMA
	or die qq{Could not open "$PG_SCHEMA": make sure this script is run from maintenance/postgres/\n};
my $t;
while (<$schema>) {
	if (/CREATE TABLE\s+(\S+)/) {
		$t = $1;
		$table{$t}={};
		$verbose > 1 and warn qq{  Found table $t\n};
	}
	elsif (/^ +(\w+)\s+TIMESTAMP/) {
		$tz{$t}{$1}++;
		$verbose > 1 and warn qq{    Got a timestamp for column $1\n};
	}
	elsif (/REFERENCES\s*([^( ]+)/) {
		my $ref = $1;
		exists $table{$ref} or die qq{No parent table $ref found for $t\n};
		$table{$t}{$ref}++;
	}
}
close $schema or die qq{Could not close "$PG_SCHEMA": $!\n};

## Read in special cases and table/version information
$verbose and warn qq{Reading in schema exception information\n};
my %version_tables;
while (<DATA>) {
	if (/^VERSION\s+(\d+\.\d+):\s+(.+)/) {
		my $list = join '|' => split /\s+/ => $2;
		$version_tables{$1} = qr{\b$list\b};
		next;
	}
	next unless /^(\w+)\s*(.*)/;
	$special{$1} = $2||'';
	$special{$2} = $1 if length $2;
}

## Determine the order of tables based on foreign key constraints
$verbose and warn qq{Figuring out order of tables to dump\n};
my %dumped;
my $bail = 0;
{
	my $found=0;
	T: for my $t (sort keys %table) {
		next if exists $dumped{$t} and $dumped{$t} >= 1;
		$found=1;
		for my $dep (sort keys %{$table{$t}}) {
			next T if ! exists $dumped{$dep} or $dumped{$dep} < 0;
		}
		$dumped{$t} = -1 if ! exists $dumped{$t};
		## Skip certain tables that are not imported
		next if exists $special{$t} and !$special{$t};
		push @torder, $special{$t} || $t;
	}
	last if !$found;
	push @torder, '---';
	for (values %dumped) { $_+=2; }
	die "Too many loops!\n" if $bail++ > 1000;
	redo;
}

## Prepare the Postgres database for the move
$verbose and warn qq{Writing Postgres transformation information\n};

print "\n-- Empty out all existing tables\n";
$verbose and warn qq{Writing truncates to empty existing tables\n};


for my $t (@torder, 'objectcache', 'querycache') {
	next if $t eq '---';
	my $tname = $special{$t}||$t;
	printf qq{TRUNCATE TABLE %-20s CASCADE;\n}, qq{"$tname"};
}
print "\n\n";

print qq{-- Temporarily rename pagecontent to "${table_prefix}text"\n};
print qq{ALTER TABLE pagecontent RENAME TO "${table_prefix}text";\n\n};

print qq{-- Allow rc_ip to contain empty string, will convert at end\n};
print qq{ALTER TABLE recentchanges ALTER rc_ip TYPE text USING host(rc_ip);\n\n};

print "-- Changing all timestamp fields to handle raw integers\n";
for my $t (sort keys %tz) {
	next if $t eq 'archive2';
	for my $c (sort keys %{$tz{$t}}) {
		printf "ALTER TABLE %-18s ALTER %-25s TYPE TEXT;\n", $t, $c;
	}
}
print "\n";

print q{
INSERT INTO page VALUES (0,-1,'Dummy Page','',0,0,0,default,now(),0,10);
};

## If we have a table _prefix, we need to temporarily rename all of our Postgres
## tables temporarily for the import. Perhaps consider making this an auto-schema
## thing in the future.
if (length $table_prefix) {
	print qq{\n\n-- Temporarily renaming tables to accomodate the table_prefix "$table_prefix"\n\n};
	for my $t (@torder) {
		next if $t eq '---' or $t eq 'text' or $t eq 'user';
		my $tname = $special{$t}||$t;
		printf qq{ALTER TABLE %-18s RENAME TO "${table_prefix}$tname";\n}, qq{"$tname"};
	}
}


## Try and dump the ill-named "user" table:
## We do this table alone because "user" is a reserved word.
print q{

SET escape_string_warning TO 'off';
\\o /dev/null

-- Postgres uses a table name of "mwuser" instead of "user"

-- Create a dummy user to satisfy fk contraints especially with revisions
SELECT setval('user_user_id_seq',0,'false');
INSERT INTO mwuser
  VALUES (DEFAULT,'Anonymous','',NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,now(),now());

};

push @MYSQLDUMPARGS, '--no-create-info';

$verbose and warn qq{Dumping "user" table\n};
$verbose > 2 and warn Dumper \@MYSQLDUMPARGS;
my $usertable = "${table_prefix}user";
open my $mfork, '-|' or exec $MYSQLDUMP, @MYSQLDUMPARGS, $MYSQLDB, $usertable;
## Unfortunately, there is no easy way to catch errors
my $numusers = 0;
while (<$mfork>) {
	++$numusers and print if s/INSERT INTO $usertable/INSERT INTO mwuser/;
}
close $mfork;
if ($numusers < 1) {
	warn qq{No users found, probably a connection error.\n};
	print qq{ERROR: No users found, connection failed, or table "$usertable" does not exist. Dump aborted.\n};
	close $mdump or die qq{Could not close "$MYSQLDUMPFILE": $!\n};
	exit;
}
print "\n-- Users loaded: $numusers\n\n-- Loading rest of the mediawiki schema:\n";

warn qq{Dumping all other tables from the MySQL schema\n} if $verbose;

## Dump the rest of the tables, in chunks based on constraints
## We do not need the user table:
my @dumplist = grep { $_ ne 'user'} @torder;
my @alist;
{
	undef @alist;
	PICKATABLE: {
		my $tname = shift @dumplist;
		## XXX Make this dynamic below
		for my $ver (sort {$b <=> $a } keys %version_tables) {
			redo PICKATABLE if $tname =~ $version_tables{$ver};
		}
		$tname = "${table_prefix}$tname" if length $table_prefix;
		next if $tname !~ /^\w/;
		push @alist, $tname;
		$verbose and warn "  $tname...\n";
		pop @alist and last if index($alist[-1],'---') >= 0;
		redo if @dumplist;
	}

	## Dump everything else
	open my $mfork2, '-|' or exec $MYSQLDUMP, @MYSQLDUMPARGS, $MYSQLDB, @alist;
	print while <$mfork2>;
	close $mfork2;
	warn qq{Finished dumping from MySQL\n} if $verbose;

	redo if @dumplist;
}

warn qq{Writing information to return Postgres database to normal\n} if $verbose;
print qq{ALTER TABLE "${table_prefix}text" RENAME TO pagecontent;\n};
print qq{ALTER TABLE ${table_prefix}recentchanges ALTER rc_ip TYPE cidr USING\n};
print qq{  CASE WHEN rc_ip = '' THEN NULL ELSE rc_ip::cidr END;\n};

## Return tables to their original names if a table prefix was used.
if (length $table_prefix) {
	print qq{\n\n-- Renaming tables by removing table prefix "$table_prefix"\n\n};
	my $maxsize = 18;
	for (@torder) {
		$maxsize = length "$_$table_prefix" if length "$_$table_prefix" > $maxsize;
	}
	for my $t (@torder) {
		next if $t eq '---' or $t eq 'text' or $t eq 'user';
		my $tname = $special{$t}||$t;
		printf qq{ALTER TABLE %*s RENAME TO "$tname";\n}, $maxsize+1, qq{"${table_prefix}$tname"};
	}
}

print qq{\n\n--Returning timestamps to normal\n};
for my $t (sort keys %tz) {
	next if $t eq 'archive2';
	for my $c (sort keys %{$tz{$t}}) {
		printf "ALTER TABLE %-18s ALTER %-25s TYPE timestamptz\n".
				"  USING TO_TIMESTAMP($c,'YYYYMMDDHHMISS');\n", $t, $c;
	}
}

## Reset sequences
print q{
SELECT setval('filearchive_fa_id_seq', 1+coalesce(max(fa_id)  ,0),false) FROM filearchive;
SELECT setval('ipblocks_ipb_id_seq',   1+coalesce(max(ipb_id) ,0),false) FROM ipblocks;
SELECT setval('job_job_id_seq',        1+coalesce(max(job_id) ,0),false) FROM job;
SELECT setval('logging_log_id_seq',    1+coalesce(max(log_id) ,0),false) FROM logging;
SELECT setval('page_page_id_seq',      1+coalesce(max(page_id),0),false) FROM page;
SELECT setval('page_restrictions_pr_id_seq', 1+coalesce(max(pr_id)  ,0),false) FROM page_restrictions;
SELECT setval('recentchanges_rc_id_seq',     1+coalesce(max(rc_id)  ,0),false) FROM recentchanges;
SELECT setval('revision_rev_id_seq',         1+coalesce(max(rev_id) ,0),false) FROM revision;
SELECT setval('text_old_id_seq',       1+coalesce(max(old_id) ,0),false) FROM pagecontent;
SELECT setval('user_user_id_seq',      1+coalesce(max(user_id),0),false) FROM mwuser;
};

print "COMMIT;\n\\o\n\n-- End of dump\n\n";
select $oldselect;
close $mdump or die qq{Could not close "$MYSQLDUMPFILE": $!\n};
exit;


__DATA__
## Known remappings: either indicate the MySQL name,
## or leave blank if it should be skipped
pagecontent text
mwuser user
archive2
profiling
objectcache

## Which tables to ignore depending on the version
VERSION 1.6: externallinks job templatelinks transcache
VERSION 1.7: filearchive langlinks querycache_info
VERSION 1.9: querycachetwo page_restrictions redirect