summaryrefslogtreecommitdiff
path: root/maintenance/importUseModWiki.php
diff options
context:
space:
mode:
Diffstat (limited to 'maintenance/importUseModWiki.php')
-rw-r--r--maintenance/importUseModWiki.php533
1 files changed, 272 insertions, 261 deletions
diff --git a/maintenance/importUseModWiki.php b/maintenance/importUseModWiki.php
index bff4cd02..a28d57a5 100644
--- a/maintenance/importUseModWiki.php
+++ b/maintenance/importUseModWiki.php
@@ -1,5 +1,4 @@
<?php
-
/**
* Import data from a UseModWiki into a MediaWiki wiki
* 2003-02-09 Brion VIBBER <brion@pobox.com>
@@ -21,45 +20,69 @@
* schema changes.
* 2005-03-14
*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
* @todo document
* @file
* @ingroup Maintenance
*/
-if ( php_sapi_name() != 'cli' ) {
- echo "Please customize the settings and run me from the command line.";
- die( -1 );
-}
-
-/** Set these correctly! */
-$wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
-$wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
+require_once( "Maintenance.php" );
-/* On a large wiki, you might run out of memory */
-@ini_set( 'memory_limit', '40M' );
+class ImportUseModWiki extends Maintenance {
-/* globals */
-$wgFieldSeparator = "\xb3"; # Some wikis may use different char
- $FS = $wgFieldSeparator ;
- $FS1 = $FS . "1" ;
- $FS2 = $FS . "2" ;
- $FS3 = $FS . "3" ;
+ private $encoding, $rootDirectory = '';
-# Unicode sanitization tools
-require_once( dirname( dirname( __FILE__ ) ) . '/includes/normal/UtfNormal.php' );
-
-$usercache = array();
+ /**
+ * Field separators
+ * @var String
+ */
+ private $FS1, $FS2, $FS3 = '';
-importPages();
+ /**
+ * @var Array
+ */
+ private $usercache, $nowiki = array();
-# ------------------------------------------------------------------------------
+ public function __construct() {
+ parent::__construct();
+ $this->mDescription = "Import pages from UseMod wikis";
+ $this->addOption( 'encoding', 'Encoding of the imported text, default CP1252', false, true );
+ /**
+ * If UseModWiki's New File System is used:
+ * $NewFS = 1; # 1 = new multibyte $FS, 0 = old $FS
+ * Use "\xb3"; for the Old File System
+ * Changed with UTF-8 UseModWiki
+ * http://www.usemod.com/cgi-bin/wiki.pl?SupportForUtf8
+ * http://www.usemod.com/cgi-bin/wiki.pl?WikiBugs/NewFieldSeparatorWronglyTreated
+ * http://www.meatballwiki.org/wiki/WikiEngine#Q_amp_A
+ */
+ $this->addOption( 'separator', 'Field separator to use, default \x1E\xFF\xFE\x1E', false, true );
+ $this->addArg( 'path', 'Path to your UseMod wiki' );
+ }
-function importPages()
-{
- global $wgRootDirectory;
+ public function execute() {
+ $this->rootDirectory = $this->getArg();
+ $this->encoding = $this->getOption( 'encoding', 'CP1252' );
+ $sep = $this->getOption( 'separator', "\x1E\xFF\xFE\x1E" );
+ $this->FS1 = "{$sep}1";
+ $this->FS2 = "{$sep}2";
+ $this->FS3 = "{$sep}3";
- $gt = '>';
- echo <<<XML
+ echo <<<XML
<?xml version="1.0" encoding="UTF-8" ?>
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
@@ -75,290 +98,278 @@ XML;
'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
foreach ( $letters as $letter ) {
- $dir = "$wgRootDirectory/page/$letter";
+ $dir = "{$this->rootDirectory}/page/$letter";
if ( is_dir( $dir ) )
- importPageDirectory( $dir );
+ $this->importPageDirectory( $dir );
}
echo <<<XML
</mediawiki>
XML;
-}
+ }
-function importPageDirectory( $dir, $prefix = "" )
-{
- echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n";
- $mydir = opendir( $dir );
- while ( $entry = readdir( $mydir ) ) {
- $m = array();
- if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
- echo importPage( $prefix . $m[1] );
- } else {
- if ( is_dir( "$dir/$entry" ) ) {
- if ( $entry != '.' && $entry != '..' ) {
- importPageDirectory( "$dir/$entry", "$entry/" );
- }
+ private function importPageDirectory( $dir, $prefix = "" ) {
+ echo "\n<!-- Checking page directory " . $this->xmlCommentSafe( $dir ) . " -->\n";
+ $mydir = opendir( $dir );
+ while ( $entry = readdir( $mydir ) ) {
+ $m = array();
+ if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
+ echo $this->importPage( $prefix . $m[1] );
} else {
- echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
+ if ( is_dir( "$dir/$entry" ) ) {
+ if ( $entry != '.' && $entry != '..' ) {
+ $this->importPageDirectory( "$dir/$entry", "$entry/" );
+ }
+ } else {
+ echo "<!-- File '" . $this->xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
+ }
}
}
}
-}
-
-# ------------------------------------------------------------------------------
-
-/* fetch_ functions
- Grab a given item from the database
- */
-
-function useModFilename( $title ) {
- $c = substr( $title, 0, 1 );
- if ( preg_match( '/[A-Z]/i', $c ) ) {
- return strtoupper( $c ) . "/$title";
- }
- return "other/$title";
-}
-
-function fetchPage( $title )
-{
- global $FS1, $FS2, $FS3, $wgRootDirectory;
-
- $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
- if ( !file_exists( $fname ) ) {
- echo "Couldn't open file '$fname' for page '$title'.\n";
- die( -1 );
+ private function useModFilename( $title ) {
+ $c = substr( $title, 0, 1 );
+ if ( preg_match( '/[A-Z]/i', $c ) ) {
+ return strtoupper( $c ) . "/$title";
+ }
+ return "other/$title";
}
- $page = splitHash( $FS1, file_get_contents( $fname ) );
- $section = splitHash( $FS2, $page["text_default"] );
- $text = splitHash( $FS3, $section["data"] );
-
- return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
- "minor" => $text["minor"] , "ts" => $section["ts"] ,
- "username" => $section["username"] , "host" => $section["host"] ) );
-}
-
-function fetchKeptPages( $title )
-{
- global $FS1, $FS2, $FS3, $wgRootDirectory;
-
- $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
- if ( !file_exists( $fname ) ) return array();
+ private function fetchPage( $title ) {
+ $fname = $this->rootDirectory . "/page/" . $this->useModFilename( $title ) . ".db";
+ if ( !file_exists( $fname ) ) {
+ echo "Couldn't open file '$fname' for page '$title'.\n";
+ die( -1 );
+ }
- $keptlist = explode( $FS1, file_get_contents( $fname ) );
- array_shift( $keptlist ); # Drop the junk at beginning of file
+ $page = $this->splitHash( $this->FS1, file_get_contents( $fname ) );
+ $section = $this->splitHash( $this->FS2, $page["text_default"] );
+ $text = $this->splitHash( $this->FS3, $section["data"] );
- $revisions = array();
- foreach ( $keptlist as $rev ) {
- $section = splitHash( $FS2, $rev );
- $text = splitHash( $FS3, $section["data"] );
- if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) {
- array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
- "minor" => $text["minor"] , "ts" => $section["ts"] ,
- "username" => $section["username"] , "host" => $section["host"] ) ) );
- } else {
- echo "<!-- skipped a bad old revision -->\n";
- }
+ return $this->array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
+ "minor" => $text["minor"] , "ts" => $section["ts"] ,
+ "username" => $section["username"] , "host" => $section["host"] ) );
}
- return $revisions;
-}
-function splitHash ( $sep , $str ) {
- $temp = explode ( $sep , $str ) ;
- $ret = array () ;
- for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) {
- $ret[$temp[$i]] = $temp[++$i] ;
+ private function fetchKeptPages( $title ) {
+ $fname = $this->rootDirectory . "/keep/" . $this->useModFilename( $title ) . ".kp";
+ if ( !file_exists( $fname ) ) return array();
+
+ $keptlist = explode( $this->FS1, file_get_contents( $fname ) );
+ array_shift( $keptlist ); # Drop the junk at beginning of file
+
+ $revisions = array();
+ foreach ( $keptlist as $rev ) {
+ $section = $this->splitHash( $this->FS2, $rev );
+ $text = $this->splitHash( $this->FS3, $section["data"] );
+ if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) {
+ array_push( $revisions, $this->array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
+ "minor" => $text["minor"] , "ts" => $section["ts"] ,
+ "username" => $section["username"] , "host" => $section["host"] ) ) );
+ } else {
+ echo "<!-- skipped a bad old revision -->\n";
+ }
}
- return $ret ;
+ return $revisions;
}
+ private function splitHash( $sep , $str ) {
+ $temp = explode ( $sep , $str ) ;
+ $ret = array () ;
+ for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) {
+ $ret[$temp[$i]] = $temp[++$i] ;
+ }
+ return $ret ;
+ }
-/* import_ functions
- Take a fetched item and produce SQL
- */
-
-function checkUserCache( $name, $host )
-{
- global $usercache;
-
- if ( $name ) {
- if ( in_array( $name, $usercache ) ) {
- $userid = $usercache[$name];
+ private function checkUserCache( $name, $host ) {
+ if ( $name ) {
+ if ( in_array( $name, $this->usercache ) ) {
+ $userid = $this->usercache[$name];
+ } else {
+ # If we haven't imported user accounts
+ $userid = 0;
+ }
+ $username = str_replace( '_', ' ', $name );
} else {
- # If we haven't imported user accounts
$userid = 0;
+ $username = $host;
}
- $username = str_replace( '_', ' ', $name );
- } else {
- $userid = 0;
- $username = $host;
+ return array( $userid, $username );
}
- return array( $userid, $username );
-}
-
-function importPage( $title )
-{
- echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n";
- $page = fetchPage( $title );
-
- $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) );
- $munged = mungeFormat( $page->text );
- if ( $munged != $page->text ) {
- /**
- * Save a *new* revision with the conversion, and put the
- * previous last version into the history.
- */
- $next = array2object( array(
- 'text' => $munged,
- 'minor' => 1,
- 'username' => 'Conversion script',
- 'host' => '127.0.0.1',
- 'ts' => time(),
- 'summary' => 'link fix',
- ) );
- $revisions = array( $page, $next );
- } else {
- /**
- * Current revision:
- */
- $revisions = array( $page );
- }
- $xml = <<<XML
- <page>
- <title>$newtitle</title>
+ private function importPage( $title ) {
+ echo "\n<!-- Importing page " . $this->xmlCommentSafe( $title ) . " -->\n";
+ $page = $this->fetchPage( $title );
+
+ $newtitle = $this->xmlsafe( str_replace( '_', ' ', $this->recodeText( $title ) ) );
+
+ $munged = $this->mungeFormat( $page->text );
+ if ( $munged != $page->text ) {
+ /**
+ * Save a *new* revision with the conversion, and put the
+ * previous last version into the history.
+ */
+ $next = $this->array2object( array(
+ 'text' => $munged,
+ 'minor' => 1,
+ 'username' => 'Conversion script',
+ 'host' => '127.0.0.1',
+ 'ts' => time(),
+ 'summary' => 'link fix',
+ ) );
+ $revisions = array( $page, $next );
+ } else {
+ /**
+ * Current revision:
+ */
+ $revisions = array( $page );
+ }
+ $xml = <<<XML
+ <page>
+ <title>$newtitle</title>
XML;
- # History
- $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
- if ( count( $revisions ) == 0 ) {
- return NULL; // Was "$sql", which does not appear to be defined.
- }
+ # History
+ $revisions = array_merge( $revisions, $this->fetchKeptPages( $title ) );
+ if ( count( $revisions ) == 0 ) {
+ return NULL; // Was "$sql", which does not appear to be defined.
+ }
- foreach ( $revisions as $rev ) {
- $text = xmlsafe( recodeText( $rev->text ) );
- $minor = ( $rev->minor ? '<minor/>' : '' );
- list( /* $userid */ , $username ) = checkUserCache( $rev->username, $rev->host );
- $username = xmlsafe( recodeText( $username ) );
- $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) );
- $comment = xmlsafe( recodeText( $rev->summary ) );
-
- $xml .= <<<XML
- <revision>
- <timestamp>$timestamp</timestamp>
- <contributor><username>$username</username></contributor>
- $minor
- <comment>$comment</comment>
- <text>$text</text>
- </revision>
+ foreach ( $revisions as $rev ) {
+ $text = $this->xmlsafe( $this->recodeText( $rev->text ) );
+ $minor = ( $rev->minor ? '<minor/>' : '' );
+ list( /* $userid */ , $username ) = $this->checkUserCache( $rev->username, $rev->host );
+ $username = $this->xmlsafe( $this->recodeText( $username ) );
+ $timestamp = $this->xmlsafe( $this->timestamp2ISO8601( $rev->ts ) );
+ $comment = $this->xmlsafe( $this->recodeText( $rev->summary ) );
+
+ $xml .= <<<XML
+ <revision>
+ <timestamp>$timestamp</timestamp>
+ <contributor><username>$username</username></contributor>
+ $minor
+ <comment>$comment</comment>
+ <text>$text</text>
+ </revision>
XML;
+ }
+ $xml .= "</page>\n\n";
+ return $xml;
}
- $xml .= "</page>\n\n";
- return $xml;
-}
-# Whee!
-function recodeText( $string ) {
- global $wgImportEncoding;
- # For currently latin-1 wikis
- $string = str_replace( "\r\n", "\n", $string );
- $string = @iconv( $wgImportEncoding, "UTF-8", $string );
- $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
- return $string;
-}
-
-function wfUtf8Sequence( $codepoint ) {
- if ( $codepoint < 0x80 ) return chr( $codepoint );
- if ( $codepoint < 0x800 ) return chr( $codepoint >> 6 & 0x3f | 0xc0 ) .
- chr( $codepoint & 0x3f | 0x80 );
- if ( $codepoint < 0x10000 ) return chr( $codepoint >> 12 & 0x0f | 0xe0 ) .
- chr( $codepoint >> 6 & 0x3f | 0x80 ) .
- chr( $codepoint & 0x3f | 0x80 );
- if ( $codepoint < 0x100000 ) return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this
- chr( $codepoint >> 12 & 0x3f | 0x80 ) .
- chr( $codepoint >> 6 & 0x3f | 0x80 ) .
- chr( $codepoint & 0x3f | 0x80 );
- # Doesn't yet handle outside the BMP
- return "&#$codepoint;";
-}
+ private function recodeText( $string ) {
+ # For currently latin-1 wikis
+ $string = str_replace( "\r\n", "\n", $string );
+ $string = @iconv( $this->encoding, "UTF-8", $string );
+ $string = $this->mungeToUtf8( $string ); # Any old &#1234; stuff
+ return $string;
+ }
-function wfMungeToUtf8( $string ) {
- $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
- $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
- # Should also do named entities here
- return $string;
-}
+ /**
+ * @todo FIXME: Don't use /e
+ */
+ private function mungeToUtf8( $string ) {
+ $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
+ $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
+ # Should also do named entities here
+ return $string;
+ }
-function timestamp2ISO8601( $ts ) {
- # 2003-08-05T18:30:02Z
- return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
-}
+ private function timestamp2ISO8601( $ts ) {
+ # 2003-08-05T18:30:02Z
+ return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
+ }
-function xmlsafe( $string ) {
/**
* The page may contain old data which has not been properly normalized.
* Invalid UTF-8 sequences or forbidden control characters will make our
* XML output invalid, so be sure to strip them out.
+ * @param String $string Text to clean up
+ * @return String
*/
- $string = UtfNormal::cleanUp( $string );
-
- $string = htmlspecialchars( $string );
- return $string;
-}
-
-function xmlCommentSafe( $text ) {
- return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) );
-}
+ private function xmlsafe( $string ) {
+ $string = UtfNormal::cleanUp( $string );
+ $string = htmlspecialchars( $string );
+ return $string;
+ }
+ private function xmlCommentSafe( $text ) {
+ return str_replace( '--', '\\-\\-', $this->xmlsafe( $this->recodeText( $text ) ) );
+ }
-function array2object( $arr ) {
- $o = (object)0;
- foreach ( $arr as $x => $y ) {
- $o->$x = $y;
+ private function array2object( $arr ) {
+ $o = (object)0;
+ foreach ( $arr as $x => $y ) {
+ $o->$x = $y;
+ }
+ return $o;
}
- return $o;
-}
+ /**
+ * Make CamelCase and /Talk links work
+ */
+ private function mungeFormat( $text ) {
+ $this->nowiki = array();
+ $staged = preg_replace_callback(
+ '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
+ array( $this, 'nowikiPlaceholder' ), $text );
+
+ # This is probably not 100% correct, I'm just
+ # glancing at the UseModWiki code.
+ $upper = "[A-Z]";
+ $lower = "[a-z_0-9]";
+ $any = "[A-Za-z_0-9]";
+ $camel = "(?:$upper+$lower+$upper+$any*)";
+ $subpage = "(?:\\/$any+)";
+ $substart = "(?:\\/$upper$any*)";
+
+ $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
+ '[[$1]]', $staged );
+
+ $final = preg_replace( '/' . preg_quote( $this->placeholder() ) . '/s',
+ array( $this, 'nowikiShift' ), $munged );
+ return $final;
+ }
-/**
- * Make CamelCase and /Talk links work
- */
-function mungeFormat( $text ) {
- global $nowiki;
- $nowiki = array();
- $staged = preg_replace_callback(
- '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
- 'nowikiPlaceholder', $text );
-
- # This is probably not 100% correct, I'm just
- # glancing at the UseModWiki code.
- $upper = "[A-Z]";
- $lower = "[a-z_0-9]";
- $any = "[A-Za-z_0-9]";
- $camel = "(?:$upper+$lower+$upper+$any*)";
- $subpage = "(?:\\/$any+)";
- $substart = "(?:\\/$upper$any*)";
-
- $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
- '[[$1]]', $staged );
-
- $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
- 'array_shift( $nowiki )', $munged );
- return $final;
-}
+ private function placeholder( $x = null ) {
+ return '\xffplaceholder\xff';
+ }
+ public function nowikiPlaceholder( $matches ) {
+ $this->nowiki[] = $matches[1];
+ return $this->placeholder();
+ }
-function placeholder( $x = null ) {
- return '\xffplaceholder\xff';
+ public function nowikiShift() {
+ return array_shift( $this->nowiki );
+ }
}
-function nowikiPlaceholder( $matches ) {
- global $nowiki;
- $nowiki[] = $matches[1];
- return placeholder();
+function wfUtf8Sequence( $codepoint ) {
+ if ( $codepoint < 0x80 ) {
+ return chr( $codepoint );
+ }
+ if ( $codepoint < 0x800 ) {
+ return chr( $codepoint >> 6 & 0x3f | 0xc0 ) .
+ chr( $codepoint & 0x3f | 0x80 );
+ }
+ if ( $codepoint < 0x10000 ) {
+ return chr( $codepoint >> 12 & 0x0f | 0xe0 ) .
+ chr( $codepoint >> 6 & 0x3f | 0x80 ) .
+ chr( $codepoint & 0x3f | 0x80 );
+ }
+ if ( $codepoint < 0x100000 ) {
+ return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this
+ chr( $codepoint >> 12 & 0x3f | 0x80 ) .
+ chr( $codepoint >> 6 & 0x3f | 0x80 ) .
+ chr( $codepoint & 0x3f | 0x80 );
+ }
+ # Doesn't yet handle outside the BMP
+ return "&#$codepoint;";
}
-
+$maintClass = 'ImportUseModWiki';
+require_once( RUN_MAINTENANCE_IF_MAIN );