* Based loosely on Magnus's code from 2001-2002 * * Updated limited version to get something working temporarily * 2003-10-09 * Be sure to run the link & index rebuilding scripts! * * Some more munging for charsets etc * 2003-11-28 * * Partial fix for pages starting with lowercase letters (??) * and CamelCase and /Subpage link conversion * 2004-11-17 * * Rewrite output to create Special:Export format for import * instead of raw SQL. Should be 'future-proof' against future * schema changes. * 2005-03-14 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html * * @todo document * @file * @ingroup Maintenance */ require_once( "Maintenance.php" ); class ImportUseModWiki extends Maintenance { private $encoding, $rootDirectory = ''; /** * Field separators * @var String */ private $FS1, $FS2, $FS3 = ''; /** * @var Array */ private $usercache, $nowiki = array(); public function __construct() { parent::__construct(); $this->mDescription = "Import pages from UseMod wikis"; $this->addOption( 'encoding', 'Encoding of the imported text, default CP1252', false, true ); /** * If UseModWiki's New File System is used: * $NewFS = 1; # 1 = new multibyte $FS, 0 = old $FS * Use "\xb3"; for the Old File System * Changed with UTF-8 UseModWiki * http://www.usemod.com/cgi-bin/wiki.pl?SupportForUtf8 * http://www.usemod.com/cgi-bin/wiki.pl?WikiBugs/NewFieldSeparatorWronglyTreated * http://www.meatballwiki.org/wiki/WikiEngine#Q_amp_A */ $this->addOption( 'separator', 'Field separator to use, default \x1E\xFF\xFE\x1E', false, true ); $this->addArg( 'path', 'Path to your UseMod wiki' ); } public function execute() { $this->rootDirectory = $this->getArg(); $this->encoding = $this->getOption( 'encoding', 'CP1252' ); $sep = $this->getOption( 'separator', "\x1E\xFF\xFE\x1E" ); $this->FS1 = "{$sep}1"; $this->FS2 = "{$sep}2"; $this->FS3 = "{$sep}3"; echo << XML; $letters = array( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' ); foreach ( $letters as $letter ) { $dir = "{$this->rootDirectory}/page/$letter"; if ( is_dir( $dir ) ) $this->importPageDirectory( $dir ); } echo << XML; } private function importPageDirectory( $dir, $prefix = "" ) { echo "\n\n"; $mydir = opendir( $dir ); while ( $entry = readdir( $mydir ) ) { $m = array(); if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) { echo $this->importPage( $prefix . $m[1] ); } else { if ( is_dir( "$dir/$entry" ) ) { if ( $entry != '.' && $entry != '..' ) { $this->importPageDirectory( "$dir/$entry", "$entry/" ); } } else { echo "\n"; } } } } private function useModFilename( $title ) { $c = substr( $title, 0, 1 ); if ( preg_match( '/[A-Z]/i', $c ) ) { return strtoupper( $c ) . "/$title"; } return "other/$title"; } private function fetchPage( $title ) { $fname = $this->rootDirectory . "/page/" . $this->useModFilename( $title ) . ".db"; if ( !file_exists( $fname ) ) { echo "Couldn't open file '$fname' for page '$title'.\n"; die( -1 ); } $page = $this->splitHash( $this->FS1, file_get_contents( $fname ) ); $section = $this->splitHash( $this->FS2, $page["text_default"] ); $text = $this->splitHash( $this->FS3, $section["data"] ); return $this->array2object( array( "text" => $text["text"] , "summary" => $text["summary"] , "minor" => $text["minor"] , "ts" => $section["ts"] , "username" => $section["username"] , "host" => $section["host"] ) ); } private function fetchKeptPages( $title ) { $fname = $this->rootDirectory . "/keep/" . $this->useModFilename( $title ) . ".kp"; if ( !file_exists( $fname ) ) return array(); $keptlist = explode( $this->FS1, file_get_contents( $fname ) ); array_shift( $keptlist ); # Drop the junk at beginning of file $revisions = array(); foreach ( $keptlist as $rev ) { $section = $this->splitHash( $this->FS2, $rev ); $text = $this->splitHash( $this->FS3, $section["data"] ); if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) { array_push( $revisions, $this->array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] , "minor" => $text["minor"] , "ts" => $section["ts"] , "username" => $section["username"] , "host" => $section["host"] ) ) ); } else { echo "\n"; } } return $revisions; } private function splitHash( $sep , $str ) { $temp = explode ( $sep , $str ) ; $ret = array () ; for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) { $ret[$temp[$i]] = $temp[++$i] ; } return $ret ; } private function checkUserCache( $name, $host ) { if ( $name ) { if ( in_array( $name, $this->usercache ) ) { $userid = $this->usercache[$name]; } else { # If we haven't imported user accounts $userid = 0; } $username = str_replace( '_', ' ', $name ); } else { $userid = 0; $username = $host; } return array( $userid, $username ); } private function importPage( $title ) { echo "\n\n"; $page = $this->fetchPage( $title ); $newtitle = $this->xmlsafe( str_replace( '_', ' ', $this->recodeText( $title ) ) ); $munged = $this->mungeFormat( $page->text ); if ( $munged != $page->text ) { /** * Save a *new* revision with the conversion, and put the * previous last version into the history. */ $next = $this->array2object( array( 'text' => $munged, 'minor' => 1, 'username' => 'Conversion script', 'host' => '127.0.0.1', 'ts' => time(), 'summary' => 'link fix', ) ); $revisions = array( $page, $next ); } else { /** * Current revision: */ $revisions = array( $page ); } $xml = << $newtitle XML; # History $revisions = array_merge( $revisions, $this->fetchKeptPages( $title ) ); if ( count( $revisions ) == 0 ) { return NULL; // Was "$sql", which does not appear to be defined. } foreach ( $revisions as $rev ) { $text = $this->xmlsafe( $this->recodeText( $rev->text ) ); $minor = ( $rev->minor ? '' : '' ); list( /* $userid */ , $username ) = $this->checkUserCache( $rev->username, $rev->host ); $username = $this->xmlsafe( $this->recodeText( $username ) ); $timestamp = $this->xmlsafe( $this->timestamp2ISO8601( $rev->ts ) ); $comment = $this->xmlsafe( $this->recodeText( $rev->summary ) ); $xml .= << $timestamp $username $minor $comment $text XML; } $xml .= "\n\n"; return $xml; } private function recodeText( $string ) { # For currently latin-1 wikis $string = str_replace( "\r\n", "\n", $string ); $string = @iconv( $this->encoding, "UTF-8", $string ); $string = $this->mungeToUtf8( $string ); # Any old Ӓ stuff return $string; } /** * @todo FIXME: Don't use /e */ private function mungeToUtf8( $string ) { $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string ); $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string ); # Should also do named entities here return $string; } private function timestamp2ISO8601( $ts ) { # 2003-08-05T18:30:02Z return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z'; } /** * The page may contain old data which has not been properly normalized. * Invalid UTF-8 sequences or forbidden control characters will make our * XML output invalid, so be sure to strip them out. * @param String $string Text to clean up * @return String */ private function xmlsafe( $string ) { $string = UtfNormal::cleanUp( $string ); $string = htmlspecialchars( $string ); return $string; } private function xmlCommentSafe( $text ) { return str_replace( '--', '\\-\\-', $this->xmlsafe( $this->recodeText( $text ) ) ); } private function array2object( $arr ) { $o = (object)0; foreach ( $arr as $x => $y ) { $o->$x = $y; } return $o; } /** * Make CamelCase and /Talk links work */ private function mungeFormat( $text ) { $this->nowiki = array(); $staged = preg_replace_callback( '/(.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s', array( $this, 'nowikiPlaceholder' ), $text ); # This is probably not 100% correct, I'm just # glancing at the UseModWiki code. $upper = "[A-Z]"; $lower = "[a-z_0-9]"; $any = "[A-Za-z_0-9]"; $camel = "(?:$upper+$lower+$upper+$any*)"; $subpage = "(?:\\/$any+)"; $substart = "(?:\\/$upper$any*)"; $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/", '[[$1]]', $staged ); $final = preg_replace( '/' . preg_quote( $this->placeholder() ) . '/s', array( $this, 'nowikiShift' ), $munged ); return $final; } private function placeholder( $x = null ) { return '\xffplaceholder\xff'; } public function nowikiPlaceholder( $matches ) { $this->nowiki[] = $matches[1]; return $this->placeholder(); } public function nowikiShift() { return array_shift( $this->nowiki ); } } function wfUtf8Sequence( $codepoint ) { if ( $codepoint < 0x80 ) { return chr( $codepoint ); } if ( $codepoint < 0x800 ) { return chr( $codepoint >> 6 & 0x3f | 0xc0 ) . chr( $codepoint & 0x3f | 0x80 ); } if ( $codepoint < 0x10000 ) { return chr( $codepoint >> 12 & 0x0f | 0xe0 ) . chr( $codepoint >> 6 & 0x3f | 0x80 ) . chr( $codepoint & 0x3f | 0x80 ); } if ( $codepoint < 0x100000 ) { return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this chr( $codepoint >> 12 & 0x3f | 0x80 ) . chr( $codepoint >> 6 & 0x3f | 0x80 ) . chr( $codepoint & 0x3f | 0x80 ); } # Doesn't yet handle outside the BMP return "&#$codepoint;"; } $maintClass = 'ImportUseModWiki'; require_once( RUN_MAINTENANCE_IF_MAIN );