Update to MediaWiki 1.21.1

author: Pierre Schmitz <pierre@archlinux.de> 2013-08-12 09:28:15 +0200
committer: Pierre Schmitz <pierre@archlinux.de> 2013-08-12 09:28:15 +0200
commit: 08aa4418c30cfc18ccc69a0f0f9cb9e17be6c196 (patch)
tree: 577a29fb579188d16003a209ce2a2e9c5b0aa2bd /extensions/SpamBlacklist/cleanup.php
parent: cacc939b34e315b85e2d72997811eb6677996cc1 (diff)
1 files changed, 125 insertions, 0 deletions
diff --git a/extensions/SpamBlacklist/cleanup.php b/extensions/SpamBlacklist/cleanup.php
new file mode 100644
index 00000000..5a040265
--- /dev/null
+++ b/extensions/SpamBlacklist/cleanup.php
@@ -0,0 +1,125 @@
+<?php
+
+/**
+ * An aggressive spam cleanup script.
+ * Searches the database for matching pages, and reverts them to the last non-spammed revision.
+ * If all revisions contain spam, deletes the page
+ */
+
+require_once( '../../maintenance/commandLine.inc' );
+require_once( 'SpamBlacklist_body.php' );
+
+/**
+ * Find the latest revision of the article that does not contain spam and revert to it
+ */
+function cleanupArticle( Revision $rev, $regexes, $match ) {
+	$title = $rev->getTitle();
+	$revId = $rev->getId();
+	while ( $rev ) {
+		$matches = false;
+		foreach ( $regexes as $regex ) {
+			$matches = $matches || preg_match( $regex, $rev->getText() );
+		}
+		if ( !$matches ) {
+			// Didn't find any spam
+			break;
+		}
+		# Revision::getPrevious can't be used in this way before MW 1.6 (Revision.php 1.26)
+		#$rev = $rev->getPrevious();
+		$revId = $title->getPreviousRevisionID( $revId );
+		if ( $revId ) {
+			$rev = Revision::newFromTitle( $title, $revId );
+		} else {
+			$rev = false;
+		}
+	}
+	$dbw = wfGetDB( DB_MASTER );
+	$dbw->begin();
+	if ( !$rev ) {
+		// Didn't find a non-spammy revision, delete the page
+		/*
+		print "All revisions are spam, deleting...\n";
+		$article = new Article( $title );
+		$article->doDeleteArticle( "All revisions matched the spam blacklist" );
+		*/
+		// Too scary, blank instead
+		print "All revisions are spam, blanking...\n";
+		$text = '';
+		$comment = "All revisions matched the spam blacklist ($match), blanking";
+	} else {
+		// Revert to this revision
+		$text = $rev->getText();
+		$comment = "Cleaning up links to $match";
+	}
+	$wikiPage = new WikiPage( $title );
+	$wikiPage->doEdit( $text, $comment );
+	$dbw->commit();
+}
+
+//------------------------------------------------------------------------------
+
+$username = 'Spam cleanup script';
+$wgUser = User::newFromName( $username );
+if ( $wgUser->idForName() == 0 ) {
+	// Create the user
+	$status = $wgUser->addToDatabase();
+	if ( $status === null || $status->isOK() ) {
+		$dbw = wfGetDB( DB_MASTER );
+		$dbw->update( 'user', array( 'user_password' => 'nologin' ),
+			array( 'user_name' => $username ), $username );
+	}
+}
+
+if ( isset( $options['n'] ) ) {
+	$dryRun = true;
+} else {
+	$dryRun = false;
+}
+
+$sb = new SpamBlacklist( $wgSpamBlacklistSettings );
+if ( $wgSpamBlacklistFiles ) {
+	$sb->files = $wgSpamBlacklistFiles;
+}
+$regexes = $sb->getBlacklists();
+if ( !$regexes ) {
+	print "Invalid regex, can't clean up spam\n";
+	exit( 1 );
+}
+
+$dbr = wfGetDB( DB_SLAVE );
+$maxID = $dbr->selectField( 'page', 'MAX(page_id)' );
+$reportingInterval = 100;
+
+print "Regexes are " . implode( ', ', array_map( 'count', $regexes ) ) . " bytes\n";
+print "Searching for spam in $maxID pages...\n";
+if ( $dryRun ) {
+	print "Dry run only\n";
+}
+
+for ( $id = 1; $id <= $maxID; $id++ ) {
+	if ( $id % $reportingInterval == 0 ) {
+		printf( "%-8d  %-5.2f%%\r", $id, $id / $maxID * 100 );
+	}
+	$revision = Revision::loadFromPageId( $dbr, $id );
+	if ( $revision ) {
+		$text = $revision->getText();
+		if ( $text ) {
+			foreach ( $regexes as $regex ) {
+				if ( preg_match( $regex, $text, $matches ) ) {
+					$title = $revision->getTitle();
+					$titleText = $title->getPrefixedText();
+					if ( $dryRun ) {
+						print "\nFound spam in [[$titleText]]\n";
+					} else {
+						print "\nCleaning up links to {$matches[0]} in [[$titleText]]\n";
+						$match = str_replace( 'http://', '', $matches[0] );
+						cleanupArticle( $revision, $regexes, $match );
+					}
+				}
+			}
+		}
+	}
+}
+// Just for satisfaction
+printf( "%-8d  %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 );
+
author	Pierre Schmitz <pierre@archlinux.de>	2013-08-12 09:28:15 +0200
committer	Pierre Schmitz <pierre@archlinux.de>	2013-08-12 09:28:15 +0200
commit	08aa4418c30cfc18ccc69a0f0f9cb9e17be6c196 (patch)
tree	577a29fb579188d16003a209ce2a2e9c5b0aa2bd /extensions/SpamBlacklist/cleanup.php
parent	cacc939b34e315b85e2d72997811eb6677996cc1 (diff)