summaryrefslogtreecommitdiff
path: root/extensions/SpamBlacklist/cleanup.php
blob: 5a0402650f7cf9bb9763e110c7078d02e524098d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
<?php

/**
 * An aggressive spam cleanup script.
 * Searches the database for matching pages, and reverts them to the last non-spammed revision.
 * If all revisions contain spam, deletes the page
 */

require_once( '../../maintenance/commandLine.inc' );
require_once( 'SpamBlacklist_body.php' );

/**
 * Find the latest revision of the article that does not contain spam and revert to it
 */
function cleanupArticle( Revision $rev, $regexes, $match ) {
	$title = $rev->getTitle();
	$revId = $rev->getId();
	while ( $rev ) {
		$matches = false;
		foreach ( $regexes as $regex ) {
			$matches = $matches || preg_match( $regex, $rev->getText() );
		}
		if ( !$matches ) {
			// Didn't find any spam
			break;
		}
		# Revision::getPrevious can't be used in this way before MW 1.6 (Revision.php 1.26)
		#$rev = $rev->getPrevious();
		$revId = $title->getPreviousRevisionID( $revId );
		if ( $revId ) {
			$rev = Revision::newFromTitle( $title, $revId );
		} else {
			$rev = false;
		}
	}
	$dbw = wfGetDB( DB_MASTER );
	$dbw->begin();
	if ( !$rev ) {
		// Didn't find a non-spammy revision, delete the page
		/*
		print "All revisions are spam, deleting...\n";
		$article = new Article( $title );
		$article->doDeleteArticle( "All revisions matched the spam blacklist" );
		*/
		// Too scary, blank instead
		print "All revisions are spam, blanking...\n";
		$text = '';
		$comment = "All revisions matched the spam blacklist ($match), blanking";
	} else {
		// Revert to this revision
		$text = $rev->getText();
		$comment = "Cleaning up links to $match";
	}
	$wikiPage = new WikiPage( $title );
	$wikiPage->doEdit( $text, $comment );
	$dbw->commit();
}

//------------------------------------------------------------------------------

$username = 'Spam cleanup script';
$wgUser = User::newFromName( $username );
if ( $wgUser->idForName() == 0 ) {
	// Create the user
	$status = $wgUser->addToDatabase();
	if ( $status === null || $status->isOK() ) {
		$dbw = wfGetDB( DB_MASTER );
		$dbw->update( 'user', array( 'user_password' => 'nologin' ),
			array( 'user_name' => $username ), $username );
	}
}

if ( isset( $options['n'] ) ) {
	$dryRun = true;
} else {
	$dryRun = false;
}

$sb = new SpamBlacklist( $wgSpamBlacklistSettings );
if ( $wgSpamBlacklistFiles ) {
	$sb->files = $wgSpamBlacklistFiles;
}
$regexes = $sb->getBlacklists();
if ( !$regexes ) {
	print "Invalid regex, can't clean up spam\n";
	exit( 1 );
}

$dbr = wfGetDB( DB_SLAVE );
$maxID = $dbr->selectField( 'page', 'MAX(page_id)' );
$reportingInterval = 100;

print "Regexes are " . implode( ', ', array_map( 'count', $regexes ) ) . " bytes\n";
print "Searching for spam in $maxID pages...\n";
if ( $dryRun ) {
	print "Dry run only\n";
}

for ( $id = 1; $id <= $maxID; $id++ ) {
	if ( $id % $reportingInterval == 0 ) {
		printf( "%-8d  %-5.2f%%\r", $id, $id / $maxID * 100 );
	}
	$revision = Revision::loadFromPageId( $dbr, $id );
	if ( $revision ) {
		$text = $revision->getText();
		if ( $text ) {
			foreach ( $regexes as $regex ) {
				if ( preg_match( $regex, $text, $matches ) ) {
					$title = $revision->getTitle();
					$titleText = $title->getPrefixedText();
					if ( $dryRun ) {
						print "\nFound spam in [[$titleText]]\n";
					} else {
						print "\nCleaning up links to {$matches[0]} in [[$titleText]]\n";
						$match = str_replace( 'http://', '', $matches[0] );
						cleanupArticle( $revision, $regexes, $match );
					}
				}
			}
		}
	}
}
// Just for satisfaction
printf( "%-8d  %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 );