summaryrefslogtreecommitdiff
path: root/extensions/SpamBlacklist/SpamRegexBatch.php
blob: b14d671e78391117c149582e8d99a5d555a75ba1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
<?php

/**
 * Utility class for working with blacklists
 */
class SpamRegexBatch {
	/**
	 * Build a set of regular expressions matching URLs with the list of regex fragments.
	 * Returns an empty list if the input list is empty.
	 *
	 * @param array $lines list of fragments which will match in URLs
	 * @param BaseBlacklist $blacklist
	 * @param int $batchSize largest allowed batch regex;
	 *                       if 0, will produce one regex per line
	 * @return array
	 */
	static function buildRegexes( $lines, BaseBlacklist $blacklist, $batchSize=4096 ) {
		# Make regex
		# It's faster using the S modifier even though it will usually only be run once
		//$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')';
		//return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim';
		$regexes = array();
		$regexStart = $blacklist->getRegexStart();
		$regexEnd = $blacklist->getRegexEnd( $batchSize );
		$build = false;
		foreach( $lines as $line ) {
			if( substr( $line, -1, 1 ) == "\\" ) {
				// Final \ will break silently on the batched regexes.
				// Skip it here to avoid breaking the next line;
				// warnings from getBadLines() will still trigger on
				// edit to keep new ones from floating in.
				continue;
			}
			// FIXME: not very robust size check, but should work. :)
			if( $build === false ) {
				$build = $line;
			} elseif( strlen( $build ) + strlen( $line ) > $batchSize ) {
				$regexes[] = $regexStart .
					str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) .
					$regexEnd;
				$build = $line;
			} else {
				$build .= '|';
				$build .= $line;
			}
		}
		if( $build !== false ) {
			$regexes[] = $regexStart .
				str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) .
				$regexEnd;
		}
		return $regexes;
	}

	/**
	 * Confirm that a set of regexes is either empty or valid.
	 *
	 * @param $regexes array set of regexes
	 * @return bool true if ok, false if contains invalid lines
	 */
	static function validateRegexes( $regexes ) {
		foreach( $regexes as $regex ) {
			wfSuppressWarnings();
			$ok = preg_match( $regex, '' );
			wfRestoreWarnings();

			if( $ok === false ) {
				return false;
			}
		}
		return true;
	}

	/**
	 * Strip comments and whitespace, then remove blanks
	 *
	 * @param $lines array
	 * @return array
	 */
	static function stripLines( $lines ) {
		return array_filter(
			array_map( 'trim',
				preg_replace( '/#.*$/', '',
					$lines ) ) );
	}

	/**
	 * Do a sanity check on the batch regex.
	 *
	 * @param $lines string unsanitized input lines
	 * @param $blacklist BaseBlacklist
	 * @param $fileName bool|string optional for debug reporting
	 * @return array of regexes
	 */
	static function buildSafeRegexes( $lines, BaseBlacklist $blacklist, $fileName=false ) {
		$lines = SpamRegexBatch::stripLines( $lines );
		$regexes = SpamRegexBatch::buildRegexes( $lines, $blacklist );
		if( SpamRegexBatch::validateRegexes( $regexes ) ) {
			return $regexes;
		} else {
			// _Something_ broke... rebuild line-by-line; it'll be
			// slower if there's a lot of blacklist lines, but one
			// broken line won't take out hundreds of its brothers.
			if( $fileName ) {
				wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" );
			}
			return SpamRegexBatch::buildRegexes( $lines, $blacklist, 0 );
		}
	}

	/**
	 * Returns an array of invalid lines
	 *
	 * @param array $lines
	 * @param $blacklist BaseBlacklist
	 * @return array of input lines which produce invalid input, or empty array if no problems
	 */
	static function getBadLines( $lines, BaseBlacklist $blacklist ) {
		$lines = SpamRegexBatch::stripLines( $lines );

		$badLines = array();
		foreach( $lines as $line ) {
			if( substr( $line, -1, 1 ) == "\\" ) {
				// Final \ will break silently on the batched regexes.
				$badLines[] = $line;
			}
		}

		$regexes = SpamRegexBatch::buildRegexes( $lines, $blacklist );
		if( SpamRegexBatch::validateRegexes( $regexes ) ) {
			// No other problems!
			return $badLines;
		}

		// Something failed in the batch, so check them one by one.
		foreach( $lines as $line ) {
			$regexes = SpamRegexBatch::buildRegexes( array( $line ), $blacklist );
			if( !SpamRegexBatch::validateRegexes( $regexes ) ) {
				$badLines[] = $line;
			}
		}
		return $badLines;
	}

	/**
	 * Build a set of regular expressions from the given multiline input text,
	 * with empty lines and comments stripped.
	 *
	 * @param $source string
	 * @param $blacklist BaseBlacklist
	 * @param $fileName bool|string optional, for reporting of bad files
	 * @return array of regular expressions, potentially empty
	 */
	static function regexesFromText( $source, BaseBlacklist $blacklist, $fileName=false ) {
		$lines = explode( "\n", $source );
		return SpamRegexBatch::buildSafeRegexes( $lines, $blacklist, $fileName );
	}

	/**
	 * Build a set of regular expressions from a MediaWiki message.
	 * Will be correctly empty if the message isn't present.
	 *
	 * @param $message string
	 * @param $blacklist BaseBlacklist
	 * @return array of regular expressions, potentially empty
	 */
	static function regexesFromMessage( $message, BaseBlacklist $blacklist ) {
		$source = wfMessage( $message )->inContentLanguage();
		if( !$source->isDisabled() ) {
			return SpamRegexBatch::regexesFromText( $source->plain(), $blacklist );
		} else {
			return array();
		}
	}
}