summaryrefslogtreecommitdiff
path: root/extensions/SpamBlacklist/SpamRegexBatch.php
diff options
context:
space:
mode:
Diffstat (limited to 'extensions/SpamBlacklist/SpamRegexBatch.php')
-rw-r--r--extensions/SpamBlacklist/SpamRegexBatch.php175
1 files changed, 175 insertions, 0 deletions
diff --git a/extensions/SpamBlacklist/SpamRegexBatch.php b/extensions/SpamBlacklist/SpamRegexBatch.php
new file mode 100644
index 00000000..b14d671e
--- /dev/null
+++ b/extensions/SpamBlacklist/SpamRegexBatch.php
@@ -0,0 +1,175 @@
+<?php
+
+/**
+ * Utility class for working with blacklists
+ */
+class SpamRegexBatch {
+ /**
+ * Build a set of regular expressions matching URLs with the list of regex fragments.
+ * Returns an empty list if the input list is empty.
+ *
+ * @param array $lines list of fragments which will match in URLs
+ * @param BaseBlacklist $blacklist
+ * @param int $batchSize largest allowed batch regex;
+ * if 0, will produce one regex per line
+ * @return array
+ */
+ static function buildRegexes( $lines, BaseBlacklist $blacklist, $batchSize=4096 ) {
+ # Make regex
+ # It's faster using the S modifier even though it will usually only be run once
+ //$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')';
+ //return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim';
+ $regexes = array();
+ $regexStart = $blacklist->getRegexStart();
+ $regexEnd = $blacklist->getRegexEnd( $batchSize );
+ $build = false;
+ foreach( $lines as $line ) {
+ if( substr( $line, -1, 1 ) == "\\" ) {
+ // Final \ will break silently on the batched regexes.
+ // Skip it here to avoid breaking the next line;
+ // warnings from getBadLines() will still trigger on
+ // edit to keep new ones from floating in.
+ continue;
+ }
+ // FIXME: not very robust size check, but should work. :)
+ if( $build === false ) {
+ $build = $line;
+ } elseif( strlen( $build ) + strlen( $line ) > $batchSize ) {
+ $regexes[] = $regexStart .
+ str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) .
+ $regexEnd;
+ $build = $line;
+ } else {
+ $build .= '|';
+ $build .= $line;
+ }
+ }
+ if( $build !== false ) {
+ $regexes[] = $regexStart .
+ str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) .
+ $regexEnd;
+ }
+ return $regexes;
+ }
+
+ /**
+ * Confirm that a set of regexes is either empty or valid.
+ *
+ * @param $regexes array set of regexes
+ * @return bool true if ok, false if contains invalid lines
+ */
+ static function validateRegexes( $regexes ) {
+ foreach( $regexes as $regex ) {
+ wfSuppressWarnings();
+ $ok = preg_match( $regex, '' );
+ wfRestoreWarnings();
+
+ if( $ok === false ) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Strip comments and whitespace, then remove blanks
+ *
+ * @param $lines array
+ * @return array
+ */
+ static function stripLines( $lines ) {
+ return array_filter(
+ array_map( 'trim',
+ preg_replace( '/#.*$/', '',
+ $lines ) ) );
+ }
+
+ /**
+ * Do a sanity check on the batch regex.
+ *
+ * @param $lines string unsanitized input lines
+ * @param $blacklist BaseBlacklist
+ * @param $fileName bool|string optional for debug reporting
+ * @return array of regexes
+ */
+ static function buildSafeRegexes( $lines, BaseBlacklist $blacklist, $fileName=false ) {
+ $lines = SpamRegexBatch::stripLines( $lines );
+ $regexes = SpamRegexBatch::buildRegexes( $lines, $blacklist );
+ if( SpamRegexBatch::validateRegexes( $regexes ) ) {
+ return $regexes;
+ } else {
+ // _Something_ broke... rebuild line-by-line; it'll be
+ // slower if there's a lot of blacklist lines, but one
+ // broken line won't take out hundreds of its brothers.
+ if( $fileName ) {
+ wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" );
+ }
+ return SpamRegexBatch::buildRegexes( $lines, $blacklist, 0 );
+ }
+ }
+
+ /**
+ * Returns an array of invalid lines
+ *
+ * @param array $lines
+ * @param $blacklist BaseBlacklist
+ * @return array of input lines which produce invalid input, or empty array if no problems
+ */
+ static function getBadLines( $lines, BaseBlacklist $blacklist ) {
+ $lines = SpamRegexBatch::stripLines( $lines );
+
+ $badLines = array();
+ foreach( $lines as $line ) {
+ if( substr( $line, -1, 1 ) == "\\" ) {
+ // Final \ will break silently on the batched regexes.
+ $badLines[] = $line;
+ }
+ }
+
+ $regexes = SpamRegexBatch::buildRegexes( $lines, $blacklist );
+ if( SpamRegexBatch::validateRegexes( $regexes ) ) {
+ // No other problems!
+ return $badLines;
+ }
+
+ // Something failed in the batch, so check them one by one.
+ foreach( $lines as $line ) {
+ $regexes = SpamRegexBatch::buildRegexes( array( $line ), $blacklist );
+ if( !SpamRegexBatch::validateRegexes( $regexes ) ) {
+ $badLines[] = $line;
+ }
+ }
+ return $badLines;
+ }
+
+ /**
+ * Build a set of regular expressions from the given multiline input text,
+ * with empty lines and comments stripped.
+ *
+ * @param $source string
+ * @param $blacklist BaseBlacklist
+ * @param $fileName bool|string optional, for reporting of bad files
+ * @return array of regular expressions, potentially empty
+ */
+ static function regexesFromText( $source, BaseBlacklist $blacklist, $fileName=false ) {
+ $lines = explode( "\n", $source );
+ return SpamRegexBatch::buildSafeRegexes( $lines, $blacklist, $fileName );
+ }
+
+ /**
+ * Build a set of regular expressions from a MediaWiki message.
+ * Will be correctly empty if the message isn't present.
+ *
+ * @param $message string
+ * @param $blacklist BaseBlacklist
+ * @return array of regular expressions, potentially empty
+ */
+ static function regexesFromMessage( $message, BaseBlacklist $blacklist ) {
+ $source = wfMessage( $message )->inContentLanguage();
+ if( !$source->isDisabled() ) {
+ return SpamRegexBatch::regexesFromText( $source->plain(), $blacklist );
+ } else {
+ return array();
+ }
+ }
+}