summaryrefslogtreecommitdiff
path: root/includes/Sanitizer.php
diff options
context:
space:
mode:
Diffstat (limited to 'includes/Sanitizer.php')
-rw-r--r--includes/Sanitizer.php111
1 files changed, 69 insertions, 42 deletions
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 96193a74..de63af79 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -346,12 +346,9 @@ class Sanitizer {
($space*=$space*
(?:
# The attribute value: quoted or alone
- \"([^<\"]*)\"
- | '([^<']*)'
+ \"([^<\"]*)(?:\"|\$)
+ | '([^<']*)(?:'|\$)
| ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
- | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
- # colors are specified like this.
- # We'll be normalizing it.
)
)?(?=$space|\$)/sx";
}
@@ -359,20 +356,13 @@ class Sanitizer {
}
/**
- * Cleans up HTML, removes dangerous tags and attributes, and
- * removes HTML comments
- * @param string $text
- * @param callable $processCallback Callback to do any variable or parameter
- * replacements in HTML attribute values
- * @param array|bool $args Arguments for the processing callback
+ * Return the various lists of recognized tags
* @param array $extratags For any extra tags to include
* @param array $removetags For any tags (default or extra) to exclude
- * @return string
+ * @return array
*/
- public static function removeHTMLtags( $text, $processCallback = null,
- $args = array(), $extratags = array(), $removetags = array()
- ) {
- global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
+ public static function getRecognizedTagData( $extratags = array(), $removetags = array() ) {
+ global $wgAllowMicrodataAttributes, $wgAllowImageTag;
static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
$htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
@@ -381,7 +371,6 @@ class Sanitizer {
// are changed (like in the screwed up test system) we will re-initialise the settings.
$globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
if ( !$staticInitialised || $staticInitialised != $globalContext ) {
-
$htmlpairsStatic = array( # Tags that must be closed
'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
@@ -431,17 +420,47 @@ class Sanitizer {
}
$staticInitialised = $globalContext;
}
+
# Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
$extratags = array_flip( $extratags );
$removetags = array_flip( $removetags );
$htmlpairs = array_merge( $extratags, $htmlpairsStatic );
$htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
+ return array(
+ 'htmlpairs' => $htmlpairs,
+ 'htmlsingle' => $htmlsingle,
+ 'htmlsingleonly' => $htmlsingleonly,
+ 'htmlnest' => $htmlnest,
+ 'tabletags' => $tabletags,
+ 'htmllist' => $htmllist,
+ 'listtags' => $listtags,
+ 'htmlsingleallowed' => $htmlsingleallowed,
+ 'htmlelements' => $htmlelements,
+ );
+ }
+
+ /**
+ * Cleans up HTML, removes dangerous tags and attributes, and
+ * removes HTML comments
+ * @param string $text
+ * @param callable $processCallback Callback to do any variable or parameter
+ * replacements in HTML attribute values
+ * @param array|bool $args Arguments for the processing callback
+ * @param array $extratags For any extra tags to include
+ * @param array $removetags For any tags (default or extra) to exclude
+ * @return string
+ */
+ public static function removeHTMLtags( $text, $processCallback = null,
+ $args = array(), $extratags = array(), $removetags = array()
+ ) {
+ extract( self::getRecognizedTagData( $extratags, $removetags ) );
+
# Remove HTML comments
$text = Sanitizer::removeHTMLcomments( $text );
$bits = explode( '<', $text );
$text = str_replace( '>', '&gt;', array_shift( $bits ) );
- if ( !$wgUseTidy ) {
+ if ( !MWTidy::isEnabled() ) {
$tagstack = $tablestack = array();
foreach ( $bits as $x ) {
$regs = array();
@@ -463,9 +482,9 @@ class Sanitizer {
$badtag = true;
} elseif ( $slash ) {
# Closing a tag... is it the one we just opened?
- wfSuppressWarnings();
+ MediaWiki\suppressWarnings();
$ot = array_pop( $tagstack );
- wfRestoreWarnings();
+ MediaWiki\restoreWarnings();
if ( $ot != $t ) {
if ( isset( $htmlsingleallowed[$ot] ) ) {
@@ -473,32 +492,32 @@ class Sanitizer {
# and see if we find a match below them
$optstack = array();
array_push( $optstack, $ot );
- wfSuppressWarnings();
+ MediaWiki\suppressWarnings();
$ot = array_pop( $tagstack );
- wfRestoreWarnings();
+ MediaWiki\restoreWarnings();
while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
array_push( $optstack, $ot );
- wfSuppressWarnings();
+ MediaWiki\suppressWarnings();
$ot = array_pop( $tagstack );
- wfRestoreWarnings();
+ MediaWiki\restoreWarnings();
}
if ( $t != $ot ) {
# No match. Push the optional elements back again
$badtag = true;
- wfSuppressWarnings();
+ MediaWiki\suppressWarnings();
$ot = array_pop( $optstack );
- wfRestoreWarnings();
+ MediaWiki\restoreWarnings();
while ( $ot ) {
array_push( $tagstack, $ot );
- wfSuppressWarnings();
+ MediaWiki\suppressWarnings();
$ot = array_pop( $optstack );
- wfRestoreWarnings();
+ MediaWiki\restoreWarnings();
}
}
} else {
- wfSuppressWarnings();
+ MediaWiki\suppressWarnings();
array_push( $tagstack, $ot );
- wfRestoreWarnings();
+ MediaWiki\restoreWarnings();
# <li> can be nested in <ul> or <ol>, skip those cases:
if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
@@ -729,7 +748,7 @@ class Sanitizer {
}
# Allow any attribute beginning with "data-"
- if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
+ if ( !preg_match( '/^data-(?!ooui)/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
continue;
}
@@ -942,7 +961,8 @@ class Sanitizer {
$value = self::normalizeCss( $value );
// Reject problematic keywords and control characters
- if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
+ if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
+ strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
return '/* invalid control char */';
} elseif ( preg_match(
'! expression
@@ -1239,10 +1259,7 @@ class Sanitizer {
* @return string
*/
private static function getTagAttributeCallback( $set ) {
- if ( isset( $set[6] ) ) {
- # Illegal #XXXXXX color with no quotes.
- return $set[6];
- } elseif ( isset( $set[5] ) ) {
+ if ( isset( $set[5] ) ) {
# No quotes.
return $set[5];
} elseif ( isset( $set[4] ) ) {
@@ -1252,9 +1269,10 @@ class Sanitizer {
# Double-quoted
return $set[3];
} elseif ( !isset( $set[2] ) ) {
- # In XHTML, attributes must have a value.
- # For 'reduced' form, return explicitly the attribute name here.
- return $set[1];
+ # In XHTML, attributes must have a value so return an empty string.
+ # See "Empty attribute syntax",
+ # http://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
+ return "";
} else {
throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
}
@@ -1374,15 +1392,19 @@ class Sanitizer {
}
/**
- * Returns true if a given Unicode codepoint is a valid character in XML.
+ * Returns true if a given Unicode codepoint is a valid character in
+ * both HTML5 and XML.
* @param int $codepoint
* @return bool
*/
private static function validateCodepoint( $codepoint ) {
+ # U+000C is valid in HTML5 but not allowed in XML.
+ # U+000D is valid in XML but not allowed in HTML5.
+ # U+007F - U+009F are disallowed in HTML5 (control characters).
return $codepoint == 0x09
|| $codepoint == 0x0a
- || $codepoint == 0x0d
- || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
+ || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
+ || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
|| ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
|| ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
}
@@ -1784,6 +1806,11 @@ class Sanitizer {
$host = preg_replace( $strip, '', $host );
+ // IPv6 host names are bracketed with []. Url-decode these.
+ if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 && preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches ) ) {
+ $host = '//[' . $matches[1] . ']' . $matches[2];
+ }
+
// @todo FIXME: Validate hostnames here
return $protocol . $host . $rest;