summaryrefslogtreecommitdiff
path: root/includes/Sanitizer.php
diff options
context:
space:
mode:
Diffstat (limited to 'includes/Sanitizer.php')
-rw-r--r--includes/Sanitizer.php207
1 files changed, 156 insertions, 51 deletions
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index b443ce14..2dff081d 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -358,38 +358,48 @@ class Sanitizer {
* @private
* @param $text String
* @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
- * @param $args Array for the processing callback
- * @param $extratags Array for any extra tags to include
- * @param $removetags Array for any tags (default or extra) to exclude
+ * @param array $args for the processing callback
+ * @param array $extratags for any extra tags to include
+ * @param array $removetags for any tags (default or extra) to exclude
* @return string
*/
static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
- global $wgUseTidy;
+ global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag;
static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
$htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
wfProfileIn( __METHOD__ );
- if ( !$staticInitialised ) {
+ // Base our staticInitialised variable off of the global config state so that if the globals
+ // are changed (like in the screwed up test system) we will re-initialise the settings.
+ $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
+ if ( !$staticInitialised || $staticInitialised != $globalContext ) {
$htmlpairsStatic = array( # Tags that must be closed
'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
'strike', 'strong', 'tt', 'var', 'div', 'center',
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
- 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
+ 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
'kbd', 'samp'
);
+ if ( $wgHtml5 ) {
+ $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) );
+ }
$htmlsingle = array(
'br', 'hr', 'li', 'dt', 'dd'
);
$htmlsingleonly = array( # Elements that cannot have close tags
'br', 'hr'
);
+ if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
+ $htmlsingle[] = $htmlsingleonly[] = 'meta';
+ $htmlsingle[] = $htmlsingleonly[] = 'link';
+ }
$htmlnest = array( # Tags that can be nested--??
'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
- 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
+ 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span'
);
$tabletags = array( # Can only appear inside table, we will close them
'td', 'th', 'tr',
@@ -401,7 +411,6 @@ class Sanitizer {
'li',
);
- global $wgAllowImageTag;
if ( $wgAllowImageTag ) {
$htmlsingle[] = 'img';
$htmlsingleonly[] = 'img';
@@ -416,13 +425,13 @@ class Sanitizer {
foreach ( $vars as $var ) {
$$var = array_flip( $$var );
}
- $staticInitialised = true;
+ $staticInitialised = $globalContext;
}
# Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
$extratags = array_flip( $extratags );
$removetags = array_flip( $removetags );
$htmlpairs = array_merge( $extratags, $htmlpairsStatic );
- $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
+ $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
# Remove HTML comments
$text = Sanitizer::removeHTMLcomments( $text );
@@ -505,11 +514,15 @@ class Sanitizer {
isset( $htmlpairs[$t] ) ) {
$badtag = true;
} elseif ( isset( $htmlsingleonly[$t] ) ) {
- # Hack to force empty tag for uncloseable elements
+ # Hack to force empty tag for unclosable elements
$brace = '/>';
} elseif ( isset( $htmlsingle[$t] ) ) {
# Hack to not close $htmlsingle tags
$brace = null;
+ # Still need to push this optionally-closed tag to
+ # the tag stack so that we can match end tags
+ # instead of marking them as bad.
+ array_push( $tagstack, $t );
} elseif ( isset( $tabletags[$t] )
&& in_array( $t, $tagstack ) ) {
// New table tag but forgot to close the previous one
@@ -528,6 +541,10 @@ class Sanitizer {
call_user_func_array( $processCallback, array( &$params, $args ) );
}
+ if ( !Sanitizer::validateTag( $params, $t ) ) {
+ $badtag = true;
+ }
+
# Strip non-approved attributes from the tag
$newparams = Sanitizer::fixTagAttributes( $params, $t );
}
@@ -551,16 +568,24 @@ class Sanitizer {
preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
$x, $regs );
@list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
+ $badtag = false;
if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
if( is_callable( $processCallback ) ) {
call_user_func_array( $processCallback, array( &$params, $args ) );
}
+
+ if ( !Sanitizer::validateTag( $params, $t ) ) {
+ $badtag = true;
+ }
+
$newparams = Sanitizer::fixTagAttributes( $params, $t );
- $rest = str_replace( '>', '&gt;', $rest );
- $text .= "<$slash$t$newparams$brace$rest";
- } else {
- $text .= '&lt;' . str_replace( '>', '&gt;', $x);
+ if ( !$badtag ) {
+ $rest = str_replace( '>', '&gt;', $rest );
+ $text .= "<$slash$t$newparams$brace$rest";
+ continue;
+ }
}
+ $text .= '&lt;' . str_replace( '>', '&gt;', $x);
}
}
wfProfileOut( __METHOD__ );
@@ -579,9 +604,9 @@ class Sanitizer {
*/
static function removeHTMLcomments( $text ) {
wfProfileIn( __METHOD__ );
- while (($start = strpos($text, '<!--')) !== false) {
- $end = strpos($text, '-->', $start + 4);
- if ($end === false) {
+ while ( ($start = strpos( $text, '<!--' ) ) !== false ) {
+ $end = strpos( $text, '-->', $start + 4 );
+ if ( $end === false ) {
# Unterminated comment; bail out
break;
}
@@ -590,22 +615,22 @@ class Sanitizer {
# Trim space and newline if the comment is both
# preceded and followed by a newline
- $spaceStart = max($start - 1, 0);
+ $spaceStart = max( $start - 1, 0 );
$spaceLen = $end - $spaceStart;
- while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
+ while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
$spaceStart--;
$spaceLen++;
}
- while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
+ while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' )
$spaceLen++;
- if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
+ if ( substr( $text, $spaceStart, 1 ) === "\n" and substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
# Remove the comment, leading and trailing
# spaces, and leave only one newline.
- $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
+ $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
}
else {
# Remove just the comment.
- $text = substr_replace($text, '', $start, $end - $start);
+ $text = substr_replace( $text, '', $start, $end - $start );
}
}
wfProfileOut( __METHOD__ );
@@ -613,12 +638,45 @@ class Sanitizer {
}
/**
+ * Takes attribute names and values for a tag and the tag name and
+ * validates that the tag is allowed to be present.
+ * This DOES NOT validate the attributes, nor does it validate the
+ * tags themselves. This method only handles the special circumstances
+ * where we may want to allow a tag within content but ONLY when it has
+ * specific attributes set.
+ *
+ * @param $params
+ * @param $element
+ * @return bool
+ */
+ static function validateTag( $params, $element ) {
+ $params = Sanitizer::decodeTagAttributes( $params );
+
+ if ( $element == 'meta' || $element == 'link' ) {
+ if ( !isset( $params['itemprop'] ) ) {
+ // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
+ return false;
+ }
+ if ( $element == 'meta' && !isset( $params['content'] ) ) {
+ // <meta> must have a content="" for the itemprop
+ return false;
+ }
+ if ( $element == 'link' && !isset( $params['href'] ) ) {
+ // <link> must have an associated href=""
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /**
* Take an array of attribute names and values and normalize or discard
* illegal values for the given element type.
*
* - Discards attributes not on a whitelist for the given element
* - Unsafe style attributes are discarded
- * - Invalid id attributes are reencoded
+ * - Invalid id attributes are re-encoded
*
* @param $attribs Array
* @param $element String
@@ -638,10 +696,10 @@ class Sanitizer {
*
* - Discards attributes not the given whitelist
* - Unsafe style attributes are discarded
- * - Invalid id attributes are reencoded
+ * - Invalid id attributes are re-encoded
*
* @param $attribs Array
- * @param $whitelist Array: list of allowed attribute names
+ * @param array $whitelist list of allowed attribute names
* @return Array
*
* @todo Check for legal values where the DTD limits things.
@@ -679,6 +737,16 @@ class Sanitizer {
$value = Sanitizer::escapeId( $value, 'noninitial' );
}
+ # WAI-ARIA
+ # http://www.w3.org/TR/wai-aria/
+ # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#wai-aria
+ # For now we only support role="presentation" until we work out what roles should be
+ # usable by content and we ensure that our code explicitly rejects patterns that
+ # violate HTML5's ARIA restrictions.
+ if ( $attribute === 'role' && $value !== 'presentation' ) {
+ continue;
+ }
+
//RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
if ( $attribute === 'rel' || $attribute === 'rev' ||
$attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
@@ -713,7 +781,7 @@ class Sanitizer {
unset( $out['itemid'] );
unset( $out['itemref'] );
}
- # TODO: Strip itemprop if we aren't descendants of an itemscope.
+ # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
}
return $out;
}
@@ -804,7 +872,7 @@ class Sanitizer {
// Reject problematic keywords and control characters
if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
return '/* invalid control char */';
- } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value ) ) {
+ } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( | image\s*\( | image-set\s*\( !ix', $value ) ) {
return '/* insecure input */';
}
return $value;
@@ -945,7 +1013,7 @@ class Sanitizer {
* @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
* HTML5 definition of id attribute
*
- * @param $id String: id to escape
+ * @param string $id id to escape
* @param $options Mixed: string or array of strings (default is array()):
* 'noninitial': This is a non-initial fragment of an id, not a full id,
* so don't pay attention if the first character isn't valid at the
@@ -982,7 +1050,7 @@ class Sanitizer {
$id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
if ( !preg_match( '/^[a-zA-Z]/', $id )
- && !in_array( 'noninitial', $options ) ) {
+ && !in_array( 'noninitial', $options ) ) {
// Initial character must be a letter!
$id = "x$id";
}
@@ -1002,17 +1070,17 @@ class Sanitizer {
*/
static function escapeClass( $class ) {
// Convert ugly stuff to underscores and kill underscores in ugly places
- return rtrim(preg_replace(
- array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
+ return rtrim( preg_replace(
+ array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
'_',
- $class ), '_');
+ $class ), '_' );
}
/**
- * Given HTML input, escape with htmlspecialchars but un-escape entites.
+ * Given HTML input, escape with htmlspecialchars but un-escape entities.
* This allows (generally harmless) entities like &#160; to survive.
*
- * @param $html String to escape
+ * @param string $html to escape
* @return String: escaped input
*/
static function escapeHtmlAllowEntities( $html ) {
@@ -1074,6 +1142,7 @@ class Sanitizer {
* attribs regex matches.
*
* @param $set Array
+ * @throws MWException
* @return String
*/
private static function getTagAttributeCallback( $set ) {
@@ -1169,7 +1238,7 @@ class Sanitizer {
$ret = Sanitizer::normalizeEntity( $matches[1] );
} elseif( $matches[2] != '' ) {
$ret = Sanitizer::decCharReference( $matches[2] );
- } elseif( $matches[3] != '' ) {
+ } elseif( $matches[3] != '' ) {
$ret = Sanitizer::hexCharReference( $matches[3] );
}
if( is_null( $ret ) ) {
@@ -1263,7 +1332,7 @@ class Sanitizer {
* This is useful for page titles, not for text to be displayed,
* MediaWiki allows HTML entities to escape normalization as a feature.
*
- * @param $text String (already normalized, containing entities)
+ * @param string $text (already normalized, containing entities)
* @return String (still normalized, without entities)
*/
public static function decodeCharReferencesAndNormalize( $text ) {
@@ -1289,7 +1358,7 @@ class Sanitizer {
return Sanitizer::decodeEntity( $matches[1] );
} elseif( $matches[2] != '' ) {
return Sanitizer::decodeChar( intval( $matches[2] ) );
- } elseif( $matches[3] != '' ) {
+ } elseif( $matches[3] != '' ) {
return Sanitizer::decodeChar( hexdec( $matches[3] ) );
}
# Last case should be an ampersand by itself
@@ -1337,10 +1406,7 @@ class Sanitizer {
* @return Array
*/
static function attributeWhitelist( $element ) {
- static $list;
- if( !isset( $list ) ) {
- $list = Sanitizer::setupAttributeWhitelist();
- }
+ $list = Sanitizer::setupAttributeWhitelist();
return isset( $list[$element] )
? $list[$element]
: array();
@@ -1354,7 +1420,25 @@ class Sanitizer {
static function setupAttributeWhitelist() {
global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
- $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
+ static $whitelist, $staticInitialised;
+ $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) );
+
+ if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
+ return $whitelist;
+ }
+
+ $common = array(
+ # HTML
+ 'id',
+ 'class',
+ 'style',
+ 'lang',
+ 'dir',
+ 'title',
+
+ # WAI-ARIA
+ 'role',
+ );
if ( $wgAllowRdfaAttributes ) {
#RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
@@ -1364,7 +1448,7 @@ class Sanitizer {
}
if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
- # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
+ # add HTML5 microdata tags as specified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
$common = array_merge( $common, array(
'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
) );
@@ -1386,7 +1470,7 @@ class Sanitizer {
# Numbers refer to sections in HTML 4.01 standard describing the element.
# See: http://www.w3.org/TR/html4/
- $whitelist = array (
+ $whitelist = array(
# 7.5.4
'div' => $block,
'center' => $common, # deprecated
@@ -1518,7 +1602,28 @@ class Sanitizer {
# HTML 5 section 4.6
'bdi' => $common,
+ );
+
+ if ( $wgHtml5 ) {
+ # HTML5 elements, defined by:
+ # http://www.whatwg.org/specs/web-apps/current-work/multipage/
+ $whitelist += array(
+ 'data' => array_merge( $common, array( 'value' ) ),
+ 'time' => array_merge( $common, array( 'datetime' ) ),
+ 'mark' => $common,
+
+ // meta and link are only permitted by removeHTMLtags when Microdata
+ // is enabled so we don't bother adding a conditional to hide these
+ // Also meta and link are only valid in WikiText as Microdata elements
+ // (ie: validateTag rejects tags missing the attributes needed for Microdata)
+ // So we don't bother including $common attributes that have no purpose.
+ 'meta' => array( 'itemprop', 'content' ),
+ 'link' => array( 'itemprop', 'href' ),
);
+ }
+
+ $staticInitialised = $globalContext;
+
return $whitelist;
}
@@ -1529,7 +1634,7 @@ class Sanitizer {
* Warning: this return value must be further escaped for literal
* inclusion in HTML output as of 1.10!
*
- * @param $text String: HTML fragment
+ * @param string $text HTML fragment
* @return String
*/
static function stripAllTags( $text ) {
@@ -1641,7 +1746,7 @@ class Sanitizer {
*
* @since 1.18
*
- * @param $addr String E-mail address
+ * @param string $addr E-mail address
* @return Bool
*/
public static function validateEmail( $addr ) {
@@ -1653,8 +1758,8 @@ class Sanitizer {
// Please note strings below are enclosed in brackets [], this make the
// hyphen "-" a range indicator. Hence it is double backslashed below.
// See bug 26948
- $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
- $rfc1034_ldh_str = "a-z0-9\\-" ;
+ $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
+ $rfc1034_ldh_str = "a-z0-9\\-";
$HTML5_email_regexp = "/
^ # start of string
@@ -1663,7 +1768,7 @@ class Sanitizer {
[$rfc1034_ldh_str]+ # First domain part
(\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
$ # End of string
- /ix" ; // case Insensitive, eXtended
+ /ix"; // case Insensitive, eXtended
return (bool) preg_match( $HTML5_email_regexp, $addr );
}