From 4ac9fa081a7c045f6a9f1cfc529d82423f485b2e Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Sun, 8 Dec 2013 09:55:49 +0100 Subject: Update to MediaWiki 1.22.0 --- includes/Sanitizer.php | 361 ++++++++++++++++++++++++------------------------- 1 file changed, 177 insertions(+), 184 deletions(-) (limited to 'includes/Sanitizer.php') diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 849e4d66..499d8218 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -1,6 +1,6 @@ et al * http://www.mediawiki.org/ @@ -25,7 +25,7 @@ */ /** - * XHTML sanitizer for MediaWiki + * HTML sanitizer for MediaWiki * @ingroup Parser */ class Sanitizer { @@ -54,9 +54,8 @@ class Sanitizer { * List of all named character entities defined in HTML 4.01 * http://www.w3.org/TR/html4/sgml/entities.html * As well as ' which is only defined starting in XHTML1. - * @private */ - static $htmlEntities = array( + private static $htmlEntities = array( 'Aacute' => 193, 'aacute' => 225, 'Acirc' => 194, @@ -315,7 +314,7 @@ class Sanitizer { /** * Character entity aliases accepted by MediaWiki */ - static $htmlEntityAliases = array( + private static $htmlEntityAliases = array( 'רלמ' => 'rlm', 'رلم' => 'rlm', ); @@ -323,7 +322,7 @@ class Sanitizer { /** * Lazy-initialised attributes regex, see getAttribsRegex() */ - static $attribsRegex; + private static $attribsRegex; /** * Regular expression to match HTML/XML attribute pairs within a tag. @@ -357,14 +356,17 @@ class Sanitizer { * removes HTML comments * @private * @param $text String - * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values + * @param $processCallback Callback to do any variable or parameter + * replacements in HTML attribute values * @param array $args for the processing callback * @param array $extratags for any extra tags to include * @param array $removetags for any tags (default or extra) to exclude * @return string */ - static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) { - global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag; + static function removeHTMLtags( $text, $processCallback = null, + $args = array(), $extratags = array(), $removetags = array() + ) { + global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag; static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; @@ -373,7 +375,7 @@ class Sanitizer { // Base our staticInitialised variable off of the global config state so that if the globals // are changed (like in the screwed up test system) we will re-initialise the settings. - $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) ); + $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) ); if ( !$staticInitialised || $staticInitialised != $globalContext ) { $htmlpairsStatic = array( # Tags that must be closed @@ -382,30 +384,28 @@ class Sanitizer { 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn', - 'kbd', 'samp' + 'kbd', 'samp', 'data', 'time', 'mark' ); - if ( $wgHtml5 ) { - $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) ); - } $htmlsingle = array( - 'br', 'hr', 'li', 'dt', 'dd' + 'br', 'wbr', 'hr', 'li', 'dt', 'dd' ); $htmlsingleonly = array( # Elements that cannot have close tags - 'br', 'hr' + 'br', 'wbr', 'hr' ); - if ( $wgHtml5 && $wgAllowMicrodataAttributes ) { + if ( $wgAllowMicrodataAttributes ) { $htmlsingle[] = $htmlsingleonly[] = 'meta'; $htmlsingle[] = $htmlsingleonly[] = 'link'; } $htmlnest = array( # Tags that can be nested--?? 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', - 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span' + 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', + 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' ); $tabletags = array( # Can only appear inside table, we will close them 'td', 'th', 'tr', ); $htmllist = array( # Tags used by list - 'ul','ol', + 'ul', 'ol', ); $listtags = array( # Tags that can appear in a list 'li', @@ -446,7 +446,7 @@ class Sanitizer { # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits - if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { + if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; } else { $slash = $t = $params = $brace = $rest = null; @@ -507,7 +507,7 @@ class Sanitizer { !in_array( 'table', $tagstack ) ) { $badtag = true; } elseif ( in_array( $t, $tagstack ) && - !isset( $htmlnest [$t ] ) ) { + !isset( $htmlnest[$t] ) ) { $badtag = true; # Is it a self closed htmlpair ? (bug 5487) } elseif ( $brace == '/>' && @@ -537,7 +537,7 @@ class Sanitizer { # Replace any variables or template parameters with # plaintext results. - if( is_callable( $processCallback ) ) { + if ( is_callable( $processCallback ) ) { call_user_func_array( $processCallback, array( &$params, $args ) ); } @@ -555,12 +555,14 @@ class Sanitizer { continue; } } - $text .= '<' . str_replace( '>', '>', $x); + $text .= '<' . str_replace( '>', '>', $x ); } # Close off any remaining tags - while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) { + while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) { $text .= "\n"; - if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); } + if ( $t == 'table' ) { + $tagstack = array_pop( $tablestack ); + } } } else { # this might be possible using tidy itself @@ -570,7 +572,7 @@ class Sanitizer { @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; $badtag = false; if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { - if( is_callable( $processCallback ) ) { + if ( is_callable( $processCallback ) ) { call_user_func_array( $processCallback, array( &$params, $args ) ); } @@ -585,7 +587,7 @@ class Sanitizer { continue; } } - $text .= '<' . str_replace( '>', '>', $x); + $text .= '<' . str_replace( '>', '>', $x ); } } wfProfileOut( __METHOD__ ); @@ -604,7 +606,7 @@ class Sanitizer { */ static function removeHTMLcomments( $text ) { wfProfileIn( __METHOD__ ); - while ( ($start = strpos( $text, '', $start + 4 ); if ( $end === false ) { # Unterminated comment; bail out @@ -621,9 +623,11 @@ class Sanitizer { $spaceStart--; $spaceLen++; } - while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) + while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) { $spaceLen++; - if ( substr( $text, $spaceStart, 1 ) === "\n" and substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) { + } + if ( substr( $text, $spaceStart, 1 ) === "\n" + && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) { # Remove the comment, leading and trailing # spaces, and leave only one newline. $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 ); @@ -706,13 +710,13 @@ class Sanitizer { * @todo Check for unique id attribute :P */ static function validateAttributes( $attribs, $whitelist ) { - global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5; + global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes; $whitelist = array_flip( $whitelist ); $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; $out = array(); - foreach( $attribs as $attribute => $value ) { + foreach ( $attribs as $attribute => $value ) { #allow XML namespace declaration if RDFa is enabled if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { @@ -722,14 +726,14 @@ class Sanitizer { continue; } - # Allow any attribute beginning with "data-", if in HTML5 mode - if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) { + # Allow any attribute beginning with "data-" + if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) { continue; } # Strip javascript "expression" from stylesheets. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp - if( $attribute == 'style' ) { + if ( $attribute == 'style' ) { $value = Sanitizer::checkCss( $value ); } @@ -739,7 +743,7 @@ class Sanitizer { # WAI-ARIA # http://www.w3.org/TR/wai-aria/ - # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#wai-aria + # http://www.whatwg.org/html/elements.html#wai-aria # For now we only support role="presentation" until we work out what roles should be # usable by content and we ensure that our code explicitly rejects patterns that # violate HTML5's ARIA restrictions. @@ -747,13 +751,18 @@ class Sanitizer { continue; } - //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity - if ( $attribute === 'rel' || $attribute === 'rev' || - $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa - $attribute === 'datatype' || $attribute === 'typeof' || #RDFa - $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata - $attribute === 'itemscope' || $attribute === 'itemtype' ) { #HTML5 microdata - + // RDFa and microdata properties allow URLs, URIs and/or CURIs. + // Check them for sanity. + if ( $attribute === 'rel' || $attribute === 'rev' + # RDFa + || $attribute === 'about' || $attribute === 'property' + || $attribute === 'resource' || $attribute === 'datatype' + || $attribute === 'typeof' + # HTML5 microdata + || $attribute === 'itemid' || $attribute === 'itemprop' + || $attribute === 'itemref' || $attribute === 'itemscope' + || $attribute === 'itemtype' + ) { //Paranoia. Allow "simple" values but suppress javascript if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { continue; @@ -765,7 +774,7 @@ class Sanitizer { if ( $attribute === 'href' || $attribute === 'src' ) { if ( !preg_match( $hrefExp, $value ) ) { continue; //drop any href or src attributes not using an allowed protocol. - //NOTE: this also drops all relative URLs + // NOTE: this also drops all relative URLs } } @@ -798,9 +807,10 @@ class Sanitizer { */ static function mergeAttributes( $a, $b ) { $out = array_merge( $a, $b ); - if( isset( $a['class'] ) && isset( $b['class'] ) - && is_string( $a['class'] ) && is_string( $b['class'] ) - && $a['class'] !== $b['class'] ) { + if ( isset( $a['class'] ) && isset( $b['class'] ) + && is_string( $a['class'] ) && is_string( $b['class'] ) + && $a['class'] !== $b['class'] + ) { $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", -1, PREG_SPLIT_NO_EMPTY ); $out['class'] = implode( ' ', array_unique( $classes ) ); @@ -811,9 +821,10 @@ class Sanitizer { /** * Pick apart some CSS and check it for forbidden or unsafe structures. * Returns a sanitized string. This sanitized string will have - * character references and escape sequences decoded, and comments - * stripped. If the input is just too evil, only a comment complaining - * about evilness will be returned. + * character references and escape sequences decoded and comments + * stripped (unless it is itself one valid comment, in which case the value + * will be passed through). If the input is just too evil, only a comment + * complaining about evilness will be returned. * * Currently URL references, 'expression', 'tps' are forbidden. * @@ -854,60 +865,28 @@ class Sanitizer { $value = preg_replace_callback( $decodeRegex, array( __CLASS__, 'cssDecodeCallback' ), $value ); - // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii - $value = preg_replace_callback( - '/[!-z]/u', // U+FF01 to U+FF5A - function ( $matches ) { - $cp = utf8ToCodepoint( $matches[0] ); - if ( $cp === false ) { - return ''; - } - return chr( $cp - 65248 ); // ASCII range \x21-\x7A - }, - $value - ); - - // Convert more characters IE6 might treat as ascii - // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D - $value = str_replace( - array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ), - array( 'r', 'n', 'n', 'l', 'i', '(', '(' ), - $value - ); - - // Remove any comments; IE gets token splitting wrong - // This must be done AFTER decoding character references and - // escape sequences, because those steps can introduce comments - // This step cannot introduce character references or escape - // sequences, because it replaces comments with spaces rather - // than removing them completely. - $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); - - // Remove anything after a comment-start token, to guard against - // incorrect client implementations. - $commentPos = strpos( $value, '/*' ); - if ( $commentPos !== false ) { - $value = substr( $value, 0, $commentPos ); + // Let the value through if it's nothing but a single comment, to + // allow other functions which may reject it to pass some error + // message through. + if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) { + // Remove any comments; IE gets token splitting wrong + // This must be done AFTER decoding character references and + // escape sequences, because those steps can introduce comments + // This step cannot introduce character references or escape + // sequences, because it replaces comments with spaces rather + // than removing them completely. + $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); + + // Remove anything after a comment-start token, to guard against + // incorrect client implementations. + $commentPos = strpos( $value, '/*' ); + if ( $commentPos !== false ) { + $value = substr( $value, 0, $commentPos ); + } } - // S followed by repeat, iteration, or prolonged sound marks, - // which IE will treat as "ss" - $value = preg_replace( - '/s(?: - \xE3\x80\xB1 | # U+3031 - \xE3\x82\x9D | # U+309D - \xE3\x83\xBC | # U+30FC - \xE3\x83\xBD | # U+30FD - \xEF\xB9\xBC | # U+FE7C - \xEF\xB9\xBD | # U+FE7D - \xEF\xBD\xB0 # U+FF70 - )/ix', - 'ss', - $value - ); - // Reject problematic keywords and control characters - if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) { + if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) { return '/* invalid control char */'; } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( | image\s*\( | image-set\s*\( !ix', $value ) ) { return '/* insecure input */'; @@ -960,21 +939,14 @@ class Sanitizer { * @return String */ static function fixTagAttributes( $text, $element ) { - if( trim( $text ) == '' ) { + if ( trim( $text ) == '' ) { return ''; } $decoded = Sanitizer::decodeTagAttributes( $text ); $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); - $attribs = array(); - foreach( $stripped as $attribute => $value ) { - $encAttribute = htmlspecialchars( $attribute ); - $encValue = Sanitizer::safeEncodeAttribute( $value ); - - $attribs[] = "$encAttribute=\"$encValue\""; - } - return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; + return Sanitizer::safeEncodeTagAttributes( $stripped ); } /** @@ -1047,7 +1019,7 @@ class Sanitizer { * in the id and * name attributes * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute - * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute + * @see http://www.whatwg.org/html/elements.html#the-id-attribute * HTML5 definition of id attribute * * @param string $id id to escape @@ -1062,10 +1034,10 @@ class Sanitizer { * @return String */ static function escapeId( $id, $options = array() ) { - global $wgHtml5, $wgExperimentalHtmlIds; + global $wgExperimentalHtmlIds; $options = (array)$options; - if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { + if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { $id = Sanitizer::decodeCharReferences( $id ); $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); $id = trim( $id, '_' ); @@ -1146,13 +1118,13 @@ class Sanitizer { * @return Array */ public static function decodeTagAttributes( $text ) { - if( trim( $text ) == '' ) { + if ( trim( $text ) == '' ) { return array(); } $attribs = array(); $pairs = array(); - if( !preg_match_all( + if ( !preg_match_all( self::getAttribsRegex(), $text, $pairs, @@ -1160,7 +1132,7 @@ class Sanitizer { return $attribs; } - foreach( $pairs as $set ) { + foreach ( $pairs as $set ) { $attribute = strtolower( $set[1] ); $value = Sanitizer::getTagAttributeCallback( $set ); @@ -1174,6 +1146,24 @@ class Sanitizer { return $attribs; } + /** + * Build a partial tag string from an associative array of attribute + * names and values as returned by decodeTagAttributes. + * + * @param $assoc_array Array + * @return String + */ + public static function safeEncodeTagAttributes( $assoc_array ) { + $attribs = array(); + foreach ( $assoc_array as $attribute => $value ) { + $encAttribute = htmlspecialchars( $attribute ); + $encValue = Sanitizer::safeEncodeAttribute( $value ); + + $attribs[] = "$encAttribute=\"$encValue\""; + } + return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; + } + /** * Pick the appropriate attribute value from a match set from the * attribs regex matches. @@ -1183,19 +1173,19 @@ class Sanitizer { * @return String */ private static function getTagAttributeCallback( $set ) { - if( isset( $set[6] ) ) { + if ( isset( $set[6] ) ) { # Illegal #XXXXXX color with no quotes. return $set[6]; - } elseif( isset( $set[5] ) ) { + } elseif ( isset( $set[5] ) ) { # No quotes. return $set[5]; - } elseif( isset( $set[4] ) ) { + } elseif ( isset( $set[4] ) ) { # Single-quoted return $set[4]; - } elseif( isset( $set[3] ) ) { + } elseif ( isset( $set[3] ) ) { # Double-quoted return $set[3]; - } elseif( !isset( $set[2] ) ) { + } elseif ( !isset( $set[2] ) ) { # In XHTML, attributes must have a value. # For 'reduced' form, return explicitly the attribute name here. return $set[1]; @@ -1271,14 +1261,14 @@ class Sanitizer { */ static function normalizeCharReferencesCallback( $matches ) { $ret = null; - if( $matches[1] != '' ) { + if ( $matches[1] != '' ) { $ret = Sanitizer::normalizeEntity( $matches[1] ); - } elseif( $matches[2] != '' ) { + } elseif ( $matches[2] != '' ) { $ret = Sanitizer::decCharReference( $matches[2] ); - } elseif( $matches[3] != '' ) { + } elseif ( $matches[3] != '' ) { $ret = Sanitizer::hexCharReference( $matches[3] ); } - if( is_null( $ret ) ) { + if ( is_null( $ret ) ) { return htmlspecialchars( $matches[0] ); } else { return $ret; @@ -1314,7 +1304,7 @@ class Sanitizer { */ static function decCharReference( $codepoint ) { $point = intval( $codepoint ); - if( Sanitizer::validateCodepoint( $point ) ) { + if ( Sanitizer::validateCodepoint( $point ) ) { return sprintf( '&#%d;', $point ); } else { return null; @@ -1327,7 +1317,7 @@ class Sanitizer { */ static function hexCharReference( $codepoint ) { $point = hexdec( $codepoint ); - if( Sanitizer::validateCodepoint( $point ) ) { + if ( Sanitizer::validateCodepoint( $point ) ) { return sprintf( '&#x%x;', $point ); } else { return null; @@ -1340,12 +1330,12 @@ class Sanitizer { * @return Boolean */ private static function validateCodepoint( $codepoint ) { - return ($codepoint == 0x09) - || ($codepoint == 0x0a) - || ($codepoint == 0x0d) - || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) - || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) - || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); + return $codepoint == 0x09 + || $codepoint == 0x0a + || $codepoint == 0x0d + || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff ) + || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) + || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); } /** @@ -1391,12 +1381,12 @@ class Sanitizer { * @return String */ static function decodeCharReferencesCallback( $matches ) { - if( $matches[1] != '' ) { + if ( $matches[1] != '' ) { return Sanitizer::decodeEntity( $matches[1] ); - } elseif( $matches[2] != '' ) { - return Sanitizer::decodeChar( intval( $matches[2] ) ); - } elseif( $matches[3] != '' ) { - return Sanitizer::decodeChar( hexdec( $matches[3] ) ); + } elseif ( $matches[2] != '' ) { + return Sanitizer::decodeChar( intval( $matches[2] ) ); + } elseif ( $matches[3] != '' ) { + return Sanitizer::decodeChar( hexdec( $matches[3] ) ); } # Last case should be an ampersand by itself return $matches[0]; @@ -1410,7 +1400,7 @@ class Sanitizer { * @private */ static function decodeChar( $codepoint ) { - if( Sanitizer::validateCodepoint( $codepoint ) ) { + if ( Sanitizer::validateCodepoint( $codepoint ) ) { return codepointToUtf8( $codepoint ); } else { return UTF8_REPLACEMENT; @@ -1429,7 +1419,7 @@ class Sanitizer { if ( isset( self::$htmlEntityAliases[$name] ) ) { $name = self::$htmlEntityAliases[$name]; } - if( isset( self::$htmlEntities[$name] ) ) { + if ( isset( self::$htmlEntities[$name] ) ) { return codepointToUtf8( self::$htmlEntities[$name] ); } else { return "&$name;"; @@ -1455,10 +1445,10 @@ class Sanitizer { * @return Array */ static function setupAttributeWhitelist() { - global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes; + global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes; static $whitelist, $staticInitialised; - $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) ); + $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) ); if ( isset( $whitelist ) && $staticInitialised == $globalContext ) { return $whitelist; @@ -1478,32 +1468,35 @@ class Sanitizer { ); if ( $wgAllowRdfaAttributes ) { - #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 + # RDFa attributes as specified in section 9 of + # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 $common = array_merge( $common, array( - 'about', 'property', 'resource', 'datatype', 'typeof', + 'about', 'property', 'resource', 'datatype', 'typeof', ) ); } - if ( $wgHtml5 && $wgAllowMicrodataAttributes ) { - # add HTML5 microdata tags as specified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model + if ( $wgAllowMicrodataAttributes ) { + # add HTML5 microdata tags as specified by + # http://www.whatwg.org/html/microdata.html#the-microdata-model $common = array_merge( $common, array( - 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype' + 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype' ) ); } $block = array_merge( $common, array( 'align' ) ); $tablealign = array( 'align', 'char', 'charoff', 'valign' ); - $tablecell = array( 'abbr', - 'axis', - 'headers', - 'scope', - 'rowspan', - 'colspan', - 'nowrap', # deprecated - 'width', # deprecated - 'height', # deprecated - 'bgcolor' # deprecated - ); + $tablecell = array( + 'abbr', + 'axis', + 'headers', + 'scope', + 'rowspan', + 'colspan', + 'nowrap', # deprecated + 'width', # deprecated + 'height', # deprecated + 'bgcolor', # deprecated + ); # Numbers refer to sections in HTML 4.01 standard describing the element. # See: http://www.w3.org/TR/html4/ @@ -1553,6 +1546,9 @@ class Sanitizer { # 9.3.2 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), + # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element + 'wbr' => array( 'id', 'class', 'title', 'style' ), + # 9.3.4 'pre' => array_merge( $common, array( 'width' ) ), @@ -1596,7 +1592,9 @@ class Sanitizer { 'td' => array_merge( $common, $tablecell, $tablealign ), 'th' => array_merge( $common, $tablecell, $tablealign ), - # 12.2 # NOTE: is not allowed directly, but the attrib whitelist is used from the Parser object + # 12.2 + # NOTE: is not allowed directly, but the attrib + # whitelist is used from the Parser object 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa # 13.2 @@ -1622,8 +1620,8 @@ class Sanitizer { # 15.3 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), - # XHTML Ruby annotation text module, simple ruby only. - # http://www.w3c.org/TR/ruby/ + # HTML Ruby annotation text module, simple ruby only. + # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element 'ruby' => $common, # rbc # rtc @@ -1639,25 +1637,20 @@ class Sanitizer { # HTML 5 section 4.6 'bdi' => $common, - ); - - if ( $wgHtml5 ) { # HTML5 elements, defined by: - # http://www.whatwg.org/specs/web-apps/current-work/multipage/ - $whitelist += array( - 'data' => array_merge( $common, array( 'value' ) ), - 'time' => array_merge( $common, array( 'datetime' ) ), - 'mark' => $common, - - // meta and link are only permitted by removeHTMLtags when Microdata - // is enabled so we don't bother adding a conditional to hide these - // Also meta and link are only valid in WikiText as Microdata elements - // (ie: validateTag rejects tags missing the attributes needed for Microdata) - // So we don't bother including $common attributes that have no purpose. - 'meta' => array( 'itemprop', 'content' ), - 'link' => array( 'itemprop', 'href' ), - ); - } + # http://www.whatwg.org/html/ + 'data' => array_merge( $common, array( 'value' ) ), + 'time' => array_merge( $common, array( 'datetime' ) ), + 'mark' => $common, + + // meta and link are only permitted by removeHTMLtags when Microdata + // is enabled so we don't bother adding a conditional to hide these + // Also meta and link are only valid in WikiText as Microdata elements + // (ie: validateTag rejects tags missing the attributes needed for Microdata) + // So we don't bother including $common attributes that have no purpose. + 'meta' => array( 'itemprop', 'content' ), + 'link' => array( 'itemprop', 'href' ), + ); $staticInitialised = $globalContext; @@ -1696,7 +1689,7 @@ class Sanitizer { */ static function hackDocType() { $out = " $codepoint ) { + foreach ( self::$htmlEntities as $entity => $codepoint ) { $out .= ""; } $out .= "]>\n"; @@ -1718,7 +1711,7 @@ class Sanitizer { # Validate hostname portion $matches = array(); - if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { + if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { list( /* $whole */, $protocol, $host, $rest ) = $matches; // Characters that will be ignored in IDNs. @@ -1762,7 +1755,7 @@ class Sanitizer { * Does a string look like an e-mail address? * * This validates an email address using an HTML5 specification found at: - * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address + * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address * Which as of 2011-01-24 says: * * A valid e-mail address is a string that matches the ABNF production @@ -1788,7 +1781,7 @@ class Sanitizer { */ public static function validateEmail( $addr ) { $result = null; - if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) { + if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) { return $result; } @@ -1807,6 +1800,6 @@ class Sanitizer { $ # End of string /ix"; // case Insensitive, eXtended - return (bool) preg_match( $HTML5_email_regexp, $addr ); + return (bool)preg_match( $HTML5_email_regexp, $addr ); } } -- cgit v1.2.2