summaryrefslogtreecommitdiff
path: root/includes/Sanitizer.php
diff options
context:
space:
mode:
authorPierre Schmitz <pierre@archlinux.de>2013-12-08 09:55:49 +0100
committerPierre Schmitz <pierre@archlinux.de>2013-12-08 09:55:49 +0100
commit4ac9fa081a7c045f6a9f1cfc529d82423f485b2e (patch)
treeaf68743f2f4a47d13f2b0eb05f5c4aaf86d8ea37 /includes/Sanitizer.php
parentaf4da56f1ad4d3ef7b06557bae365da2ea27a897 (diff)
Update to MediaWiki 1.22.0
Diffstat (limited to 'includes/Sanitizer.php')
-rw-r--r--includes/Sanitizer.php361
1 files changed, 177 insertions, 184 deletions
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 849e4d66..499d8218 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -1,6 +1,6 @@
<?php
/**
- * XHTML sanitizer for %MediaWiki.
+ * HTML sanitizer for %MediaWiki.
*
* Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
* http://www.mediawiki.org/
@@ -25,7 +25,7 @@
*/
/**
- * XHTML sanitizer for MediaWiki
+ * HTML sanitizer for MediaWiki
* @ingroup Parser
*/
class Sanitizer {
@@ -54,9 +54,8 @@ class Sanitizer {
* List of all named character entities defined in HTML 4.01
* http://www.w3.org/TR/html4/sgml/entities.html
* As well as &apos; which is only defined starting in XHTML1.
- * @private
*/
- static $htmlEntities = array(
+ private static $htmlEntities = array(
'Aacute' => 193,
'aacute' => 225,
'Acirc' => 194,
@@ -315,7 +314,7 @@ class Sanitizer {
/**
* Character entity aliases accepted by MediaWiki
*/
- static $htmlEntityAliases = array(
+ private static $htmlEntityAliases = array(
'רלמ' => 'rlm',
'رلم' => 'rlm',
);
@@ -323,7 +322,7 @@ class Sanitizer {
/**
* Lazy-initialised attributes regex, see getAttribsRegex()
*/
- static $attribsRegex;
+ private static $attribsRegex;
/**
* Regular expression to match HTML/XML attribute pairs within a tag.
@@ -357,14 +356,17 @@ class Sanitizer {
* removes HTML comments
* @private
* @param $text String
- * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
+ * @param $processCallback Callback to do any variable or parameter
+ * replacements in HTML attribute values
* @param array $args for the processing callback
* @param array $extratags for any extra tags to include
* @param array $removetags for any tags (default or extra) to exclude
* @return string
*/
- static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
- global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag;
+ static function removeHTMLtags( $text, $processCallback = null,
+ $args = array(), $extratags = array(), $removetags = array()
+ ) {
+ global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
$htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
@@ -373,7 +375,7 @@ class Sanitizer {
// Base our staticInitialised variable off of the global config state so that if the globals
// are changed (like in the screwed up test system) we will re-initialise the settings.
- $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
+ $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
if ( !$staticInitialised || $staticInitialised != $globalContext ) {
$htmlpairsStatic = array( # Tags that must be closed
@@ -382,30 +384,28 @@ class Sanitizer {
'strike', 'strong', 'tt', 'var', 'div', 'center',
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
- 'kbd', 'samp'
+ 'kbd', 'samp', 'data', 'time', 'mark'
);
- if ( $wgHtml5 ) {
- $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) );
- }
$htmlsingle = array(
- 'br', 'hr', 'li', 'dt', 'dd'
+ 'br', 'wbr', 'hr', 'li', 'dt', 'dd'
);
$htmlsingleonly = array( # Elements that cannot have close tags
- 'br', 'hr'
+ 'br', 'wbr', 'hr'
);
- if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
+ if ( $wgAllowMicrodataAttributes ) {
$htmlsingle[] = $htmlsingleonly[] = 'meta';
$htmlsingle[] = $htmlsingleonly[] = 'link';
}
$htmlnest = array( # Tags that can be nested--??
'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
- 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span'
+ 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
+ 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
);
$tabletags = array( # Can only appear inside table, we will close them
'td', 'th', 'tr',
);
$htmllist = array( # Tags used by list
- 'ul','ol',
+ 'ul', 'ol',
);
$listtags = array( # Tags that can appear in a list
'li',
@@ -446,7 +446,7 @@ class Sanitizer {
# $params: String between element name and >
# $brace: Ending '>' or '/>'
# $rest: Everything until the next element of $bits
- if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
+ if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
} else {
$slash = $t = $params = $brace = $rest = null;
@@ -507,7 +507,7 @@ class Sanitizer {
!in_array( 'table', $tagstack ) ) {
$badtag = true;
} elseif ( in_array( $t, $tagstack ) &&
- !isset( $htmlnest [$t ] ) ) {
+ !isset( $htmlnest[$t] ) ) {
$badtag = true;
# Is it a self closed htmlpair ? (bug 5487)
} elseif ( $brace == '/>' &&
@@ -537,7 +537,7 @@ class Sanitizer {
# Replace any variables or template parameters with
# plaintext results.
- if( is_callable( $processCallback ) ) {
+ if ( is_callable( $processCallback ) ) {
call_user_func_array( $processCallback, array( &$params, $args ) );
}
@@ -555,12 +555,14 @@ class Sanitizer {
continue;
}
}
- $text .= '&lt;' . str_replace( '>', '&gt;', $x);
+ $text .= '&lt;' . str_replace( '>', '&gt;', $x );
}
# Close off any remaining tags
- while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
+ while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
$text .= "</$t>\n";
- if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
+ if ( $t == 'table' ) {
+ $tagstack = array_pop( $tablestack );
+ }
}
} else {
# this might be possible using tidy itself
@@ -570,7 +572,7 @@ class Sanitizer {
@list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
$badtag = false;
if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
- if( is_callable( $processCallback ) ) {
+ if ( is_callable( $processCallback ) ) {
call_user_func_array( $processCallback, array( &$params, $args ) );
}
@@ -585,7 +587,7 @@ class Sanitizer {
continue;
}
}
- $text .= '&lt;' . str_replace( '>', '&gt;', $x);
+ $text .= '&lt;' . str_replace( '>', '&gt;', $x );
}
}
wfProfileOut( __METHOD__ );
@@ -604,7 +606,7 @@ class Sanitizer {
*/
static function removeHTMLcomments( $text ) {
wfProfileIn( __METHOD__ );
- while ( ($start = strpos( $text, '<!--' ) ) !== false ) {
+ while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
$end = strpos( $text, '-->', $start + 4 );
if ( $end === false ) {
# Unterminated comment; bail out
@@ -621,9 +623,11 @@ class Sanitizer {
$spaceStart--;
$spaceLen++;
}
- while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' )
+ while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
$spaceLen++;
- if ( substr( $text, $spaceStart, 1 ) === "\n" and substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
+ }
+ if ( substr( $text, $spaceStart, 1 ) === "\n"
+ && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
# Remove the comment, leading and trailing
# spaces, and leave only one newline.
$text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
@@ -706,13 +710,13 @@ class Sanitizer {
* @todo Check for unique id attribute :P
*/
static function validateAttributes( $attribs, $whitelist ) {
- global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
+ global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
$whitelist = array_flip( $whitelist );
$hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
$out = array();
- foreach( $attribs as $attribute => $value ) {
+ foreach ( $attribs as $attribute => $value ) {
#allow XML namespace declaration if RDFa is enabled
if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
@@ -722,14 +726,14 @@ class Sanitizer {
continue;
}
- # Allow any attribute beginning with "data-", if in HTML5 mode
- if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
+ # Allow any attribute beginning with "data-"
+ if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
continue;
}
# Strip javascript "expression" from stylesheets.
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
- if( $attribute == 'style' ) {
+ if ( $attribute == 'style' ) {
$value = Sanitizer::checkCss( $value );
}
@@ -739,7 +743,7 @@ class Sanitizer {
# WAI-ARIA
# http://www.w3.org/TR/wai-aria/
- # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#wai-aria
+ # http://www.whatwg.org/html/elements.html#wai-aria
# For now we only support role="presentation" until we work out what roles should be
# usable by content and we ensure that our code explicitly rejects patterns that
# violate HTML5's ARIA restrictions.
@@ -747,13 +751,18 @@ class Sanitizer {
continue;
}
- //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
- if ( $attribute === 'rel' || $attribute === 'rev' ||
- $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
- $attribute === 'datatype' || $attribute === 'typeof' || #RDFa
- $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
- $attribute === 'itemscope' || $attribute === 'itemtype' ) { #HTML5 microdata
-
+ // RDFa and microdata properties allow URLs, URIs and/or CURIs.
+ // Check them for sanity.
+ if ( $attribute === 'rel' || $attribute === 'rev'
+ # RDFa
+ || $attribute === 'about' || $attribute === 'property'
+ || $attribute === 'resource' || $attribute === 'datatype'
+ || $attribute === 'typeof'
+ # HTML5 microdata
+ || $attribute === 'itemid' || $attribute === 'itemprop'
+ || $attribute === 'itemref' || $attribute === 'itemscope'
+ || $attribute === 'itemtype'
+ ) {
//Paranoia. Allow "simple" values but suppress javascript
if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
continue;
@@ -765,7 +774,7 @@ class Sanitizer {
if ( $attribute === 'href' || $attribute === 'src' ) {
if ( !preg_match( $hrefExp, $value ) ) {
continue; //drop any href or src attributes not using an allowed protocol.
- //NOTE: this also drops all relative URLs
+ // NOTE: this also drops all relative URLs
}
}
@@ -798,9 +807,10 @@ class Sanitizer {
*/
static function mergeAttributes( $a, $b ) {
$out = array_merge( $a, $b );
- if( isset( $a['class'] ) && isset( $b['class'] )
- && is_string( $a['class'] ) && is_string( $b['class'] )
- && $a['class'] !== $b['class'] ) {
+ if ( isset( $a['class'] ) && isset( $b['class'] )
+ && is_string( $a['class'] ) && is_string( $b['class'] )
+ && $a['class'] !== $b['class']
+ ) {
$classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
-1, PREG_SPLIT_NO_EMPTY );
$out['class'] = implode( ' ', array_unique( $classes ) );
@@ -811,9 +821,10 @@ class Sanitizer {
/**
* Pick apart some CSS and check it for forbidden or unsafe structures.
* Returns a sanitized string. This sanitized string will have
- * character references and escape sequences decoded, and comments
- * stripped. If the input is just too evil, only a comment complaining
- * about evilness will be returned.
+ * character references and escape sequences decoded and comments
+ * stripped (unless it is itself one valid comment, in which case the value
+ * will be passed through). If the input is just too evil, only a comment
+ * complaining about evilness will be returned.
*
* Currently URL references, 'expression', 'tps' are forbidden.
*
@@ -854,60 +865,28 @@ class Sanitizer {
$value = preg_replace_callback( $decodeRegex,
array( __CLASS__, 'cssDecodeCallback' ), $value );
- // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
- $value = preg_replace_callback(
- '/[!-z]/u', // U+FF01 to U+FF5A
- function ( $matches ) {
- $cp = utf8ToCodepoint( $matches[0] );
- if ( $cp === false ) {
- return '';
- }
- return chr( $cp - 65248 ); // ASCII range \x21-\x7A
- },
- $value
- );
-
- // Convert more characters IE6 might treat as ascii
- // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
- $value = str_replace(
- array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
- array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
- $value
- );
-
- // Remove any comments; IE gets token splitting wrong
- // This must be done AFTER decoding character references and
- // escape sequences, because those steps can introduce comments
- // This step cannot introduce character references or escape
- // sequences, because it replaces comments with spaces rather
- // than removing them completely.
- $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
-
- // Remove anything after a comment-start token, to guard against
- // incorrect client implementations.
- $commentPos = strpos( $value, '/*' );
- if ( $commentPos !== false ) {
- $value = substr( $value, 0, $commentPos );
+ // Let the value through if it's nothing but a single comment, to
+ // allow other functions which may reject it to pass some error
+ // message through.
+ if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
+ // Remove any comments; IE gets token splitting wrong
+ // This must be done AFTER decoding character references and
+ // escape sequences, because those steps can introduce comments
+ // This step cannot introduce character references or escape
+ // sequences, because it replaces comments with spaces rather
+ // than removing them completely.
+ $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+
+ // Remove anything after a comment-start token, to guard against
+ // incorrect client implementations.
+ $commentPos = strpos( $value, '/*' );
+ if ( $commentPos !== false ) {
+ $value = substr( $value, 0, $commentPos );
+ }
}
- // S followed by repeat, iteration, or prolonged sound marks,
- // which IE will treat as "ss"
- $value = preg_replace(
- '/s(?:
- \xE3\x80\xB1 | # U+3031
- \xE3\x82\x9D | # U+309D
- \xE3\x83\xBC | # U+30FC
- \xE3\x83\xBD | # U+30FD
- \xEF\xB9\xBC | # U+FE7C
- \xEF\xB9\xBD | # U+FE7D
- \xEF\xBD\xB0 # U+FF70
- )/ix',
- 'ss',
- $value
- );
-
// Reject problematic keywords and control characters
- if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
+ if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
return '/* invalid control char */';
} elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( | image\s*\( | image-set\s*\( !ix', $value ) ) {
return '/* insecure input */';
@@ -960,21 +939,14 @@ class Sanitizer {
* @return String
*/
static function fixTagAttributes( $text, $element ) {
- if( trim( $text ) == '' ) {
+ if ( trim( $text ) == '' ) {
return '';
}
$decoded = Sanitizer::decodeTagAttributes( $text );
$stripped = Sanitizer::validateTagAttributes( $decoded, $element );
- $attribs = array();
- foreach( $stripped as $attribute => $value ) {
- $encAttribute = htmlspecialchars( $attribute );
- $encValue = Sanitizer::safeEncodeAttribute( $value );
-
- $attribs[] = "$encAttribute=\"$encValue\"";
- }
- return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+ return Sanitizer::safeEncodeTagAttributes( $stripped );
}
/**
@@ -1047,7 +1019,7 @@ class Sanitizer {
* in the id and
* name attributes
* @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
- * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
+ * @see http://www.whatwg.org/html/elements.html#the-id-attribute
* HTML5 definition of id attribute
*
* @param string $id id to escape
@@ -1062,10 +1034,10 @@ class Sanitizer {
* @return String
*/
static function escapeId( $id, $options = array() ) {
- global $wgHtml5, $wgExperimentalHtmlIds;
+ global $wgExperimentalHtmlIds;
$options = (array)$options;
- if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
+ if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
$id = Sanitizer::decodeCharReferences( $id );
$id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
$id = trim( $id, '_' );
@@ -1146,13 +1118,13 @@ class Sanitizer {
* @return Array
*/
public static function decodeTagAttributes( $text ) {
- if( trim( $text ) == '' ) {
+ if ( trim( $text ) == '' ) {
return array();
}
$attribs = array();
$pairs = array();
- if( !preg_match_all(
+ if ( !preg_match_all(
self::getAttribsRegex(),
$text,
$pairs,
@@ -1160,7 +1132,7 @@ class Sanitizer {
return $attribs;
}
- foreach( $pairs as $set ) {
+ foreach ( $pairs as $set ) {
$attribute = strtolower( $set[1] );
$value = Sanitizer::getTagAttributeCallback( $set );
@@ -1175,6 +1147,24 @@ class Sanitizer {
}
/**
+ * Build a partial tag string from an associative array of attribute
+ * names and values as returned by decodeTagAttributes.
+ *
+ * @param $assoc_array Array
+ * @return String
+ */
+ public static function safeEncodeTagAttributes( $assoc_array ) {
+ $attribs = array();
+ foreach ( $assoc_array as $attribute => $value ) {
+ $encAttribute = htmlspecialchars( $attribute );
+ $encValue = Sanitizer::safeEncodeAttribute( $value );
+
+ $attribs[] = "$encAttribute=\"$encValue\"";
+ }
+ return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+ }
+
+ /**
* Pick the appropriate attribute value from a match set from the
* attribs regex matches.
*
@@ -1183,19 +1173,19 @@ class Sanitizer {
* @return String
*/
private static function getTagAttributeCallback( $set ) {
- if( isset( $set[6] ) ) {
+ if ( isset( $set[6] ) ) {
# Illegal #XXXXXX color with no quotes.
return $set[6];
- } elseif( isset( $set[5] ) ) {
+ } elseif ( isset( $set[5] ) ) {
# No quotes.
return $set[5];
- } elseif( isset( $set[4] ) ) {
+ } elseif ( isset( $set[4] ) ) {
# Single-quoted
return $set[4];
- } elseif( isset( $set[3] ) ) {
+ } elseif ( isset( $set[3] ) ) {
# Double-quoted
return $set[3];
- } elseif( !isset( $set[2] ) ) {
+ } elseif ( !isset( $set[2] ) ) {
# In XHTML, attributes must have a value.
# For 'reduced' form, return explicitly the attribute name here.
return $set[1];
@@ -1271,14 +1261,14 @@ class Sanitizer {
*/
static function normalizeCharReferencesCallback( $matches ) {
$ret = null;
- if( $matches[1] != '' ) {
+ if ( $matches[1] != '' ) {
$ret = Sanitizer::normalizeEntity( $matches[1] );
- } elseif( $matches[2] != '' ) {
+ } elseif ( $matches[2] != '' ) {
$ret = Sanitizer::decCharReference( $matches[2] );
- } elseif( $matches[3] != '' ) {
+ } elseif ( $matches[3] != '' ) {
$ret = Sanitizer::hexCharReference( $matches[3] );
}
- if( is_null( $ret ) ) {
+ if ( is_null( $ret ) ) {
return htmlspecialchars( $matches[0] );
} else {
return $ret;
@@ -1314,7 +1304,7 @@ class Sanitizer {
*/
static function decCharReference( $codepoint ) {
$point = intval( $codepoint );
- if( Sanitizer::validateCodepoint( $point ) ) {
+ if ( Sanitizer::validateCodepoint( $point ) ) {
return sprintf( '&#%d;', $point );
} else {
return null;
@@ -1327,7 +1317,7 @@ class Sanitizer {
*/
static function hexCharReference( $codepoint ) {
$point = hexdec( $codepoint );
- if( Sanitizer::validateCodepoint( $point ) ) {
+ if ( Sanitizer::validateCodepoint( $point ) ) {
return sprintf( '&#x%x;', $point );
} else {
return null;
@@ -1340,12 +1330,12 @@ class Sanitizer {
* @return Boolean
*/
private static function validateCodepoint( $codepoint ) {
- return ($codepoint == 0x09)
- || ($codepoint == 0x0a)
- || ($codepoint == 0x0d)
- || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
- || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
- || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
+ return $codepoint == 0x09
+ || $codepoint == 0x0a
+ || $codepoint == 0x0d
+ || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
+ || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
+ || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
}
/**
@@ -1391,12 +1381,12 @@ class Sanitizer {
* @return String
*/
static function decodeCharReferencesCallback( $matches ) {
- if( $matches[1] != '' ) {
+ if ( $matches[1] != '' ) {
return Sanitizer::decodeEntity( $matches[1] );
- } elseif( $matches[2] != '' ) {
- return Sanitizer::decodeChar( intval( $matches[2] ) );
- } elseif( $matches[3] != '' ) {
- return Sanitizer::decodeChar( hexdec( $matches[3] ) );
+ } elseif ( $matches[2] != '' ) {
+ return Sanitizer::decodeChar( intval( $matches[2] ) );
+ } elseif ( $matches[3] != '' ) {
+ return Sanitizer::decodeChar( hexdec( $matches[3] ) );
}
# Last case should be an ampersand by itself
return $matches[0];
@@ -1410,7 +1400,7 @@ class Sanitizer {
* @private
*/
static function decodeChar( $codepoint ) {
- if( Sanitizer::validateCodepoint( $codepoint ) ) {
+ if ( Sanitizer::validateCodepoint( $codepoint ) ) {
return codepointToUtf8( $codepoint );
} else {
return UTF8_REPLACEMENT;
@@ -1429,7 +1419,7 @@ class Sanitizer {
if ( isset( self::$htmlEntityAliases[$name] ) ) {
$name = self::$htmlEntityAliases[$name];
}
- if( isset( self::$htmlEntities[$name] ) ) {
+ if ( isset( self::$htmlEntities[$name] ) ) {
return codepointToUtf8( self::$htmlEntities[$name] );
} else {
return "&$name;";
@@ -1455,10 +1445,10 @@ class Sanitizer {
* @return Array
*/
static function setupAttributeWhitelist() {
- global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
+ global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
static $whitelist, $staticInitialised;
- $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) );
+ $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
return $whitelist;
@@ -1478,32 +1468,35 @@ class Sanitizer {
);
if ( $wgAllowRdfaAttributes ) {
- #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
+ # RDFa attributes as specified in section 9 of
+ # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
$common = array_merge( $common, array(
- 'about', 'property', 'resource', 'datatype', 'typeof',
+ 'about', 'property', 'resource', 'datatype', 'typeof',
) );
}
- if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
- # add HTML5 microdata tags as specified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
+ if ( $wgAllowMicrodataAttributes ) {
+ # add HTML5 microdata tags as specified by
+ # http://www.whatwg.org/html/microdata.html#the-microdata-model
$common = array_merge( $common, array(
- 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
+ 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
) );
}
$block = array_merge( $common, array( 'align' ) );
$tablealign = array( 'align', 'char', 'charoff', 'valign' );
- $tablecell = array( 'abbr',
- 'axis',
- 'headers',
- 'scope',
- 'rowspan',
- 'colspan',
- 'nowrap', # deprecated
- 'width', # deprecated
- 'height', # deprecated
- 'bgcolor' # deprecated
- );
+ $tablecell = array(
+ 'abbr',
+ 'axis',
+ 'headers',
+ 'scope',
+ 'rowspan',
+ 'colspan',
+ 'nowrap', # deprecated
+ 'width', # deprecated
+ 'height', # deprecated
+ 'bgcolor', # deprecated
+ );
# Numbers refer to sections in HTML 4.01 standard describing the element.
# See: http://www.w3.org/TR/html4/
@@ -1553,6 +1546,9 @@ class Sanitizer {
# 9.3.2
'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
+ # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
+ 'wbr' => array( 'id', 'class', 'title', 'style' ),
+
# 9.3.4
'pre' => array_merge( $common, array( 'width' ) ),
@@ -1596,7 +1592,9 @@ class Sanitizer {
'td' => array_merge( $common, $tablecell, $tablealign ),
'th' => array_merge( $common, $tablecell, $tablealign ),
- # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
+ # 12.2
+ # NOTE: <a> is not allowed directly, but the attrib
+ # whitelist is used from the Parser object
'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
# 13.2
@@ -1622,8 +1620,8 @@ class Sanitizer {
# 15.3
'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
- # XHTML Ruby annotation text module, simple ruby only.
- # http://www.w3c.org/TR/ruby/
+ # HTML Ruby annotation text module, simple ruby only.
+ # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
'ruby' => $common,
# rbc
# rtc
@@ -1639,25 +1637,20 @@ class Sanitizer {
# HTML 5 section 4.6
'bdi' => $common,
- );
-
- if ( $wgHtml5 ) {
# HTML5 elements, defined by:
- # http://www.whatwg.org/specs/web-apps/current-work/multipage/
- $whitelist += array(
- 'data' => array_merge( $common, array( 'value' ) ),
- 'time' => array_merge( $common, array( 'datetime' ) ),
- 'mark' => $common,
-
- // meta and link are only permitted by removeHTMLtags when Microdata
- // is enabled so we don't bother adding a conditional to hide these
- // Also meta and link are only valid in WikiText as Microdata elements
- // (ie: validateTag rejects tags missing the attributes needed for Microdata)
- // So we don't bother including $common attributes that have no purpose.
- 'meta' => array( 'itemprop', 'content' ),
- 'link' => array( 'itemprop', 'href' ),
- );
- }
+ # http://www.whatwg.org/html/
+ 'data' => array_merge( $common, array( 'value' ) ),
+ 'time' => array_merge( $common, array( 'datetime' ) ),
+ 'mark' => $common,
+
+ // meta and link are only permitted by removeHTMLtags when Microdata
+ // is enabled so we don't bother adding a conditional to hide these
+ // Also meta and link are only valid in WikiText as Microdata elements
+ // (ie: validateTag rejects tags missing the attributes needed for Microdata)
+ // So we don't bother including $common attributes that have no purpose.
+ 'meta' => array( 'itemprop', 'content' ),
+ 'link' => array( 'itemprop', 'href' ),
+ );
$staticInitialised = $globalContext;
@@ -1696,7 +1689,7 @@ class Sanitizer {
*/
static function hackDocType() {
$out = "<!DOCTYPE html [\n";
- foreach( self::$htmlEntities as $entity => $codepoint ) {
+ foreach ( self::$htmlEntities as $entity => $codepoint ) {
$out .= "<!ENTITY $entity \"&#$codepoint;\">";
}
$out .= "]>\n";
@@ -1718,7 +1711,7 @@ class Sanitizer {
# Validate hostname portion
$matches = array();
- if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
+ if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
list( /* $whole */, $protocol, $host, $rest ) = $matches;
// Characters that will be ignored in IDNs.
@@ -1762,7 +1755,7 @@ class Sanitizer {
* Does a string look like an e-mail address?
*
* This validates an email address using an HTML5 specification found at:
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+ * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
* Which as of 2011-01-24 says:
*
* A valid e-mail address is a string that matches the ABNF production
@@ -1788,7 +1781,7 @@ class Sanitizer {
*/
public static function validateEmail( $addr ) {
$result = null;
- if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
+ if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
return $result;
}
@@ -1807,6 +1800,6 @@ class Sanitizer {
$ # End of string
/ix"; // case Insensitive, eXtended
- return (bool) preg_match( $HTML5_email_regexp, $addr );
+ return (bool)preg_match( $HTML5_email_regexp, $addr );
}
}