From 4ac9fa081a7c045f6a9f1cfc529d82423f485b2e Mon Sep 17 00:00:00 2001
From: Pierre Schmitz <pierre@archlinux.de>
Date: Sun, 8 Dec 2013 09:55:49 +0100
Subject: Update to MediaWiki 1.22.0

---
 includes/Sanitizer.php | 361 ++++++++++++++++++++++++-------------------------
 1 file changed, 177 insertions(+), 184 deletions(-)

(limited to 'includes/Sanitizer.php')

diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 849e4d66..499d8218 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -1,6 +1,6 @@
 <?php
 /**
- * XHTML sanitizer for %MediaWiki.
+ * HTML sanitizer for %MediaWiki.
  *
  * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
  * http://www.mediawiki.org/
@@ -25,7 +25,7 @@
  */
 
 /**
- * XHTML sanitizer for MediaWiki
+ * HTML sanitizer for MediaWiki
  * @ingroup Parser
  */
 class Sanitizer {
@@ -54,9 +54,8 @@ class Sanitizer {
 	 * List of all named character entities defined in HTML 4.01
 	 * http://www.w3.org/TR/html4/sgml/entities.html
 	 * As well as &apos; which is only defined starting in XHTML1.
-	 * @private
 	 */
-	static $htmlEntities = array(
+	private static $htmlEntities = array(
 		'Aacute'   => 193,
 		'aacute'   => 225,
 		'Acirc'    => 194,
@@ -315,7 +314,7 @@ class Sanitizer {
 	/**
 	 * Character entity aliases accepted by MediaWiki
 	 */
-	static $htmlEntityAliases = array(
+	private static $htmlEntityAliases = array(
 		'רלמ' => 'rlm',
 		'رلم' => 'rlm',
 	);
@@ -323,7 +322,7 @@ class Sanitizer {
 	/**
 	 * Lazy-initialised attributes regex, see getAttribsRegex()
 	 */
-	static $attribsRegex;
+	private static $attribsRegex;
 
 	/**
 	 * Regular expression to match HTML/XML attribute pairs within a tag.
@@ -357,14 +356,17 @@ class Sanitizer {
 	 * removes HTML comments
 	 * @private
 	 * @param $text String
-	 * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
+	 * @param $processCallback Callback to do any variable or parameter
+	 *        replacements in HTML attribute values
 	 * @param array $args for the processing callback
 	 * @param array $extratags for any extra tags to include
 	 * @param array $removetags for any tags (default or extra) to exclude
 	 * @return string
 	 */
-	static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
-		global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag;
+	static function removeHTMLtags( $text, $processCallback = null,
+		$args = array(), $extratags = array(), $removetags = array()
+	) {
+		global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
 
 		static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 			$htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
@@ -373,7 +375,7 @@ class Sanitizer {
 
 		// Base our staticInitialised variable off of the global config state so that if the globals
 		// are changed (like in the screwed up test system) we will re-initialise the settings.
-		$globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
+		$globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
 		if ( !$staticInitialised || $staticInitialised != $globalContext ) {
 
 			$htmlpairsStatic = array( # Tags that must be closed
@@ -382,30 +384,28 @@ class Sanitizer {
 				'strike', 'strong', 'tt', 'var', 'div', 'center',
 				'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 				'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
-				'kbd', 'samp'
+				'kbd', 'samp', 'data', 'time', 'mark'
 			);
-			if ( $wgHtml5 ) {
-				$htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) );
-			}
 			$htmlsingle = array(
-				'br', 'hr', 'li', 'dt', 'dd'
+				'br', 'wbr', 'hr', 'li', 'dt', 'dd'
 			);
 			$htmlsingleonly = array( # Elements that cannot have close tags
-				'br', 'hr'
+				'br', 'wbr', 'hr'
 			);
-			if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
+			if ( $wgAllowMicrodataAttributes ) {
 				$htmlsingle[] = $htmlsingleonly[] = 'meta';
 				$htmlsingle[] = $htmlsingleonly[] = 'link';
 			}
 			$htmlnest = array( # Tags that can be nested--??
 				'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
-				'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span'
+				'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
+				'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
 			);
 			$tabletags = array( # Can only appear inside table, we will close them
 				'td', 'th', 'tr',
 			);
 			$htmllist = array( # Tags used by list
-				'ul','ol',
+				'ul', 'ol',
 			);
 			$listtags = array( # Tags that can appear in a list
 				'li',
@@ -446,7 +446,7 @@ class Sanitizer {
 				# $params: String between element name and >
 				# $brace: Ending '>' or '/>'
 				# $rest: Everything until the next element of $bits
-				if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
+				if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 					list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 				} else {
 					$slash = $t = $params = $brace = $rest = null;
@@ -507,7 +507,7 @@ class Sanitizer {
 						!in_array( 'table', $tagstack ) ) {
 							$badtag = true;
 						} elseif ( in_array( $t, $tagstack ) &&
-						!isset( $htmlnest [$t ] ) ) {
+						!isset( $htmlnest[$t] ) ) {
 							$badtag = true;
 						# Is it a self closed htmlpair ? (bug 5487)
 						} elseif ( $brace == '/>' &&
@@ -537,7 +537,7 @@ class Sanitizer {
 
 						# Replace any variables or template parameters with
 						# plaintext results.
-						if( is_callable( $processCallback ) ) {
+						if ( is_callable( $processCallback ) ) {
 							call_user_func_array( $processCallback, array( &$params, $args ) );
 						}
 
@@ -555,12 +555,14 @@ class Sanitizer {
 						continue;
 					}
 				}
-				$text .= '&lt;' . str_replace( '>', '&gt;', $x);
+				$text .= '&lt;' . str_replace( '>', '&gt;', $x );
 			}
 			# Close off any remaining tags
-			while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
+			while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
 				$text .= "</$t>\n";
-				if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
+				if ( $t == 'table' ) {
+					$tagstack = array_pop( $tablestack );
+				}
 			}
 		} else {
 			# this might be possible using tidy itself
@@ -570,7 +572,7 @@ class Sanitizer {
 				@list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 				$badtag = false;
 				if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
-					if( is_callable( $processCallback ) ) {
+					if ( is_callable( $processCallback ) ) {
 						call_user_func_array( $processCallback, array( &$params, $args ) );
 					}
 
@@ -585,7 +587,7 @@ class Sanitizer {
 						continue;
 					}
 				}
-				$text .= '&lt;' . str_replace( '>', '&gt;', $x);
+				$text .= '&lt;' . str_replace( '>', '&gt;', $x );
 			}
 		}
 		wfProfileOut( __METHOD__ );
@@ -604,7 +606,7 @@ class Sanitizer {
 	 */
 	static function removeHTMLcomments( $text ) {
 		wfProfileIn( __METHOD__ );
-		while ( ($start = strpos( $text, '<!--' ) ) !== false ) {
+		while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
 			$end = strpos( $text, '-->', $start + 4 );
 			if ( $end === false ) {
 				# Unterminated comment; bail out
@@ -621,9 +623,11 @@ class Sanitizer {
 				$spaceStart--;
 				$spaceLen++;
 			}
-			while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' )
+			while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
 				$spaceLen++;
-			if ( substr( $text, $spaceStart, 1 ) === "\n" and substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
+			}
+			if ( substr( $text, $spaceStart, 1 ) === "\n"
+				&& substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
 				# Remove the comment, leading and trailing
 				# spaces, and leave only one newline.
 				$text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
@@ -706,13 +710,13 @@ class Sanitizer {
 	 * @todo Check for unique id attribute :P
 	 */
 	static function validateAttributes( $attribs, $whitelist ) {
-		global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
+		global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
 
 		$whitelist = array_flip( $whitelist );
 		$hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
 
 		$out = array();
-		foreach( $attribs as $attribute => $value ) {
+		foreach ( $attribs as $attribute => $value ) {
 			#allow XML namespace declaration if RDFa is enabled
 			if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
 				if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
@@ -722,14 +726,14 @@ class Sanitizer {
 				continue;
 			}
 
-			# Allow any attribute beginning with "data-", if in HTML5 mode
-			if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
+			# Allow any attribute beginning with "data-"
+			if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
 				continue;
 			}
 
 			# Strip javascript "expression" from stylesheets.
 			# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
-			if( $attribute == 'style' ) {
+			if ( $attribute == 'style' ) {
 				$value = Sanitizer::checkCss( $value );
 			}
 
@@ -739,7 +743,7 @@ class Sanitizer {
 
 			# WAI-ARIA
 			# http://www.w3.org/TR/wai-aria/
-			# http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#wai-aria
+			# http://www.whatwg.org/html/elements.html#wai-aria
 			# For now we only support role="presentation" until we work out what roles should be
 			# usable by content and we ensure that our code explicitly rejects patterns that
 			# violate HTML5's ARIA restrictions.
@@ -747,13 +751,18 @@ class Sanitizer {
 				continue;
 			}
 
-			//RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
-			if ( $attribute === 'rel' || $attribute === 'rev' ||
-				$attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
-				$attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
-				$attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
-				$attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
-
+			// RDFa and microdata properties allow URLs, URIs and/or CURIs.
+			// Check them for sanity.
+			if ( $attribute === 'rel' || $attribute === 'rev'
+				# RDFa
+				|| $attribute === 'about' || $attribute === 'property'
+				|| $attribute === 'resource' || $attribute === 'datatype'
+				|| $attribute === 'typeof'
+				# HTML5 microdata
+				|| $attribute === 'itemid' || $attribute === 'itemprop'
+				|| $attribute === 'itemref' || $attribute === 'itemscope'
+				|| $attribute === 'itemtype'
+			) {
 				//Paranoia. Allow "simple" values but suppress javascript
 				if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
 					continue;
@@ -765,7 +774,7 @@ class Sanitizer {
 			if ( $attribute === 'href' || $attribute === 'src' ) {
 				if ( !preg_match( $hrefExp, $value ) ) {
 					continue; //drop any href or src attributes not using an allowed protocol.
-						  //NOTE: this also drops all relative URLs
+					// NOTE: this also drops all relative URLs
 				}
 			}
 
@@ -798,9 +807,10 @@ class Sanitizer {
 	 */
 	static function mergeAttributes( $a, $b ) {
 		$out = array_merge( $a, $b );
-		if( isset( $a['class'] ) && isset( $b['class'] )
-		&& is_string( $a['class'] ) && is_string( $b['class'] )
-		&& $a['class'] !== $b['class'] ) {
+		if ( isset( $a['class'] ) && isset( $b['class'] )
+			&& is_string( $a['class'] ) && is_string( $b['class'] )
+			&& $a['class'] !== $b['class']
+		) {
 			$classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 				-1, PREG_SPLIT_NO_EMPTY );
 			$out['class'] = implode( ' ', array_unique( $classes ) );
@@ -811,9 +821,10 @@ class Sanitizer {
 	/**
 	 * Pick apart some CSS and check it for forbidden or unsafe structures.
 	 * Returns a sanitized string. This sanitized string will have
-	 * character references and escape sequences decoded, and comments
-	 * stripped. If the input is just too evil, only a comment complaining
-	 * about evilness will be returned.
+	 * character references and escape sequences decoded and comments
+	 * stripped (unless it is itself one valid comment, in which case the value
+	 * will be passed through). If the input is just too evil, only a comment
+	 * complaining about evilness will be returned.
 	 *
 	 * Currently URL references, 'expression', 'tps' are forbidden.
 	 *
@@ -854,60 +865,28 @@ class Sanitizer {
 		$value = preg_replace_callback( $decodeRegex,
 			array( __CLASS__, 'cssDecodeCallback' ), $value );
 
-		// Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
-		$value = preg_replace_callback(
-			'/[！-ｚ]/u', // U+FF01 to U+FF5A
-			function ( $matches ) {
-				$cp = utf8ToCodepoint( $matches[0] );
-				if ( $cp === false ) {
-					return '';
-				}
-				return chr( $cp - 65248 ); // ASCII range \x21-\x7A
-			},
-			$value
-		);
-
-		// Convert more characters IE6 might treat as ascii
-		// U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
-		$value = str_replace(
-			array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
-			array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
-			$value
-		);
-
-		// Remove any comments; IE gets token splitting wrong
-		// This must be done AFTER decoding character references and
-		// escape sequences, because those steps can introduce comments
-		// This step cannot introduce character references or escape
-		// sequences, because it replaces comments with spaces rather
-		// than removing them completely.
-		$value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
-
-		// Remove anything after a comment-start token, to guard against
-		// incorrect client implementations.
-		$commentPos = strpos( $value, '/*' );
-		if ( $commentPos !== false ) {
-			$value = substr( $value, 0, $commentPos );
+		// Let the value through if it's nothing but a single comment, to
+		// allow other functions which may reject it to pass some error
+		// message through.
+		if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
+			// Remove any comments; IE gets token splitting wrong
+			// This must be done AFTER decoding character references and
+			// escape sequences, because those steps can introduce comments
+			// This step cannot introduce character references or escape
+			// sequences, because it replaces comments with spaces rather
+			// than removing them completely.
+			$value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+
+			// Remove anything after a comment-start token, to guard against
+			// incorrect client implementations.
+			$commentPos = strpos( $value, '/*' );
+			if ( $commentPos !== false ) {
+				$value = substr( $value, 0, $commentPos );
+			}
 		}
 
-		// S followed by repeat, iteration, or prolonged sound marks,
-		// which IE will treat as "ss"
-		$value = preg_replace(
-			'/s(?:
-				\xE3\x80\xB1 | # U+3031
-				\xE3\x82\x9D | # U+309D
-				\xE3\x83\xBC | # U+30FC
-				\xE3\x83\xBD | # U+30FD
-				\xEF\xB9\xBC | # U+FE7C
-				\xEF\xB9\xBD | # U+FE7D
-				\xEF\xBD\xB0   # U+FF70
-			)/ix',
-			'ss',
-			$value
-		);
-
 		// Reject problematic keywords and control characters
-		if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
+		if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
 			return '/* invalid control char */';
 		} elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( | image\s*\( | image-set\s*\( !ix', $value ) ) {
 			return '/* insecure input */';
@@ -960,21 +939,14 @@ class Sanitizer {
 	 * @return String
 	 */
 	static function fixTagAttributes( $text, $element ) {
-		if( trim( $text ) == '' ) {
+		if ( trim( $text ) == '' ) {
 			return '';
 		}
 
 		$decoded = Sanitizer::decodeTagAttributes( $text );
 		$stripped = Sanitizer::validateTagAttributes( $decoded, $element );
 
-		$attribs = array();
-		foreach( $stripped as $attribute => $value ) {
-			$encAttribute = htmlspecialchars( $attribute );
-			$encValue = Sanitizer::safeEncodeAttribute( $value );
-
-			$attribs[] = "$encAttribute=\"$encValue\"";
-		}
-		return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+		return Sanitizer::safeEncodeTagAttributes( $stripped );
 	}
 
 	/**
@@ -1047,7 +1019,7 @@ class Sanitizer {
 	 *                                                          in the id and
 	 *                                                          name attributes
 	 * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
-	 * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
+	 * @see http://www.whatwg.org/html/elements.html#the-id-attribute
 	 *   HTML5 definition of id attribute
 	 *
 	 * @param string $id id to escape
@@ -1062,10 +1034,10 @@ class Sanitizer {
 	 * @return String
 	 */
 	static function escapeId( $id, $options = array() ) {
-		global $wgHtml5, $wgExperimentalHtmlIds;
+		global $wgExperimentalHtmlIds;
 		$options = (array)$options;
 
-		if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
+		if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
 			$id = Sanitizer::decodeCharReferences( $id );
 			$id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
 			$id = trim( $id, '_' );
@@ -1146,13 +1118,13 @@ class Sanitizer {
 	 * @return Array
 	 */
 	public static function decodeTagAttributes( $text ) {
-		if( trim( $text ) == '' ) {
+		if ( trim( $text ) == '' ) {
 			return array();
 		}
 
 		$attribs = array();
 		$pairs = array();
-		if( !preg_match_all(
+		if ( !preg_match_all(
 			self::getAttribsRegex(),
 			$text,
 			$pairs,
@@ -1160,7 +1132,7 @@ class Sanitizer {
 			return $attribs;
 		}
 
-		foreach( $pairs as $set ) {
+		foreach ( $pairs as $set ) {
 			$attribute = strtolower( $set[1] );
 			$value = Sanitizer::getTagAttributeCallback( $set );
 
@@ -1174,6 +1146,24 @@ class Sanitizer {
 		return $attribs;
 	}
 
+	/**
+	 * Build a partial tag string from an associative array of attribute
+	 * names and values as returned by decodeTagAttributes.
+	 *
+	 * @param $assoc_array Array
+	 * @return String
+	 */
+	public static function safeEncodeTagAttributes( $assoc_array ) {
+		$attribs = array();
+		foreach ( $assoc_array as $attribute => $value ) {
+			$encAttribute = htmlspecialchars( $attribute );
+			$encValue = Sanitizer::safeEncodeAttribute( $value );
+
+			$attribs[] = "$encAttribute=\"$encValue\"";
+		}
+		return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+	}
+
 	/**
 	 * Pick the appropriate attribute value from a match set from the
 	 * attribs regex matches.
@@ -1183,19 +1173,19 @@ class Sanitizer {
 	 * @return String
 	 */
 	private static function getTagAttributeCallback( $set ) {
-		if( isset( $set[6] ) ) {
+		if ( isset( $set[6] ) ) {
 			# Illegal #XXXXXX color with no quotes.
 			return $set[6];
-		} elseif( isset( $set[5] ) ) {
+		} elseif ( isset( $set[5] ) ) {
 			# No quotes.
 			return $set[5];
-		} elseif( isset( $set[4] ) ) {
+		} elseif ( isset( $set[4] ) ) {
 			# Single-quoted
 			return $set[4];
-		} elseif( isset( $set[3] ) ) {
+		} elseif ( isset( $set[3] ) ) {
 			# Double-quoted
 			return $set[3];
-		} elseif( !isset( $set[2] ) ) {
+		} elseif ( !isset( $set[2] ) ) {
 			# In XHTML, attributes must have a value.
 			# For 'reduced' form, return explicitly the attribute name here.
 			return $set[1];
@@ -1271,14 +1261,14 @@ class Sanitizer {
 	 */
 	static function normalizeCharReferencesCallback( $matches ) {
 		$ret = null;
-		if( $matches[1] != '' ) {
+		if ( $matches[1] != '' ) {
 			$ret = Sanitizer::normalizeEntity( $matches[1] );
-		} elseif( $matches[2] != '' ) {
+		} elseif ( $matches[2] != '' ) {
 			$ret = Sanitizer::decCharReference( $matches[2] );
-		} elseif( $matches[3] != '' ) {
+		} elseif ( $matches[3] != '' ) {
 			$ret = Sanitizer::hexCharReference( $matches[3] );
 		}
-		if( is_null( $ret ) ) {
+		if ( is_null( $ret ) ) {
 			return htmlspecialchars( $matches[0] );
 		} else {
 			return $ret;
@@ -1314,7 +1304,7 @@ class Sanitizer {
 	 */
 	static function decCharReference( $codepoint ) {
 		$point = intval( $codepoint );
-		if( Sanitizer::validateCodepoint( $point ) ) {
+		if ( Sanitizer::validateCodepoint( $point ) ) {
 			return sprintf( '&#%d;', $point );
 		} else {
 			return null;
@@ -1327,7 +1317,7 @@ class Sanitizer {
 	 */
 	static function hexCharReference( $codepoint ) {
 		$point = hexdec( $codepoint );
-		if( Sanitizer::validateCodepoint( $point ) ) {
+		if ( Sanitizer::validateCodepoint( $point ) ) {
 			return sprintf( '&#x%x;', $point );
 		} else {
 			return null;
@@ -1340,12 +1330,12 @@ class Sanitizer {
 	 * @return Boolean
 	 */
 	private static function validateCodepoint( $codepoint ) {
-		return ($codepoint ==    0x09)
-			|| ($codepoint ==    0x0a)
-			|| ($codepoint ==    0x0d)
-			|| ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
-			|| ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
-			|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
+		return $codepoint == 0x09
+			|| $codepoint == 0x0a
+			|| $codepoint == 0x0d
+			|| ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
+			|| ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
+			|| ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
 	}
 
 	/**
@@ -1391,12 +1381,12 @@ class Sanitizer {
 	 * @return String
 	 */
 	static function decodeCharReferencesCallback( $matches ) {
-		if( $matches[1] != '' ) {
+		if ( $matches[1] != '' ) {
 			return Sanitizer::decodeEntity( $matches[1] );
-		} elseif( $matches[2] != '' ) {
-			return  Sanitizer::decodeChar( intval( $matches[2] ) );
-		} elseif( $matches[3] != '' ) {
-			return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
+		} elseif ( $matches[2] != '' ) {
+			return Sanitizer::decodeChar( intval( $matches[2] ) );
+		} elseif ( $matches[3] != '' ) {
+			return Sanitizer::decodeChar( hexdec( $matches[3] ) );
 		}
 		# Last case should be an ampersand by itself
 		return $matches[0];
@@ -1410,7 +1400,7 @@ class Sanitizer {
 	 * @private
 	 */
 	static function decodeChar( $codepoint ) {
-		if( Sanitizer::validateCodepoint( $codepoint ) ) {
+		if ( Sanitizer::validateCodepoint( $codepoint ) ) {
 			return codepointToUtf8( $codepoint );
 		} else {
 			return UTF8_REPLACEMENT;
@@ -1429,7 +1419,7 @@ class Sanitizer {
 		if ( isset( self::$htmlEntityAliases[$name] ) ) {
 			$name = self::$htmlEntityAliases[$name];
 		}
-		if( isset( self::$htmlEntities[$name] ) ) {
+		if ( isset( self::$htmlEntities[$name] ) ) {
 			return codepointToUtf8( self::$htmlEntities[$name] );
 		} else {
 			return "&$name;";
@@ -1455,10 +1445,10 @@ class Sanitizer {
 	 * @return Array
 	 */
 	static function setupAttributeWhitelist() {
-		global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
+		global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
 
 		static $whitelist, $staticInitialised;
-		$globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) );
+		$globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
 
 		if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
 			return $whitelist;
@@ -1478,32 +1468,35 @@ class Sanitizer {
 		);
 
 		if ( $wgAllowRdfaAttributes ) {
-			#RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
+			# RDFa attributes as specified in section 9 of
+			# http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
 			$common = array_merge( $common, array(
-			    'about', 'property', 'resource', 'datatype', 'typeof',
+				'about', 'property', 'resource', 'datatype', 'typeof',
 			) );
 		}
 
-		if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
-			# add HTML5 microdata tags as specified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
+		if ( $wgAllowMicrodataAttributes ) {
+			# add HTML5 microdata tags as specified by
+			# http://www.whatwg.org/html/microdata.html#the-microdata-model
 			$common = array_merge( $common, array(
-			    'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
+				'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
 			) );
 		}
 
 		$block = array_merge( $common, array( 'align' ) );
 		$tablealign = array( 'align', 'char', 'charoff', 'valign' );
-		$tablecell = array( 'abbr',
-		                    'axis',
-		                    'headers',
-		                    'scope',
-		                    'rowspan',
-		                    'colspan',
-		                    'nowrap', # deprecated
-		                    'width',  # deprecated
-		                    'height', # deprecated
-		                    'bgcolor' # deprecated
-		                    );
+		$tablecell = array(
+			'abbr',
+			'axis',
+			'headers',
+			'scope',
+			'rowspan',
+			'colspan',
+			'nowrap', # deprecated
+			'width', # deprecated
+			'height', # deprecated
+			'bgcolor', # deprecated
+		);
 
 		# Numbers refer to sections in HTML 4.01 standard describing the element.
 		# See: http://www.w3.org/TR/html4/
@@ -1553,6 +1546,9 @@ class Sanitizer {
 			# 9.3.2
 			'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 
+			# http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
+			'wbr'        => array( 'id', 'class', 'title', 'style' ),
+
 			# 9.3.4
 			'pre'        => array_merge( $common, array( 'width' ) ),
 
@@ -1596,7 +1592,9 @@ class Sanitizer {
 			'td'         => array_merge( $common, $tablecell, $tablealign ),
 			'th'         => array_merge( $common, $tablecell, $tablealign ),
 
-			# 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
+			# 12.2
+			# NOTE: <a> is not allowed directly, but the attrib
+			# whitelist is used from the Parser object
 			'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
 
 			# 13.2
@@ -1622,8 +1620,8 @@ class Sanitizer {
 			# 15.3
 			'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 
-			# XHTML Ruby annotation text module, simple ruby only.
-			# http://www.w3c.org/TR/ruby/
+			# HTML Ruby annotation text module, simple ruby only.
+			# http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
 			'ruby'       => $common,
 			# rbc
 			# rtc
@@ -1639,25 +1637,20 @@ class Sanitizer {
 			# HTML 5 section 4.6
 			'bdi' => $common,
 
-		);
-
-		if ( $wgHtml5 ) {
 			# HTML5 elements, defined by:
-			# http://www.whatwg.org/specs/web-apps/current-work/multipage/
-			$whitelist += array(
-				'data' => array_merge( $common, array( 'value' ) ),
-				'time' => array_merge( $common, array( 'datetime' ) ),
-				'mark' => $common,
-
-				// meta and link are only permitted by removeHTMLtags when Microdata
-				// is enabled so we don't bother adding a conditional to hide these
-				// Also meta and link are only valid in WikiText as Microdata elements
-				// (ie: validateTag rejects tags missing the attributes needed for Microdata)
-				// So we don't bother including $common attributes that have no purpose.
-				'meta' => array( 'itemprop', 'content' ),
-				'link' => array( 'itemprop', 'href' ),
-			);
-		}
+			# http://www.whatwg.org/html/
+			'data' => array_merge( $common, array( 'value' ) ),
+			'time' => array_merge( $common, array( 'datetime' ) ),
+			'mark' => $common,
+
+			// meta and link are only permitted by removeHTMLtags when Microdata
+			// is enabled so we don't bother adding a conditional to hide these
+			// Also meta and link are only valid in WikiText as Microdata elements
+			// (ie: validateTag rejects tags missing the attributes needed for Microdata)
+			// So we don't bother including $common attributes that have no purpose.
+			'meta' => array( 'itemprop', 'content' ),
+			'link' => array( 'itemprop', 'href' ),
+		);
 
 		$staticInitialised = $globalContext;
 
@@ -1696,7 +1689,7 @@ class Sanitizer {
 	 */
 	static function hackDocType() {
 		$out = "<!DOCTYPE html [\n";
-		foreach( self::$htmlEntities as $entity => $codepoint ) {
+		foreach ( self::$htmlEntities as $entity => $codepoint ) {
 			$out .= "<!ENTITY $entity \"&#$codepoint;\">";
 		}
 		$out .= "]>\n";
@@ -1718,7 +1711,7 @@ class Sanitizer {
 
 		# Validate hostname portion
 		$matches = array();
-		if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
+		if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
 			list( /* $whole */, $protocol, $host, $rest ) = $matches;
 
 			// Characters that will be ignored in IDNs.
@@ -1762,7 +1755,7 @@ class Sanitizer {
 	 * Does a string look like an e-mail address?
 	 *
 	 * This validates an email address using an HTML5 specification found at:
-	 * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+	 * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
 	 * Which as of 2011-01-24 says:
 	 *
 	 *   A valid e-mail address is a string that matches the ABNF production
@@ -1788,7 +1781,7 @@ class Sanitizer {
 	 */
 	public static function validateEmail( $addr ) {
 		$result = null;
-		if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
+		if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
 			return $result;
 		}
 
@@ -1807,6 +1800,6 @@ class Sanitizer {
 		$                      # End of string
 		/ix"; // case Insensitive, eXtended
 
-		return (bool) preg_match( $HTML5_email_regexp, $addr );
+		return (bool)preg_match( $HTML5_email_regexp, $addr );
 	}
 }
-- 
cgit v1.2.2