1 files changed, 410 insertions, 350 deletions
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index a6c64264..d0e46f91 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -25,323 +25,325 @@
  */
 
 /**
- * Regular expression to match various types of character references in
- * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
- */
-define( 'MW_CHAR_REFS_REGEX',
-	'/&([A-Za-z0-9\x80-\xff]+);
-	 |&\#([0-9]+);
-	 |&\#x([0-9A-Za-z]+);
-	 |&\#X([0-9A-Za-z]+);
-	 |(&)/x' );
-
-/**
- * Regular expression to match HTML/XML attribute pairs within a tag.
- * Allows some... latitude.
- * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
+ * XHTML sanitizer for MediaWiki
+ * @ingroup Parser
  */
-$attribFirst = '[:A-Z_a-z0-9]';
-$attrib = '[:A-Z_a-z-.0-9]';
-$space = '[\x09\x0a\x0d\x20]';
-define( 'MW_ATTRIBS_REGEX',
-	"/(?:^|$space)({$attribFirst}{$attrib}*)
-	  ($space*=$space*
-		(?:
-		 # The attribute value: quoted or alone
-		  \"([^<\"]*)\"
-		 | '([^<']*)'
-		 |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
-		 |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
-							 # colors are specified like this.
-							 # We'll be normalizing it.
-		)
-	   )?(?=$space|\$)/sx" );
+class Sanitizer {
+	/**
+	 * Regular expression to match various types of character references in
+	 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
+	 */
+	const CHAR_REFS_REGEX = 
+		'/&([A-Za-z0-9\x80-\xff]+);
+		 |&\#([0-9]+);
+		 |&\#[xX]([0-9A-Fa-f]+);
+		 |(&)/x';
 
-/**
- * Regular expression to match URIs that could trigger script execution
- */
-define( 'MW_EVIL_URI_PATTERN', '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i' );
+	const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
+	const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
 
-/**
- * Regular expression to match namespace attributes
- */
-define( 'MW_XMLNS_ATTRIBUTE_PATTRN', "/^xmlns:$attrib+$/" );
+	/**
+	 * List of all named character entities defined in HTML 4.01
+	 * http://www.w3.org/TR/html4/sgml/entities.html
+	 * As well as &apos; which is only defined starting in XHTML1.
+	 * @private
+	 */
+	static $htmlEntities = array(
+		'Aacute'   => 193,
+		'aacute'   => 225,
+		'Acirc'    => 194,
+		'acirc'    => 226,
+		'acute'    => 180,
+		'AElig'    => 198,
+		'aelig'    => 230,
+		'Agrave'   => 192,
+		'agrave'   => 224,
+		'alefsym'  => 8501,
+		'Alpha'    => 913,
+		'alpha'    => 945,
+		'amp'      => 38,
+		'and'      => 8743,
+		'ang'      => 8736,
+		'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
+		'Aring'    => 197,
+		'aring'    => 229,
+		'asymp'    => 8776,
+		'Atilde'   => 195,
+		'atilde'   => 227,
+		'Auml'     => 196,
+		'auml'     => 228,
+		'bdquo'    => 8222,
+		'Beta'     => 914,
+		'beta'     => 946,
+		'brvbar'   => 166,
+		'bull'     => 8226,
+		'cap'      => 8745,
+		'Ccedil'   => 199,
+		'ccedil'   => 231,
+		'cedil'    => 184,
+		'cent'     => 162,
+		'Chi'      => 935,
+		'chi'      => 967,
+		'circ'     => 710,
+		'clubs'    => 9827,
+		'cong'     => 8773,
+		'copy'     => 169,
+		'crarr'    => 8629,
+		'cup'      => 8746,
+		'curren'   => 164,
+		'dagger'   => 8224,
+		'Dagger'   => 8225,
+		'darr'     => 8595,
+		'dArr'     => 8659,
+		'deg'      => 176,
+		'Delta'    => 916,
+		'delta'    => 948,
+		'diams'    => 9830,
+		'divide'   => 247,
+		'Eacute'   => 201,
+		'eacute'   => 233,
+		'Ecirc'    => 202,
+		'ecirc'    => 234,
+		'Egrave'   => 200,
+		'egrave'   => 232,
+		'empty'    => 8709,
+		'emsp'     => 8195,
+		'ensp'     => 8194,
+		'Epsilon'  => 917,
+		'epsilon'  => 949,
+		'equiv'    => 8801,
+		'Eta'      => 919,
+		'eta'      => 951,
+		'ETH'      => 208,
+		'eth'      => 240,
+		'Euml'     => 203,
+		'euml'     => 235,
+		'euro'     => 8364,
+		'exist'    => 8707,
+		'fnof'     => 402,
+		'forall'   => 8704,
+		'frac12'   => 189,
+		'frac14'   => 188,
+		'frac34'   => 190,
+		'frasl'    => 8260,
+		'Gamma'    => 915,
+		'gamma'    => 947,
+		'ge'       => 8805,
+		'gt'       => 62,
+		'harr'     => 8596,
+		'hArr'     => 8660,
+		'hearts'   => 9829,
+		'hellip'   => 8230,
+		'Iacute'   => 205,
+		'iacute'   => 237,
+		'Icirc'    => 206,
+		'icirc'    => 238,
+		'iexcl'    => 161,
+		'Igrave'   => 204,
+		'igrave'   => 236,
+		'image'    => 8465,
+		'infin'    => 8734,
+		'int'      => 8747,
+		'Iota'     => 921,
+		'iota'     => 953,
+		'iquest'   => 191,
+		'isin'     => 8712,
+		'Iuml'     => 207,
+		'iuml'     => 239,
+		'Kappa'    => 922,
+		'kappa'    => 954,
+		'Lambda'   => 923,
+		'lambda'   => 955,
+		'lang'     => 9001,
+		'laquo'    => 171,
+		'larr'     => 8592,
+		'lArr'     => 8656,
+		'lceil'    => 8968,
+		'ldquo'    => 8220,
+		'le'       => 8804,
+		'lfloor'   => 8970,
+		'lowast'   => 8727,
+		'loz'      => 9674,
+		'lrm'      => 8206,
+		'lsaquo'   => 8249,
+		'lsquo'    => 8216,
+		'lt'       => 60,
+		'macr'     => 175,
+		'mdash'    => 8212,
+		'micro'    => 181,
+		'middot'   => 183,
+		'minus'    => 8722,
+		'Mu'       => 924,
+		'mu'       => 956,
+		'nabla'    => 8711,
+		'nbsp'     => 160,
+		'ndash'    => 8211,
+		'ne'       => 8800,
+		'ni'       => 8715,
+		'not'      => 172,
+		'notin'    => 8713,
+		'nsub'     => 8836,
+		'Ntilde'   => 209,
+		'ntilde'   => 241,
+		'Nu'       => 925,
+		'nu'       => 957,
+		'Oacute'   => 211,
+		'oacute'   => 243,
+		'Ocirc'    => 212,
+		'ocirc'    => 244,
+		'OElig'    => 338,
+		'oelig'    => 339,
+		'Ograve'   => 210,
+		'ograve'   => 242,
+		'oline'    => 8254,
+		'Omega'    => 937,
+		'omega'    => 969,
+		'Omicron'  => 927,
+		'omicron'  => 959,
+		'oplus'    => 8853,
+		'or'       => 8744,
+		'ordf'     => 170,
+		'ordm'     => 186,
+		'Oslash'   => 216,
+		'oslash'   => 248,
+		'Otilde'   => 213,
+		'otilde'   => 245,
+		'otimes'   => 8855,
+		'Ouml'     => 214,
+		'ouml'     => 246,
+		'para'     => 182,
+		'part'     => 8706,
+		'permil'   => 8240,
+		'perp'     => 8869,
+		'Phi'      => 934,
+		'phi'      => 966,
+		'Pi'       => 928,
+		'pi'       => 960,
+		'piv'      => 982,
+		'plusmn'   => 177,
+		'pound'    => 163,
+		'prime'    => 8242,
+		'Prime'    => 8243,
+		'prod'     => 8719,
+		'prop'     => 8733,
+		'Psi'      => 936,
+		'psi'      => 968,
+		'quot'     => 34,
+		'radic'    => 8730,
+		'rang'     => 9002,
+		'raquo'    => 187,
+		'rarr'     => 8594,
+		'rArr'     => 8658,
+		'rceil'    => 8969,
+		'rdquo'    => 8221,
+		'real'     => 8476,
+		'reg'      => 174,
+		'rfloor'   => 8971,
+		'Rho'      => 929,
+		'rho'      => 961,
+		'rlm'      => 8207,
+		'rsaquo'   => 8250,
+		'rsquo'    => 8217,
+		'sbquo'    => 8218,
+		'Scaron'   => 352,
+		'scaron'   => 353,
+		'sdot'     => 8901,
+		'sect'     => 167,
+		'shy'      => 173,
+		'Sigma'    => 931,
+		'sigma'    => 963,
+		'sigmaf'   => 962,
+		'sim'      => 8764,
+		'spades'   => 9824,
+		'sub'      => 8834,
+		'sube'     => 8838,
+		'sum'      => 8721,
+		'sup'      => 8835,
+		'sup1'     => 185,
+		'sup2'     => 178,
+		'sup3'     => 179,
+		'supe'     => 8839,
+		'szlig'    => 223,
+		'Tau'      => 932,
+		'tau'      => 964,
+		'there4'   => 8756,
+		'Theta'    => 920,
+		'theta'    => 952,
+		'thetasym' => 977,
+		'thinsp'   => 8201,
+		'THORN'    => 222,
+		'thorn'    => 254,
+		'tilde'    => 732,
+		'times'    => 215,
+		'trade'    => 8482,
+		'Uacute'   => 218,
+		'uacute'   => 250,
+		'uarr'     => 8593,
+		'uArr'     => 8657,
+		'Ucirc'    => 219,
+		'ucirc'    => 251,
+		'Ugrave'   => 217,
+		'ugrave'   => 249,
+		'uml'      => 168,
+		'upsih'    => 978,
+		'Upsilon'  => 933,
+		'upsilon'  => 965,
+		'Uuml'     => 220,
+		'uuml'     => 252,
+		'weierp'   => 8472,
+		'Xi'       => 926,
+		'xi'       => 958,
+		'Yacute'   => 221,
+		'yacute'   => 253,
+		'yen'      => 165,
+		'Yuml'     => 376,
+		'yuml'     => 255,
+		'Zeta'     => 918,
+		'zeta'     => 950,
+		'zwj'      => 8205,
+		'zwnj'     => 8204
+	);
 
-/**
- * List of all named character entities defined in HTML 4.01
- * http://www.w3.org/TR/html4/sgml/entities.html
- * @private
- */
-global $wgHtmlEntities;
-$wgHtmlEntities = array(
-	'Aacute'   => 193,
-	'aacute'   => 225,
-	'Acirc'    => 194,
-	'acirc'    => 226,
-	'acute'    => 180,
-	'AElig'    => 198,
-	'aelig'    => 230,
-	'Agrave'   => 192,
-	'agrave'   => 224,
-	'alefsym'  => 8501,
-	'Alpha'    => 913,
-	'alpha'    => 945,
-	'amp'      => 38,
-	'and'      => 8743,
-	'ang'      => 8736,
-	'Aring'    => 197,
-	'aring'    => 229,
-	'asymp'    => 8776,
-	'Atilde'   => 195,
-	'atilde'   => 227,
-	'Auml'     => 196,
-	'auml'     => 228,
-	'bdquo'    => 8222,
-	'Beta'     => 914,
-	'beta'     => 946,
-	'brvbar'   => 166,
-	'bull'     => 8226,
-	'cap'      => 8745,
-	'Ccedil'   => 199,
-	'ccedil'   => 231,
-	'cedil'    => 184,
-	'cent'     => 162,
-	'Chi'      => 935,
-	'chi'      => 967,
-	'circ'     => 710,
-	'clubs'    => 9827,
-	'cong'     => 8773,
-	'copy'     => 169,
-	'crarr'    => 8629,
-	'cup'      => 8746,
-	'curren'   => 164,
-	'dagger'   => 8224,
-	'Dagger'   => 8225,
-	'darr'     => 8595,
-	'dArr'     => 8659,
-	'deg'      => 176,
-	'Delta'    => 916,
-	'delta'    => 948,
-	'diams'    => 9830,
-	'divide'   => 247,
-	'Eacute'   => 201,
-	'eacute'   => 233,
-	'Ecirc'    => 202,
-	'ecirc'    => 234,
-	'Egrave'   => 200,
-	'egrave'   => 232,
-	'empty'    => 8709,
-	'emsp'     => 8195,
-	'ensp'     => 8194,
-	'Epsilon'  => 917,
-	'epsilon'  => 949,
-	'equiv'    => 8801,
-	'Eta'      => 919,
-	'eta'      => 951,
-	'ETH'      => 208,
-	'eth'      => 240,
-	'Euml'     => 203,
-	'euml'     => 235,
-	'euro'     => 8364,
-	'exist'    => 8707,
-	'fnof'     => 402,
-	'forall'   => 8704,
-	'frac12'   => 189,
-	'frac14'   => 188,
-	'frac34'   => 190,
-	'frasl'    => 8260,
-	'Gamma'    => 915,
-	'gamma'    => 947,
-	'ge'       => 8805,
-	'gt'       => 62,
-	'harr'     => 8596,
-	'hArr'     => 8660,
-	'hearts'   => 9829,
-	'hellip'   => 8230,
-	'Iacute'   => 205,
-	'iacute'   => 237,
-	'Icirc'    => 206,
-	'icirc'    => 238,
-	'iexcl'    => 161,
-	'Igrave'   => 204,
-	'igrave'   => 236,
-	'image'    => 8465,
-	'infin'    => 8734,
-	'int'      => 8747,
-	'Iota'     => 921,
-	'iota'     => 953,
-	'iquest'   => 191,
-	'isin'     => 8712,
-	'Iuml'     => 207,
-	'iuml'     => 239,
-	'Kappa'    => 922,
-	'kappa'    => 954,
-	'Lambda'   => 923,
-	'lambda'   => 955,
-	'lang'     => 9001,
-	'laquo'    => 171,
-	'larr'     => 8592,
-	'lArr'     => 8656,
-	'lceil'    => 8968,
-	'ldquo'    => 8220,
-	'le'       => 8804,
-	'lfloor'   => 8970,
-	'lowast'   => 8727,
-	'loz'      => 9674,
-	'lrm'      => 8206,
-	'lsaquo'   => 8249,
-	'lsquo'    => 8216,
-	'lt'       => 60,
-	'macr'     => 175,
-	'mdash'    => 8212,
-	'micro'    => 181,
-	'middot'   => 183,
-	'minus'    => 8722,
-	'Mu'       => 924,
-	'mu'       => 956,
-	'nabla'    => 8711,
-	'nbsp'     => 160,
-	'ndash'    => 8211,
-	'ne'       => 8800,
-	'ni'       => 8715,
-	'not'      => 172,
-	'notin'    => 8713,
-	'nsub'     => 8836,
-	'Ntilde'   => 209,
-	'ntilde'   => 241,
-	'Nu'       => 925,
-	'nu'       => 957,
-	'Oacute'   => 211,
-	'oacute'   => 243,
-	'Ocirc'    => 212,
-	'ocirc'    => 244,
-	'OElig'    => 338,
-	'oelig'    => 339,
-	'Ograve'   => 210,
-	'ograve'   => 242,
-	'oline'    => 8254,
-	'Omega'    => 937,
-	'omega'    => 969,
-	'Omicron'  => 927,
-	'omicron'  => 959,
-	'oplus'    => 8853,
-	'or'       => 8744,
-	'ordf'     => 170,
-	'ordm'     => 186,
-	'Oslash'   => 216,
-	'oslash'   => 248,
-	'Otilde'   => 213,
-	'otilde'   => 245,
-	'otimes'   => 8855,
-	'Ouml'     => 214,
-	'ouml'     => 246,
-	'para'     => 182,
-	'part'     => 8706,
-	'permil'   => 8240,
-	'perp'     => 8869,
-	'Phi'      => 934,
-	'phi'      => 966,
-	'Pi'       => 928,
-	'pi'       => 960,
-	'piv'      => 982,
-	'plusmn'   => 177,
-	'pound'    => 163,
-	'prime'    => 8242,
-	'Prime'    => 8243,
-	'prod'     => 8719,
-	'prop'     => 8733,
-	'Psi'      => 936,
-	'psi'      => 968,
-	'quot'     => 34,
-	'radic'    => 8730,
-	'rang'     => 9002,
-	'raquo'    => 187,
-	'rarr'     => 8594,
-	'rArr'     => 8658,
-	'rceil'    => 8969,
-	'rdquo'    => 8221,
-	'real'     => 8476,
-	'reg'      => 174,
-	'rfloor'   => 8971,
-	'Rho'      => 929,
-	'rho'      => 961,
-	'rlm'      => 8207,
-	'rsaquo'   => 8250,
-	'rsquo'    => 8217,
-	'sbquo'    => 8218,
-	'Scaron'   => 352,
-	'scaron'   => 353,
-	'sdot'     => 8901,
-	'sect'     => 167,
-	'shy'      => 173,
-	'Sigma'    => 931,
-	'sigma'    => 963,
-	'sigmaf'   => 962,
-	'sim'      => 8764,
-	'spades'   => 9824,
-	'sub'      => 8834,
-	'sube'     => 8838,
-	'sum'      => 8721,
-	'sup'      => 8835,
-	'sup1'     => 185,
-	'sup2'     => 178,
-	'sup3'     => 179,
-	'supe'     => 8839,
-	'szlig'    => 223,
-	'Tau'      => 932,
-	'tau'      => 964,
-	'there4'   => 8756,
-	'Theta'    => 920,
-	'theta'    => 952,
-	'thetasym' => 977,
-	'thinsp'   => 8201,
-	'THORN'    => 222,
-	'thorn'    => 254,
-	'tilde'    => 732,
-	'times'    => 215,
-	'trade'    => 8482,
-	'Uacute'   => 218,
-	'uacute'   => 250,
-	'uarr'     => 8593,
-	'uArr'     => 8657,
-	'Ucirc'    => 219,
-	'ucirc'    => 251,
-	'Ugrave'   => 217,
-	'ugrave'   => 249,
-	'uml'      => 168,
-	'upsih'    => 978,
-	'Upsilon'  => 933,
-	'upsilon'  => 965,
-	'Uuml'     => 220,
-	'uuml'     => 252,
-	'weierp'   => 8472,
-	'Xi'       => 926,
-	'xi'       => 958,
-	'Yacute'   => 221,
-	'yacute'   => 253,
-	'yen'      => 165,
-	'Yuml'     => 376,
-	'yuml'     => 255,
-	'Zeta'     => 918,
-	'zeta'     => 950,
-	'zwj'      => 8205,
-	'zwnj'     => 8204 );
+	/**
+	 * Character entity aliases accepted by MediaWiki
+	 */
+	static $htmlEntityAliases = array(
+		'רלמ' => 'rlm',
+		'رلم' => 'rlm',
+	);
 
-/**
- * Character entity aliases accepted by MediaWiki
- */
-global $wgHtmlEntityAliases;
-$wgHtmlEntityAliases = array(
-	'רלמ' => 'rlm',
-	'رلم' => 'rlm',
-);
+	/**
+	 * Lazy-initialised attributes regex, see getAttribsRegex()
+	 */
+	static $attribsRegex;
 
+	/**
+	 * Regular expression to match HTML/XML attribute pairs within a tag.
+	 * Allows some... latitude.
+	 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
+	 */
+	static function getAttribsRegex() {
+		if ( self::$attribsRegex === null ) {
+			$attribFirst = '[:A-Z_a-z0-9]';
+			$attrib = '[:A-Z_a-z-.0-9]';
+			$space = '[\x09\x0a\x0d\x20]';
+			self::$attribsRegex = 
+				"/(?:^|$space)({$attribFirst}{$attrib}*)
+				  ($space*=$space*
+					(?:
+					 # The attribute value: quoted or alone
+					  \"([^<\"]*)\"
+					 | '([^<']*)'
+					 |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
+					 |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
+										 # colors are specified like this.
+										 # We'll be normalizing it.
+					)
+				)?(?=$space|\$)/sx";
+		}
+		return self::$attribsRegex;
+	}
 
-/**
- * XHTML sanitizer for MediaWiki
- * @ingroup Parser
- */
-class Sanitizer {
 	/**
 	 * Cleans up HTML, removes dangerous tags and attributes, and
 	 * removes HTML comments
@@ -636,8 +638,8 @@ class Sanitizer {
 		$out = array();
 		foreach( $attribs as $attribute => $value ) {
 			#allow XML namespace declaration if RDFa is enabled
-			if ( $wgAllowRdfaAttributes && preg_match( MW_XMLNS_ATTRIBUTE_PATTRN, $attribute ) ) {
-				if ( !preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
+			if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
+				if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
 					$out[$attribute] = $value;
 				}
 
@@ -667,7 +669,7 @@ class Sanitizer {
 				$attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
 
 				//Paranoia. Allow "simple" values but suppress javascript
-				if ( preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
+				if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
 					continue; 
 				}
 			}
@@ -687,19 +689,6 @@ class Sanitizer {
 		}
 
 		if ( $wgAllowMicrodataAttributes ) {
-			# There are some complicated validity constraints we need to
-			# enforce here.  First of all, we don't want to allow non-standard
-			# itemtypes.
-			$allowedTypes = array(
-				'http://microformats.org/profile/hcard',
-				'http://microformats.org/profile/hcalendar#vevent',
-				'http://n.whatwg.org/work',
-			);
-			if ( isset( $out['itemtype'] ) && !in_array( $out['itemtype'],
-			$allowedTypes ) ) {
-				# Kill everything
-				unset( $out['itemscope'] );
-			}
 			# itemtype, itemid, itemref don't make sense without itemscope
 			if ( !array_key_exists( 'itemscope', $out ) ) {
 				unset( $out['itemtype'] );
@@ -803,6 +792,10 @@ class Sanitizer {
 		return $value;
 	}
 
+	/**
+	 * @param $matches array
+	 * @return String
+	 */
 	static function cssDecodeCallback( $matches ) {
 		if ( $matches[1] !== '' ) {
 			// Line continuation
@@ -1037,7 +1030,7 @@ class Sanitizer {
 		$attribs = array();
 		$pairs = array();
 		if( !preg_match_all(
-			MW_ATTRIBS_REGEX,
+			self::getAttribsRegex(),
 			$text,
 			$pairs,
 			PREG_SET_ORDER ) ) {
@@ -1060,7 +1053,7 @@ class Sanitizer {
 
 	/**
 	 * Pick the appropriate attribute value from a match set from the
-	 * MW_ATTRIBS_REGEX matches.
+	 * attribs regex matches.
 	 *
 	 * @param $set Array
 	 * @return String
@@ -1104,6 +1097,10 @@ class Sanitizer {
 				Sanitizer::normalizeCharReferences( $text ) ) );
 	}
 
+	/**
+	 * @param $text string
+	 * @return mixed
+	 */
 	private static function normalizeWhitespace( $text ) {
 		return preg_replace(
 			'/\r\n|[\x20\x0d\x0a\x09]/',
@@ -1140,7 +1137,7 @@ class Sanitizer {
 	 */
 	static function normalizeCharReferences( $text ) {
 		return preg_replace_callback(
-			MW_CHAR_REFS_REGEX,
+			self::CHAR_REFS_REGEX,
 			array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 			$text );
 	}
@@ -1156,8 +1153,6 @@ class Sanitizer {
 			$ret = Sanitizer::decCharReference( $matches[2] );
 		} elseif( $matches[3] != ''  ) {
 			$ret = Sanitizer::hexCharReference( $matches[3] );
-		} elseif( $matches[4] != '' ) {
-			$ret = Sanitizer::hexCharReference( $matches[4] );
 		}
 		if( is_null( $ret ) ) {
 			return htmlspecialchars( $matches[0] );
@@ -1177,19 +1172,22 @@ class Sanitizer {
 	 * @return String
 	 */
 	static function normalizeEntity( $name ) {
-		global $wgHtmlEntities, $wgHtmlEntityAliases;
-		if ( isset( $wgHtmlEntityAliases[$name] ) ) {
-			return "&{$wgHtmlEntityAliases[$name]};";
+		if ( isset( self::$htmlEntityAliases[$name] ) ) {
+			return '&' . self::$htmlEntityAliases[$name] . ';';
 		} elseif ( in_array( $name,
 		array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
 			return "&$name;";
-		} elseif ( isset( $wgHtmlEntities[$name] ) ) {
-			return "&#{$wgHtmlEntities[$name]};";
+		} elseif ( isset( self::$htmlEntities[$name] ) ) {
+			return '&#' . self::$htmlEntities[$name] . ';';
 		} else {
 			return "&amp;$name;";
 		}
 	}
 
+	/**
+	 * @param $codepoint
+	 * @return null|string
+	 */
 	static function decCharReference( $codepoint ) {
 		$point = intval( $codepoint );
 		if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1199,6 +1197,10 @@ class Sanitizer {
 		}
 	}
 
+	/**
+	 * @param $codepoint
+	 * @return null|string
+	 */
 	static function hexCharReference( $codepoint ) {
 		$point = hexdec( $codepoint );
 		if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1231,7 +1233,7 @@ class Sanitizer {
 	 */
 	public static function decodeCharReferences( $text ) {
 		return preg_replace_callback(
-			MW_CHAR_REFS_REGEX,
+			self::CHAR_REFS_REGEX,
 			array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 			$text );
 	}
@@ -1249,7 +1251,7 @@ class Sanitizer {
 	public static function decodeCharReferencesAndNormalize( $text ) {
 		global $wgContLang;
 		$text = preg_replace_callback(
-			MW_CHAR_REFS_REGEX,
+			self::CHAR_REFS_REGEX,
 			array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 			$text, /* limit */ -1, $count );
 
@@ -1271,8 +1273,6 @@ class Sanitizer {
 			return  Sanitizer::decodeChar( intval( $matches[2] ) );
 		} elseif( $matches[3] != ''  ) {
 			return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
-		} elseif( $matches[4] != '' ) {
-			return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 		}
 		# Last case should be an ampersand by itself
 		return $matches[0];
@@ -1298,16 +1298,15 @@ class Sanitizer {
 	 * return the UTF-8 encoding of that character. Otherwise, returns
 	 * pseudo-entity source (eg &foo;)
 	 *
-	 * @param $name Strings
+	 * @param $name String
 	 * @return String
 	 */
 	static function decodeEntity( $name ) {
-		global $wgHtmlEntities, $wgHtmlEntityAliases;
-		if ( isset( $wgHtmlEntityAliases[$name] ) ) {
-			$name = $wgHtmlEntityAliases[$name];
+		if ( isset( self::$htmlEntityAliases[$name] ) ) {
+			$name = self::$htmlEntityAliases[$name];
 		}
-		if( isset( $wgHtmlEntities[$name] ) ) {
-			return codepointToUtf8( $wgHtmlEntities[$name] );
+		if( isset( self::$htmlEntities[$name] ) ) {
+			return codepointToUtf8( self::$htmlEntities[$name] );
 		} else {
 			return "&$name;";
 		}
@@ -1532,22 +1531,26 @@ class Sanitizer {
 	 * @return String
 	 */
 	static function hackDocType() {
-		global $wgHtmlEntities;
 		$out = "<!DOCTYPE html [\n";
-		foreach( $wgHtmlEntities as $entity => $codepoint ) {
+		foreach( self::$htmlEntities as $entity => $codepoint ) {
 			$out .= "<!ENTITY $entity \"&#$codepoint;\">";
 		}
 		$out .= "]>\n";
 		return $out;
 	}
 
+	/**
+	 * @param $url string
+	 * @return mixed|string
+	 */
 	static function cleanUrl( $url ) {
 		# Normalize any HTML entities in input. They will be
 		# re-escaped by makeExternalLink().
 		$url = Sanitizer::decodeCharReferences( $url );
 
 		# Escape any control characters introduced by the above step
-		$url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url );
+		$url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 
+			array( __CLASS__, 'cleanUrlCallback' ), $url );
 
 		# Validate hostname portion
 		$matches = array();
@@ -1575,7 +1578,7 @@ class Sanitizer {
 
 			$host = preg_replace( $strip, '', $host );
 
-			// @todo Fixme: validate hostnames here
+			// @todo FIXME: Validate hostnames here
 
 			return $protocol . $host . $rest;
 		} else {
@@ -1583,4 +1586,61 @@ class Sanitizer {
 		}
 	}
 
+	/**
+	 * @param $matches array
+	 * @return string
+	 */
+	static function cleanUrlCallback( $matches ) {
+		return urlencode( $matches[0] );
+	}
+
+	/**
+	 * Does a string look like an e-mail address?
+	 *
+	 * This validates an email address using an HTML5 specification found at:
+	 * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+	 * Which as of 2011-01-24 says:
+	 *
+	 *   A valid e-mail address is a string that matches the ABNF production
+	 *   1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
+	 *   in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
+	 *   3.5.
+	 *
+	 * This function is an implementation of the specification as requested in
+	 * bug 22449.
+	 *
+	 * Client-side forms will use the same standard validation rules via JS or
+	 * HTML 5 validation; additional restrictions can be enforced server-side
+	 * by extensions via the 'isValidEmailAddr' hook.
+	 *
+	 * Note that this validation doesn't 100% match RFC 2822, but is believed
+	 * to be liberal enough for wide use. Some invalid addresses will still
+	 * pass validation here.
+	 *
+	 * @param $addr String E-mail address
+	 * @return Bool
+	 */
+	public static function validateEmail( $addr ) {
+		$result = null;
+		if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
+			return $result;
+		}
+
+		// Please note strings below are enclosed in brackets [], this make the
+		// hyphen "-" a range indicator. Hence it is double backslashed below.
+		// See bug 26948
+		$rfc5322_atext   = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
+		$rfc1034_ldh_str = "a-z0-9\\-" ;
+
+		$HTML5_email_regexp = "/
+		^                      # start of string
+		[$rfc5322_atext\\.]+    # user part which is liberal :p
+		@                      # 'apostrophe'
+		[$rfc1034_ldh_str]+       # First domain part
+		(\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
+		$                      # End of string
+		/ix" ; // case Insensitive, eXtended
+
+		return (bool) preg_match( $HTML5_email_regexp, $addr );
+	}
 }