From 9db190c7e736ec8d063187d4241b59feaf7dc2d1 Mon Sep 17 00:00:00 2001
From: Pierre Schmitz <pierre@archlinux.de>
Date: Wed, 22 Jun 2011 11:28:20 +0200
Subject: update to MediaWiki 1.17.0

---
 includes/Sanitizer.php | 96 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 74 insertions(+), 22 deletions(-)

(limited to 'includes/Sanitizer.php')

diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 26837b3c..a6c64264 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -2,7 +2,7 @@
 /**
  * XHTML sanitizer for MediaWiki
  *
- * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
+ * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
  * http://www.mediawiki.org/
  *
  * This program is free software; you can redistribute it and/or modify
@@ -40,10 +40,11 @@ define( 'MW_CHAR_REFS_REGEX',
  * Allows some... latitude.
  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  */
-$attrib = '[A-Za-z0-9]';
+$attribFirst = '[:A-Z_a-z0-9]';
+$attrib = '[:A-Z_a-z-.0-9]';
 $space = '[\x09\x0a\x0d\x20]';
 define( 'MW_ATTRIBS_REGEX',
-	"/(?:^|$space)((?:xml:|xmlns:)?$attrib+)
+	"/(?:^|$space)({$attribFirst}{$attrib}*)
 	  ($space*=$space*
 		(?:
 		 # The attribute value: quoted or alone
@@ -367,7 +368,8 @@ class Sanitizer {
 				'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 				'strike', 'strong', 'tt', 'var', 'div', 'center',
 				'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
-				'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
+				'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
+				'kbd', 'samp'
 			);
 			$htmlsingle = array(
 				'br', 'hr', 'li', 'dt', 'dd'
@@ -389,6 +391,12 @@ class Sanitizer {
 				'li',
 			);
 
+			global $wgAllowImageTag;
+			if ( $wgAllowImageTag ) {
+				$htmlsingle[] = 'img';
+				$htmlsingleonly[] = 'img';
+			}
+
 			$htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 			$htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 
@@ -620,7 +628,7 @@ class Sanitizer {
 	 * @todo Check for unique id attribute :P
 	 */
 	static function validateAttributes( $attribs, $whitelist ) {
-		global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
+		global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
 
 		$whitelist = array_flip( $whitelist );
 		$hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
@@ -636,7 +644,8 @@ class Sanitizer {
 				continue;
 			}
 
-			if( !isset( $whitelist[$attribute] ) ) {
+			# Allow any attribute beginning with "data-", if in HTML5 mode
+			if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
 				continue;
 			}
 
@@ -914,7 +923,9 @@ class Sanitizer {
 	 *
 	 * To ensure we don't have to bother escaping anything, we also strip ', ",
 	 * & even if $wgExperimentalIds is true.  TODO: Is this the best tactic?
-	 * We also strip # because it upsets IE6.
+	 * We also strip # because it upsets IE, and % because it could be
+	 * ambiguous if it's part of something that looks like a percent escape
+	 * (which don't work reliably in fragments cross-browser).
 	 *
 	 * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 	 *                                                          in the id and
@@ -940,7 +951,7 @@ class Sanitizer {
 
 		if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
 			$id = Sanitizer::decodeCharReferences( $id );
-			$id = preg_replace( '/[ \t\n\r\f_\'"&#]+/', '_', $id );
+			$id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
 			$id = trim( $id, '_' );
 			if ( $id === '' ) {
 				# Must have been all whitespace to start with.
@@ -988,17 +999,16 @@ class Sanitizer {
 
 	/**
 	 * Given HTML input, escape with htmlspecialchars but un-escape entites.
-	 * This allows (generally harmless) entities like &nbsp; to survive.
+	 * This allows (generally harmless) entities like &#160; to survive.
 	 *
 	 * @param $html String to escape
 	 * @return String: escaped input
 	 */
 	static function escapeHtmlAllowEntities( $html ) {
+		$html = Sanitizer::decodeCharReferences( $html );
 		# It seems wise to escape ' as well as ", as a matter of course.  Can't
 		# hurt.
 		$html = htmlspecialchars( $html, ENT_QUOTES );
-		$html = str_replace( '&amp;', '&', $html );
-		$html = Sanitizer::normalizeCharReferences( $html );
 		return $html;
 	}
 
@@ -1101,12 +1111,25 @@ class Sanitizer {
 			$text );
 	}
 
+	/**
+	 * Normalizes whitespace in a section name, such as might be returned
+	 * by Parser::stripSectionName(), for use in the id's that are used for
+	 * section links.
+	 *
+	 * @param $section String
+	 * @return String
+	 */
+	static function normalizeSectionNameWhitespace( $section ) {
+		return trim( preg_replace( '/[ _]+/', ' ', $section ) );
+	}
+
 	/**
 	 * Ensure that any entities and character references are legal
 	 * for XML and XHTML specifically. Any stray bits will be
 	 * &amp;-escaped to result in a valid text fragment.
 	 *
-	 * a. any named char refs must be known in XHTML
+	 * a. named char refs can only be &lt; &gt; &amp; &quot;, others are
+	 *   numericized (this way we're well-formed even without a DTD)
 	 * b. any numeric char refs must be legal chars, not invalid or forbidden
 	 * c. use &#x, not &#X
 	 * d. fix or reject non-valid attributes
@@ -1145,9 +1168,10 @@ class Sanitizer {
 
 	/**
 	 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
-	 * return the named entity reference as is. If the entity is a
-	 * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
-	 * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
+	 * return the equivalent numeric entity reference (except for the core &lt;
+	 * &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
+	 * the HTML equivalent. Otherwise, returns HTML-escaped text of
+	 * pseudo-entity source (eg &amp;foo;)
 	 *
 	 * @param $name String
 	 * @return String
@@ -1156,8 +1180,11 @@ class Sanitizer {
 		global $wgHtmlEntities, $wgHtmlEntityAliases;
 		if ( isset( $wgHtmlEntityAliases[$name] ) ) {
 			return "&{$wgHtmlEntityAliases[$name]};";
-		} elseif( isset( $wgHtmlEntities[$name] ) ) {
+		} elseif ( in_array( $name,
+		array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
 			return "&$name;";
+		} elseif ( isset( $wgHtmlEntities[$name] ) ) {
+			return "&#{$wgHtmlEntities[$name]};";
 		} else {
 			return "&amp;$name;";
 		}
@@ -1209,6 +1236,30 @@ class Sanitizer {
 			$text );
 	}
 
+	/**
+	 * Decode any character references, numeric or named entities,
+	 * in the next and normalize the resulting string. (bug 14952)
+	 *
+	 * This is useful for page titles, not for text to be displayed,
+	 * MediaWiki allows HTML entities to escape normalization as a feature.
+	 *
+	 * @param $text String (already normalized, containing entities)
+	 * @return String (still normalized, without entities)
+	 */
+	public static function decodeCharReferencesAndNormalize( $text ) {
+		global $wgContLang;
+		$text = preg_replace_callback(
+			MW_CHAR_REFS_REGEX,
+			array( 'Sanitizer', 'decodeCharReferencesCallback' ),
+			$text, /* limit */ -1, $count );
+
+		if ( $count ) {
+			return $wgContLang->normalize( $text );
+		} else {
+			return $text;
+		}
+	}
+
 	/**
 	 * @param $matches String
 	 * @return String
@@ -1342,10 +1393,10 @@ class Sanitizer {
 			'em'         => $common,
 			'strong'     => $common,
 			'cite'       => $common,
-			# dfn
+			'dfn'        => $common,
 			'code'       => $common,
-			# samp
-			# kbd
+			'samp'       => $common,
+			'kbd'        => $common,
 			'var'        => $common,
 			'abbr'       => $common,
 			# acronym
@@ -1412,8 +1463,9 @@ class Sanitizer {
 
 			# 13.2
 			# Not usually allowed, but may be used for extension-style hooks
-			# such as <math> when it is rasterized
-			'img'        => array_merge( $common, array( 'alt' ) ),
+			# such as <math> when it is rasterized, or if $wgAllowImageTag is
+			# true
+			'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
 
 			# 15.2.1
 			'tt'         => $common,
@@ -1495,7 +1547,7 @@ class Sanitizer {
 		$url = Sanitizer::decodeCharReferences( $url );
 
 		# Escape any control characters introduced by the above step
-		$url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+		$url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url );
 
 		# Validate hostname portion
 		$matches = array();
-- 
cgit v1.2.2