summaryrefslogtreecommitdiff
path: root/includes/Sanitizer.php
diff options
context:
space:
mode:
authorPierre Schmitz <pierre@archlinux.de>2011-06-22 11:28:20 +0200
committerPierre Schmitz <pierre@archlinux.de>2011-06-22 11:28:20 +0200
commit9db190c7e736ec8d063187d4241b59feaf7dc2d1 (patch)
tree46d1a0dee7febef5c2d57a9f7b972be16a163b3d /includes/Sanitizer.php
parent78677c7bbdcc9739f6c10c75935898a20e1acd9e (diff)
update to MediaWiki 1.17.0
Diffstat (limited to 'includes/Sanitizer.php')
-rw-r--r--includes/Sanitizer.php96
1 files changed, 74 insertions, 22 deletions
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 26837b3c..a6c64264 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -2,7 +2,7 @@
/**
* XHTML sanitizer for MediaWiki
*
- * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
+ * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
* http://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
@@ -40,10 +40,11 @@ define( 'MW_CHAR_REFS_REGEX',
* Allows some... latitude.
* Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
*/
-$attrib = '[A-Za-z0-9]';
+$attribFirst = '[:A-Z_a-z0-9]';
+$attrib = '[:A-Z_a-z-.0-9]';
$space = '[\x09\x0a\x0d\x20]';
define( 'MW_ATTRIBS_REGEX',
- "/(?:^|$space)((?:xml:|xmlns:)?$attrib+)
+ "/(?:^|$space)({$attribFirst}{$attrib}*)
($space*=$space*
(?:
# The attribute value: quoted or alone
@@ -367,7 +368,8 @@ class Sanitizer {
'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
'strike', 'strong', 'tt', 'var', 'div', 'center',
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
- 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
+ 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
+ 'kbd', 'samp'
);
$htmlsingle = array(
'br', 'hr', 'li', 'dt', 'dd'
@@ -389,6 +391,12 @@ class Sanitizer {
'li',
);
+ global $wgAllowImageTag;
+ if ( $wgAllowImageTag ) {
+ $htmlsingle[] = 'img';
+ $htmlsingleonly[] = 'img';
+ }
+
$htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
$htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
@@ -620,7 +628,7 @@ class Sanitizer {
* @todo Check for unique id attribute :P
*/
static function validateAttributes( $attribs, $whitelist ) {
- global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
+ global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
$whitelist = array_flip( $whitelist );
$hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
@@ -636,7 +644,8 @@ class Sanitizer {
continue;
}
- if( !isset( $whitelist[$attribute] ) ) {
+ # Allow any attribute beginning with "data-", if in HTML5 mode
+ if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
continue;
}
@@ -914,7 +923,9 @@ class Sanitizer {
*
* To ensure we don't have to bother escaping anything, we also strip ', ",
* & even if $wgExperimentalIds is true. TODO: Is this the best tactic?
- * We also strip # because it upsets IE6.
+ * We also strip # because it upsets IE, and % because it could be
+ * ambiguous if it's part of something that looks like a percent escape
+ * (which don't work reliably in fragments cross-browser).
*
* @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
* in the id and
@@ -940,7 +951,7 @@ class Sanitizer {
if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
$id = Sanitizer::decodeCharReferences( $id );
- $id = preg_replace( '/[ \t\n\r\f_\'"&#]+/', '_', $id );
+ $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
$id = trim( $id, '_' );
if ( $id === '' ) {
# Must have been all whitespace to start with.
@@ -988,17 +999,16 @@ class Sanitizer {
/**
* Given HTML input, escape with htmlspecialchars but un-escape entites.
- * This allows (generally harmless) entities like &nbsp; to survive.
+ * This allows (generally harmless) entities like &#160; to survive.
*
* @param $html String to escape
* @return String: escaped input
*/
static function escapeHtmlAllowEntities( $html ) {
+ $html = Sanitizer::decodeCharReferences( $html );
# It seems wise to escape ' as well as ", as a matter of course. Can't
# hurt.
$html = htmlspecialchars( $html, ENT_QUOTES );
- $html = str_replace( '&amp;', '&', $html );
- $html = Sanitizer::normalizeCharReferences( $html );
return $html;
}
@@ -1102,11 +1112,24 @@ class Sanitizer {
}
/**
+ * Normalizes whitespace in a section name, such as might be returned
+ * by Parser::stripSectionName(), for use in the id's that are used for
+ * section links.
+ *
+ * @param $section String
+ * @return String
+ */
+ static function normalizeSectionNameWhitespace( $section ) {
+ return trim( preg_replace( '/[ _]+/', ' ', $section ) );
+ }
+
+ /**
* Ensure that any entities and character references are legal
* for XML and XHTML specifically. Any stray bits will be
* &amp;-escaped to result in a valid text fragment.
*
- * a. any named char refs must be known in XHTML
+ * a. named char refs can only be &lt; &gt; &amp; &quot;, others are
+ * numericized (this way we're well-formed even without a DTD)
* b. any numeric char refs must be legal chars, not invalid or forbidden
* c. use &#x, not &#X
* d. fix or reject non-valid attributes
@@ -1145,9 +1168,10 @@ class Sanitizer {
/**
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
- * return the named entity reference as is. If the entity is a
- * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
- * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
+ * return the equivalent numeric entity reference (except for the core &lt;
+ * &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
+ * the HTML equivalent. Otherwise, returns HTML-escaped text of
+ * pseudo-entity source (eg &amp;foo;)
*
* @param $name String
* @return String
@@ -1156,8 +1180,11 @@ class Sanitizer {
global $wgHtmlEntities, $wgHtmlEntityAliases;
if ( isset( $wgHtmlEntityAliases[$name] ) ) {
return "&{$wgHtmlEntityAliases[$name]};";
- } elseif( isset( $wgHtmlEntities[$name] ) ) {
+ } elseif ( in_array( $name,
+ array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
return "&$name;";
+ } elseif ( isset( $wgHtmlEntities[$name] ) ) {
+ return "&#{$wgHtmlEntities[$name]};";
} else {
return "&amp;$name;";
}
@@ -1210,6 +1237,30 @@ class Sanitizer {
}
/**
+ * Decode any character references, numeric or named entities,
+ * in the next and normalize the resulting string. (bug 14952)
+ *
+ * This is useful for page titles, not for text to be displayed,
+ * MediaWiki allows HTML entities to escape normalization as a feature.
+ *
+ * @param $text String (already normalized, containing entities)
+ * @return String (still normalized, without entities)
+ */
+ public static function decodeCharReferencesAndNormalize( $text ) {
+ global $wgContLang;
+ $text = preg_replace_callback(
+ MW_CHAR_REFS_REGEX,
+ array( 'Sanitizer', 'decodeCharReferencesCallback' ),
+ $text, /* limit */ -1, $count );
+
+ if ( $count ) {
+ return $wgContLang->normalize( $text );
+ } else {
+ return $text;
+ }
+ }
+
+ /**
* @param $matches String
* @return String
*/
@@ -1342,10 +1393,10 @@ class Sanitizer {
'em' => $common,
'strong' => $common,
'cite' => $common,
- # dfn
+ 'dfn' => $common,
'code' => $common,
- # samp
- # kbd
+ 'samp' => $common,
+ 'kbd' => $common,
'var' => $common,
'abbr' => $common,
# acronym
@@ -1412,8 +1463,9 @@ class Sanitizer {
# 13.2
# Not usually allowed, but may be used for extension-style hooks
- # such as <math> when it is rasterized
- 'img' => array_merge( $common, array( 'alt' ) ),
+ # such as <math> when it is rasterized, or if $wgAllowImageTag is
+ # true
+ 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
# 15.2.1
'tt' => $common,
@@ -1495,7 +1547,7 @@ class Sanitizer {
$url = Sanitizer::decodeCharReferences( $url );
# Escape any control characters introduced by the above step
- $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+ $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url );
# Validate hostname portion
$matches = array();