summaryrefslogtreecommitdiff
path: root/includes/Sanitizer.php
diff options
context:
space:
mode:
Diffstat (limited to 'includes/Sanitizer.php')
-rw-r--r--includes/Sanitizer.php224
1 files changed, 140 insertions, 84 deletions
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index f5a24dfa..185679f6 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -327,77 +327,90 @@ class Sanitizer {
* @param array $args for the processing callback
* @return string
*/
- function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
+ static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
global $wgUseTidy, $wgUserHtml;
- $fname = 'Parser::removeHTMLtags';
- wfProfileIn( $fname );
-
- if( $wgUserHtml ) {
- $htmlpairs = array( # Tags that must be closed
- 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
- 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
- 'strike', 'strong', 'tt', 'var', 'div', 'center',
- 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
- 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
- );
- $htmlsingle = array(
- 'br', 'hr', 'li', 'dt', 'dd'
- );
- $htmlsingleonly = array( # Elements that cannot have close tags
- 'br', 'hr'
- );
- $htmlnest = array( # Tags that can be nested--??
- 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
- 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
- );
- $tabletags = array( # Can only appear inside table
- 'td', 'th', 'tr',
- );
- $htmllist = array( # Tags used by list
- 'ul','ol',
- );
- $listtags = array( # Tags that can appear in a list
- 'li',
- );
- } else {
- $htmlpairs = array();
- $htmlsingle = array();
- $htmlnest = array();
- $tabletags = array();
- }
+ static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
+ $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
+
+ wfProfileIn( __METHOD__ );
+
+ if ( !$staticInitialised ) {
+ if( $wgUserHtml ) {
+ $htmlpairs = array( # Tags that must be closed
+ 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
+ 'strike', 'strong', 'tt', 'var', 'div', 'center',
+ 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
+ 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
+ );
+ $htmlsingle = array(
+ 'br', 'hr', 'li', 'dt', 'dd'
+ );
+ $htmlsingleonly = array( # Elements that cannot have close tags
+ 'br', 'hr'
+ );
+ $htmlnest = array( # Tags that can be nested--??
+ 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
+ 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
+ );
+ $tabletags = array( # Can only appear inside table
+ 'td', 'th', 'tr',
+ );
+ $htmllist = array( # Tags used by list
+ 'ul','ol',
+ );
+ $listtags = array( # Tags that can appear in a list
+ 'li',
+ );
+
+ } else {
+ $htmlpairs = array();
+ $htmlsingle = array();
+ $htmlnest = array();
+ $tabletags = array();
+ }
- $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
- $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
+ $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
+ $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
+
+ # Convert them all to hashtables for faster lookup
+ $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
+ 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
+ foreach ( $vars as $var ) {
+ $$var = array_flip( $$var );
+ }
+ $staticInitialised = true;
+ }
# Remove HTML comments
$text = Sanitizer::removeHTMLcomments( $text );
$bits = explode( '<', $text );
$text = array_shift( $bits );
if(!$wgUseTidy) {
- $tagstack = array(); $tablestack = array();
+ $tagstack = $tablestack = array();
foreach ( $bits as $x ) {
$prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
- preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
- $x, $regs );
+ preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs );
list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
error_reporting( $prev );
$badtag = 0 ;
- if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
+ if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
# Check our stack
if ( $slash ) {
# Closing a tag...
- if( in_array( $t, $htmlsingleonly ) ) {
+ if( isset( $htmlsingleonly[$t] ) ) {
$badtag = 1;
} elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
- if ( in_array($ot, $htmlsingleallowed) ) {
+ if ( isset( $htmlsingleallowed[$ot] ) ) {
# Pop all elements with an optional close tag
# and see if we find a match below them
$optstack = array();
array_push ($optstack, $ot);
while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
- in_array($ot, $htmlsingleallowed) ) {
+ isset( $htmlsingleallowed[$ot] ) )
+ {
array_push ($optstack, $ot);
}
if ( $t != $ot ) {
@@ -410,7 +423,7 @@ class Sanitizer {
} else {
@array_push( $tagstack, $ot );
# <li> can be nested in <ul> or <ol>, skip those cases:
- if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {
+ if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
$badtag = 1;
}
}
@@ -422,20 +435,20 @@ class Sanitizer {
$newparams = '';
} else {
# Keep track for later
- if ( in_array( $t, $tabletags ) &&
+ if ( isset( $tabletags[$t] ) &&
! in_array( 'table', $tagstack ) ) {
$badtag = 1;
} else if ( in_array( $t, $tagstack ) &&
- ! in_array ( $t , $htmlnest ) ) {
+ ! isset( $htmlnest [$t ] ) ) {
$badtag = 1 ;
# Is it a self closed htmlpair ? (bug 5487)
} else if( $brace == '/>' &&
- in_array($t, $htmlpairs) ) {
+ isset( $htmlpairs[$t] ) ) {
$badtag = 1;
- } elseif( in_array( $t, $htmlsingleonly ) ) {
+ } elseif( isset( $htmlsingleonly[$t] ) ) {
# Hack to force empty tag for uncloseable elements
$brace = '/>';
- } else if( in_array( $t, $htmlsingle ) ) {
+ } else if( isset( $htmlsingle[$t] ) ) {
# Hack to not close $htmlsingle tags
$brace = NULL;
} else {
@@ -475,7 +488,7 @@ class Sanitizer {
preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
$x, $regs );
@list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
- if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
+ if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
if( is_callable( $processCallback ) ) {
call_user_func_array( $processCallback, array( &$params, $args ) );
}
@@ -487,7 +500,7 @@ class Sanitizer {
}
}
}
- wfProfileOut( $fname );
+ wfProfileOut( __METHOD__ );
return $text;
}
@@ -501,9 +514,8 @@ class Sanitizer {
* @param string $text
* @return string
*/
- function removeHTMLcomments( $text ) {
- $fname='Parser::removeHTMLcomments';
- wfProfileIn( $fname );
+ static function removeHTMLcomments( $text ) {
+ wfProfileIn( __METHOD__ );
while (($start = strpos($text, '<!--')) !== false) {
$end = strpos($text, '-->', $start + 4);
if ($end === false) {
@@ -533,7 +545,7 @@ class Sanitizer {
$text = substr_replace($text, '', $start, $end - $start);
}
}
- wfProfileOut( $fname );
+ wfProfileOut( __METHOD__ );
return $text;
}
@@ -551,7 +563,7 @@ class Sanitizer {
* @todo Check for legal values where the DTD limits things.
* @todo Check for unique id attribute :P
*/
- function validateTagAttributes( $attribs, $element ) {
+ static function validateTagAttributes( $attribs, $element ) {
$whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
$out = array();
foreach( $attribs as $attribute => $value ) {
@@ -626,7 +638,7 @@ class Sanitizer {
* @param string $element
* @return string
*/
- function fixTagAttributes( $text, $element ) {
+ static function fixTagAttributes( $text, $element ) {
if( trim( $text ) == '' ) {
return '';
}
@@ -649,7 +661,7 @@ class Sanitizer {
* @param $text
* @return HTML-encoded text fragment
*/
- function encodeAttribute( $text ) {
+ static function encodeAttribute( $text ) {
$encValue = htmlspecialchars( $text );
// Whitespace is normalized during attribute decoding,
@@ -670,7 +682,7 @@ class Sanitizer {
* @param $text
* @return HTML-encoded text fragment
*/
- function safeEncodeAttribute( $text ) {
+ static function safeEncodeAttribute( $text ) {
$encValue = Sanitizer::encodeAttribute( $text );
# Templates and links may be expanded in later parsing,
@@ -713,7 +725,7 @@ class Sanitizer {
* @param string $id
* @return string
*/
- function escapeId( $id ) {
+ static function escapeId( $id ) {
static $replace = array(
'%3A' => ':',
'%' => '.'
@@ -730,7 +742,7 @@ class Sanitizer {
* @return string
* @private
*/
- function armorLinksCallback( $matches ) {
+ private static function armorLinksCallback( $matches ) {
return str_replace( ':', '&#58;', $matches[1] );
}
@@ -742,7 +754,7 @@ class Sanitizer {
* @param string
* @return array
*/
- function decodeTagAttributes( $text ) {
+ static function decodeTagAttributes( $text ) {
$attribs = array();
if( trim( $text ) == '' ) {
@@ -780,7 +792,7 @@ class Sanitizer {
* @return string
* @private
*/
- function getTagAttributeCallback( $set ) {
+ private static function getTagAttributeCallback( $set ) {
if( isset( $set[6] ) ) {
# Illegal #XXXXXX color with no quotes.
return $set[6];
@@ -814,7 +826,7 @@ class Sanitizer {
* @return string
* @private
*/
- function normalizeAttributeValue( $text ) {
+ private static function normalizeAttributeValue( $text ) {
return str_replace( '"', '&quot;',
preg_replace(
'/\r\n|[\x20\x0d\x0a\x09]/',
@@ -836,7 +848,7 @@ class Sanitizer {
* @return string
* @private
*/
- function normalizeCharReferences( $text ) {
+ static function normalizeCharReferences( $text ) {
return preg_replace_callback(
MW_CHAR_REFS_REGEX,
array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
@@ -846,7 +858,7 @@ class Sanitizer {
* @param string $matches
* @return string
*/
- function normalizeCharReferencesCallback( $matches ) {
+ static function normalizeCharReferencesCallback( $matches ) {
$ret = null;
if( $matches[1] != '' ) {
$ret = Sanitizer::normalizeEntity( $matches[1] );
@@ -871,8 +883,9 @@ class Sanitizer {
*
* @param string $name
* @return string
+ * @static
*/
- function normalizeEntity( $name ) {
+ static function normalizeEntity( $name ) {
global $wgHtmlEntities;
if( isset( $wgHtmlEntities[$name] ) ) {
return "&$name;";
@@ -881,7 +894,7 @@ class Sanitizer {
}
}
- function decCharReference( $codepoint ) {
+ static function decCharReference( $codepoint ) {
$point = intval( $codepoint );
if( Sanitizer::validateCodepoint( $point ) ) {
return sprintf( '&#%d;', $point );
@@ -890,7 +903,7 @@ class Sanitizer {
}
}
- function hexCharReference( $codepoint ) {
+ static function hexCharReference( $codepoint ) {
$point = hexdec( $codepoint );
if( Sanitizer::validateCodepoint( $point ) ) {
return sprintf( '&#x%x;', $point );
@@ -904,7 +917,7 @@ class Sanitizer {
* @param int $codepoint
* @return bool
*/
- function validateCodepoint( $codepoint ) {
+ private static function validateCodepoint( $codepoint ) {
return ($codepoint == 0x09)
|| ($codepoint == 0x0a)
|| ($codepoint == 0x0d)
@@ -920,8 +933,9 @@ class Sanitizer {
* @param string $text
* @return string
* @public
+ * @static
*/
- function decodeCharReferences( $text ) {
+ public static function decodeCharReferences( $text ) {
return preg_replace_callback(
MW_CHAR_REFS_REGEX,
array( 'Sanitizer', 'decodeCharReferencesCallback' ),
@@ -932,7 +946,7 @@ class Sanitizer {
* @param string $matches
* @return string
*/
- function decodeCharReferencesCallback( $matches ) {
+ static function decodeCharReferencesCallback( $matches ) {
if( $matches[1] != '' ) {
return Sanitizer::decodeEntity( $matches[1] );
} elseif( $matches[2] != '' ) {
@@ -953,7 +967,7 @@ class Sanitizer {
* @return string
* @private
*/
- function decodeChar( $codepoint ) {
+ static function decodeChar( $codepoint ) {
if( Sanitizer::validateCodepoint( $codepoint ) ) {
return codepointToUtf8( $codepoint );
} else {
@@ -969,7 +983,7 @@ class Sanitizer {
* @param string $name
* @return string
*/
- function decodeEntity( $name ) {
+ static function decodeEntity( $name ) {
global $wgHtmlEntities;
if( isset( $wgHtmlEntities[$name] ) ) {
return codepointToUtf8( $wgHtmlEntities[$name] );
@@ -985,7 +999,7 @@ class Sanitizer {
* @param string $element
* @return array
*/
- function attributeWhitelist( $element ) {
+ static function attributeWhitelist( $element ) {
static $list;
if( !isset( $list ) ) {
$list = Sanitizer::setupAttributeWhitelist();
@@ -996,9 +1010,10 @@ class Sanitizer {
}
/**
+ * @todo Document it a bit
* @return array
*/
- function setupAttributeWhitelist() {
+ static function setupAttributeWhitelist() {
$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
$block = array_merge( $common, array( 'align' ) );
$tablealign = array( 'align', 'char', 'charoff', 'valign' );
@@ -1082,9 +1097,9 @@ class Sanitizer {
# 11.2.1
'table' => array_merge( $common,
array( 'summary', 'width', 'border', 'frame',
- 'rules', 'cellspacing', 'cellpadding',
- 'align', 'bgcolor', 'frame', 'rules',
- 'border' ) ),
+ 'rules', 'cellspacing', 'cellpadding',
+ 'align', 'bgcolor',
+ ) ),
# 11.2.2
'caption' => array_merge( $common, array( 'align' ) ),
@@ -1142,7 +1157,7 @@ class Sanitizer {
* @param string $text HTML fragment
* @return string
*/
- function stripAllTags( $text ) {
+ static function stripAllTags( $text ) {
# Actual <tags>
$text = preg_replace( '/ < .*? > /x', '', $text );
@@ -1169,7 +1184,7 @@ class Sanitizer {
* @return string
* @static
*/
- function hackDocType() {
+ static function hackDocType() {
global $wgHtmlEntities;
$out = "<!DOCTYPE html [\n";
foreach( $wgHtmlEntities as $entity => $codepoint ) {
@@ -1178,6 +1193,47 @@ class Sanitizer {
$out .= "]>\n";
return $out;
}
+
+ static function cleanUrl( $url, $hostname=true ) {
+ # Normalize any HTML entities in input. They will be
+ # re-escaped by makeExternalLink().
+ $url = Sanitizer::decodeCharReferences( $url );
+
+ # Escape any control characters introduced by the above step
+ $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+
+ # Validate hostname portion
+ if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
+ list( $whole, $protocol, $host, $rest ) = $matches;
+
+ // Characters that will be ignored in IDNs.
+ // http://tools.ietf.org/html/3454#section-3.1
+ // Strip them before further processing so blacklists and such work.
+ $strip = "/
+ \\s| # general whitespace
+ \xc2\xad| # 00ad SOFT HYPHEN
+ \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
+ \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
+ \xe2\x81\xa0| # 2060 WORD JOINER
+ \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
+ \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
+ \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
+ \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
+ \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
+ \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
+ \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
+ [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
+ /xuD";
+
+ $host = preg_replace( $strip, '', $host );
+
+ // @fixme: validate hostnames here
+
+ return $protocol . $host . $rest;
+ } else {
+ return $url;
+ }
+ }
}