et al * https://www.mediawiki.org/ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html * * @file * @ingroup Parser */ /** * HTML sanitizer for MediaWiki * @ingroup Parser */ class Sanitizer { /** * Regular expression to match various types of character references in * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences */ const CHAR_REFS_REGEX = '/&([A-Za-z0-9\x80-\xff]+); |&\#([0-9]+); |&\#[xX]([0-9A-Fa-f]+); |(&)/x'; /** * Acceptable tag name charset from HTML5 parsing spec * http://www.w3.org/TR/html5/syntax.html#tag-open-state */ const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; /** * Blacklist for evil uris like javascript: * WARNING: DO NOT use this in any place that actually requires blacklisting * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the * only way to be secure from javascript: uri based xss vectors is to whitelist * things that you know are safe and deny everything else. * [1]: http://ha.ckers.org/xss.html */ const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; /** * List of all named character entities defined in HTML 4.01 * http://www.w3.org/TR/html4/sgml/entities.html * As well as ' which is only defined starting in XHTML1. */ private static $htmlEntities = array( 'Aacute' => 193, 'aacute' => 225, 'Acirc' => 194, 'acirc' => 226, 'acute' => 180, 'AElig' => 198, 'aelig' => 230, 'Agrave' => 192, 'agrave' => 224, 'alefsym' => 8501, 'Alpha' => 913, 'alpha' => 945, 'amp' => 38, 'and' => 8743, 'ang' => 8736, 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. 'Aring' => 197, 'aring' => 229, 'asymp' => 8776, 'Atilde' => 195, 'atilde' => 227, 'Auml' => 196, 'auml' => 228, 'bdquo' => 8222, 'Beta' => 914, 'beta' => 946, 'brvbar' => 166, 'bull' => 8226, 'cap' => 8745, 'Ccedil' => 199, 'ccedil' => 231, 'cedil' => 184, 'cent' => 162, 'Chi' => 935, 'chi' => 967, 'circ' => 710, 'clubs' => 9827, 'cong' => 8773, 'copy' => 169, 'crarr' => 8629, 'cup' => 8746, 'curren' => 164, 'dagger' => 8224, 'Dagger' => 8225, 'darr' => 8595, 'dArr' => 8659, 'deg' => 176, 'Delta' => 916, 'delta' => 948, 'diams' => 9830, 'divide' => 247, 'Eacute' => 201, 'eacute' => 233, 'Ecirc' => 202, 'ecirc' => 234, 'Egrave' => 200, 'egrave' => 232, 'empty' => 8709, 'emsp' => 8195, 'ensp' => 8194, 'Epsilon' => 917, 'epsilon' => 949, 'equiv' => 8801, 'Eta' => 919, 'eta' => 951, 'ETH' => 208, 'eth' => 240, 'Euml' => 203, 'euml' => 235, 'euro' => 8364, 'exist' => 8707, 'fnof' => 402, 'forall' => 8704, 'frac12' => 189, 'frac14' => 188, 'frac34' => 190, 'frasl' => 8260, 'Gamma' => 915, 'gamma' => 947, 'ge' => 8805, 'gt' => 62, 'harr' => 8596, 'hArr' => 8660, 'hearts' => 9829, 'hellip' => 8230, 'Iacute' => 205, 'iacute' => 237, 'Icirc' => 206, 'icirc' => 238, 'iexcl' => 161, 'Igrave' => 204, 'igrave' => 236, 'image' => 8465, 'infin' => 8734, 'int' => 8747, 'Iota' => 921, 'iota' => 953, 'iquest' => 191, 'isin' => 8712, 'Iuml' => 207, 'iuml' => 239, 'Kappa' => 922, 'kappa' => 954, 'Lambda' => 923, 'lambda' => 955, 'lang' => 9001, 'laquo' => 171, 'larr' => 8592, 'lArr' => 8656, 'lceil' => 8968, 'ldquo' => 8220, 'le' => 8804, 'lfloor' => 8970, 'lowast' => 8727, 'loz' => 9674, 'lrm' => 8206, 'lsaquo' => 8249, 'lsquo' => 8216, 'lt' => 60, 'macr' => 175, 'mdash' => 8212, 'micro' => 181, 'middot' => 183, 'minus' => 8722, 'Mu' => 924, 'mu' => 956, 'nabla' => 8711, 'nbsp' => 160, 'ndash' => 8211, 'ne' => 8800, 'ni' => 8715, 'not' => 172, 'notin' => 8713, 'nsub' => 8836, 'Ntilde' => 209, 'ntilde' => 241, 'Nu' => 925, 'nu' => 957, 'Oacute' => 211, 'oacute' => 243, 'Ocirc' => 212, 'ocirc' => 244, 'OElig' => 338, 'oelig' => 339, 'Ograve' => 210, 'ograve' => 242, 'oline' => 8254, 'Omega' => 937, 'omega' => 969, 'Omicron' => 927, 'omicron' => 959, 'oplus' => 8853, 'or' => 8744, 'ordf' => 170, 'ordm' => 186, 'Oslash' => 216, 'oslash' => 248, 'Otilde' => 213, 'otilde' => 245, 'otimes' => 8855, 'Ouml' => 214, 'ouml' => 246, 'para' => 182, 'part' => 8706, 'permil' => 8240, 'perp' => 8869, 'Phi' => 934, 'phi' => 966, 'Pi' => 928, 'pi' => 960, 'piv' => 982, 'plusmn' => 177, 'pound' => 163, 'prime' => 8242, 'Prime' => 8243, 'prod' => 8719, 'prop' => 8733, 'Psi' => 936, 'psi' => 968, 'quot' => 34, 'radic' => 8730, 'rang' => 9002, 'raquo' => 187, 'rarr' => 8594, 'rArr' => 8658, 'rceil' => 8969, 'rdquo' => 8221, 'real' => 8476, 'reg' => 174, 'rfloor' => 8971, 'Rho' => 929, 'rho' => 961, 'rlm' => 8207, 'rsaquo' => 8250, 'rsquo' => 8217, 'sbquo' => 8218, 'Scaron' => 352, 'scaron' => 353, 'sdot' => 8901, 'sect' => 167, 'shy' => 173, 'Sigma' => 931, 'sigma' => 963, 'sigmaf' => 962, 'sim' => 8764, 'spades' => 9824, 'sub' => 8834, 'sube' => 8838, 'sum' => 8721, 'sup' => 8835, 'sup1' => 185, 'sup2' => 178, 'sup3' => 179, 'supe' => 8839, 'szlig' => 223, 'Tau' => 932, 'tau' => 964, 'there4' => 8756, 'Theta' => 920, 'theta' => 952, 'thetasym' => 977, 'thinsp' => 8201, 'THORN' => 222, 'thorn' => 254, 'tilde' => 732, 'times' => 215, 'trade' => 8482, 'Uacute' => 218, 'uacute' => 250, 'uarr' => 8593, 'uArr' => 8657, 'Ucirc' => 219, 'ucirc' => 251, 'Ugrave' => 217, 'ugrave' => 249, 'uml' => 168, 'upsih' => 978, 'Upsilon' => 933, 'upsilon' => 965, 'Uuml' => 220, 'uuml' => 252, 'weierp' => 8472, 'Xi' => 926, 'xi' => 958, 'Yacute' => 221, 'yacute' => 253, 'yen' => 165, 'Yuml' => 376, 'yuml' => 255, 'Zeta' => 918, 'zeta' => 950, 'zwj' => 8205, 'zwnj' => 8204 ); /** * Character entity aliases accepted by MediaWiki */ private static $htmlEntityAliases = array( 'רלמ' => 'rlm', 'رلم' => 'rlm', ); /** * Lazy-initialised attributes regex, see getAttribsRegex() */ private static $attribsRegex; /** * Regular expression to match HTML/XML attribute pairs within a tag. * Allows some... latitude. * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes * @return string */ static function getAttribsRegex() { if ( self::$attribsRegex === null ) { $attribFirst = '[:A-Z_a-z0-9]'; $attrib = '[:A-Z_a-z-.0-9]'; $space = '[\x09\x0a\x0d\x20]'; self::$attribsRegex = "/(?:^|$space)({$attribFirst}{$attrib}*) ($space*=$space* (?: # The attribute value: quoted or alone \"([^<\"]*)\" | '([^<']*)' | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) | (\#[0-9a-fA-F]+) # Technically wrong, but lots of # colors are specified like this. # We'll be normalizing it. ) )?(?=$space|\$)/sx"; } return self::$attribsRegex; } /** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @param string $text * @param callable $processCallback Callback to do any variable or parameter * replacements in HTML attribute values * @param array|bool $args Arguments for the processing callback * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude * @return string */ public static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) { global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag; static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; // Base our staticInitialised variable off of the global config state so that if the globals // are changed (like in the screwed up test system) we will re-initialise the settings. $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) ); if ( !$staticInitialised || $staticInitialised != $globalContext ) { $htmlpairsStatic = array( # Tags that must be closed 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', 'kbd', 'samp', 'data', 'time', 'mark' ); $htmlsingle = array( 'br', 'wbr', 'hr', 'li', 'dt', 'dd' ); $htmlsingleonly = array( # Elements that cannot have close tags 'br', 'wbr', 'hr' ); if ( $wgAllowMicrodataAttributes ) { $htmlsingle[] = $htmlsingleonly[] = 'meta'; $htmlsingle[] = $htmlsingleonly[] = 'link'; } $htmlnest = array( # Tags that can be nested--?? 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' ); $tabletags = array( # Can only appear inside table, we will close them 'td', 'th', 'tr', ); $htmllist = array( # Tags used by list 'ul', 'ol', ); $listtags = array( # Tags that can appear in a list 'li', ); if ( $wgAllowImageTag ) { $htmlsingle[] = 'img'; $htmlsingleonly[] = 'img'; } $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); # Convert them all to hashtables for faster lookup $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ); foreach ( $vars as $var ) { $$var = array_flip( $$var ); } $staticInitialised = $globalContext; } # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays $extratags = array_flip( $extratags ); $removetags = array_flip( $removetags ); $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); # Remove HTML comments $text = Sanitizer::removeHTMLcomments( $text ); $bits = explode( '<', $text ); $text = str_replace( '>', '>', array_shift( $bits ) ); if ( !$wgUseTidy ) { $tagstack = $tablestack = array(); foreach ( $bits as $x ) { $regs = array(); # $slash: Does the current element start with a '/'? # $t: Current element name # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } $badtag = false; if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { # Check our stack if ( $slash && isset( $htmlsingleonly[$t] ) ) { $badtag = true; } elseif ( $slash ) { # Closing a tag... is it the one we just opened? wfSuppressWarnings(); $ot = array_pop( $tagstack ); wfRestoreWarnings(); if ( $ot != $t ) { if ( isset( $htmlsingleallowed[$ot] ) ) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push( $optstack, $ot ); wfSuppressWarnings(); $ot = array_pop( $tagstack ); wfRestoreWarnings(); while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { array_push( $optstack, $ot ); wfSuppressWarnings(); $ot = array_pop( $tagstack ); wfRestoreWarnings(); } if ( $t != $ot ) { # No match. Push the optional elements back again $badtag = true; wfSuppressWarnings(); $ot = array_pop( $optstack ); wfRestoreWarnings(); while ( $ot ) { array_push( $tagstack, $ot ); wfSuppressWarnings(); $ot = array_pop( $optstack ); wfRestoreWarnings(); } } } else { wfSuppressWarnings(); array_push( $tagstack, $ot ); wfRestoreWarnings(); #
  • can be nested in