From c1f9b1f7b1b77776192048005dcc66dcf3df2bfb Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Sat, 27 Dec 2014 15:41:37 +0100 Subject: Update to MediaWiki 1.24.1 --- includes/HtmlFormatter.php | 97 ++++++++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 38 deletions(-) (limited to 'includes/HtmlFormatter.php') diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php index 248a76fe..ccbfba82 100644 --- a/includes/HtmlFormatter.php +++ b/includes/HtmlFormatter.php @@ -34,7 +34,7 @@ class HtmlFormatter { /** * Constructor * - * @param string $html: Text to process + * @param string $html Text to process */ public function __construct( $html ) { $this->html = $html; @@ -51,15 +51,15 @@ class HtmlFormatter { /** * Override this in descendant class to modify HTML after it has been converted from DOM tree - * @param string $html: HTML to process - * @return string: Processed HTML + * @param string $html HTML to process + * @return string Processed HTML */ protected function onHtmlReady( $html ) { return $html; } /** - * @return DOMDocument: DOM to manipulate + * @return DOMDocument DOM to manipulate */ public function getDoc() { if ( !$this->doc ) { @@ -101,7 +101,7 @@ class HtmlFormatter { * . * # * - * @param Array|string $selectors: Selector(s) of stuff to remove + * @param array|string $selectors Selector(s) of stuff to remove */ public function remove( $selectors ) { $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); @@ -114,7 +114,7 @@ class HtmlFormatter { * Note this interface may fail in surprising unexpected ways due to usage of regexes, * so should not be relied on for HTML markup security measures. * - * @param Array|string $elements: Name(s) of tag(s) to flatten + * @param array|string $elements Name(s) of tag(s) to flatten */ public function flatten( $elements ) { $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); @@ -128,15 +128,23 @@ class HtmlFormatter { } /** - * Removes content we've chosen to remove + * Removes content we've chosen to remove. The text of the removed elements can be + * extracted with the getText method. + * @return array Array of removed DOMElements */ public function filterContent() { wfProfileIn( __METHOD__ ); $removals = $this->parseItemsToRemove(); - if ( !$removals ) { + // Bail out early if nothing to do + if ( array_reduce( $removals, + function ( $carry, $item ) { + return $carry && !$item; + }, + true + ) ) { wfProfileOut( __METHOD__ ); - return; + return array(); } $doc = $this->getDoc(); @@ -156,8 +164,7 @@ class HtmlFormatter { } } } - - $this->removeElements( $domElemsToRemove ); + $removed = $this->removeElements( $domElemsToRemove ); // Elements with named IDs $domElemsToRemove = array(); @@ -167,7 +174,7 @@ class HtmlFormatter { $domElemsToRemove[] = $itemToRemoveNode; } } - $this->removeElements( $domElemsToRemove ); + $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); // CSS Classes $domElemsToRemove = array(); @@ -183,7 +190,7 @@ class HtmlFormatter { } } } - $this->removeElements( $domElemsToRemove ); + $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); // Tags with CSS Classes foreach ( $removals['TAG_CLASS'] as $classToRemove ) { @@ -192,16 +199,17 @@ class HtmlFormatter { $elements = $xpath->query( '//' . $parts[0] . '[@class="' . $parts[1] . '"]' ); - - $this->removeElements( $elements ); + $removed = array_merge( $removed, $this->removeElements( $elements ) ); } wfProfileOut( __METHOD__ ); + return $removed; } /** * Removes a list of elelments from DOMDocument * @param array|DOMNodeList $elements + * @return array Array of removed elements */ private function removeElements( $elements ) { $list = $elements; @@ -217,6 +225,7 @@ class HtmlFormatter { $element->parentNode->removeChild( $element ); } } + return $list; } /** @@ -228,7 +237,7 @@ class HtmlFormatter { private function fixLibXML( $html ) { wfProfileIn( __METHOD__ ); static $replacements; - if ( ! $replacements ) { + if ( !$replacements ) { // We don't include rules like '"' => '&quot;' because entities had already been // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE! $replacements = new ReplacementArray( array( @@ -245,15 +254,20 @@ class HtmlFormatter { } /** - * Performs final transformations and returns resulting HTML + * Performs final transformations and returns resulting HTML. Note that if you want to call this + * both without an element and with an element you should call it without an element first. If you + * specify the $element in the method it'll change the underlying dom and you won't be able to get + * it back. * - * @param DOMElement|string|null $element: ID of element to get HTML from or false to get it from the whole tree - * @return string: Processed HTML + * @param DOMElement|string|null $element ID of element to get HTML from or + * false to get it from the whole tree + * @return string Processed HTML */ public function getText( $element = null ) { wfProfileIn( __METHOD__ ); if ( $this->doc ) { + wfProfileIn( __METHOD__ . '-dom' ); if ( $element !== null && !( $element instanceof DOMElement ) ) { $element = $this->doc->getElementById( $element ); } @@ -269,35 +283,45 @@ class HtmlFormatter { $body->appendChild( $element ); } $html = $this->doc->saveHTML(); + wfProfileOut( __METHOD__ . '-dom' ); + + wfProfileIn( __METHOD__ . '-fixes' ); $html = $this->fixLibXml( $html ); + if ( wfIsWindows() ) { + // Cleanup for CRLF misprocessing of unknown origin on Windows. + // + // If this error continues in the future, please track it down in the + // XML code paths if possible and fix there. + $html = str_replace( ' ', '', $html ); + } + wfProfileOut( __METHOD__ . '-fixes' ); } else { $html = $this->html; } - if ( wfIsWindows() ) { - // Appears to be cleanup for CRLF misprocessing of unknown origin - // when running server on Windows platform. - // - // If this error continues in the future, please track it down in the - // XML code paths if possible and fix there. - $html = str_replace( ' ', '', $html ); - } + // Remove stuff added by wrapHTML() $html = preg_replace( '/|^.*?|<\/body>.*$/s', '', $html ); $html = $this->onHtmlReady( $html ); + wfProfileIn( __METHOD__ . '-flatten' ); if ( $this->elementsToFlatten ) { $elements = implode( '|', $this->elementsToFlatten ); $html = preg_replace( "#]*>#is", '', $html ); } + wfProfileOut( __METHOD__ . '-flatten' ); wfProfileOut( __METHOD__ ); return $html; } /** - * @param $selector: CSS selector to parse - * @param $type - * @param $rawName - * @return bool: Whether the selector was successfully recognised + * Helper function for parseItemsToRemove(). This function extracts the selector type + * and the raw name of a selector from a CSS-style selector string and assigns those + * values to parameters passed by reference. For example, if given '#toc' as the + * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName. + * @param string $selector CSS selector to parse + * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG) + * @param string $rawName The raw name of the selector + * @return bool Whether the selector was successfully recognised */ protected function parseSelector( $selector, &$type, &$rawName ) { if ( strpos( $selector, '.' ) === 0 ) { @@ -306,14 +330,10 @@ class HtmlFormatter { } elseif ( strpos( $selector, '#' ) === 0 ) { $type = 'ID'; $rawName = substr( $selector, 1 ); - } elseif ( strpos( $selector, '.' ) !== 0 && - strpos( $selector, '.' ) !== false ) - { + } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) { $type = 'TAG_CLASS'; $rawName = $selector; - } elseif ( strpos( $selector, '[' ) === false - && strpos( $selector, ']' ) === false ) - { + } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) { $type = 'TAG'; $rawName = $selector; } else { @@ -324,7 +344,8 @@ class HtmlFormatter { } /** - * Transforms CSS selectors into an internal representation suitable for processing + * Transforms CSS-style selectors into an internal representation suitable for + * processing by filterContent() * @return array */ protected function parseItemsToRemove() { -- cgit v1.2.2