summaryrefslogtreecommitdiff
path: root/includes/HtmlFormatter.php
diff options
context:
space:
mode:
authorPierre Schmitz <pierre@archlinux.de>2014-12-27 15:41:37 +0100
committerPierre Schmitz <pierre@archlinux.de>2014-12-31 11:43:28 +0100
commitc1f9b1f7b1b77776192048005dcc66dcf3df2bfb (patch)
tree2b38796e738dd74cb42ecd9bfd151803108386bc /includes/HtmlFormatter.php
parentb88ab0086858470dd1f644e64cb4e4f62bb2be9b (diff)
Update to MediaWiki 1.24.1
Diffstat (limited to 'includes/HtmlFormatter.php')
-rw-r--r--includes/HtmlFormatter.php97
1 files changed, 59 insertions, 38 deletions
diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php
index 248a76fe..ccbfba82 100644
--- a/includes/HtmlFormatter.php
+++ b/includes/HtmlFormatter.php
@@ -34,7 +34,7 @@ class HtmlFormatter {
/**
* Constructor
*
- * @param string $html: Text to process
+ * @param string $html Text to process
*/
public function __construct( $html ) {
$this->html = $html;
@@ -51,15 +51,15 @@ class HtmlFormatter {
/**
* Override this in descendant class to modify HTML after it has been converted from DOM tree
- * @param string $html: HTML to process
- * @return string: Processed HTML
+ * @param string $html HTML to process
+ * @return string Processed HTML
*/
protected function onHtmlReady( $html ) {
return $html;
}
/**
- * @return DOMDocument: DOM to manipulate
+ * @return DOMDocument DOM to manipulate
*/
public function getDoc() {
if ( !$this->doc ) {
@@ -101,7 +101,7 @@ class HtmlFormatter {
* .<class>
* #<id>
*
- * @param Array|string $selectors: Selector(s) of stuff to remove
+ * @param array|string $selectors Selector(s) of stuff to remove
*/
public function remove( $selectors ) {
$this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
@@ -114,7 +114,7 @@ class HtmlFormatter {
* Note this interface may fail in surprising unexpected ways due to usage of regexes,
* so should not be relied on for HTML markup security measures.
*
- * @param Array|string $elements: Name(s) of tag(s) to flatten
+ * @param array|string $elements Name(s) of tag(s) to flatten
*/
public function flatten( $elements ) {
$this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
@@ -128,15 +128,23 @@ class HtmlFormatter {
}
/**
- * Removes content we've chosen to remove
+ * Removes content we've chosen to remove. The text of the removed elements can be
+ * extracted with the getText method.
+ * @return array Array of removed DOMElements
*/
public function filterContent() {
wfProfileIn( __METHOD__ );
$removals = $this->parseItemsToRemove();
- if ( !$removals ) {
+ // Bail out early if nothing to do
+ if ( array_reduce( $removals,
+ function ( $carry, $item ) {
+ return $carry && !$item;
+ },
+ true
+ ) ) {
wfProfileOut( __METHOD__ );
- return;
+ return array();
}
$doc = $this->getDoc();
@@ -156,8 +164,7 @@ class HtmlFormatter {
}
}
}
-
- $this->removeElements( $domElemsToRemove );
+ $removed = $this->removeElements( $domElemsToRemove );
// Elements with named IDs
$domElemsToRemove = array();
@@ -167,7 +174,7 @@ class HtmlFormatter {
$domElemsToRemove[] = $itemToRemoveNode;
}
}
- $this->removeElements( $domElemsToRemove );
+ $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
// CSS Classes
$domElemsToRemove = array();
@@ -183,7 +190,7 @@ class HtmlFormatter {
}
}
}
- $this->removeElements( $domElemsToRemove );
+ $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
// Tags with CSS Classes
foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
@@ -192,16 +199,17 @@ class HtmlFormatter {
$elements = $xpath->query(
'//' . $parts[0] . '[@class="' . $parts[1] . '"]'
);
-
- $this->removeElements( $elements );
+ $removed = array_merge( $removed, $this->removeElements( $elements ) );
}
wfProfileOut( __METHOD__ );
+ return $removed;
}
/**
* Removes a list of elelments from DOMDocument
* @param array|DOMNodeList $elements
+ * @return array Array of removed elements
*/
private function removeElements( $elements ) {
$list = $elements;
@@ -217,6 +225,7 @@ class HtmlFormatter {
$element->parentNode->removeChild( $element );
}
}
+ return $list;
}
/**
@@ -228,7 +237,7 @@ class HtmlFormatter {
private function fixLibXML( $html ) {
wfProfileIn( __METHOD__ );
static $replacements;
- if ( ! $replacements ) {
+ if ( !$replacements ) {
// We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
// normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
$replacements = new ReplacementArray( array(
@@ -245,15 +254,20 @@ class HtmlFormatter {
}
/**
- * Performs final transformations and returns resulting HTML
+ * Performs final transformations and returns resulting HTML. Note that if you want to call this
+ * both without an element and with an element you should call it without an element first. If you
+ * specify the $element in the method it'll change the underlying dom and you won't be able to get
+ * it back.
*
- * @param DOMElement|string|null $element: ID of element to get HTML from or false to get it from the whole tree
- * @return string: Processed HTML
+ * @param DOMElement|string|null $element ID of element to get HTML from or
+ * false to get it from the whole tree
+ * @return string Processed HTML
*/
public function getText( $element = null ) {
wfProfileIn( __METHOD__ );
if ( $this->doc ) {
+ wfProfileIn( __METHOD__ . '-dom' );
if ( $element !== null && !( $element instanceof DOMElement ) ) {
$element = $this->doc->getElementById( $element );
}
@@ -269,35 +283,45 @@ class HtmlFormatter {
$body->appendChild( $element );
}
$html = $this->doc->saveHTML();
+ wfProfileOut( __METHOD__ . '-dom' );
+
+ wfProfileIn( __METHOD__ . '-fixes' );
$html = $this->fixLibXml( $html );
+ if ( wfIsWindows() ) {
+ // Cleanup for CRLF misprocessing of unknown origin on Windows.
+ //
+ // If this error continues in the future, please track it down in the
+ // XML code paths if possible and fix there.
+ $html = str_replace( '&#13;', '', $html );
+ }
+ wfProfileOut( __METHOD__ . '-fixes' );
} else {
$html = $this->html;
}
- if ( wfIsWindows() ) {
- // Appears to be cleanup for CRLF misprocessing of unknown origin
- // when running server on Windows platform.
- //
- // If this error continues in the future, please track it down in the
- // XML code paths if possible and fix there.
- $html = str_replace( '&#13;', '', $html );
- }
+ // Remove stuff added by wrapHTML()
$html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
$html = $this->onHtmlReady( $html );
+ wfProfileIn( __METHOD__ . '-flatten' );
if ( $this->elementsToFlatten ) {
$elements = implode( '|', $this->elementsToFlatten );
$html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
}
+ wfProfileOut( __METHOD__ . '-flatten' );
wfProfileOut( __METHOD__ );
return $html;
}
/**
- * @param $selector: CSS selector to parse
- * @param $type
- * @param $rawName
- * @return bool: Whether the selector was successfully recognised
+ * Helper function for parseItemsToRemove(). This function extracts the selector type
+ * and the raw name of a selector from a CSS-style selector string and assigns those
+ * values to parameters passed by reference. For example, if given '#toc' as the
+ * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
+ * @param string $selector CSS selector to parse
+ * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
+ * @param string $rawName The raw name of the selector
+ * @return bool Whether the selector was successfully recognised
*/
protected function parseSelector( $selector, &$type, &$rawName ) {
if ( strpos( $selector, '.' ) === 0 ) {
@@ -306,14 +330,10 @@ class HtmlFormatter {
} elseif ( strpos( $selector, '#' ) === 0 ) {
$type = 'ID';
$rawName = substr( $selector, 1 );
- } elseif ( strpos( $selector, '.' ) !== 0 &&
- strpos( $selector, '.' ) !== false )
- {
+ } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
$type = 'TAG_CLASS';
$rawName = $selector;
- } elseif ( strpos( $selector, '[' ) === false
- && strpos( $selector, ']' ) === false )
- {
+ } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
$type = 'TAG';
$rawName = $selector;
} else {
@@ -324,7 +344,8 @@ class HtmlFormatter {
}
/**
- * Transforms CSS selectors into an internal representation suitable for processing
+ * Transforms CSS-style selectors into an internal representation suitable for
+ * processing by filterContent()
* @return array
*/
protected function parseItemsToRemove() {