summaryrefslogtreecommitdiff
path: root/includes/HtmlFormatter.php
diff options
context:
space:
mode:
Diffstat (limited to 'includes/HtmlFormatter.php')
-rw-r--r--includes/HtmlFormatter.php19
1 files changed, 17 insertions, 2 deletions
diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php
index b2926d17..221cefbb 100644
--- a/includes/HtmlFormatter.php
+++ b/includes/HtmlFormatter.php
@@ -63,7 +63,15 @@ class HtmlFormatter {
*/
public function getDoc() {
if ( !$this->doc ) {
- $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
+ // DOMDocument::loadHTML apparently isn't very good with encodings, so
+ // convert input to ASCII by encoding everything above 128 as entities.
+ if ( function_exists( 'mb_convert_encoding' ) ) {
+ $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
+ } else {
+ $html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) {
+ return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
+ }, $this->html );
+ }
// Workaround for bug that caused spaces before references
// to disappear during processing:
@@ -244,7 +252,14 @@ class HtmlFormatter {
) );
}
$html = $replacements->replace( $html );
- $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
+
+ if ( function_exists( 'mb_convert_encoding' ) ) {
+ // Just in case the conversion in getDoc() above used named
+ // entities that aren't known to html_entity_decode().
+ $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
+ } else {
+ $html = html_entity_decode( $html, ENT_COMPAT, 'utf-8' );
+ }
return $html;
}