summaryrefslogtreecommitdiff
path: root/includes/parser/Tidy.php
blob: 2b98f01df7f7687ea5beef0a1ceffc1341d17edc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
<?php
/**
 * HTML validation and correction
 *
 * @file
 */

/**
 * Class used to hide mw:editsection tokens from Tidy so that it doesn't break them
 * or break on them. This is a bit of a hack for now, but hopefully in the future
 * we may create a real postprocessor or something that will replace this.
 * It's called wrapper because for now it basically takes over MWTidy::tidy's task
 * of wrapping the text in a xhtml block
 *
 * This re-uses some of the parser's UNIQ tricks, though some of it is private so it's
 * duplicated. Perhaps we should create an abstract marker hiding class.
 */
class MWTidyWrapper {

	/**
	 * @var ReplacementArray
	 */
	protected $mTokens;

	protected $mUniqPrefix;

	protected $mMarkerIndex;

	public function __construct() {
		$this->mTokens = null;
		$this->mUniqPrefix = null;
	}

	/**
	 * @param $text string
	 * @return string
	 */
	public function getWrapped( $text ) {
		$this->mTokens = new ReplacementArray;
		$this->mUniqPrefix = "\x7fUNIQ" .
			dechex( mt_rand( 0, 0x7fffffff ) ) . dechex( mt_rand( 0, 0x7fffffff ) );
		$this->mMarkerIndex = 0;

		$wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX,
			array( &$this, 'replaceEditSectionLinksCallback' ), $text );

		$wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'.
			' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>'.
			'<head><title>test</title></head><body>'.$wrappedtext.'</body></html>';

		return $wrappedtext;
	}

	/**
	 * @param $m array
	 *
	 * @return string
	 */
	function replaceEditSectionLinksCallback( $m ) {
		$marker = "{$this->mUniqPrefix}-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX;
		$this->mMarkerIndex++;
		$this->mTokens->setPair( $marker, $m[0] );
		return $marker;
	}

	/**
	 * @param $text string
	 * @return string
	 */
	public function postprocess( $text ) {
		return $this->mTokens->replace( $text );
	}

}

/**
 * Class to interact with HTML tidy
 *
 * Either the external tidy program or the in-process tidy extension
 * will be used depending on availability. Override the default
 * $wgTidyInternal setting to disable the internal if it's not working.
 *
 * @ingroup Parser
 */
class MWTidy {
	/**
	 * Interface with html tidy, used if $wgUseTidy = true.
	 * If tidy isn't able to correct the markup, the original will be
	 * returned in all its glory with a warning comment appended.
	 *
	 * @param $text String: hideous HTML input
	 * @return String: corrected HTML output
	 */
	public static function tidy( $text ) {
		global $wgTidyInternal;

		$wrapper = new MWTidyWrapper;
		$wrappedtext = $wrapper->getWrapped( $text );

		$retVal = null;
		if ( $wgTidyInternal ) {
			$correctedtext = self::execInternalTidy( $wrappedtext, false, $retVal );
		} else {
			$correctedtext = self::execExternalTidy( $wrappedtext, false, $retVal );
		}

		if ( $retVal < 0 ) {
			wfDebug( "Possible tidy configuration error!\n" );
			return $text . "\n<!-- Tidy was unable to run -->\n";
		} elseif ( is_null( $correctedtext ) ) {
			wfDebug( "Tidy error detected!\n" );
			return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
		}

		$correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens

		return $correctedtext;
	}

	/**
	 * Check HTML for errors, used if $wgValidateAllHtml = true.
	 *
	 * @param $text String
	 * @param &$errorStr String: return the error string
	 * @return Boolean: whether the HTML is valid
	 */
	public static function checkErrors( $text, &$errorStr = null ) {
		global $wgTidyInternal;

		$retval = 0;
		if( $wgTidyInternal ) {
			$errorStr = self::execInternalTidy( $text, true, $retval );
		} else {
			$errorStr = self::execExternalTidy( $text, true, $retval );
		}

		return ( $retval < 0 && $errorStr == '' ) || $retval == 0;
	}

	/**
	 * Spawn an external HTML tidy process and get corrected markup back from it.
	 * Also called in OutputHandler.php for full page validation
	 *
	 * @param $text String: HTML to check
	 * @param $stderr Boolean: Whether to read result from STDERR rather than STDOUT
	 * @param &$retval Exit code (-1 on internal error)
	 * @return mixed String or null
	 */
	private static function execExternalTidy( $text, $stderr = false, &$retval = null ) {
		global $wgTidyConf, $wgTidyBin, $wgTidyOpts;
		wfProfileIn( __METHOD__ );

		$cleansource = '';
		$opts = ' -utf8';

		if ( $stderr ) {
			$descriptorspec = array(
				0 => array( 'pipe', 'r' ),
				1 => array( 'file', wfGetNull(), 'a' ),
				2 => array( 'pipe', 'w' )
			);
		} else {
			$descriptorspec = array(
				0 => array( 'pipe', 'r' ),
				1 => array( 'pipe', 'w' ),
				2 => array( 'file', wfGetNull(), 'a' )
			);
		}

		$readpipe = $stderr ? 2 : 1;
		$pipes = array();

		$process = proc_open(
			"$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes );

		if ( is_resource( $process ) ) {
			// Theoretically, this style of communication could cause a deadlock
			// here. If the stdout buffer fills up, then writes to stdin could
			// block. This doesn't appear to happen with tidy, because tidy only
			// writes to stdout after it's finished reading from stdin. Search
			// for tidyParseStdin and tidySaveStdout in console/tidy.c
			fwrite( $pipes[0], $text );
			fclose( $pipes[0] );
			while ( !feof( $pipes[$readpipe] ) ) {
				$cleansource .= fgets( $pipes[$readpipe], 1024 );
			}
			fclose( $pipes[$readpipe] );
			$retval = proc_close( $process );
		} else {
			wfWarn( "Unable to start external tidy process" );
			$retval = -1;
		}

		if ( !$stderr && $cleansource == '' && $text != '' ) {
			// Some kind of error happened, so we couldn't get the corrected text.
			// Just give up; we'll use the source text and append a warning.
			$cleansource = null;
		}

		wfProfileOut( __METHOD__ );
		return $cleansource;
	}

	/**
	 * Use the HTML tidy extension to use the tidy library in-process,
	 * saving the overhead of spawning a new process.
	 *
	 * @param $text String: HTML to check
	 * @param $stderr Boolean: Whether to read result from error status instead of output
	 * @param &$retval Exit code (-1 on internal error)
	 * @return mixed String or null
	 */
	private static function execInternalTidy( $text, $stderr = false, &$retval = null ) {
		global $wgTidyConf, $wgDebugTidy;
		wfProfileIn( __METHOD__ );

		if ( !MWInit::classExists( 'tidy' ) ) {
			wfWarn( "Unable to load internal tidy class." );
			$retval = -1;

			wfProfileOut( __METHOD__ );
			return null;
		}

		$tidy = new tidy;
		$tidy->parseString( $text, $wgTidyConf, 'utf8' );

		if ( $stderr ) {
			$retval = $tidy->getStatus();

			wfProfileOut( __METHOD__ );
			return $tidy->errorBuffer;
		} else {
			$tidy->cleanRepair();
			$retval = $tidy->getStatus();
			if ( $retval == 2 ) {
				// 2 is magic number for fatal error
				// http://www.php.net/manual/en/function.tidy-get-status.php
				$cleansource = null;
			} else {
				$cleansource = tidy_get_output( $tidy );
				if ( $wgDebugTidy && $retval > 0 ) {
					$cleansource .= "<!--\nTidy reports:\n" .
						str_replace( '-->', '--&gt;', $tidy->errorBuffer ) .
						"\n-->";
				}
			}

			wfProfileOut( __METHOD__ );
			return $cleansource;
		}
	}
}