summaryrefslogtreecommitdiff
path: root/includes/parser/Tidy.php
blob: 38f22fd8ace529ad8c204d9bb3bb72989e7ba192 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
<?php
/**
 * HTML validation and correction
 *
 * @file
 */

/**
 * Class to interact with HTML tidy
 *
 * Either the external tidy program or the in-process tidy extension
 * will be used depending on availability. Override the default
 * $wgTidyInternal setting to disable the internal if it's not working.
 *
 * @ingroup Parser
 */
class MWTidy {

	/**
	 * Interface with html tidy, used if $wgUseTidy = true.
	 * If tidy isn't able to correct the markup, the original will be
	 * returned in all its glory with a warning comment appended.
	 *
	 * @param $text String: hideous HTML input
	 * @return String: corrected HTML output
	 */
	public static function tidy( $text ) {
		global $wgTidyInternal;

		$wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'.
' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>'.
'<head><title>test</title></head><body>'.$text.'</body></html>';

		if( $wgTidyInternal ) {
			$correctedtext = self::execInternalTidy( $wrappedtext );
		} else {
			$correctedtext = self::execExternalTidy( $wrappedtext );
		}
		if( is_null( $correctedtext ) ) {
			wfDebug( "Tidy error detected!\n" );
			return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
		}

		return $correctedtext;
	}

	/**
	 * Check HTML for errors, used if $wgValidateAllHtml = true.
	 *
	 * @param $text String
	 * @param &$errorStr String: return the error string
	 * @return Boolean: whether the HTML is valid
	 */
	public static function checkErrors( $text, &$errorStr = null ) {
		global $wgTidyInternal;
		
		$retval = 0;
		if( $wgTidyInternal ) {
			$errorStr = self::execInternalTidy( $text, true, $retval );
		} else {
			$errorStr = self::execExternalTidy( $text, true, $retval );
		}
		return ( $retval < 0 && $errorStr == '' ) || $retval == 0;
	}

	/**
	 * Spawn an external HTML tidy process and get corrected markup back from it.
	 * Also called in OutputHandler.php for full page validation
	 *
	 * @param $text String: HTML to check
	 * @param $stderr Boolean: Whether to read from STDERR rather than STDOUT
	 * @param &$retval Exit code (-1 on internal error)
	 * @return mixed String or null
	 */
	private static function execExternalTidy( $text, $stderr = false, &$retval = null ) {
		global $wgTidyConf, $wgTidyBin, $wgTidyOpts;
		wfProfileIn( __METHOD__ );

		$cleansource = '';
		$opts = ' -utf8';

		if( $stderr ) {
			$descriptorspec = array(
				0 => array( 'pipe', 'r' ),
				1 => array( 'file', wfGetNull(), 'a' ),
				2 => array( 'pipe', 'w' )
			);
		} else {
			$descriptorspec = array(
				0 => array( 'pipe', 'r' ),
				1 => array( 'pipe', 'w' ),
				2 => array( 'file', wfGetNull(), 'a' )
			);
		}
		
		$readpipe = $stderr ? 2 : 1;
		$pipes = array();

		if( function_exists( 'proc_open' ) ) {
			$process = proc_open( "$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes );
			if ( is_resource( $process ) ) {
				// Theoretically, this style of communication could cause a deadlock
				// here. If the stdout buffer fills up, then writes to stdin could
				// block. This doesn't appear to happen with tidy, because tidy only
				// writes to stdout after it's finished reading from stdin. Search
				// for tidyParseStdin and tidySaveStdout in console/tidy.c
				fwrite( $pipes[0], $text );
				fclose( $pipes[0] );
				while ( !feof( $pipes[$readpipe] ) ) {
					$cleansource .= fgets( $pipes[$readpipe], 1024 );
				}
				fclose( $pipes[$readpipe] );
				$retval = proc_close( $process );
			} else {
				$retval = -1;
			}
		} else {
			$retval = -1;	
		}

		if( !$stderr && $cleansource == '' && $text != '' ) {
			// Some kind of error happened, so we couldn't get the corrected text.
			// Just give up; we'll use the source text and append a warning.
			$cleansource = null;
		}
		wfProfileOut( __METHOD__ );
		return $cleansource;
	}

	/**
	 * Use the HTML tidy PECL extension to use the tidy library in-process,
	 * saving the overhead of spawning a new process.
	 *
	 * 'pear install tidy' should be able to compile the extension module.
	 */
	private static function execInternalTidy( $text, $stderr = false, &$retval = null ) {
		global $wgTidyConf, $wgDebugTidy;
		wfProfileIn( __METHOD__ );

		$tidy = new tidy;
		$tidy->parseString( $text, $wgTidyConf, 'utf8' );

		if( $stderr ) {
			$retval = $tidy->getStatus();
			wfProfileOut( __METHOD__ );
			return $tidy->errorBuffer;
		} else {
			$tidy->cleanRepair();
			$retval = $tidy->getStatus();
			if( $retval == 2 ) {
				// 2 is magic number for fatal error
				// http://www.php.net/manual/en/function.tidy-get-status.php
				$cleansource = null;
			} else {
				$cleansource = tidy_get_output( $tidy );
			}
			if ( $wgDebugTidy && $retval > 0 ) {
				$cleansource .= "<!--\nTidy reports:\n" .
					str_replace( '-->', '--&gt;', $tidy->errorBuffer ) .
					"\n-->";
			}
	
			wfProfileOut( __METHOD__ );
			return $cleansource;
		}
	}

}