summaryrefslogtreecommitdiff
path: root/includes/ParserXML.php
diff options
context:
space:
mode:
Diffstat (limited to 'includes/ParserXML.php')
-rw-r--r--includes/ParserXML.php643
1 files changed, 643 insertions, 0 deletions
diff --git a/includes/ParserXML.php b/includes/ParserXML.php
new file mode 100644
index 00000000..e7b64f6e
--- /dev/null
+++ b/includes/ParserXML.php
@@ -0,0 +1,643 @@
+<?php
+/**
+ *
+ * @package MediaWiki
+ * @subpackage Experimental
+ */
+
+/** */
+require_once ('Parser.php');
+
+/**
+ * This should one day become the XML->(X)HTML parser
+ * Based on work by Jan Hidders and Magnus Manske
+ * To use, set
+ * $wgUseXMLparser = true ;
+ * $wgEnableParserCache = false ;
+ * $wgWiki2xml to the path and executable of the command line version (cli)
+ * in LocalSettings.php
+ * @package MediaWiki
+ * @subpackage Experimental
+ */
+
+/**
+ * the base class for an element
+ * @package MediaWiki
+ * @subpackage Experimental
+ */
+class element {
+ var $name = '';
+ var $attrs = array ();
+ var $children = array ();
+
+ /**
+ * This finds the ATTRS element and returns the ATTR sub-children as a single string
+ * @todo FIXME $parser always empty when calling makeXHTML()
+ */
+ function getSourceAttrs() {
+ $ret = '';
+ foreach ($this->children as $child) {
+ if (!is_string($child) AND $child->name == 'ATTRS') {
+ $ret = $child->makeXHTML($parser);
+ }
+ }
+ return $ret;
+ }
+
+ /**
+ * This collects the ATTR thingies for getSourceAttrs()
+ */
+ function getTheseAttrs() {
+ $ret = array ();
+ foreach ($this->children as $child) {
+ if (!is_string($child) AND $child->name == 'ATTR') {
+ $ret[] = $child->attrs["NAME"]."='".$child->children[0]."'";
+ }
+ }
+ return implode(' ', $ret);
+ }
+
+ function fixLinkTails(& $parser, $key) {
+ $k2 = $key +1;
+ if (!isset ($this->children[$k2]))
+ return;
+ if (!is_string($this->children[$k2]))
+ return;
+ if (is_string($this->children[$key]))
+ return;
+ if ($this->children[$key]->name != "LINK")
+ return;
+
+ $n = $this->children[$k2];
+ $s = '';
+ while ($n != '' AND (($n[0] >= 'a' AND $n[0] <= 'z') OR $n[0] == 'ä' OR $n[0] == 'ö' OR $n[0] == 'ü' OR $n[0] == 'ß')) {
+ $s .= $n[0];
+ $n = substr($n, 1);
+ }
+ $this->children[$k2] = $n;
+
+ if (count($this->children[$key]->children) > 1) {
+ $kl = array_keys($this->children[$key]->children);
+ $kl = array_pop($kl);
+ $this->children[$key]->children[$kl]->children[] = $s;
+ } else {
+ $e = new element;
+ $e->name = "LINKOPTION";
+ $t = $this->children[$key]->sub_makeXHTML($parser);
+ $e->children[] = trim($t).$s;
+ $this->children[$key]->children[] = $e;
+ }
+ }
+
+ /**
+ * This function generates the XHTML for the entire subtree
+ */
+ function sub_makeXHTML(& $parser, $tag = '', $attr = '') {
+ $ret = '';
+
+ $attr2 = $this->getSourceAttrs();
+ if ($attr != '' AND $attr2 != '')
+ $attr .= ' ';
+ $attr .= $attr2;
+
+ if ($tag != '') {
+ $ret .= '<'.$tag;
+ if ($attr != '')
+ $ret .= ' '.$attr;
+ $ret .= '>';
+ }
+
+ # THIS SHOULD BE DONE IN THE WIKI2XML-PARSER INSTEAD
+ # foreach ( array_keys ( $this->children ) AS $x )
+ # $this->fixLinkTails ( $parser , $x ) ;
+
+ foreach ($this->children as $child) {
+ if (is_string($child)) {
+ $ret .= $child;
+ } elseif ($child->name != 'ATTRS') {
+ $ret .= $child->makeXHTML($parser);
+ }
+ }
+ if ($tag != '')
+ $ret .= '</'.$tag.">\n";
+ return $ret;
+ }
+
+ /**
+ * Link functions
+ */
+ function createInternalLink(& $parser, $target, $display_title, $options) {
+ global $wgUser;
+ $skin = $wgUser->getSkin();
+ $tp = explode(':', $target); # tp = target parts
+ $title = ''; # The plain title
+ $language = ''; # The language/meta/etc. part
+ $namespace = ''; # The namespace, if any
+ $subtarget = ''; # The '#' thingy
+
+ $nt = Title :: newFromText($target);
+ $fl = strtoupper($this->attrs['FORCEDLINK']) == 'YES';
+
+ if ($fl || count($tp) == 1) {
+ # Plain and simple case
+ $title = $target;
+ } else {
+ # There's stuff missing here...
+ if ($nt->getNamespace() == NS_IMAGE) {
+ $options[] = $display_title;
+ return $parser->makeImage($nt, implode('|', $options));
+ } else {
+ # Default
+ $title = $target;
+ }
+ }
+
+ if ($language != '') {
+ # External link within the WikiMedia project
+ return "{language link}";
+ } else {
+ if ($namespace != '') {
+ # Link to another namespace, check for image/media stuff
+ return "{namespace link}";
+ } else {
+ return $skin->makeLink($target, $display_title);
+ }
+ }
+ }
+
+ /** @todo document */
+ function makeInternalLink(& $parser) {
+ $target = '';
+ $option = array ();
+ foreach ($this->children as $child) {
+ if (is_string($child)) {
+ # This shouldn't be the case!
+ } else {
+ if ($child->name == 'LINKTARGET') {
+ $target = trim($child->makeXHTML($parser));
+ } else {
+ $option[] = trim($child->makeXHTML($parser));
+ }
+ }
+ }
+
+ if (count($option) == 0)
+ $option[] = $target; # Create dummy display title
+ $display_title = array_pop($option);
+ return $this->createInternalLink($parser, $target, $display_title, $option);
+ }
+
+ /** @todo document */
+ function getTemplateXHTML($title, $parts, & $parser) {
+ global $wgLang, $wgUser;
+ $skin = $wgUser->getSkin();
+ $ot = $title; # Original title
+ if (count(explode(':', $title)) == 1)
+ $title = $wgLang->getNsText(NS_TEMPLATE).":".$title;
+ $nt = Title :: newFromText($title);
+ $id = $nt->getArticleID();
+ if ($id == 0) {
+ # No/non-existing page
+ return $skin->makeBrokenLink($title, $ot);
+ }
+
+ $a = 0;
+ $tv = array (); # Template variables
+ foreach ($parts AS $part) {
+ $a ++;
+ $x = explode('=', $part, 2);
+ if (count($x) == 1)
+ $key = "{$a}";
+ else
+ $key = $x[0];
+ $value = array_pop($x);
+ $tv[$key] = $value;
+ }
+ $art = new Article($nt);
+ $text = $art->getContent(false);
+ $parser->plain_parse($text, true, $tv);
+
+ return $text;
+ }
+
+ /**
+ * This function actually converts wikiXML into XHTML tags
+ * @todo use switch() !
+ */
+ function makeXHTML(& $parser) {
+ $ret = '';
+ $n = $this->name; # Shortcut
+
+ if ($n == 'EXTENSION') {
+ # Fix allowed HTML
+ $old_n = $n;
+ $ext = strtoupper($this->attrs['NAME']);
+
+ switch($ext) {
+ case 'B':
+ case 'STRONG':
+ $n = 'BOLD';
+ break;
+ case 'I':
+ case 'EM':
+ $n = 'ITALICS';
+ break;
+ case 'U':
+ $n = 'UNDERLINED'; # Hey, virtual wiki tag! ;-)
+ break;
+ case 'S':
+ $n = 'STRIKE';
+ break;
+ case 'P':
+ $n = 'PARAGRAPH';
+ break;
+ case 'TABLE':
+ $n = 'TABLE';
+ break;
+ case 'TR':
+ $n = 'TABLEROW';
+ break;
+ case 'TD':
+ $n = 'TABLECELL';
+ break;
+ case 'TH':
+ $n = 'TABLEHEAD';
+ break;
+ case 'CAPTION':
+ $n = 'CAPTION';
+ break;
+ case 'NOWIKI':
+ $n = 'NOWIKI';
+ break;
+ }
+ if ($n != $old_n) {
+ unset ($this->attrs['NAME']); # Cleanup
+ } elseif ($parser->nowiki > 0) {
+ # No 'real' wiki tags allowed in nowiki section
+ $n = '';
+ }
+ } // $n = 'EXTENSION'
+
+ switch($n) {
+ case 'ARTICLE':
+ $ret .= $this->sub_makeXHTML($parser);
+ break;
+ case 'HEADING':
+ $ret .= $this->sub_makeXHTML($parser, 'h'.$this->attrs['LEVEL']);
+ break;
+ case 'PARAGRAPH':
+ $ret .= $this->sub_makeXHTML($parser, 'p');
+ break;
+ case 'BOLD':
+ $ret .= $this->sub_makeXHTML($parser, 'strong');
+ break;
+ case 'ITALICS':
+ $ret .= $this->sub_makeXHTML($parser, 'em');
+ break;
+
+ # These don't exist as wiki markup
+ case 'UNDERLINED':
+ $ret .= $this->sub_makeXHTML($parser, 'u');
+ break;
+ case 'STRIKE':
+ $ret .= $this->sub_makeXHTML($parser, 'strike');
+ break;
+
+ # HTML comment
+ case 'COMMENT':
+ # Comments are parsed out
+ $ret .= '';
+ break;
+
+
+ # Links
+ case 'LINK':
+ $ret .= $this->makeInternalLink($parser);
+ break;
+ case 'LINKTARGET':
+ case 'LINKOPTION':
+ $ret .= $this->sub_makeXHTML($parser);
+ break;
+
+ case 'TEMPLATE':
+ $parts = $this->sub_makeXHTML($parser);
+ $parts = explode('|', $parts);
+ $title = array_shift($parts);
+ $ret .= $this->getTemplateXHTML($title, $parts, & $parser);
+ break;
+
+ case 'TEMPLATEVAR':
+ $x = $this->sub_makeXHTML($parser);
+ if (isset ($parser->mCurrentTemplateOptions["{$x}"]))
+ $ret .= $parser->mCurrentTemplateOptions["{$x}"];
+ break;
+
+ # Internal use, not generated by wiki2xml parser
+ case 'IGNORE':
+ $ret .= $this->sub_makeXHTML($parser);
+
+ case 'NOWIKI':
+ $parser->nowiki++;
+ $ret .= $this->sub_makeXHTML($parser, '');
+ $parser->nowiki--;
+
+
+ # Unknown HTML extension
+ case 'EXTENSION': # This is currently a dummy!!!
+ $ext = $this->attrs['NAME'];
+
+ $ret .= '&lt;'.$ext.'&gt;';
+ $ret .= $this->sub_makeXHTML($parser);
+ $ret .= '&lt;/'.$ext.'&gt; ';
+ break;
+
+
+ # Table stuff
+
+ case 'TABLE':
+ $ret .= $this->sub_makeXHTML($parser, 'table');
+ break;
+ case 'TABLEROW':
+ $ret .= $this->sub_makeXHTML($parser, 'tr');
+ break;
+ case 'TABLECELL':
+ $ret .= $this->sub_makeXHTML($parser, 'td');
+ break;
+ case 'TABLEHEAD':
+ $ret .= $this->sub_makeXHTML($parser, 'th');
+ break;
+ case 'CAPTION':
+ $ret .= $this->sub_makeXHTML($parser, 'caption');
+ break;
+ case 'ATTRS': # SPECIAL CASE : returning attributes
+ return $this->getTheseAttrs();
+
+
+ # Lists stuff
+ case 'LISTITEM':
+ if ($parser->mListType == 'dl')
+ $ret .= $this->sub_makeXHTML($parser, 'dd');
+ else
+ $ret .= $this->sub_makeXHTML($parser, 'li');
+ break;
+ case 'LIST':
+ $type = 'ol'; # Default
+ if ($this->attrs['TYPE'] == 'bullet')
+ $type = 'ul';
+ else
+ if ($this->attrs['TYPE'] == 'indent')
+ $type = 'dl';
+ $oldtype = $parser->mListType;
+ $parser->mListType = $type;
+ $ret .= $this->sub_makeXHTML($parser, $type);
+ $parser->mListType = $oldtype;
+ break;
+
+ # Something else entirely
+ default:
+ $ret .= '&lt;'.$n.'&gt;';
+ $ret .= $this->sub_makeXHTML($parser);
+ $ret .= '&lt;/'.$n.'&gt; ';
+ } // switch($n)
+
+ $ret = "\n{$ret}\n";
+ $ret = str_replace("\n\n", "\n", $ret);
+ return $ret;
+ }
+
+ /**
+ * A function for additional debugging output
+ */
+ function myPrint() {
+ $ret = "<ul>\n";
+ $ret .= "<li> <b> Name: </b> $this->name </li>\n";
+ // print attributes
+ $ret .= '<li> <b> Attributes: </b>';
+ foreach ($this->attrs as $name => $value) {
+ $ret .= "$name => $value; ";
+ }
+ $ret .= " </li>\n";
+ // print children
+ foreach ($this->children as $child) {
+ if (is_string($child)) {
+ $ret .= "<li> $child </li>\n";
+ } else {
+ $ret .= $child->myPrint();
+ }
+ }
+ $ret .= "</ul>\n";
+ return $ret;
+ }
+}
+
+$ancStack = array (); // the stack with ancestral elements
+
+// START Three global functions needed for parsing, sorry guys
+/** @todo document */
+function wgXMLstartElement($parser, $name, $attrs) {
+ global $ancStack;
+
+ $newElem = new element;
+ $newElem->name = $name;
+ $newElem->attrs = $attrs;
+
+ array_push($ancStack, $newElem);
+}
+
+/** @todo document */
+function wgXMLendElement($parser, $name) {
+ global $ancStack, $rootElem;
+ // pop element off stack
+ $elem = array_pop($ancStack);
+ if (count($ancStack) == 0)
+ $rootElem = $elem;
+ else
+ // add it to its parent
+ array_push($ancStack[count($ancStack) - 1]->children, $elem);
+}
+
+/** @todo document */
+function wgXMLcharacterData($parser, $data) {
+ global $ancStack;
+ $data = trim($data); // Don't add blank lines, they're no use...
+ // add to parent if parent exists
+ if ($ancStack && $data != "") {
+ array_push($ancStack[count($ancStack) - 1]->children, $data);
+ }
+}
+// END Three global functions needed for parsing, sorry guys
+
+/**
+ * Here's the class that generates a nice tree
+ * @package MediaWiki
+ * @subpackage Experimental
+ */
+class xml2php {
+
+ /** @todo document */
+ function & scanFile($filename) {
+ global $ancStack, $rootElem;
+ $ancStack = array ();
+
+ $xml_parser = xml_parser_create();
+ xml_set_element_handler($xml_parser, 'wgXMLstartElement', 'wgXMLendElement');
+ xml_set_character_data_handler($xml_parser, 'wgXMLcharacterData');
+ if (!($fp = fopen($filename, 'r'))) {
+ die('could not open XML input');
+ }
+ while ($data = fread($fp, 4096)) {
+ if (!xml_parse($xml_parser, $data, feof($fp))) {
+ die(sprintf("XML error: %s at line %d", xml_error_string(xml_get_error_code($xml_parser)), xml_get_current_line_number($xml_parser)));
+ }
+ }
+ xml_parser_free($xml_parser);
+
+ // return the remaining root element we copied in the beginning
+ return $rootElem;
+ }
+
+ /** @todo document */
+ function scanString($input) {
+ global $ancStack, $rootElem;
+ $ancStack = array ();
+
+ $xml_parser = xml_parser_create();
+ xml_set_element_handler($xml_parser, 'wgXMLstartElement', 'wgXMLendElement');
+ xml_set_character_data_handler($xml_parser, 'wgXMLcharacterData');
+
+ if (!xml_parse($xml_parser, $input, true)) {
+ die(sprintf("XML error: %s at line %d", xml_error_string(xml_get_error_code($xml_parser)), xml_get_current_line_number($xml_parser)));
+ }
+ xml_parser_free($xml_parser);
+
+ // return the remaining root element we copied in the beginning
+ return $rootElem;
+ }
+
+}
+
+/**
+ * @todo document
+ * @package MediaWiki
+ * @subpackage Experimental
+ */
+class ParserXML extends Parser {
+ /**#@+
+ * @private
+ */
+ # Persistent:
+ var $mTagHooks, $mListType;
+
+ # Cleared with clearState():
+ var $mOutput, $mAutonumber, $mDTopen, $mStripState = array ();
+ var $mVariables, $mIncludeCount, $mArgStack, $mLastSection, $mInPre;
+
+ # Temporary:
+ var $mOptions, $mTitle, $mOutputType, $mTemplates, // cache of already loaded templates, avoids
+ // multiple SQL queries for the same string
+ $mTemplatePath; // stores an unsorted hash of all the templates already loaded
+ // in this path. Used for loop detection.
+
+ var $nowikicount, $mCurrentTemplateOptions;
+
+ /**#@-*/
+
+ /**
+ * Constructor
+ *
+ * @public
+ */
+ function ParserXML() {
+ $this->mTemplates = array ();
+ $this->mTemplatePath = array ();
+ $this->mTagHooks = array ();
+ $this->clearState();
+ }
+
+ /**
+ * Clear Parser state
+ *
+ * @private
+ */
+ function clearState() {
+ $this->mOutput = new ParserOutput;
+ $this->mAutonumber = 0;
+ $this->mLastSection = "";
+ $this->mDTopen = false;
+ $this->mVariables = false;
+ $this->mIncludeCount = array ();
+ $this->mStripState = array ();
+ $this->mArgStack = array ();
+ $this->mInPre = false;
+ }
+
+ /**
+ * Turns the wikitext into XML by calling the external parser
+ *
+ */
+ function html2xml(& $text) {
+ global $wgWiki2xml;
+
+ # generating html2xml command path
+ $a = $wgWiki2xml;
+ $a = explode('/', $a);
+ array_pop($a);
+ $a[] = 'html2xml';
+ $html2xml = implode('/', $a);
+ $a = array ();
+
+ $tmpfname = tempnam( wfTempDir(), 'FOO' );
+ $handle = fopen($tmpfname, 'w');
+ fwrite($handle, utf8_encode($text));
+ fclose($handle);
+ exec($html2xml.' < '.$tmpfname, $a);
+ $text = utf8_decode(implode("\n", $a));
+ unlink($tmpfname);
+ }
+
+ /** @todo document */
+ function runXMLparser(& $text) {
+ global $wgWiki2xml;
+
+ $this->html2xml($text);
+
+ $tmpfname = tempnam( wfTempDir(), 'FOO');
+ $handle = fopen($tmpfname, 'w');
+ fwrite($handle, $text);
+ fclose($handle);
+ exec($wgWiki2xml.' < '.$tmpfname, $a);
+ $text = utf8_decode(implode("\n", $a));
+ unlink($tmpfname);
+ }
+
+ /** @todo document */
+ function plain_parse(& $text, $inline = false, $templateOptions = array ()) {
+ $this->runXMLparser($text);
+ $nowikicount = 0;
+ $w = new xml2php;
+ $result = $w->scanString($text);
+
+ $oldTemplateOptions = $this->mCurrentTemplateOptions;
+ $this->mCurrentTemplateOptions = $templateOptions;
+
+ if ($inline) { # Inline rendering off for templates
+ if (count($result->children) == 1)
+ $result->children[0]->name = 'IGNORE';
+ }
+
+ if (1)
+ $text = $result->makeXHTML($this); # No debugging info
+ else
+ $text = $result->makeXHTML($this).'<hr>'.$text.'<hr>'.$result->myPrint();
+ $this->mCurrentTemplateOptions = $oldTemplateOptions;
+ }
+
+ /** @todo document */
+ function parse($text, & $title, $options, $linestart = true, $clearState = true) {
+ $this->plain_parse($text);
+ $this->mOutput->setText($text);
+ return $this->mOutput;
+ }
+
+}
+?>