summaryrefslogtreecommitdiff
path: root/maintenance/backupTextPass.inc
diff options
context:
space:
mode:
Diffstat (limited to 'maintenance/backupTextPass.inc')
-rw-r--r--maintenance/backupTextPass.inc113
1 files changed, 93 insertions, 20 deletions
diff --git a/maintenance/backupTextPass.inc b/maintenance/backupTextPass.inc
index 5f776373..d83f1fcc 100644
--- a/maintenance/backupTextPass.inc
+++ b/maintenance/backupTextPass.inc
@@ -48,6 +48,8 @@ class TextPassDumper extends BackupDumper {
protected $maxConsecutiveFailedTextRetrievals = 200;
protected $failureTimeout = 5; // Seconds to sleep after db failure
+ protected $bufferSize = 524288; // In bytes. Maximum size to read from the stub in on go.
+
protected $php = "php";
protected $spawn = false;
@@ -186,6 +188,10 @@ class TextPassDumper extends BackupDumper {
$url = $this->processFileOpt( $val, $param );
switch ( $opt ) {
+ case 'buffersize':
+ // Lower bound for xml reading buffer size is 4 KB
+ $this->bufferSize = max( intval( $val ), 4 * 1024 );
+ break;
case 'prefetch':
require_once "$IP/maintenance/backupPrefetch.inc";
$this->prefetch = new BaseDump( $url );
@@ -354,6 +360,8 @@ class TextPassDumper extends BackupDumper {
$this->lastName = "";
$this->thisPage = 0;
$this->thisRev = 0;
+ $this->thisRevModel = null;
+ $this->thisRevFormat = null;
$parser = xml_parser_create( "UTF-8" );
xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
@@ -366,12 +374,11 @@ class TextPassDumper extends BackupDumper {
xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
$offset = 0; // for context extraction on error reporting
- $bufferSize = 512 * 1024;
do {
if ( $this->checkIfTimeExceeded() ) {
$this->setTimeExceeded();
}
- $chunk = fread( $input, $bufferSize );
+ $chunk = fread( $input, $this->bufferSize );
if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
@@ -422,7 +429,33 @@ class TextPassDumper extends BackupDumper {
}
/**
+ * Applies applicable export transformations to $text.
+ *
+ * @param string $text
+ * @param string $model
+ * @param string|null $format
+ *
+ * @return string
+ */
+ private function exportTransform( $text, $model, $format = null ) {
+ try {
+ $handler = ContentHandler::getForModelID( $model );
+ $text = $handler->exportTransform( $text, $format );
+ }
+ catch ( MWException $ex ) {
+ $this->progress(
+ "Unable to apply export transformation for content model '$model': " .
+ $ex->getMessage()
+ );
+ }
+
+ return $text;
+ }
+
+ /**
* Tries to get the revision text for a revision id.
+ * Export transformations are applied if the content model can is given or can be
+ * determined from the database.
*
* Upon errors, retries (Up to $this->maxFailures tries each call).
* If still no good revision get could be found even after this retrying, "" is returned.
@@ -431,11 +464,14 @@ class TextPassDumper extends BackupDumper {
* is thrown.
*
* @param string $id The revision id to get the text for
+ * @param string|bool|null $model The content model used to determine applicable export transformations.
+ * If $model is null, it will be determined from the database.
+ * @param string|null $format The content format used when applying export transformations.
*
- * @return string The revision text for $id, or ""
* @throws MWException
+ * @return string The revision text for $id, or ""
*/
- function getText( $id ) {
+ function getText( $id, $model = null, $format = null ) {
global $wgContentHandlerUseDB;
$prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
@@ -453,6 +489,24 @@ class TextPassDumper extends BackupDumper {
$oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
$consecutiveFailedTextRetrievals = 0;
+ if ( $model === null && $wgContentHandlerUseDB ) {
+ $row = $this->db->selectRow(
+ 'revision',
+ array( 'rev_content_model', 'rev_content_format' ),
+ array( 'rev_id' => $this->thisRev ),
+ __METHOD__
+ );
+
+ if ( $row ) {
+ $model = $row->rev_content_model;
+ $format = $row->rev_content_format;
+ }
+ }
+
+ if ( $model === null || $model === '' ) {
+ $model = false;
+ }
+
while ( $failures < $this->maxFailures ) {
// As soon as we found a good text for the $id, we will return immediately.
@@ -469,9 +523,19 @@ class TextPassDumper extends BackupDumper {
$tryIsPrefetch = true;
$text = $this->prefetch->prefetch( intval( $this->thisPage ),
intval( $this->thisRev ) );
+
if ( $text === null ) {
$text = false;
}
+
+ if ( is_string( $text ) && $model !== false ) {
+ // Apply export transformation to text coming from an old dump.
+ // The purpose of this transformation is to convert up from legacy
+ // formats, which may still be used in the older dump that is used
+ // for pre-fetching. Applying the transformation again should not
+ // interfere with content that is already in the correct form.
+ $text = $this->exportTransform( $text, $model, $format );
+ }
}
if ( $text === false ) {
@@ -483,6 +547,12 @@ class TextPassDumper extends BackupDumper {
$text = $this->getTextDb( $id );
}
+ if ( $text !== false && $model !== false ) {
+ // Apply export transformation to text coming from the database.
+ // Prefetched text should already have transformations applied.
+ $text = $this->exportTransform( $text, $model, $format );
+ }
+
// No more checks for texts from DB for now.
// If we received something that is not false,
// We treat it as good text, regardless of whether it actually is or is not
@@ -504,21 +574,8 @@ class TextPassDumper extends BackupDumper {
throw new MWException( "No database available" );
}
- $revLength = strlen( $text );
- if ( $wgContentHandlerUseDB ) {
- $row = $this->db->selectRow(
- 'revision',
- array( 'rev_len', 'rev_content_model' ),
- array( 'rev_id' => $revID ),
- __METHOD__
- );
- if ( $row ) {
- // only check the length for the wikitext content handler,
- // it's a wasted (and failed) check otherwise
- if ( $row->rev_content_model == CONTENT_MODEL_WIKITEXT ) {
- $revLength = $row->rev_len;
- }
- }
+ if ( $model !== CONTENT_MODEL_WIKITEXT ) {
+ $revLength = strlen( $text );
} else {
$revLength = $this->db->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
}
@@ -757,7 +814,14 @@ class TextPassDumper extends BackupDumper {
}
if ( $name == "text" && isset( $attribs['id'] ) ) {
- $text = $this->getText( $attribs['id'] );
+ $id = $attribs['id'];
+ $model = trim( $this->thisRevModel );
+ $format = trim( $this->thisRevFormat );
+
+ $model = $model === '' ? null : $model;
+ $format = $format === '' ? null : $format;
+
+ $text = $this->getText( $id, $model, $format );
$this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
if ( strlen( $text ) > 0 ) {
$this->characterData( $parser, $text );
@@ -780,6 +844,8 @@ class TextPassDumper extends BackupDumper {
$this->egress->writeRevision( null, $this->buffer );
$this->buffer = "";
$this->thisRev = "";
+ $this->thisRevModel = null;
+ $this->thisRevFormat = null;
} elseif ( $name == 'page' ) {
if ( !$this->firstPageWritten ) {
$this->firstPageWritten = trim( $this->thisPage );
@@ -834,6 +900,13 @@ class TextPassDumper extends BackupDumper {
$this->thisPage .= $data;
}
}
+ elseif ( $this->lastName == "model" ) {
+ $this->thisRevModel .= $data;
+ }
+ elseif ( $this->lastName == "format" ) {
+ $this->thisRevFormat .= $data;
+ }
+
// have to skip the newline left over from closepagetag line of
// end of checkpoint files. nasty hack!!
if ( $this->checkpointJustWritten ) {