summaryrefslogtreecommitdiff
path: root/includes/HistoryBlob.php
diff options
context:
space:
mode:
authorPierre Schmitz <pierre@archlinux.de>2009-02-22 13:37:51 +0100
committerPierre Schmitz <pierre@archlinux.de>2009-02-22 13:37:51 +0100
commitb9b85843572bf283f48285001e276ba7e61b63f6 (patch)
tree4c6f4571552ada9ccfb4030481dcf77308f8b254 /includes/HistoryBlob.php
parentd9a20acc4e789cca747ad360d87ee3f3e7aa58c1 (diff)
updated to MediaWiki 1.14.0
Diffstat (limited to 'includes/HistoryBlob.php')
-rw-r--r--includes/HistoryBlob.php444
1 files changed, 358 insertions, 86 deletions
diff --git a/includes/HistoryBlob.php b/includes/HistoryBlob.php
index 3772926d..664ceb4f 100644
--- a/includes/HistoryBlob.php
+++ b/includes/HistoryBlob.php
@@ -1,41 +1,33 @@
<?php
/**
- * Pure virtual parent
- * @todo document (needs a one-sentence top-level class description, that answers the question: "what is a HistoryBlob?")
+ * Base class for general text storage via the "object" flag in old_flags, or
+ * two-part external storage URLs. Used for represent efficient concatenated
+ * storage, and migration-related pointer objects.
*/
interface HistoryBlob
{
/**
- * setMeta and getMeta currently aren't used for anything, I just thought
- * they might be useful in the future.
- * @param $meta String: a single string.
- */
- public function setMeta( $meta );
-
- /**
- * setMeta and getMeta currently aren't used for anything, I just thought
- * they might be useful in the future.
- * Gets the meta-value
- */
- public function getMeta();
-
- /**
* Adds an item of text, returns a stub object which points to the item.
* You must call setLocation() on the stub object before storing it to the
* database
+ * Returns the key for getItem()
*/
public function addItem( $text );
/**
- * Get item by hash
+ * Get item by key, or false if the key is not present
*/
- public function getItem( $hash );
+ public function getItem( $key );
- # Set the "default text"
- # This concept is an odd property of the current DB schema, whereby each text item has a revision
- # associated with it. The default text is the text of the associated revision. There may, however,
- # be other revisions in the same object
+ /**
+ * Set the "default text"
+ * This concept is an odd property of the current DB schema, whereby each text item has a revision
+ * associated with it. The default text is the text of the associated revision. There may, however,
+ * be other revisions in the same object.
+ *
+ * Default text is not required for two-part external storage URLs.
+ */
public function setText( $text );
/**
@@ -45,13 +37,15 @@ interface HistoryBlob
}
/**
- * The real object
- * @todo document (needs one-sentence top-level class description + function descriptions).
+ * Concatenated gzip (CGZ) storage
+ * Improves compression ratio by concatenating like objects before gzipping
*/
class ConcatenatedGzipHistoryBlob implements HistoryBlob
{
public $mVersion = 0, $mCompressed = false, $mItems = array(), $mDefaultHash = '';
- public $mFast = 0, $mSize = 0;
+ public $mSize = 0;
+ public $mMaxSize = 10000000;
+ public $mMaxCount = 100;
/** Constructor */
public function ConcatenatedGzipHistoryBlob() {
@@ -60,34 +54,16 @@ class ConcatenatedGzipHistoryBlob implements HistoryBlob
}
}
- #
- # HistoryBlob implementation:
- #
-
- /** @todo document */
- public function setMeta( $metaData ) {
- $this->uncompress();
- $this->mItems['meta'] = $metaData;
- }
-
- /** @todo document */
- public function getMeta() {
- $this->uncompress();
- return $this->mItems['meta'];
- }
-
- /** @todo document */
public function addItem( $text ) {
$this->uncompress();
$hash = md5( $text );
- $this->mItems[$hash] = $text;
- $this->mSize += strlen( $text );
-
- $stub = new HistoryBlobStub( $hash );
- return $stub;
+ if ( !isset( $this->mItems[$hash] ) ) {
+ $this->mItems[$hash] = $text;
+ $this->mSize += strlen( $text );
+ }
+ return $hash;
}
- /** @todo document */
public function getItem( $hash ) {
$this->uncompress();
if ( array_key_exists( $hash, $this->mItems ) ) {
@@ -97,29 +73,27 @@ class ConcatenatedGzipHistoryBlob implements HistoryBlob
}
}
- /** @todo document */
public function setText( $text ) {
$this->uncompress();
- $stub = $this->addItem( $text );
- $this->mDefaultHash = $stub->mHash;
+ $this->mDefaultHash = $this->addItem( $text );
}
- /** @todo document */
public function getText() {
$this->uncompress();
return $this->getItem( $this->mDefaultHash );
}
- # HistoryBlob implemented.
-
-
- /** @todo document */
+ /**
+ * Remove an item
+ */
public function removeItem( $hash ) {
$this->mSize -= strlen( $this->mItems[$hash] );
unset( $this->mItems[$hash] );
}
- /** @todo document */
+ /**
+ * Compress the bulk data in the object
+ */
public function compress() {
if ( !$this->mCompressed ) {
$this->mItems = gzdeflate( serialize( $this->mItems ) );
@@ -127,7 +101,9 @@ class ConcatenatedGzipHistoryBlob implements HistoryBlob
}
}
- /** @todo document */
+ /**
+ * Uncompress bulk data
+ */
public function uncompress() {
if ( $this->mCompressed ) {
$this->mItems = unserialize( gzinflate( $this->mItems ) );
@@ -136,39 +112,22 @@ class ConcatenatedGzipHistoryBlob implements HistoryBlob
}
- /** @todo document */
function __sleep() {
$this->compress();
return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' );
}
- /** @todo document */
function __wakeup() {
$this->uncompress();
}
/**
- * Determines if this object is happy
+ * Helper function for compression jobs
+ * Returns true until the object is "full" and ready to be committed
*/
- public function isHappy( $maxFactor, $factorThreshold ) {
- if ( count( $this->mItems ) == 0 ) {
- return true;
- }
- if ( !$this->mFast ) {
- $this->uncompress();
- $record = serialize( $this->mItems );
- $size = strlen( $record );
- $avgUncompressed = $size / count( $this->mItems );
- $compressed = strlen( gzdeflate( $record ) );
-
- if ( $compressed < $factorThreshold * 1024 ) {
- return true;
- } else {
- return $avgUncompressed * $maxFactor < $compressed;
- }
- } else {
- return count( $this->mItems ) <= 10;
- }
+ public function isHappy() {
+ return $this->mSize < $this->mMaxSize
+ && count( $this->mItems ) < $this->mMaxCount;
}
}
@@ -184,12 +143,15 @@ $wgBlobCache = array();
/**
- * @todo document (needs one-sentence top-level class description + some function descriptions).
+ * Pointer object for an item within a CGZ blob stored in the text table.
*/
class HistoryBlobStub {
var $mOldId, $mHash, $mRef;
- /** @todo document */
+ /**
+ * @param string $hash The content hash of the text
+ * @param integer $oldid The old_id for the CGZ object
+ */
function HistoryBlobStub( $hash = '', $oldid = 0 ) {
$this->mHash = $hash;
}
@@ -216,7 +178,6 @@ class HistoryBlobStub {
return $this->mRef;
}
- /** @todo document */
function getText() {
$fname = 'HistoryBlobStub::getText';
global $wgBlobCache;
@@ -264,7 +225,9 @@ class HistoryBlobStub {
return $obj->getItem( $this->mHash );
}
- /** @todo document */
+ /**
+ * Get the content hash
+ */
function getHash() {
return $this->mHash;
}
@@ -282,7 +245,9 @@ class HistoryBlobStub {
class HistoryBlobCurStub {
var $mCurId;
- /** @todo document */
+ /**
+ * @param integer $curid The cur_id pointed to
+ */
function HistoryBlobCurStub( $curid = 0 ) {
$this->mCurId = $curid;
}
@@ -295,7 +260,6 @@ class HistoryBlobCurStub {
$this->mCurId = $id;
}
- /** @todo document */
function getText() {
$dbr = wfGetDB( DB_SLAVE );
$row = $dbr->selectRow( 'cur', array( 'cur_text' ), array( 'cur_id' => $this->mCurId ) );
@@ -305,3 +269,311 @@ class HistoryBlobCurStub {
return $row->cur_text;
}
}
+
+/**
+ * Diff-based history compression
+ * Requires xdiff 1.5+ and zlib
+ */
+class DiffHistoryBlob implements HistoryBlob {
+ /** Uncompressed item cache */
+ var $mItems = array();
+
+ /** Total uncompressed size */
+ var $mSize = 0;
+
+ /**
+ * Array of diffs. If a diff D from A to B is notated D = B - A, and Z is
+ * an empty string:
+ *
+ * { item[map[i]] - item[map[i-1]] where i > 0
+ * diff[i] = {
+ * { item[map[i]] - Z where i = 0
+ */
+ var $mDiffs;
+
+ /** The diff map, see above */
+ var $mDiffMap;
+
+ /**
+ * The key for getText()
+ */
+ var $mDefaultKey;
+
+ /**
+ * Compressed storage
+ */
+ var $mCompressed;
+
+ /**
+ * True if the object is locked against further writes
+ */
+ var $mFrozen = false;
+
+ /**
+ * The maximum uncompressed size before the object becomes sad
+ * Should be less than max_allowed_packet
+ */
+ var $mMaxSize = 10000000;
+
+ /**
+ * The maximum number of text items before the object becomes sad
+ */
+ var $mMaxCount = 100;
+
+ /** Constants from xdiff.h */
+ const XDL_BDOP_INS = 1;
+ const XDL_BDOP_CPY = 2;
+ const XDL_BDOP_INSB = 3;
+
+ function __construct() {
+ if ( !function_exists( 'gzdeflate' ) ) {
+ throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
+ }
+ }
+
+ function addItem( $text ) {
+ if ( $this->mFrozen ) {
+ throw new MWException( __METHOD__.": Cannot add more items after sleep/wakeup" );
+ }
+
+ $this->mItems[] = $text;
+ $this->mSize += strlen( $text );
+ $this->mDiffs = null; // later
+ return count( $this->mItems ) - 1;
+ }
+
+ function getItem( $key ) {
+ return $this->mItems[$key];
+ }
+
+ function setText( $text ) {
+ $this->mDefaultKey = $this->addItem( $text );
+ }
+
+ function getText() {
+ return $this->getItem( $this->mDefaultKey );
+ }
+
+ function compress() {
+ if ( !function_exists( 'xdiff_string_rabdiff' ) ){
+ throw new MWException( "Need xdiff 1.5+ support to write DiffHistoryBlob\n" );
+ }
+ if ( isset( $this->mDiffs ) ) {
+ // Already compressed
+ return;
+ }
+ if ( !count( $this->mItems ) ) {
+ // Empty
+ return;
+ }
+
+ // Create two diff sequences: one for main text and one for small text
+ $sequences = array(
+ 'small' => array(
+ 'tail' => '',
+ 'diffs' => array(),
+ 'map' => array(),
+ ),
+ 'main' => array(
+ 'tail' => '',
+ 'diffs' => array(),
+ 'map' => array(),
+ ),
+ );
+ $smallFactor = 0.5;
+
+ for ( $i = 0; $i < count( $this->mItems ); $i++ ) {
+ $text = $this->mItems[$i];
+ if ( $i == 0 ) {
+ $seqName = 'main';
+ } else {
+ $mainTail = $sequences['main']['tail'];
+ if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) {
+ $seqName = 'small';
+ } else {
+ $seqName = 'main';
+ }
+ }
+ $seq =& $sequences[$seqName];
+ $tail = $seq['tail'];
+ $diff = $this->diff( $tail, $text );
+ $seq['diffs'][] = $diff;
+ $seq['map'][] = $i;
+ $seq['tail'] = $text;
+ }
+ unset( $seq ); // unlink dangerous alias
+
+ // Knit the sequences together
+ $tail = '';
+ $this->mDiffs = array();
+ $this->mDiffMap = array();
+ foreach ( $sequences as $seq ) {
+ if ( !count( $seq['diffs'] ) ) {
+ continue;
+ }
+ if ( $tail === '' ) {
+ $this->mDiffs[] = $seq['diffs'][0];
+ } else {
+ $head = $this->patch( '', $seq['diffs'][0] );
+ $this->mDiffs[] = $this->diff( $tail, $head );
+ }
+ $this->mDiffMap[] = $seq['map'][0];
+ for ( $i = 1; $i < count( $seq['diffs'] ); $i++ ) {
+ $this->mDiffs[] = $seq['diffs'][$i];
+ $this->mDiffMap[] = $seq['map'][$i];
+ }
+ $tail = $seq['tail'];
+ }
+ }
+
+ function diff( $t1, $t2 ) {
+ # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
+ # "String is not zero-terminated"
+ wfSuppressWarnings();
+ $diff = xdiff_string_rabdiff( $t1, $t2 ) . '';
+ wfRestoreWarnings();
+ return $diff;
+ }
+
+ function patch( $base, $diff ) {
+ if ( function_exists( 'xdiff_string_bpatch' ) ) {
+ wfSuppressWarnings();
+ $text = xdiff_string_bpatch( $base, $diff ) . '';
+ wfRestoreWarnings();
+ return $text;
+ }
+
+ # Pure PHP implementation
+
+ $header = unpack( 'Vofp/Vcsize', substr( $diff, 0, 8 ) );
+
+ # Check the checksum if mhash is available
+ if ( extension_loaded( 'mhash' ) ) {
+ $ofp = mhash( MHASH_ADLER32, $base );
+ if ( $ofp !== substr( $diff, 0, 4 ) ) {
+ wfDebug( __METHOD__. ": incorrect base checksum\n" );
+ return false;
+ }
+ }
+ if ( $header['csize'] != strlen( $base ) ) {
+ wfDebug( __METHOD__. ": incorrect base length\n" );
+ return false;
+ }
+
+ $p = 8;
+ $out = '';
+ while ( $p < strlen( $diff ) ) {
+ $x = unpack( 'Cop', substr( $diff, $p, 1 ) );
+ $op = $x['op'];
+ ++$p;
+ switch ( $op ) {
+ case self::XDL_BDOP_INS:
+ $x = unpack( 'Csize', substr( $diff, $p, 1 ) );
+ $p++;
+ $out .= substr( $diff, $p, $x['size'] );
+ $p += $x['size'];
+ break;
+ case self::XDL_BDOP_INSB:
+ $x = unpack( 'Vcsize', substr( $diff, $p, 4 ) );
+ $p += 4;
+ $out .= substr( $diff, $p, $x['csize'] );
+ $p += $x['csize'];
+ break;
+ case self::XDL_BDOP_CPY:
+ $x = unpack( 'Voff/Vcsize', substr( $diff, $p, 8 ) );
+ $p += 8;
+ $out .= substr( $base, $x['off'], $x['csize'] );
+ break;
+ default:
+ wfDebug( __METHOD__.": invalid op\n" );
+ return false;
+ }
+ }
+ return $out;
+ }
+
+ function uncompress() {
+ if ( !$this->mDiffs ) {
+ return;
+ }
+ $tail = '';
+ for ( $diffKey = 0; $diffKey < count( $this->mDiffs ); $diffKey++ ) {
+ $textKey = $this->mDiffMap[$diffKey];
+ $text = $this->patch( $tail, $this->mDiffs[$diffKey] );
+ $this->mItems[$textKey] = $text;
+ $tail = $text;
+ }
+ }
+
+ function __sleep() {
+ $this->compress();
+ if ( !count( $this->mItems ) ) {
+ // Empty object
+ $info = false;
+ } else {
+ // Take forward differences to improve the compression ratio for sequences
+ $map = '';
+ $prev = 0;
+ foreach ( $this->mDiffMap as $i ) {
+ if ( $map !== '' ) {
+ $map .= ',';
+ }
+ $map .= $i - $prev;
+ $prev = $i;
+ }
+ $info = array(
+ 'diffs' => $this->mDiffs,
+ 'map' => $map
+ );
+ }
+ if ( isset( $this->mDefaultKey ) ) {
+ $info['default'] = $this->mDefaultKey;
+ }
+ $this->mCompressed = gzdeflate( serialize( $info ) );
+ return array( 'mCompressed' );
+ }
+
+ function __wakeup() {
+ // addItem() doesn't work if mItems is partially filled from mDiffs
+ $this->mFrozen = true;
+ $info = unserialize( gzinflate( $this->mCompressed ) );
+ unset( $this->mCompressed );
+
+ if ( !$info ) {
+ // Empty object
+ return;
+ }
+
+ if ( isset( $info['default'] ) ) {
+ $this->mDefaultKey = $info['default'];
+ }
+ $this->mDiffs = $info['diffs'];
+ if ( isset( $info['base'] ) ) {
+ // Old format
+ $this->mDiffMap = range( 0, count( $this->mDiffs ) - 1 );
+ array_unshift( $this->mDiffs,
+ pack( 'VVCV', 0, 0, self::XDL_BDOP_INSB, strlen( $info['base'] ) ) .
+ $info['base'] );
+ } else {
+ // New format
+ $map = explode( ',', $info['map'] );
+ $cur = 0;
+ $this->mDiffMap = array();
+ foreach ( $map as $i ) {
+ $cur += $i;
+ $this->mDiffMap[] = $cur;
+ }
+ }
+ $this->uncompress();
+ }
+
+ /**
+ * Helper function for compression jobs
+ * Returns true until the object is "full" and ready to be committed
+ */
+ function isHappy() {
+ return $this->mSize < $this->mMaxSize
+ && count( $this->mItems ) < $this->mMaxCount;
+ }
+
+}