summaryrefslogtreecommitdiff
path: root/maintenance/refreshLinks.php
diff options
context:
space:
mode:
Diffstat (limited to 'maintenance/refreshLinks.php')
-rw-r--r--maintenance/refreshLinks.php174
1 files changed, 119 insertions, 55 deletions
diff --git a/maintenance/refreshLinks.php b/maintenance/refreshLinks.php
index 0c2f722c..e1b6ac68 100644
--- a/maintenance/refreshLinks.php
+++ b/maintenance/refreshLinks.php
@@ -36,42 +36,51 @@ class RefreshLinks extends Maintenance {
$this->addOption( 'new-only', 'Only affect articles with just a single edit' );
$this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
$this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
- $this->addOption( 'm', 'Maximum replication lag', false, true );
$this->addOption( 'e', 'Last page id to refresh', false, true );
+ $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
+ 'query, default 100000', false, true );
$this->addArg( 'start', 'Page_id to start from, default 1', false );
$this->setBatchSize( 100 );
}
public function execute() {
- $max = $this->getOption( 'm', 0 );
+ // Note that there is a difference between not specifying the start
+ // and end IDs and using the minimum and maximum values from the page
+ // table. In the latter case, deleteLinksFromNonexistent() will not
+ // delete entries for nonexistent IDs that fall outside the range.
+ $start = (int)$this->getArg( 0 ) ?: null;
+ $end = (int)$this->getOption( 'e' ) ?: null;
+ $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100000 );
if ( !$this->hasOption( 'dfn-only' ) ) {
- $start = $this->getArg( 0, 1 );
$new = $this->getOption( 'new-only', false );
- $end = $this->getOption( 'e', 0 );
$redir = $this->getOption( 'redirects-only', false );
$oldRedir = $this->getOption( 'old-redirects-only', false );
- $this->doRefreshLinks( $start, $new, $max, $end, $redir, $oldRedir );
+ $this->doRefreshLinks( $start, $new, $end, $redir, $oldRedir );
+ $this->deleteLinksFromNonexistent( null, null, $this->mBatchSize, $dfnChunkSize );
+ } else {
+ $this->deleteLinksFromNonexistent( $start, $end, $this->mBatchSize, $dfnChunkSize );
}
- $this->deleteLinksFromNonexistent( $max, $this->mBatchSize );
}
/**
* Do the actual link refreshing.
- * @param int $start Page_id to start from
+ * @param int|null $start Page_id to start from
* @param bool $newOnly Only do pages with 1 edit
- * @param int $maxLag Max DB replication lag
- * @param int $end Page_id to stop at
+ * @param int|null $end Page_id to stop at
* @param bool $redirectsOnly Only fix redirects
* @param bool $oldRedirectsOnly Only fix redirects without redirect entries
*/
- private function doRefreshLinks( $start, $newOnly = false, $maxLag = false,
- $end = 0, $redirectsOnly = false, $oldRedirectsOnly = false
+ private function doRefreshLinks( $start, $newOnly = false,
+ $end = null, $redirectsOnly = false, $oldRedirectsOnly = false
) {
global $wgParser, $wgUseTidy;
$reportingInterval = 100;
$dbr = wfGetDB( DB_SLAVE );
- $start = intval( $start );
+
+ if ( $start === null ) {
+ $start = 1;
+ }
// Give extensions a chance to optimize settings
wfRunHooks( 'MaintenanceRefreshLinksInit', array( $this ) );
@@ -89,15 +98,10 @@ class RefreshLinks extends Maintenance {
$conds = array(
"page_is_redirect=1",
- "rd_from IS NULL"
+ "rd_from IS NULL",
+ self::intervalCond( $dbr, 'page_id', $start, $end ),
);
- if ( $end == 0 ) {
- $conds[] = "page_id >= $start";
- } else {
- $conds[] = "page_id BETWEEN $start AND $end";
- }
-
$res = $dbr->select(
array( 'page', 'redirect' ),
'page_id',
@@ -124,7 +128,8 @@ class RefreshLinks extends Maintenance {
array( 'page_id' ),
array(
'page_is_new' => 1,
- "page_id >= $start" ),
+ self::intervalCond( $dbr, 'page_id', $start, $end ),
+ ),
__METHOD__
);
$num = $res->numRows();
@@ -253,19 +258,60 @@ class RefreshLinks extends Maintenance {
* Removes non-existing links from pages from pagelinks, imagelinks,
* categorylinks, templatelinks, externallinks, interwikilinks, langlinks and redirect tables.
*
- * @param int $maxLag
+ * @param int|null $start Page_id to start from
+ * @param int|null $end Page_id to stop at
* @param int $batchSize The size of deletion batches
+ * @param int $chunkSize Maximum number of existent IDs to check per query
*
* @author Merlijn van Deen <valhallasw@arctus.nl>
*/
- private function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) {
+ private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100,
+ $chunkSize = 100000
+ ) {
wfWaitForSlaves();
+ $this->output( "Deleting illegal entries from the links tables...\n" );
+ $dbr = wfGetDB( DB_SLAVE );
+ do {
+ // Find the start of the next chunk. This is based only
+ // on existent page_ids.
+ $nextStart = $dbr->selectField(
+ 'page',
+ 'page_id',
+ self::intervalCond( $dbr, 'page_id', $start, $end ),
+ __METHOD__,
+ array( 'ORDER BY' => 'page_id', 'OFFSET' => $chunkSize )
+ );
- $dbw = wfGetDB( DB_MASTER );
+ if ( $nextStart !== false ) {
+ // To find the end of the current chunk, subtract one.
+ // This will serve to limit the number of rows scanned in
+ // dfnCheckInterval(), per query, to at most the sum of
+ // the chunk size and deletion batch size.
+ $chunkEnd = $nextStart - 1;
+ } else {
+ // This is the last chunk. Check all page_ids up to $end.
+ $chunkEnd = $end;
+ }
+
+ $fmtStart = $start !== null ? "[$start" : '(-INF';
+ $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)';
+ $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" );
+ $this->dfnCheckInterval( $start, $chunkEnd, $batchSize );
+
+ $start = $nextStart;
- $lb = wfGetLBFactory()->newMainLB();
- $dbr = $lb->getConnection( DB_SLAVE );
- $dbr->bufferResults( false );
+ } while ( $nextStart !== false );
+ }
+
+ /**
+ * @see RefreshLinks::deleteLinksFromNonexistent()
+ * @param int|null $start Page_id to start from
+ * @param int|null $end Page_id to stop at
+ * @param int $batchSize The size of deletion batches
+ */
+ private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) {
+ $dbw = wfGetDB( DB_MASTER );
+ $dbr = wfGetDB( DB_SLAVE );
$linksTables = array( // table name => page_id field
'pagelinks' => 'pl_from',
@@ -280,40 +326,58 @@ class RefreshLinks extends Maintenance {
);
foreach ( $linksTables as $table => $field ) {
- $this->output( "Retrieving illegal entries from $table... " );
-
- // SELECT DISTINCT( $field ) FROM $table LEFT JOIN page ON $field=page_id WHERE page_id IS NULL;
- $results = $dbr->select(
- array( $table, 'page' ),
- $field,
- array( 'page_id' => null ),
- __METHOD__,
- 'DISTINCT',
- array( 'page' => array( 'LEFT JOIN', "$field=page_id" ) )
- );
-
+ $this->output( " $table: 0" );
$counter = 0;
- $list = array();
- $this->output( "0.." );
- foreach ( $results as $row ) {
- $counter++;
- $list[] = $row->$field;
- if ( ( $counter % $batchSize ) == 0 ) {
+ do {
+ $ids = $dbr->selectFieldValues(
+ $table,
+ $field,
+ array(
+ self::intervalCond( $dbr, $field, $start, $end ),
+ "$field NOT IN ({$dbr->selectSQLText( 'page', 'page_id' )})",
+ ),
+ __METHOD__,
+ array( 'DISTINCT', 'ORDER BY' => $field, 'LIMIT' => $batchSize )
+ );
+
+ $numIds = count( $ids );
+ if ( $numIds ) {
+ $counter += $numIds;
wfWaitForSlaves();
- $dbw->delete( $table, array( $field => $list ), __METHOD__ );
-
- $this->output( $counter . ".." );
- $list = array();
+ $dbw->delete( $table, array( $field => $ids ), __METHOD__ );
+ $this->output( ", $counter" );
+ $start = $ids[$numIds - 1] + 1;
}
- }
- $this->output( $counter );
- if ( count( $list ) > 0 ) {
- $dbw->delete( $table, array( $field => $list ), __METHOD__ );
- }
- $this->output( "\n" );
+
+ } while ( $numIds >= $batchSize && ( $end === null || $start <= $end ) );
+
+ $this->output( " deleted.\n" );
+
wfWaitForSlaves();
}
- $lb->closeAll();
+ }
+
+ /**
+ * Build a SQL expression for a closed interval (i.e. BETWEEN).
+ *
+ * By specifying a null $start or $end, it is also possible to create
+ * half-bounded or unbounded intervals using this function.
+ *
+ * @param IDatabase $db Database connection
+ * @param string $var Field name
+ * @param mixed $start First value to include or null
+ * @param mixed $end Last value to include or null
+ */
+ private static function intervalCond( IDatabase $db, $var, $start, $end ) {
+ if ( $start === null && $end === null ) {
+ return "$var IS NOT NULL";
+ } elseif ( $end === null ) {
+ return "$var >= {$db->addQuotes( $start )}";
+ } elseif ( $start === null ) {
+ return "$var <= {$db->addQuotes( $end )}";
+ } else {
+ return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}";
+ }
}
}