[PATCH 001 of 006] raid5: Move write operations to a work queue

Dan Williams <dan.j.williams@xxxxxxxxx> · Wed, 28 Jun 2006 11:23:56 -0700

This patch moves write (reconstruct and read-modify) operations to a
work queue.  Note the next patch in this series fixes some incorrect
assumptions around having multiple operations in flight (i.e. ignore
this version of 'queue_raid_work').

Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>

 drivers/md/raid5.c         |  314 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/raid/raid5.h |   67 +++++++++
 2 files changed, 357 insertions(+), 24 deletions(-)

===================================================================
Index: linux-2.6-raid/drivers/md/raid5.c
===================================================================

--- linux-2.6-raid.orig/drivers/md/raid5.c	2006-06-28 08:44:11.000000000 -0700
+++ linux-2.6-raid/drivers/md/raid5.c	2006-06-28 09:52:07.000000000 -0700
@@ -305,6 +305,7 @@
 	memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
 	sh->raid_conf = conf;
 	spin_lock_init(&sh->lock);
+	INIT_WORK(&sh->ops.work, conf->do_block_ops, sh);
 
 	if (grow_buffers(sh, conf->raid_disks)) {
 		shrink_buffers(sh, conf->raid_disks);
@@ -1224,6 +1225,80 @@
 	}
 }
 
+static int handle_write_operations5(struct stripe_head *sh, int rcw, int locked)
+{
+	int i, pd_idx = sh->pd_idx, disks = sh->disks;
+	int complete=0;
+
+	if (test_bit(STRIPE_OP_RCW, &sh->state) &&
+			test_bit(STRIPE_OP_RCW_Done, &sh->ops.state)) {
+				clear_bit(STRIPE_OP_RCW, &sh->state);
+				clear_bit(STRIPE_OP_RCW_Done, &sh->ops.state);
+				complete++;
+	}
+
+	if (test_bit(STRIPE_OP_RMW, &sh->state) &&
+			test_bit(STRIPE_OP_RMW_Done, &sh->ops.state)) {
+				clear_bit(STRIPE_OP_RMW, &sh->state);
+				clear_bit(STRIPE_OP_RMW_Done, &sh->ops.state);
+				BUG_ON(++complete == 2);
+	}
+
+
+	/* If no operation is currently in process then use the rcw flag to
+	 * select an operation
+	 */
+	if (locked == 0) {
+		if (rcw == 0) {
+			/* enter stage 1 of reconstruct write operation */
+			set_bit(STRIPE_OP_RCW, &sh->state);
+			set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state);
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+
+				if (i!=pd_idx && dev->towrite) {
+					set_bit(R5_LOCKED, &dev->flags);
+					locked++;
+				}
+			}
+		} else {
+			/* enter stage 1 of read modify write operation */
+			BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
+			set_bit(STRIPE_OP_RMW, &sh->state);
+			set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state);
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+				if (i==pd_idx)
+					continue;
+
+				if (dev->towrite &&
+				    test_bit(R5_UPTODATE, &dev->flags)) {
+					set_bit(R5_LOCKED, &dev->flags);
+					locked++;
+				}
+			}
+		}
+	} else if (locked && complete == 0) /* the queue has an operation in flight */
+		locked = -EBUSY;
+	else if (complete)
+		locked = 0;
+
+	/* keep the parity disk locked while asynchronous operations
+	 * are in flight
+	 */
+	if (locked > 0) {
+		set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
+		clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+		sh->ops.queue_count++;
+	} else if (locked == 0)
+		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+
+	PRINTK("%s: stripe %llu locked: %d complete: %d op_state: %lx\n",
+		__FUNCTION__, (unsigned long long)sh->sector,
+		locked, complete, sh->ops.state);
+
+	return locked;
+}
 
 
 /*
@@ -1320,6 +1395,174 @@
 	return pd_idx;
 }
 
+static inline void drain_bio(struct bio *wbi, sector_t sector, struct page *page)
+{
+	while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+		copy_data(1, wbi, page, sector);
+		wbi = r5_next_bio(wbi, sector);
+	}
+}
+
+/* must be called under the stripe lock */
+static void queue_raid_work(struct stripe_head *sh)
+{
+	if (--sh->ops.queue_count == 0) {
+		atomic_inc(&sh->count);
+		queue_work(sh->raid_conf->block_ops_queue, &sh->ops.work);
+	} else if (sh->ops.queue_count < 0)
+		sh->ops.queue_count = 0;
+}
+
+/*
+ * raid5_do_soft_block_ops - perform block memory operations on stripe data
+ * outside the spin lock.
+ */
+static void raid5_do_soft_block_ops(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+	int i, pd_idx = sh->pd_idx, disks = sh->disks, count = 1;
+	void *ptr[MAX_XOR_BLOCKS];
+	struct bio *chosen;
+	int overlap=0, new_work=0, written=0;
+	unsigned long state, ops_state;
+
+	/* take a snapshot of what needs to be done at this point in time */
+	spin_lock(&sh->lock);
+	state = sh->state;
+	ops_state = sh->ops.state;
+	spin_unlock(&sh->lock);
+
+	if (test_bit(STRIPE_OP_RMW, &state)) {
+		PRINTK("%s: stripe %llu STRIPE_OP_RMW op_state: %lx\n",
+			__FUNCTION__, (unsigned long long)sh->sector,
+			ops_state);
+
+		ptr[0] = page_address(sh->dev[pd_idx].page);
+
+		if (test_and_clear_bit(STRIPE_OP_RMW_ParityPre, &ops_state)) {
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+				/* if it is locked then servicing
+				 * has been requested
+				 */
+				if (dev->towrite && test_bit(R5_LOCKED, &dev->flags)) {
+					ptr[count++] = page_address(dev->page);
+					/* ? is the device_lock necessary here, compute_parity
+					 * does not lock for this operation ?
+					 */
+					chosen = dev->towrite;
+					dev->towrite = NULL;
+
+					overlap++;
+
+					BUG_ON(dev->written);
+					dev->written = chosen;
+					check_xor();
+				}
+			}
+			if (count != 1)
+				xor_block(count, STRIPE_SIZE, ptr);
+			set_bit(STRIPE_OP_RMW_Drain, &ops_state);
+		}
+		if (test_and_clear_bit(STRIPE_OP_RMW_Drain, &ops_state)) {
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+				written++;
+				drain_bio(dev->written, dev->sector, dev->page);
+			}
+			set_bit(STRIPE_OP_RMW_ParityPost, &ops_state);
+		}
+		if (test_and_clear_bit(STRIPE_OP_RMW_ParityPost, &ops_state)) {
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+				if (dev->written) {
+					ptr[count++] = page_address(dev->page);
+					check_xor();
+				}
+			}
+			if (count != 1)
+				xor_block(count, STRIPE_SIZE, ptr);
+
+			/* signal completion and acknowledge the last state seen
+			 * by sh->ops.state
+			 */
+			set_bit(STRIPE_OP_RMW_Done, &ops_state);
+			set_bit(STRIPE_OP_RMW_ParityPre, &ops_state);
+		}
+
+	} else if (test_bit(STRIPE_OP_RCW, &state)) {
+		PRINTK("%s: stripe %llu STRIPE_OP_RCW op_state: %lx\n",
+			__FUNCTION__, (unsigned long long)sh->sector,
+			ops_state);
+
+		ptr[0] = page_address(sh->dev[pd_idx].page);
+
+		if (test_and_clear_bit(STRIPE_OP_RCW_Drain, &ops_state)) {
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+				if (i!=pd_idx && dev->towrite &&
+					test_bit(R5_LOCKED, &dev->flags)) {
+					chosen = dev->towrite;
+					dev->towrite = NULL;
+
+					BUG_ON(dev->written);
+					dev->written = chosen;
+
+					overlap++;
+					written++;
+
+					drain_bio(dev->written, dev->sector,
+						dev->page);
+				} else if (i==pd_idx)
+					memset(ptr[0], 0, STRIPE_SIZE);
+			}
+			set_bit(STRIPE_OP_RCW_Parity, &ops_state);
+		}
+		if (test_and_clear_bit(STRIPE_OP_RCW_Parity, &ops_state)) {
+			for (i=disks; i--;)
+				if (i != pd_idx) {
+					ptr[count++] = page_address(sh->dev[i].page);
+					check_xor();
+				}
+			if (count != 1)
+				xor_block(count, STRIPE_SIZE, ptr);
+
+			/* signal completion and acknowledge the last state seen
+			 * by sh->ops.state
+			 */
+			set_bit(STRIPE_OP_RCW_Done, &ops_state);
+			set_bit(STRIPE_OP_RCW_Drain, &ops_state);
+
+		}
+	}
+
+	spin_lock(&sh->lock);
+	/* Update the state of operations, by XORing we clear the stage 1 requests
+	 * while preserving new requests.
+	 */
+	sh->ops.state ^= ops_state;
+
+	if (written)
+		for (i=disks ; i-- ;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (dev->written)
+				set_bit(R5_UPTODATE, &dev->flags);
+		}
+
+	if (overlap)
+		for (i= disks; i-- ;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_and_clear_bit(R5_Overlap, &dev->flags))
+				wake_up(&sh->raid_conf->wait_for_overlap);
+		}
+
+	sh->ops.queue_count += new_work;
+	set_bit(STRIPE_HANDLE, &sh->state);
+	queue_raid_work(sh);
+	spin_unlock(&sh->lock);
+
+	release_stripe(sh);
+}
 
 /*
  * handle_stripe - do things to a stripe.
@@ -1333,7 +1576,6 @@
  *    schedule a write of some buffers
  *    return confirmation of parity correctness
  *
- * Parity calculations are done inside the stripe lock
  * buffers are taken off read_list or write_list, and bh_cache buffers
  * get BH_Lock set before the stripe lock is released.
  *
@@ -1352,9 +1594,9 @@
 	int failed_num=0;
 	struct r5dev *dev;
 
-	PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
-		(unsigned long long)sh->sector, atomic_read(&sh->count),
-		sh->pd_idx);
+	PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d\n",
+	       (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
+	       sh->pd_idx);
 
 	spin_lock(&sh->lock);
 	clear_bit(STRIPE_HANDLE, &sh->state);
@@ -1596,7 +1838,8 @@
 	}
 
 	/* now to consider writing and what else, if anything should be read */
-	if (to_write) {
+	if (to_write || test_bit(STRIPE_OP_RCW, &sh->state) ||
+		test_bit(STRIPE_OP_RMW, &sh->state)) {
 		int rmw=0, rcw=0;
 		for (i=disks ; i--;) {
 			/* would I have to read this buffer for read_modify_write */
@@ -1668,25 +1911,29 @@
 				}
 			}
 		/* now if nothing is locked, and if we have enough data, we can start a write request */
-		if (locked == 0 && (rcw == 0 ||rmw == 0) &&
-		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-			PRINTK("Computing parity...\n");
-			compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
-			/* now every locked buffer is ready to be written */
-			for (i=disks; i--;)
-				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
-					PRINTK("Writing block %d\n", i);
-					locked++;
-					set_bit(R5_Wantwrite, &sh->dev[i].flags);
-					if (!test_bit(R5_Insync, &sh->dev[i].flags)
-					    || (i==sh->pd_idx && failed == 0))
-						set_bit(STRIPE_INSYNC, &sh->state);
+		/* ...or, if we have previously started write operations we can now advance the state */
+		if ((locked == 0 && (rcw == 0 ||rmw == 0) &&
+		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) ||
+		    test_bit(STRIPE_OP_RCW, &sh->state) || test_bit(STRIPE_OP_RMW, &sh->state)) {
+			int work_queued = handle_write_operations5(sh, rcw, locked);
+			if (work_queued == 0) {
+				/* now every locked buffer is ready to be written */
+				for (i=disks; i--;)
+					if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
+						PRINTK("Writing block %d\n", i);
+						locked++;
+						set_bit(R5_Wantwrite, &sh->dev[i].flags);
+						if (!test_bit(R5_Insync, &sh->dev[i].flags)
+						    || (i==sh->pd_idx && failed == 0))
+							set_bit(STRIPE_INSYNC, &sh->state);
+					}
+				if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+					atomic_dec(&conf->preread_active_stripes);
+					if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+						md_wakeup_thread(conf->mddev->thread);
 				}
-			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				atomic_dec(&conf->preread_active_stripes);
-				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-					md_wakeup_thread(conf->mddev->thread);
-			}
+			} else if (work_queued > 0)
+				locked += work_queued;
 		}
 	}
 
@@ -1819,6 +2066,8 @@
 			}
 	}
 
+	queue_raid_work(sh);
+
 	spin_unlock(&sh->lock);
 
 	while ((bi=return_bi)) {
@@ -1829,6 +2078,7 @@
 		bi->bi_size = 0;
 		bi->bi_end_io(bi, bytes, 0);
 	}
+
 	for (i=disks; i-- ;) {
 		int rw;
 		struct bio *bi;
@@ -3117,6 +3367,21 @@
 		if (!conf->spare_page)
 			goto abort;
 	}
+
+	sprintf(conf->workqueue_name, "%s_raid5_ops",
+		mddev->gendisk->disk_name);
+
+	if ((conf->block_ops_queue = create_workqueue(conf->workqueue_name))
+				     == NULL)
+		goto abort;
+
+	/* To Do:
+	 * 1/ Offload to asynchronous copy / xor engines
+	 * 2/ Automated selection of optimal do_block_ops
+	 *	routine similar to the xor template selection
+	 */
+	conf->do_block_ops = raid5_do_soft_block_ops;
+
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
@@ -3279,6 +3544,8 @@
 		safe_put_page(conf->spare_page);
 		kfree(conf->disks);
 		kfree(conf->stripe_hashtbl);
+		if (conf->do_block_ops)
+			destroy_workqueue(conf->block_ops_queue);
 		kfree(conf);
 	}
 	mddev->private = NULL;
@@ -3299,6 +3566,7 @@
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
 	kfree(conf->disks);
+	destroy_workqueue(conf->block_ops_queue);
 	kfree(conf);
 	mddev->private = NULL;
 	return 0;
Index: linux-2.6-raid/include/linux/raid/raid5.h
===================================================================
--- linux-2.6-raid.orig/include/linux/raid/raid5.h	2006-06-28 08:44:11.000000000 -0700
+++ linux-2.6-raid/include/linux/raid/raid5.h	2006-06-28 10:34:54.000000000 -0700
@@ -3,6 +3,8 @@
 
 #include <linux/raid/md.h>
 #include <linux/raid/xor.h>
+#include <linux/workqueue.h>
+#include <linux/dmaengine.h>
 
 /*
  *
@@ -123,6 +125,14 @@
  * The refcount counts each thread that have activated the stripe,
  * plus raid5d if it is handling it, plus one for each active request
  * on a cached buffer.
+ *
+ * Block operations (copy, xor, block fill, and block compare) are executed
+ * outside the spin lock.  A stripe can have multiple operations in flight provided
+ * that the operations do not have data dependencies.  For most cases data
+ * dependencies will be avoided by the 'overlap' protection logic in add_stripe_bio.
+ * A case that violates this rule is compute block operations where the work queue must
+ * guarantee that blocks are up to date before proceeding with a write or check
+ * parity operation.
  */
 
 struct stripe_head {
@@ -136,6 +146,16 @@
 	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;			/* disks in stripe */
+	struct stripe_operations {
+		int			queue_count;	/* if == 0 places stripe in the workqueue */
+		unsigned long		state;		/* state of block operations */
+		struct work_struct	work;		/* work queue descriptor */
+		#ifdef CONFIG_DMA_ENGINE
+		u32 			dma_result;	/* storage for dma engine results */
+		dma_cookie_t		dma_cookie;	/* last issued dma operation */
+		struct dma_chan		*dma_chan;	/* dma channel for ops offload */
+		#endif
+	} ops;
 	struct r5dev {
 		struct bio	req;
 		struct bio_vec	vec;
@@ -145,6 +165,7 @@
 		unsigned long	flags;
 	} dev[1]; /* allocated with extra space depending of RAID geometry */
 };
+
 /* Flags */
 #define	R5_UPTODATE	0	/* page contains current data */
 #define	R5_LOCKED	1	/* IO has been submitted on "req" */
@@ -156,8 +177,9 @@
 #define	R5_Overlap	7	/* There is a pending overlapping request on this block */
 #define	R5_ReadError	8	/* seen a read error here recently */
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
-
 #define	R5_Expanded	10	/* This block now has post-expand data */
+#define	R5_Consistent	11	/* Block is HW DMA-able without a cache flush */
+
 /*
  * Write method
  */
@@ -179,6 +201,41 @@
 #define	STRIPE_EXPANDING	9
 #define	STRIPE_EXPAND_SOURCE	10
 #define	STRIPE_EXPAND_READY	11
+#define	STRIPE_OP_RCW		12
+#define	STRIPE_OP_RMW		13 /* RAID-5 only */
+#define	STRIPE_OP_UPDATE	14 /* RAID-6 only */
+#define	STRIPE_OP_CHECK		15
+#define	STRIPE_OP_COMPUTE	16
+#define	STRIPE_OP_COMPUTE2	17 /* RAID-6 only */
+#define	STRIPE_OP_BIOFILL	18
+
+/*
+ * These flags are communication markers between the handle_stripe[5|6]
+ * routine and the block operations work queue
+ * - The _End definitions are a signal from handle_stripe to the work queue to
+ *   to ensure the completion of the operation so the results can be committed
+ *   to disk
+ * - The _Done definitions signal completion from work queue to handle_stripe
+ * - All other definitions are service requests for the work queue
+ */
+#define	STRIPE_OP_RCW_Drain		0
+#define	STRIPE_OP_RCW_Parity		1
+#define	STRIPE_OP_RCW_End		2
+#define	STRIPE_OP_RCW_Done		3
+#define	STRIPE_OP_RMW_ParityPre		4
+#define	STRIPE_OP_RMW_Drain		5
+#define	STRIPE_OP_RMW_ParityPost	6
+#define	STRIPE_OP_RMW_End		7
+#define	STRIPE_OP_RMW_Done		8
+#define	STRIPE_OP_CHECK_Gen   		9
+#define	STRIPE_OP_CHECK_Verify		10
+#define	STRIPE_OP_CHECK_End		11
+#define	STRIPE_OP_CHECK_Done		12
+#define	STRIPE_OP_COMPUTE_Prep		13
+#define	STRIPE_OP_COMPUTE_Parity	14
+#define	STRIPE_OP_COMPUTE_End		15
+#define	STRIPE_OP_COMPUTE_Done		16
+
 /*
  * Plugging:
  *
@@ -229,11 +286,16 @@
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
 	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
+
+	struct workqueue_struct *block_ops_queue;
+	void (*do_block_ops)(void *);
+
 	/* unfortunately we need two cache names as we temporarily have
 	 * two caches.
 	 */
 	int			active_name;
 	char			cache_name[2][20];
+	char			workqueue_name[20];
 	kmem_cache_t		*slab_cache; /* for allocating stripes */
 
 	int			seq_flush, seq_write;
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html