[PATCH 1/2] block: Add support for atomic writes

Chris Mason <chris.mason@xxxxxxxxxxxx> · Fri, 1 Nov 2013 17:28:54 -0400

This allows filesystems and O_DIRECT to send down a list of bios
flagged for atomic completion.  If the hardware supports atomic
IO, it is given the whole list in a single make_request_fn
call.

In order to limit corner cases, there are a few restrictions in the
current code:

* Every bio in the list must be for the same queue

* Every bio must be a simple write.  No trims or reads may be mixed in

A new blk_queue_set_atomic_write() sets the number of atomic segments a
given driver can accept.

Any number greater than one is allowed, but the driver is expected to
do final checks on the bio list to make sure a given list fits inside
its atomic capabilities.

Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxxxx>
---
 block/blk-core.c       | 217 +++++++++++++++++++++++++++++++------------------
 block/blk-settings.c   |  17 ++++
 include/linux/blkdev.h |  14 ++++
 3 files changed, 170 insertions(+), 78 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 39d1261..6a5c292 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1664,95 +1664,131 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 	return 0;
 }
 
+static void end_linked_bio(struct bio *bio, int err)
+{
+	struct bio *next;
+	do {
+		next = bio->bi_next;
+		bio->bi_next = NULL;
+		bio_endio(bio, err);
+		bio = next;
+	} while (bio);
+}
+
 static noinline_for_stack bool
-generic_make_request_checks(struct bio *bio)
+generic_make_request_checks(struct bio *first_bio)
 {
-	struct request_queue *q;
-	int nr_sectors = bio_sectors(bio);
+	struct request_queue *q = NULL;
+	int nr_sectors;
 	int err = -EIO;
 	char b[BDEVNAME_SIZE];
 	struct hd_struct *part;
+	struct bio *bio;
+	int linked_bio = first_bio->bi_next ? 1 : 0;
 
 	might_sleep();
 
-	if (bio_check_eod(bio, nr_sectors))
-		goto end_io;
+	bio = first_bio;
+	for_each_bio(bio) {
+		nr_sectors = bio_sectors(bio);
+		if (bio_check_eod(bio, nr_sectors))
+			goto end_io;
 
-	q = bdev_get_queue(bio->bi_bdev);
-	if (unlikely(!q)) {
-		printk(KERN_ERR
-		       "generic_make_request: Trying to access "
-			"nonexistent block-device %s (%Lu)\n",
-			bdevname(bio->bi_bdev, b),
-			(long long) bio->bi_iter.bi_sector);
-		goto end_io;
-	}
+		if (!q) {
+			q = bdev_get_queue(bio->bi_bdev);
+			if (unlikely(!q)) {
+				printk(KERN_ERR
+				       "generic_make_request: Trying to access "
+					"nonexistent block-device %s (%Lu)\n",
+					bdevname(bio->bi_bdev, b),
+					(long long) bio->bi_iter.bi_sector);
+				goto end_io;
+			}
+		} else if (q != bdev_get_queue(bio->bi_bdev)) {
+			printk(KERN_ERR "generic_make_request: linked bio queue mismatch\n");
+			goto end_io;
+		}
 
-	if (likely(bio_is_rw(bio) &&
-		   nr_sectors > queue_max_hw_sectors(q))) {
-		printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-		       bdevname(bio->bi_bdev, b),
-		       bio_sectors(bio),
-		       queue_max_hw_sectors(q));
-		goto end_io;
-	}
+		if (likely(bio_is_rw(bio) &&
+			   nr_sectors > queue_max_hw_sectors(q))) {
+			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
+			       bdevname(bio->bi_bdev, b),
+			       bio_sectors(bio),
+			       queue_max_hw_sectors(q));
+			goto end_io;
+		}
 
-	part = bio->bi_bdev->bd_part;
-	if (should_fail_request(part, bio->bi_iter.bi_size) ||
-	    should_fail_request(&part_to_disk(part)->part0,
-				bio->bi_iter.bi_size))
-		goto end_io;
+		part = bio->bi_bdev->bd_part;
+		if (should_fail_request(part, bio->bi_iter.bi_size) ||
+		    should_fail_request(&part_to_disk(part)->part0,
+					bio->bi_iter.bi_size))
+			goto end_io;
 
-	/*
-	 * If this device has partitions, remap block n
-	 * of partition p to block n+start(p) of the disk.
-	 */
-	blk_partition_remap(bio);
+		/*
+		 * If this device has partitions, remap block n
+		 * of partition p to block n+start(p) of the disk.
+		 */
+		blk_partition_remap(bio);
 
-	if (bio_check_eod(bio, nr_sectors))
-		goto end_io;
+		if (bio_check_eod(bio, nr_sectors))
+			goto end_io;
 
-	/*
-	 * Filter flush bio's early so that make_request based
-	 * drivers without flush support don't have to worry
-	 * about them.
-	 */
-	if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
-		bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
-		if (!nr_sectors) {
-			err = 0;
+		/*
+		 * Filter flush bio's early so that make_request based
+		 * drivers without flush support don't have to worry
+		 * about them.
+		 */
+		if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+			bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+			if (!nr_sectors) {
+				/*
+				 * we don't know how to mix empty flush bios
+				 * with a list of non-flush bios on devices
+				 * that don't support flushing
+				 */
+				if (linked_bio)
+					err = -EINVAL;
+				else
+					err = 0;
+				goto end_io;
+			}
+		}
+
+		if ((bio->bi_rw & REQ_DISCARD) &&
+		    (!blk_queue_discard(q) ||
+		     ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
+			err = -EOPNOTSUPP;
 			goto end_io;
 		}
-	}
 
-	if ((bio->bi_rw & REQ_DISCARD) &&
-	    (!blk_queue_discard(q) ||
-	     ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
-		err = -EOPNOTSUPP;
-		goto end_io;
-	}
+		if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
+			err = -EOPNOTSUPP;
+			goto end_io;
+		}
 
-	if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
-		err = -EOPNOTSUPP;
-		goto end_io;
-	}
+		if ((bio->bi_rw & REQ_ATOMIC) &&
+		    !q->limits.atomic_write_segments) {
+			err = -EOPNOTSUPP;
+			goto end_io;
+		}
 
-	/*
-	 * Various block parts want %current->io_context and lazy ioc
-	 * allocation ends up trading a lot of pain for a small amount of
-	 * memory.  Just allocate it upfront.  This may fail and block
-	 * layer knows how to live with it.
-	 */
-	create_io_context(GFP_ATOMIC, q->node);
+		/*
+		 * Various block parts want %current->io_context and lazy ioc
+		 * allocation ends up trading a lot of pain for a small amount of
+		 * memory.  Just allocate it upfront.  This may fail and block
+		 * layer knows how to live with it.
+		 */
+		create_io_context(GFP_ATOMIC, q->node);
 
-	if (blk_throtl_bio(q, bio))
-		return false;	/* throttled, will be resubmitted later */
+		if (blk_throtl_bio(q, bio))
+			return false;	/* throttled, will be resubmitted later */
 
-	trace_block_bio_queue(q, bio);
+		trace_block_bio_queue(q, bio);
+	}
 	return true;
 
 end_io:
-	bio_endio(bio, err);
+	end_linked_bio(first_bio, err);
 	return false;
 }
 
@@ -1788,6 +1824,17 @@ void generic_make_request(struct bio *bio)
 		return;
 
 	/*
+	 * generic_make_request checks for atomic write support, we'll have
+	 * failed already if the queue doesn't support it
+	 */
+	if (bio->bi_rw & REQ_ATOMIC) {
+		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+		q->make_request_fn(q, bio);
+		return;
+	}
+
+	/*
 	 * We only want one ->make_request_fn to be active at a time, else
 	 * stack usage with stacked devices could be a problem.  So use
 	 * current->bio_list to keep a list of requests submited by a
@@ -1815,6 +1862,10 @@ void generic_make_request(struct bio *bio)
 	 * from the top.  In this case we really did just take the bio
 	 * of the top of the list (no pretending) and so remove it from
 	 * bio_list, and call into ->make_request() again.
+	 *
+	 * REQ_ATOMIC bios may have been chained on bi_next, but we
+	 * should have caught them all above.  This BUG_ON(bi_next)
+	 * will catch any lists of bios that were not flagged as atomic
 	 */
 	BUG_ON(bio->bi_next);
 	bio_list_init(&bio_list_on_stack);
@@ -1849,28 +1900,38 @@ void submit_bio(int rw, struct bio *bio)
 	 * go through the normal accounting stuff before submission.
 	 */
 	if (bio_has_data(bio)) {
-		unsigned int count;
-
-		if (unlikely(rw & REQ_WRITE_SAME))
-			count = bdev_logical_block_size(bio->bi_bdev) >> 9;
-		else
-			count = bio_sectors(bio);
+		unsigned int count = 0;
+		unsigned int size = 0;
+		struct bio *walk;
+
+		walk = bio;
+		for_each_bio(walk) {
+			if (unlikely(rw & REQ_WRITE_SAME))
+				count += bdev_logical_block_size(walk->bi_bdev) >> 9;
+			else
+				count += bio_sectors(walk);
+			size += walk->bi_iter.bi_size;
+		}
 
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
 		} else {
-			task_io_account_read(bio->bi_iter.bi_size);
+			task_io_account_read(size);
 			count_vm_events(PGPGIN, count);
 		}
 
 		if (unlikely(block_dump)) {
 			char b[BDEVNAME_SIZE];
-			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
-			current->comm, task_pid_nr(current),
-				(rw & WRITE) ? "WRITE" : "READ",
-				(unsigned long long)bio->bi_iter.bi_sector,
-				bdevname(bio->bi_bdev, b),
-				count);
+
+			walk = bio;
+			for_each_bio(walk) {
+				printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
+				current->comm, task_pid_nr(current),
+					(rw & WRITE) ? "WRITE" : "READ",
+					(unsigned long long)walk->bi_iter.bi_sector,
+					bdevname(walk->bi_bdev, b),
+					bio_sectors(walk));
+			}
 		}
 	}
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5330933..17a6d23 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -119,6 +119,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
 	lim->discard_zeroes_data = 0;
+	lim->atomic_write_segments = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -804,6 +805,22 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
 /**
+ * blk_queue_set_atomic_write - number of segments supported for atomic writes
+ * @q:     the request queue for the device
+ * @segments: number of segments supported
+ *
+ * description:
+ *    If the device supports atomic (or transactional) writes, then it can pass
+ *    the maximum number of segments it supports in here. Atomic writes are
+ *    either completed as a whole, or none of it gets written.
+ **/
+void blk_queue_set_atomic_write(struct request_queue *q, unsigned int segments)
+{
+	q->limits.atomic_write_segments = segments;
+}
+EXPORT_SYMBOL(blk_queue_set_atomic_write);
+
+/**
  * blk_queue_flush - configure queue's cache flush capability
  * @q:		the request queue for the device
  * @flush:	0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ca0119d..40238bf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,6 +283,8 @@ struct queue_limits {
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
 
+	unsigned int		atomic_write_segments;
+
 	unsigned short		logical_block_size;
 	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
@@ -968,6 +970,8 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned short)
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
+extern void blk_queue_set_atomic_write(struct request_queue *q,
+				       unsigned int segments);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
@@ -1190,6 +1194,16 @@ static inline unsigned short queue_logical_block_size(struct request_queue *q)
 	return retval;
 }
 
+static inline unsigned short bdev_atomic_write_segments(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return q->limits.atomic_write_segments;
+
+	return 0;
+}
+
 static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 {
 	return queue_logical_block_size(bdev_get_queue(bdev));
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html