Re: [PATCH 1/2] block: Add support for atomic writes

Shaohua Li <shli@xxxxxxxxxx> · Sat, 2 Nov 2013 05:47:28 +0800

On Fri, Nov 01, 2013 at 05:28:54PM -0400, Chris Mason wrote:
> This allows filesystems and O_DIRECT to send down a list of bios
> flagged for atomic completion.  If the hardware supports atomic
> IO, it is given the whole list in a single make_request_fn
> call.
> 
> In order to limit corner cases, there are a few restrictions in the
> current code:
> 
> * Every bio in the list must be for the same queue
> 
> * Every bio must be a simple write.  No trims or reads may be mixed in
> 
> A new blk_queue_set_atomic_write() sets the number of atomic segments a
> given driver can accept.

If the queue is request based, I thought we have problem with linked bios. For
example, init request will be wrong with linked bios.

> Any number greater than one is allowed, but the driver is expected to
> do final checks on the bio list to make sure a given list fits inside
> its atomic capabilities.
> 
> Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxxxx>
> ---
>  block/blk-core.c       | 217 +++++++++++++++++++++++++++++++------------------
>  block/blk-settings.c   |  17 ++++
>  include/linux/blkdev.h |  14 ++++
>  3 files changed, 170 insertions(+), 78 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 39d1261..6a5c292 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1664,95 +1664,131 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
>  	return 0;
>  }
>  
> +static void end_linked_bio(struct bio *bio, int err)
> +{
> +	struct bio *next;
> +	do {
> +		next = bio->bi_next;
> +		bio->bi_next = NULL;
> +		bio_endio(bio, err);
> +		bio = next;
> +	} while (bio);
> +}
> +
>  static noinline_for_stack bool
> -generic_make_request_checks(struct bio *bio)
> +generic_make_request_checks(struct bio *first_bio)
>  {
> -	struct request_queue *q;
> -	int nr_sectors = bio_sectors(bio);
> +	struct request_queue *q = NULL;
> +	int nr_sectors;
>  	int err = -EIO;
>  	char b[BDEVNAME_SIZE];
>  	struct hd_struct *part;
> +	struct bio *bio;
> +	int linked_bio = first_bio->bi_next ? 1 : 0;
>  
>  	might_sleep();
>  
> -	if (bio_check_eod(bio, nr_sectors))
> -		goto end_io;
> +	bio = first_bio;
> +	for_each_bio(bio) {
> +		nr_sectors = bio_sectors(bio);
> +		if (bio_check_eod(bio, nr_sectors))
> +			goto end_io;
>  
> -	q = bdev_get_queue(bio->bi_bdev);
> -	if (unlikely(!q)) {
> -		printk(KERN_ERR
> -		       "generic_make_request: Trying to access "
> -			"nonexistent block-device %s (%Lu)\n",
> -			bdevname(bio->bi_bdev, b),
> -			(long long) bio->bi_iter.bi_sector);
> -		goto end_io;
> -	}
> +		if (!q) {
> +			q = bdev_get_queue(bio->bi_bdev);
> +			if (unlikely(!q)) {
> +				printk(KERN_ERR
> +				       "generic_make_request: Trying to access "
> +					"nonexistent block-device %s (%Lu)\n",
> +					bdevname(bio->bi_bdev, b),
> +					(long long) bio->bi_iter.bi_sector);
> +				goto end_io;
> +			}
> +		} else if (q != bdev_get_queue(bio->bi_bdev)) {
> +			printk(KERN_ERR "generic_make_request: linked bio queue mismatch\n");
> +			goto end_io;
> +		}
>  
> -	if (likely(bio_is_rw(bio) &&
> -		   nr_sectors > queue_max_hw_sectors(q))) {
> -		printk(KERN_ERR "bio too big device %s (%u > %u)\n",
> -		       bdevname(bio->bi_bdev, b),
> -		       bio_sectors(bio),
> -		       queue_max_hw_sectors(q));
> -		goto end_io;
> -	}
> +		if (likely(bio_is_rw(bio) &&
> +			   nr_sectors > queue_max_hw_sectors(q))) {
> +			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
> +			       bdevname(bio->bi_bdev, b),
> +			       bio_sectors(bio),
> +			       queue_max_hw_sectors(q));
> +			goto end_io;
> +		}
>  
> -	part = bio->bi_bdev->bd_part;
> -	if (should_fail_request(part, bio->bi_iter.bi_size) ||
> -	    should_fail_request(&part_to_disk(part)->part0,
> -				bio->bi_iter.bi_size))
> -		goto end_io;
> +		part = bio->bi_bdev->bd_part;
> +		if (should_fail_request(part, bio->bi_iter.bi_size) ||
> +		    should_fail_request(&part_to_disk(part)->part0,
> +					bio->bi_iter.bi_size))
> +			goto end_io;
>  
> -	/*
> -	 * If this device has partitions, remap block n
> -	 * of partition p to block n+start(p) of the disk.
> -	 */
> -	blk_partition_remap(bio);
> +		/*
> +		 * If this device has partitions, remap block n
> +		 * of partition p to block n+start(p) of the disk.
> +		 */
> +		blk_partition_remap(bio);
>  
> -	if (bio_check_eod(bio, nr_sectors))
> -		goto end_io;
> +		if (bio_check_eod(bio, nr_sectors))
> +			goto end_io;
>  
> -	/*
> -	 * Filter flush bio's early so that make_request based
> -	 * drivers without flush support don't have to worry
> -	 * about them.
> -	 */
> -	if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
> -		bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
> -		if (!nr_sectors) {
> -			err = 0;
> +		/*
> +		 * Filter flush bio's early so that make_request based
> +		 * drivers without flush support don't have to worry
> +		 * about them.
> +		 */
> +		if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
> +			bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
> +			if (!nr_sectors) {
> +				/*
> +				 * we don't know how to mix empty flush bios
> +				 * with a list of non-flush bios on devices
> +				 * that don't support flushing
> +				 */
> +				if (linked_bio)
> +					err = -EINVAL;
> +				else
> +					err = 0;
> +				goto end_io;
> +			}
> +		}
> +
> +		if ((bio->bi_rw & REQ_DISCARD) &&
> +		    (!blk_queue_discard(q) ||
> +		     ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
> +			err = -EOPNOTSUPP;
>  			goto end_io;
>  		}
> -	}
>  
> -	if ((bio->bi_rw & REQ_DISCARD) &&
> -	    (!blk_queue_discard(q) ||
> -	     ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
> -		err = -EOPNOTSUPP;
> -		goto end_io;
> -	}
> +		if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
> +			err = -EOPNOTSUPP;
> +			goto end_io;
> +		}
>  
> -	if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
> -		err = -EOPNOTSUPP;
> -		goto end_io;
> -	}
> +		if ((bio->bi_rw & REQ_ATOMIC) &&
> +		    !q->limits.atomic_write_segments) {
> +			err = -EOPNOTSUPP;
> +			goto end_io;
> +		}

Looks we don't check if the queue is a stacked queue for MD/DM. And I didn't
see we check if atomic bios number exceeds atomic_write_segments.

> -	/*
> -	 * Various block parts want %current->io_context and lazy ioc
> -	 * allocation ends up trading a lot of pain for a small amount of
> -	 * memory.  Just allocate it upfront.  This may fail and block
> -	 * layer knows how to live with it.
> -	 */
> -	create_io_context(GFP_ATOMIC, q->node);
> +		/*
> +		 * Various block parts want %current->io_context and lazy ioc
> +		 * allocation ends up trading a lot of pain for a small amount of
> +		 * memory.  Just allocate it upfront.  This may fail and block
> +		 * layer knows how to live with it.
> +		 */
> +		create_io_context(GFP_ATOMIC, q->node);
>  
> -	if (blk_throtl_bio(q, bio))
> -		return false;	/* throttled, will be resubmitted later */
> +		if (blk_throtl_bio(q, bio))
> +			return false;	/* throttled, will be resubmitted later */
>  
> -	trace_block_bio_queue(q, bio);
> +		trace_block_bio_queue(q, bio);
> +	}
>  	return true;
>  
>  end_io:
> -	bio_endio(bio, err);
> +	end_linked_bio(first_bio, err);
>  	return false;
>  }
>  
> @@ -1788,6 +1824,17 @@ void generic_make_request(struct bio *bio)
>  		return;
>  
>  	/*
> +	 * generic_make_request checks for atomic write support, we'll have
> +	 * failed already if the queue doesn't support it
> +	 */
> +	if (bio->bi_rw & REQ_ATOMIC) {
> +		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> +
> +		q->make_request_fn(q, bio);

directly sending requests out has problem for MD/DM. But We should fail early
for MD/DM check, so this isn't a problem if we detected this is a stacked queue.

Thanks,
Shaohua
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html