On Fri, Nov 01, 2013 at 05:28:54PM -0400, Chris Mason wrote: > This allows filesystems and O_DIRECT to send down a list of bios > flagged for atomic completion. If the hardware supports atomic > IO, it is given the whole list in a single make_request_fn > call. > > In order to limit corner cases, there are a few restrictions in the > current code: > > * Every bio in the list must be for the same queue > > * Every bio must be a simple write. No trims or reads may be mixed in > > A new blk_queue_set_atomic_write() sets the number of atomic segments a > given driver can accept. If the queue is request based, I thought we have problem with linked bios. For example, init request will be wrong with linked bios. > Any number greater than one is allowed, but the driver is expected to > do final checks on the bio list to make sure a given list fits inside > its atomic capabilities. > > Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxxxx> > --- > block/blk-core.c | 217 +++++++++++++++++++++++++++++++------------------ > block/blk-settings.c | 17 ++++ > include/linux/blkdev.h | 14 ++++ > 3 files changed, 170 insertions(+), 78 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index 39d1261..6a5c292 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -1664,95 +1664,131 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) > return 0; > } > > +static void end_linked_bio(struct bio *bio, int err) > +{ > + struct bio *next; > + do { > + next = bio->bi_next; > + bio->bi_next = NULL; > + bio_endio(bio, err); > + bio = next; > + } while (bio); > +} > + > static noinline_for_stack bool > -generic_make_request_checks(struct bio *bio) > +generic_make_request_checks(struct bio *first_bio) > { > - struct request_queue *q; > - int nr_sectors = bio_sectors(bio); > + struct request_queue *q = NULL; > + int nr_sectors; > int err = -EIO; > char b[BDEVNAME_SIZE]; > struct hd_struct *part; > + struct bio *bio; > + int linked_bio = first_bio->bi_next ? 1 : 0; > > might_sleep(); > > - if (bio_check_eod(bio, nr_sectors)) > - goto end_io; > + bio = first_bio; > + for_each_bio(bio) { > + nr_sectors = bio_sectors(bio); > + if (bio_check_eod(bio, nr_sectors)) > + goto end_io; > > - q = bdev_get_queue(bio->bi_bdev); > - if (unlikely(!q)) { > - printk(KERN_ERR > - "generic_make_request: Trying to access " > - "nonexistent block-device %s (%Lu)\n", > - bdevname(bio->bi_bdev, b), > - (long long) bio->bi_iter.bi_sector); > - goto end_io; > - } > + if (!q) { > + q = bdev_get_queue(bio->bi_bdev); > + if (unlikely(!q)) { > + printk(KERN_ERR > + "generic_make_request: Trying to access " > + "nonexistent block-device %s (%Lu)\n", > + bdevname(bio->bi_bdev, b), > + (long long) bio->bi_iter.bi_sector); > + goto end_io; > + } > + } else if (q != bdev_get_queue(bio->bi_bdev)) { > + printk(KERN_ERR "generic_make_request: linked bio queue mismatch\n"); > + goto end_io; > + } > > - if (likely(bio_is_rw(bio) && > - nr_sectors > queue_max_hw_sectors(q))) { > - printk(KERN_ERR "bio too big device %s (%u > %u)\n", > - bdevname(bio->bi_bdev, b), > - bio_sectors(bio), > - queue_max_hw_sectors(q)); > - goto end_io; > - } > + if (likely(bio_is_rw(bio) && > + nr_sectors > queue_max_hw_sectors(q))) { > + printk(KERN_ERR "bio too big device %s (%u > %u)\n", > + bdevname(bio->bi_bdev, b), > + bio_sectors(bio), > + queue_max_hw_sectors(q)); > + goto end_io; > + } > > - part = bio->bi_bdev->bd_part; > - if (should_fail_request(part, bio->bi_iter.bi_size) || > - should_fail_request(&part_to_disk(part)->part0, > - bio->bi_iter.bi_size)) > - goto end_io; > + part = bio->bi_bdev->bd_part; > + if (should_fail_request(part, bio->bi_iter.bi_size) || > + should_fail_request(&part_to_disk(part)->part0, > + bio->bi_iter.bi_size)) > + goto end_io; > > - /* > - * If this device has partitions, remap block n > - * of partition p to block n+start(p) of the disk. > - */ > - blk_partition_remap(bio); > + /* > + * If this device has partitions, remap block n > + * of partition p to block n+start(p) of the disk. > + */ > + blk_partition_remap(bio); > > - if (bio_check_eod(bio, nr_sectors)) > - goto end_io; > + if (bio_check_eod(bio, nr_sectors)) > + goto end_io; > > - /* > - * Filter flush bio's early so that make_request based > - * drivers without flush support don't have to worry > - * about them. > - */ > - if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { > - bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); > - if (!nr_sectors) { > - err = 0; > + /* > + * Filter flush bio's early so that make_request based > + * drivers without flush support don't have to worry > + * about them. > + */ > + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { > + bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); > + if (!nr_sectors) { > + /* > + * we don't know how to mix empty flush bios > + * with a list of non-flush bios on devices > + * that don't support flushing > + */ > + if (linked_bio) > + err = -EINVAL; > + else > + err = 0; > + goto end_io; > + } > + } > + > + if ((bio->bi_rw & REQ_DISCARD) && > + (!blk_queue_discard(q) || > + ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { > + err = -EOPNOTSUPP; > goto end_io; > } > - } > > - if ((bio->bi_rw & REQ_DISCARD) && > - (!blk_queue_discard(q) || > - ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { > - err = -EOPNOTSUPP; > - goto end_io; > - } > + if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { > + err = -EOPNOTSUPP; > + goto end_io; > + } > > - if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { > - err = -EOPNOTSUPP; > - goto end_io; > - } > + if ((bio->bi_rw & REQ_ATOMIC) && > + !q->limits.atomic_write_segments) { > + err = -EOPNOTSUPP; > + goto end_io; > + } Looks we don't check if the queue is a stacked queue for MD/DM. And I didn't see we check if atomic bios number exceeds atomic_write_segments. > - /* > - * Various block parts want %current->io_context and lazy ioc > - * allocation ends up trading a lot of pain for a small amount of > - * memory. Just allocate it upfront. This may fail and block > - * layer knows how to live with it. > - */ > - create_io_context(GFP_ATOMIC, q->node); > + /* > + * Various block parts want %current->io_context and lazy ioc > + * allocation ends up trading a lot of pain for a small amount of > + * memory. Just allocate it upfront. This may fail and block > + * layer knows how to live with it. > + */ > + create_io_context(GFP_ATOMIC, q->node); > > - if (blk_throtl_bio(q, bio)) > - return false; /* throttled, will be resubmitted later */ > + if (blk_throtl_bio(q, bio)) > + return false; /* throttled, will be resubmitted later */ > > - trace_block_bio_queue(q, bio); > + trace_block_bio_queue(q, bio); > + } > return true; > > end_io: > - bio_endio(bio, err); > + end_linked_bio(first_bio, err); > return false; > } > > @@ -1788,6 +1824,17 @@ void generic_make_request(struct bio *bio) > return; > > /* > + * generic_make_request checks for atomic write support, we'll have > + * failed already if the queue doesn't support it > + */ > + if (bio->bi_rw & REQ_ATOMIC) { > + struct request_queue *q = bdev_get_queue(bio->bi_bdev); > + > + q->make_request_fn(q, bio); directly sending requests out has problem for MD/DM. But We should fail early for MD/DM check, so this isn't a problem if we detected this is a stacked queue. Thanks, Shaohua -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html