This allows filesystems and O_DIRECT to send down a list of bios flagged for atomic completion. If the hardware supports atomic IO, it is given the whole list in a single make_request_fn call. In order to limit corner cases, there are a few restrictions in the current code: * Every bio in the list must be for the same queue * Every bio must be a simple write. No trims or reads may be mixed in A new blk_queue_set_atomic_write() sets the number of atomic segments a given driver can accept. Any number greater than one is allowed, but the driver is expected to do final checks on the bio list to make sure a given list fits inside its atomic capabilities. Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxxxx> --- block/blk-core.c | 217 +++++++++++++++++++++++++++++++------------------ block/blk-settings.c | 17 ++++ include/linux/blkdev.h | 14 ++++ 3 files changed, 170 insertions(+), 78 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 39d1261..6a5c292 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1664,95 +1664,131 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) return 0; } +static void end_linked_bio(struct bio *bio, int err) +{ + struct bio *next; + do { + next = bio->bi_next; + bio->bi_next = NULL; + bio_endio(bio, err); + bio = next; + } while (bio); +} + static noinline_for_stack bool -generic_make_request_checks(struct bio *bio) +generic_make_request_checks(struct bio *first_bio) { - struct request_queue *q; - int nr_sectors = bio_sectors(bio); + struct request_queue *q = NULL; + int nr_sectors; int err = -EIO; char b[BDEVNAME_SIZE]; struct hd_struct *part; + struct bio *bio; + int linked_bio = first_bio->bi_next ? 1 : 0; might_sleep(); - if (bio_check_eod(bio, nr_sectors)) - goto end_io; + bio = first_bio; + for_each_bio(bio) { + nr_sectors = bio_sectors(bio); + if (bio_check_eod(bio, nr_sectors)) + goto end_io; - q = bdev_get_queue(bio->bi_bdev); - if (unlikely(!q)) { - printk(KERN_ERR - "generic_make_request: Trying to access " - "nonexistent block-device %s (%Lu)\n", - bdevname(bio->bi_bdev, b), - (long long) bio->bi_iter.bi_sector); - goto end_io; - } + if (!q) { + q = bdev_get_queue(bio->bi_bdev); + if (unlikely(!q)) { + printk(KERN_ERR + "generic_make_request: Trying to access " + "nonexistent block-device %s (%Lu)\n", + bdevname(bio->bi_bdev, b), + (long long) bio->bi_iter.bi_sector); + goto end_io; + } + } else if (q != bdev_get_queue(bio->bi_bdev)) { + printk(KERN_ERR "generic_make_request: linked bio queue mismatch\n"); + goto end_io; + } - if (likely(bio_is_rw(bio) && - nr_sectors > queue_max_hw_sectors(q))) { - printk(KERN_ERR "bio too big device %s (%u > %u)\n", - bdevname(bio->bi_bdev, b), - bio_sectors(bio), - queue_max_hw_sectors(q)); - goto end_io; - } + if (likely(bio_is_rw(bio) && + nr_sectors > queue_max_hw_sectors(q))) { + printk(KERN_ERR "bio too big device %s (%u > %u)\n", + bdevname(bio->bi_bdev, b), + bio_sectors(bio), + queue_max_hw_sectors(q)); + goto end_io; + } - part = bio->bi_bdev->bd_part; - if (should_fail_request(part, bio->bi_iter.bi_size) || - should_fail_request(&part_to_disk(part)->part0, - bio->bi_iter.bi_size)) - goto end_io; + part = bio->bi_bdev->bd_part; + if (should_fail_request(part, bio->bi_iter.bi_size) || + should_fail_request(&part_to_disk(part)->part0, + bio->bi_iter.bi_size)) + goto end_io; - /* - * If this device has partitions, remap block n - * of partition p to block n+start(p) of the disk. - */ - blk_partition_remap(bio); + /* + * If this device has partitions, remap block n + * of partition p to block n+start(p) of the disk. + */ + blk_partition_remap(bio); - if (bio_check_eod(bio, nr_sectors)) - goto end_io; + if (bio_check_eod(bio, nr_sectors)) + goto end_io; - /* - * Filter flush bio's early so that make_request based - * drivers without flush support don't have to worry - * about them. - */ - if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { - bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); - if (!nr_sectors) { - err = 0; + /* + * Filter flush bio's early so that make_request based + * drivers without flush support don't have to worry + * about them. + */ + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { + bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); + if (!nr_sectors) { + /* + * we don't know how to mix empty flush bios + * with a list of non-flush bios on devices + * that don't support flushing + */ + if (linked_bio) + err = -EINVAL; + else + err = 0; + goto end_io; + } + } + + if ((bio->bi_rw & REQ_DISCARD) && + (!blk_queue_discard(q) || + ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { + err = -EOPNOTSUPP; goto end_io; } - } - if ((bio->bi_rw & REQ_DISCARD) && - (!blk_queue_discard(q) || - ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { - err = -EOPNOTSUPP; - goto end_io; - } + if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { + err = -EOPNOTSUPP; + goto end_io; + } - if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { - err = -EOPNOTSUPP; - goto end_io; - } + if ((bio->bi_rw & REQ_ATOMIC) && + !q->limits.atomic_write_segments) { + err = -EOPNOTSUPP; + goto end_io; + } - /* - * Various block parts want %current->io_context and lazy ioc - * allocation ends up trading a lot of pain for a small amount of - * memory. Just allocate it upfront. This may fail and block - * layer knows how to live with it. - */ - create_io_context(GFP_ATOMIC, q->node); + /* + * Various block parts want %current->io_context and lazy ioc + * allocation ends up trading a lot of pain for a small amount of + * memory. Just allocate it upfront. This may fail and block + * layer knows how to live with it. + */ + create_io_context(GFP_ATOMIC, q->node); - if (blk_throtl_bio(q, bio)) - return false; /* throttled, will be resubmitted later */ + if (blk_throtl_bio(q, bio)) + return false; /* throttled, will be resubmitted later */ - trace_block_bio_queue(q, bio); + trace_block_bio_queue(q, bio); + } return true; end_io: - bio_endio(bio, err); + end_linked_bio(first_bio, err); return false; } @@ -1788,6 +1824,17 @@ void generic_make_request(struct bio *bio) return; /* + * generic_make_request checks for atomic write support, we'll have + * failed already if the queue doesn't support it + */ + if (bio->bi_rw & REQ_ATOMIC) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + q->make_request_fn(q, bio); + return; + } + + /* * We only want one ->make_request_fn to be active at a time, else * stack usage with stacked devices could be a problem. So use * current->bio_list to keep a list of requests submited by a @@ -1815,6 +1862,10 @@ void generic_make_request(struct bio *bio) * from the top. In this case we really did just take the bio * of the top of the list (no pretending) and so remove it from * bio_list, and call into ->make_request() again. + * + * REQ_ATOMIC bios may have been chained on bi_next, but we + * should have caught them all above. This BUG_ON(bi_next) + * will catch any lists of bios that were not flagged as atomic */ BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack); @@ -1849,28 +1900,38 @@ void submit_bio(int rw, struct bio *bio) * go through the normal accounting stuff before submission. */ if (bio_has_data(bio)) { - unsigned int count; - - if (unlikely(rw & REQ_WRITE_SAME)) - count = bdev_logical_block_size(bio->bi_bdev) >> 9; - else - count = bio_sectors(bio); + unsigned int count = 0; + unsigned int size = 0; + struct bio *walk; + + walk = bio; + for_each_bio(walk) { + if (unlikely(rw & REQ_WRITE_SAME)) + count += bdev_logical_block_size(walk->bi_bdev) >> 9; + else + count += bio_sectors(walk); + size += walk->bi_iter.bi_size; + } if (rw & WRITE) { count_vm_events(PGPGOUT, count); } else { - task_io_account_read(bio->bi_iter.bi_size); + task_io_account_read(size); count_vm_events(PGPGIN, count); } if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", - current->comm, task_pid_nr(current), - (rw & WRITE) ? "WRITE" : "READ", - (unsigned long long)bio->bi_iter.bi_sector, - bdevname(bio->bi_bdev, b), - count); + + walk = bio; + for_each_bio(walk) { + printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", + current->comm, task_pid_nr(current), + (rw & WRITE) ? "WRITE" : "READ", + (unsigned long long)walk->bi_iter.bi_sector, + bdevname(walk->bi_bdev, b), + bio_sectors(walk)); + } } } diff --git a/block/blk-settings.c b/block/blk-settings.c index 5330933..17a6d23 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -119,6 +119,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->discard_alignment = 0; lim->discard_misaligned = 0; lim->discard_zeroes_data = 0; + lim->atomic_write_segments = 0; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); lim->alignment_offset = 0; @@ -804,6 +805,22 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) EXPORT_SYMBOL(blk_queue_update_dma_alignment); /** + * blk_queue_set_atomic_write - number of segments supported for atomic writes + * @q: the request queue for the device + * @segments: number of segments supported + * + * description: + * If the device supports atomic (or transactional) writes, then it can pass + * the maximum number of segments it supports in here. Atomic writes are + * either completed as a whole, or none of it gets written. + **/ +void blk_queue_set_atomic_write(struct request_queue *q, unsigned int segments) +{ + q->limits.atomic_write_segments = segments; +} +EXPORT_SYMBOL(blk_queue_set_atomic_write); + +/** * blk_queue_flush - configure queue's cache flush capability * @q: the request queue for the device * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ca0119d..40238bf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -283,6 +283,8 @@ struct queue_limits { unsigned int discard_granularity; unsigned int discard_alignment; + unsigned int atomic_write_segments; + unsigned short logical_block_size; unsigned short max_segments; unsigned short max_integrity_segments; @@ -968,6 +970,8 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned short) extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); +extern void blk_queue_set_atomic_write(struct request_queue *q, + unsigned int segments); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); @@ -1190,6 +1194,16 @@ static inline unsigned short queue_logical_block_size(struct request_queue *q) return retval; } +static inline unsigned short bdev_atomic_write_segments(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return q->limits.atomic_write_segments; + + return 0; +} + static inline unsigned short bdev_logical_block_size(struct block_device *bdev) { return queue_logical_block_size(bdev_get_queue(bdev)); -- 1.8.2 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html