On 11/29/18 3:55 PM, Jens Axboe wrote: > The bio referencing has a trick that doesn't do any actual atomic > inc/dec on the reference count until we have to elevator to > 1. For the > async IO O_DIRECT case, we can't use the simple DIO variants, so we use > __blkdev_direct_IO(). It always grabs an extra reference to the bio > after allocation, which means we then enter the slower path of actually > having to do atomic_inc/dec on the count. > > We don't need to do that for the async case, unless we end up going > multi-bio, in which case we're already doing huge amounts of IO. For the > smaller IO case (< BIO_MAX_PAGES), we can do without the extra ref. I accidentally generated this against the aio-poll branch, here's a version that is not. diff --git a/fs/block_dev.c b/fs/block_dev.c index d233a59ea364..5c0a224bf9cb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -281,10 +281,25 @@ struct blkdev_dio { static struct bio_set blkdev_dio_pool; +static void blkdev_bio_dirty_release(struct bio *bio, bool should_dirty) +{ + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) + put_page(bvec->bv_page); + bio_put(bio); + } +} + static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->should_dirty; + bool should_free = false; if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { if (bio->bi_status && !dio->bio.bi_status) @@ -302,25 +317,20 @@ static void blkdev_bio_end_io(struct bio *bio) } dio->iocb->ki_complete(iocb, ret, 0); - bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); blk_wake_io_task(waiter); } + /* last bio is done, now we can free the parent holding dio */ + should_free = dio->multi_bio; } - if (should_dirty) { - bio_check_pages_dirty(bio); - } else { - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) - put_page(bvec->bv_page); - bio_put(bio); - } + if (!dio->is_sync) + blkdev_bio_dirty_release(bio, should_dirty); + if (should_free) + bio_put(&dio->bio); } static ssize_t @@ -343,14 +353,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) return -EINVAL; bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); - bio_get(bio); /* extra ref for the completion handler */ dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) + if (dio->is_sync) { dio->waiter = current; - else + bio_get(bio); + } else { dio->iocb = iocb; + } dio->size = 0; dio->multi_bio = false; @@ -400,6 +411,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } if (!dio->multi_bio) { + /* + * async needs an extra reference if we are goign + * multi-bio only, so we can ensure that 'dio' + * sticks around. + */ + if (!is_sync) + bio_get(bio); dio->multi_bio = true; atomic_set(&dio->ref, 2); } else { @@ -433,6 +451,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) if (likely(!ret)) ret = dio->size; + blkdev_bio_dirty_release(bio, dio->should_dirty); bio_put(&dio->bio); return ret; } -- Jens Axboe