To give others context for why I'm caring about this issue again, this recent BZ against 4.3-rc served as a reminder that we _need_ a fix: https://bugzilla.redhat.com/show_bug.cgi?id=1267650 FYI, I cleaned up the plug-based approach a bit further, here is the incremental patch: http://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/commit/?h=wip&id=f73d001ec692125308accbb5ca26f892f949c1b6 And here is a new version of the overall combined patch (sharing now before I transition to looking at alternatives, though my gut is the use of a plug in generic_make_request really wouldn't hurt us.. famous last words): block/bio.c | 82 +++++++++++++------------------------------------- block/blk-core.c | 21 ++++++++----- drivers/md/dm-bufio.c | 2 +- drivers/md/raid1.c | 6 ++-- drivers/md/raid10.c | 6 ++-- include/linux/blkdev.h | 11 +++++-- include/linux/sched.h | 4 --- 7 files changed, 51 insertions(+), 81 deletions(-) diff --git a/block/bio.c b/block/bio.c index ad3f276..3d03668 100644 --- a/block/bio.c +++ b/block/bio.c @@ -354,35 +354,31 @@ static void bio_alloc_rescue(struct work_struct *work) } } -static void punt_bios_to_rescuer(struct bio_set *bs) +/** + * blk_flush_bio_list + * @plug: the blk_plug that may have collected bios + * + * Pop bios queued on plug->bio_list and submit each of them to + * their rescue workqueue. + * + * If the bio doesn't have a bio_set, we use the default fs_bio_set. + * However, stacking drivers should use bio_set, so this shouldn't be + * an issue. + */ +void blk_flush_bio_list(struct blk_plug *plug) { - struct bio_list punt, nopunt; struct bio *bio; - /* - * In order to guarantee forward progress we must punt only bios that - * were allocated from this bio_set; otherwise, if there was a bio on - * there for a stacking driver higher up in the stack, processing it - * could require allocating bios from this bio_set, and doing that from - * our own rescuer would be bad. - * - * Since bio lists are singly linked, pop them all instead of trying to - * remove from the middle of the list: - */ - - bio_list_init(&punt); - bio_list_init(&nopunt); - - while ((bio = bio_list_pop(current->bio_list))) - bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); - - *current->bio_list = nopunt; - - spin_lock(&bs->rescue_lock); - bio_list_merge(&bs->rescue_list, &punt); - spin_unlock(&bs->rescue_lock); + while ((bio = bio_list_pop(&plug->bio_list))) { + struct bio_set *bs = bio->bi_pool; + if (!bs) + bs = fs_bio_set; - queue_work(bs->rescue_workqueue, &bs->rescue_work); + spin_lock(&bs->rescue_lock); + bio_list_add(&bs->rescue_list, bio); + queue_work(bs->rescue_workqueue, &bs->rescue_work); + spin_unlock(&bs->rescue_lock); + } } /** @@ -422,7 +418,6 @@ static void punt_bios_to_rescuer(struct bio_set *bs) */ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { - gfp_t saved_gfp = gfp_mask; unsigned front_pad; unsigned inline_vecs; unsigned long idx = BIO_POOL_NONE; @@ -443,37 +438,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) /* should not use nobvec bioset for nr_iovecs > 0 */ if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0)) return NULL; - /* - * generic_make_request() converts recursion to iteration; this - * means if we're running beneath it, any bios we allocate and - * submit will not be submitted (and thus freed) until after we - * return. - * - * This exposes us to a potential deadlock if we allocate - * multiple bios from the same bio_set() while running - * underneath generic_make_request(). If we were to allocate - * multiple bios (say a stacking block driver that was splitting - * bios), we would deadlock if we exhausted the mempool's - * reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are - * bios on current->bio_list, we first try the allocation - * without __GFP_WAIT; if that fails, we punt those bios we - * would be blocking to the rescuer workqueue before we retry - * with the original gfp_flags. - */ - - if (current->bio_list && !bio_list_empty(current->bio_list)) - gfp_mask &= ~__GFP_WAIT; p = mempool_alloc(bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(bs->bio_pool, gfp_mask); - } - front_pad = bs->front_pad; inline_vecs = BIO_INLINE_VECS; } @@ -486,12 +452,6 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) if (nr_iovecs > inline_vecs) { bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); - if (!bvl && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); - } - if (unlikely(!bvl)) goto err_free; diff --git a/block/blk-core.c b/block/blk-core.c index 2eb722d..cf0706a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1927,6 +1927,7 @@ end_io: void generic_make_request(struct bio *bio) { struct bio_list bio_list_on_stack; + struct blk_plug plug; if (!generic_make_request_checks(bio)) return; @@ -1934,15 +1935,15 @@ void generic_make_request(struct bio *bio) /* * We only want one ->make_request_fn to be active at a time, else * stack usage with stacked devices could be a problem. So use - * current->bio_list to keep a list of requests submited by a - * make_request_fn function. current->bio_list is also used as a + * current->plug->bio_list to keep a list of requests submitted by a + * make_request_fn function. current->plug->bio_list is also used as a * flag to say if generic_make_request is currently active in this * task or not. If it is NULL, then no make_request is active. If * it is non-NULL, then a make_request is active, and new requests * should be added at the tail */ - if (current->bio_list) { - bio_list_add(current->bio_list, bio); + if (current->plug && current->plug->bio_list) { + bio_list_add(¤t->plug->bio_list, bio); return; } @@ -1962,15 +1963,17 @@ void generic_make_request(struct bio *bio) */ BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack); - current->bio_list = &bio_list_on_stack; + blk_start_plug(&plug); + current->plug->bio_list = &bio_list_on_stack; do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); q->make_request_fn(q, bio); - bio = bio_list_pop(current->bio_list); + bio = bio_list_pop(current->plug->bio_list); } while (bio); - current->bio_list = NULL; /* deactivate */ + current->plug->bio_list = NULL; /* deactivate */ + blk_finish_plug(&plug); } EXPORT_SYMBOL(generic_make_request); @@ -3065,6 +3068,8 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); + plug->bio_list = NULL; + /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier @@ -3151,6 +3156,8 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) LIST_HEAD(list); unsigned int depth; + blk_flush_bio_list(plug); + flush_plug_callbacks(plug, from_schedule); if (!list_empty(&plug->mq_list)) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 2dd3308..c2bff16 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -168,7 +168,7 @@ static inline int dm_bufio_cache_index(struct dm_bufio_client *c) #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) -#define dm_bufio_in_request() (!!current->bio_list) +#define dm_bufio_in_request() (current->plug && !!current->plug->bio_list) static void dm_bufio_lock(struct dm_bufio_client *c) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4517f06..357782f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -874,8 +874,8 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) (!conf->barrier || ((conf->start_next_window < conf->next_resync + RESYNC_SECTORS) && - current->bio_list && - !bio_list_empty(current->bio_list))), + (current->plug && current->plug->bio_list && + !bio_list_empty(current->plug->bio_list)))), conf->resync_lock); conf->nr_waiting--; } @@ -1013,7 +1013,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r1conf *conf = mddev->private; struct bio *bio; - if (from_schedule || current->bio_list) { + if (from_schedule || (current->plug && current->plug->bio_list)) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); conf->pending_count += plug->pending_cnt; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0fc33eb..780681f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -944,8 +944,8 @@ static void wait_barrier(struct r10conf *conf) wait_event_lock_irq(conf->wait_barrier, !conf->barrier || (conf->nr_pending && - current->bio_list && - !bio_list_empty(current->bio_list)), + (current->plug && current->plug->bio_list && + !bio_list_empty(current->plug->bio_list))), conf->resync_lock); conf->nr_waiting--; } @@ -1021,7 +1021,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r10conf *conf = mddev->private; struct bio *bio; - if (from_schedule || current->bio_list) { + if (from_schedule || (current->plug && current->plug->bio_list)) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); conf->pending_count += plug->pending_cnt; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99da9eb..9bdac70 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1040,6 +1040,7 @@ struct blk_plug { struct list_head list; /* requests */ struct list_head mq_list; /* blk-mq requests */ struct list_head cb_list; /* md requires an unplug callback */ + struct bio_list *bio_list; /* queued bios from stacked block device */ }; #define BLK_MAX_REQUEST_COUNT 16 @@ -1079,9 +1080,12 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) return plug && (!list_empty(&plug->list) || !list_empty(&plug->mq_list) || - !list_empty(&plug->cb_list)); + !list_empty(&plug->cb_list) || + (plug->bio_list && !bio_list_empty(plug->bio_list))); } +extern void blk_flush_bio_list(struct blk_plug *plug); + /* * tag stuff */ @@ -1673,12 +1677,15 @@ static inline void blk_schedule_flush_plug(struct task_struct *task) { } - static inline bool blk_needs_flush_plug(struct task_struct *tsk) { return false; } +static inline void blk_flush_bio_list(void) +{ +} + static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, sector_t *error_sector) { diff --git a/include/linux/sched.h b/include/linux/sched.h index b7b9501..ca304f1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -128,7 +128,6 @@ struct sched_attr { struct futex_pi_state; struct robust_list_head; -struct bio_list; struct fs_struct; struct perf_event_context; struct blk_plug; @@ -1633,9 +1632,6 @@ struct task_struct { /* journalling filesystem info */ void *journal_info; -/* stacked block device info */ - struct bio_list *bio_list; - #ifdef CONFIG_BLOCK /* stack plugging */ struct blk_plug *plug; -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel