2887e41 (blk-wbt: Avoid lock contention and thundering herd issue in wbt_wait) introduces two cases that could miss wakeup: - __wbt_done only wakes up one waiter one time. There could be multiple waiters and (limit - inflight) > 1 at the moment. - When the waiter is waked up, it is still on wait queue and set to TASK_UNINTERRUPTIBLE immediately, so this waiter could be waked up one more time. If a __wbt_done comes and wakes up again, the prevous waiter may waste a wakeup. To fix them and avoid to introduce too much lock contention, we introduce our own wake up func wbt_wake_function in __wbt_wait and use wake_up_all in __wbt_done. wbt_wake_function will try to get wbt budget firstly, if sucesses, wake up the process, otherwise, return -1 to interrupt the wake up loop. Signed-off-by: Jianchao Wang <jianchao.w.wang@xxxxxxxxxx> Fixes: 2887e41 (blk-wbt: Avoid lock contention and thundering herd issue in wbt_wait) Cc: Anchal Agarwal <anchalag@xxxxxxxxxx> Cc: Frank van der Linden <fllinden@xxxxxxxxxx> --- block/blk-wbt.c | 78 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index c9358f1..2667590 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -166,7 +166,7 @@ static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) int diff = limit - inflight; if (!inflight || diff >= rwb->wb_background / 2) - wake_up(&rqw->wait); + wake_up_all(&rqw->wait); } } @@ -481,6 +481,40 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) return limit; } +struct wbt_wait_data { + struct task_struct *curr; + struct rq_wb *rwb; + struct rq_wait *rqw; + unsigned long rw; +}; + +static int wbt_wake_function(wait_queue_entry_t *curr, unsigned int mode, + int wake_flags, void *key) +{ + struct wbt_wait_data *data = curr->private; + + /* + * If fail to get budget, return -1 to interrupt the wake up + * loop in __wake_up_common. + */ + if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw))) + return -1; + + wake_up_process(data->curr); + + list_del_init(&curr->entry); + return 1; +} + +static inline void wbt_init_wait(struct wait_queue_entry *wait, + struct wbt_wait_data *data) +{ + INIT_LIST_HEAD(&wait->entry); + wait->flags = 0; + wait->func = wbt_wake_function; + wait->private = data; +} + /* * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. @@ -491,31 +525,27 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, __acquires(lock) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); - DECLARE_WAITQUEUE(wait, current); - bool has_sleeper; - - has_sleeper = wq_has_sleeper(&rqw->wait); - if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) + struct wait_queue_entry wait; + struct wbt_wait_data data = { + .curr = current, + .rwb = rwb, + .rqw = rqw, + .rw = rw, + }; + + if (!wq_has_sleeper(&rqw->wait) && + rq_wait_inc_below(rqw, get_limit(rwb, rw))) return; - add_wait_queue_exclusive(&rqw->wait, &wait); - do { - set_current_state(TASK_UNINTERRUPTIBLE); - - if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) - break; - - if (lock) { - spin_unlock_irq(lock); - io_schedule(); - spin_lock_irq(lock); - } else - io_schedule(); - has_sleeper = false; - } while (1); - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&rqw->wait, &wait); + wbt_init_wait(&wait, &data); + prepare_to_wait_exclusive(&rqw->wait, &wait, + TASK_UNINTERRUPTIBLE); + if (lock) { + spin_unlock_irq(lock); + io_schedule(); + spin_lock_irq(lock); + } else + io_schedule(); } static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) -- 2.7.4