Adding the Frank and Anchal, as this part of the thread is not copying them. Just a minor comment at the bottom. On Fri, Aug 24, 2018 at 08:58:09AM -0600, Jens Axboe wrote: > On 8/24/18 8:40 AM, Jens Axboe wrote: > > On 8/23/18 8:06 PM, jianchao.wang wrote: > >> Hi Jens > >> > >> On 08/23/2018 11:42 PM, Jens Axboe wrote: > >>>> - > >>>> - __set_current_state(TASK_RUNNING); > >>>> - remove_wait_queue(&rqw->wait, &wait); > >>>> + wbt_init_wait(&wait, &data); > >>>> + prepare_to_wait_exclusive(&rqw->wait, &wait, > >>>> + TASK_UNINTERRUPTIBLE); > >>>> + if (lock) { > >>>> + spin_unlock_irq(lock); > >>>> + io_schedule(); > >>>> + spin_lock_irq(lock); > >>>> + } else > >>>> + io_schedule(); > >>> Aren't we still missing a get-token attempt after adding to the > >>> waitqueue? For the case where someone frees the token after your initial > >>> check, but before you add yourself to the waitqueue. > >> > >> I used to think about this. > >> However, there is a very tricky scenario here: > >> We will try get the wbt budget in wbt_wake_function. > >> After add a task into the wait queue, wbt_wake_function has been able to > >> be invoked for this task. If we get the wbt budget after prepare_to_wait_exclusive, > >> we may get wbt budget twice. > > > > Ah yes good point. But without it, you've got another race that will > > potentially put you to sleep forever. > > > > How about something like the below? That should take care of both > > situations. Totally untested. > > Slightly better/cleaner one below. Still totally untested. > > > diff --git a/block/blk-wbt.c b/block/blk-wbt.c > index 84507d3e9a98..bc13544943ff 100644 > --- a/block/blk-wbt.c > +++ b/block/blk-wbt.c > @@ -123,16 +123,11 @@ static void rwb_wake_all(struct rq_wb *rwb) > } > } > > -static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) > +static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, > + enum wbt_flags wb_acct) > { > - struct rq_wb *rwb = RQWB(rqos); > - struct rq_wait *rqw; > int inflight, limit; > > - if (!(wb_acct & WBT_TRACKED)) > - return; > - > - rqw = get_rq_wait(rwb, wb_acct); > inflight = atomic_dec_return(&rqw->inflight); > > /* > @@ -166,8 +161,21 @@ static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) > int diff = limit - inflight; > > if (!inflight || diff >= rwb->wb_background / 2) > - wake_up(&rqw->wait); > + wake_up_all(&rqw->wait); > } > + > +} > + > +static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) > +{ > + struct rq_wb *rwb = RQWB(rqos); > + struct rq_wait *rqw; > + > + if (!(wb_acct & WBT_TRACKED)) > + return; > + > + rqw = get_rq_wait(rwb, wb_acct); > + wbt_rqw_done(rwb, rqw, wb_acct); > } > > /* > @@ -481,6 +489,32 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) > return limit; > } > > +struct wbt_wait_data { > + struct task_struct *curr; > + struct rq_wb *rwb; > + struct rq_wait *rqw; > + unsigned long rw; > + bool got_token; > +}; > + > +static int wbt_wake_function(wait_queue_entry_t *curr, unsigned int mode, > + int wake_flags, void *key) > +{ > + struct wbt_wait_data *data = curr->private; > + > + /* > + * If we fail to get a budget, return -1 to interrupt the wake up > + * loop in __wake_up_common. > + */ > + if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw))) > + return -1; > + > + data->got_token = true; > + wake_up_process(data->curr); > + list_del_init(&curr->entry); > + return 1; > +} > + > /* > * Block if we will exceed our limit, or if we are currently waiting for > * the timer to kick off queuing again. > @@ -491,31 +525,44 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, > __acquires(lock) > { > struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); > - DECLARE_WAITQUEUE(wait, current); > + struct wbt_wait_data data = { > + .curr = current, > + .rwb = rwb, > + .rqw = rqw, > + .rw = rw, > + }; > + struct wait_queue_entry wait = { > + .func = wbt_wake_function, > + .private = &data, > + .entry = LIST_HEAD_INIT(wait.entry), > + }; > bool has_sleeper; > > has_sleeper = wq_has_sleeper(&rqw->wait); > if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) > return; > > - add_wait_queue_exclusive(&rqw->wait, &wait); > - do { > - set_current_state(TASK_UNINTERRUPTIBLE); > + prepare_to_wait_exclusive(&rqw->wait, &wait, TASK_UNINTERRUPTIBLE); > > - if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) > - break; > + if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) { > + finish_wait(&rqw->wait, &wait); > > - if (lock) { > - spin_unlock_irq(lock); > - io_schedule(); > - spin_lock_irq(lock); > - } else > - io_schedule(); > - has_sleeper = false; > - } while (1); > - > - __set_current_state(TASK_RUNNING); > - remove_wait_queue(&rqw->wait, &wait); > + /* > + * We raced with wbt_wake_function() getting a token, which > + * means we now have two. Put ours and wake anyone else > + * potentially waiting for one. > + */ > + if (data.got_token) > + wbt_rqw_done(rwb, rqw, wb_acct); > + return; > + } > + > + if (lock) { > + spin_unlock_irq(lock); > + io_schedule(); > + spin_lock_irq(lock); > + } else > + io_schedule(); Nitpick but, shouldn't this look like: + if (lock) { + spin_unlock_irq(lock); + io_schedule(); + spin_lock_irq(lock); + } else { + io_schedule(); + } And another random though, it would be good to have some sort of tracing of this. > } > > static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) > > -- > Jens Axboe > > -- All the best, Eduardo Valentin