Re: [PATCH] blk-wbt: get back the missed wakeup from __wbt_done

Jens Axboe <axboe@xxxxxxxxx> · Sat, 25 Aug 2018 09:41:10 -0600

On 8/24/18 2:41 PM, Jens Axboe wrote:
> On 8/24/18 2:33 PM, Anchal Agarwal wrote:
>> On Fri, Aug 24, 2018 at 12:50:44PM -0600, Jens Axboe wrote:
>>> On 8/24/18 12:12 PM, Anchal Agarwal wrote:
>>>> That's totally fair. As compared to before the patch it was way too high
>>>> and my test case wasn't even running due to the thunderign herd issues and
>>>> queue re-ordering. Anyways as I also mentioned before 10 times 
>>>> contention is not too bad since its not really affecting much the number of
>>>> files read in my applciation. Also, you are right waking up N tasks seems 
>>>> plausible. 
>>>
>>> OK, I'm going to take that as a positive response. I'm going to propose
>>> the last patch as the final addition in this round, since it does fix a
>>> gap in the previous. And I do think that we need to wake as many tasks
>>> as can make progress, otherwise we're deliberately running the device at
>>> a lower load than we should.
>>>
>>>> My application is somewhat similar to database workload. It does uses fsync 
>>>> internally also. So what it does is it creates files of random sizes with 
>>>> random contents. It stores the hash of those files in memory. During the 
>>>> test it reads those files back from storage and checks their hashes. 
>>>
>>> How many tasks are running for your test?
>>>
>>> -- 
>>> Jens Axboe
>>>
>>>
>>
>> So there are 128 Concurrent reads/writes happening. Max files written before
>> reads start is 16384 and each file is of size 512KB. Does that answer your
>> question?
> 
> Yes it does, thanks. That's not a crazy amount of tasks or threads.
> 
>> BTW, I still have to test the last patch you send but by looking at the patch 
>> I assumed it will work anyways!
> 
> Thanks for the vote of confidence, I'd appreciate if you would give it a
> whirl. Your workload seems nastier than what I test with, so would be
> great to have someone else test it too.

This is what I have come up with, this actually survives some torture
testing. We do need to have the wait as a loop, since we can't rely on
just being woken from the ->func handler we set. It also handles the
prep token get race.

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 84507d3e9a98..2442b2b141b8 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -123,16 +123,11 @@ static void rwb_wake_all(struct rq_wb *rwb)
 	}
 }
 
-static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
+static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
+			 enum wbt_flags wb_acct)
 {
-	struct rq_wb *rwb = RQWB(rqos);
-	struct rq_wait *rqw;
 	int inflight, limit;
 
-	if (!(wb_acct & WBT_TRACKED))
-		return;
-
-	rqw = get_rq_wait(rwb, wb_acct);
 	inflight = atomic_dec_return(&rqw->inflight);
 
 	/*
@@ -166,8 +161,21 @@ static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
 		int diff = limit - inflight;
 
 		if (!inflight || diff >= rwb->wb_background / 2)
-			wake_up(&rqw->wait);
+			wake_up_all(&rqw->wait);
 	}
+
+}
+
+static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
+{
+	struct rq_wb *rwb = RQWB(rqos);
+	struct rq_wait *rqw;
+
+	if (!(wb_acct & WBT_TRACKED))
+		return;
+
+	rqw = get_rq_wait(rwb, wb_acct);
+	wbt_rqw_done(rwb, rqw, wb_acct);
 }
 
 /*
@@ -481,6 +489,33 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 	return limit;
 }
 
+struct wbt_wait_data {
+	struct wait_queue_entry wq;
+	struct task_struct *curr;
+	struct rq_wb *rwb;
+	struct rq_wait *rqw;
+	unsigned long rw;
+	unsigned long flags;
+};
+
+static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+			     int wake_flags, void *key)
+{
+	struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data, wq);
+
+	/*
+	 * If we fail to get a budget, return -1 to interrupt the wake up
+	 * loop in __wake_up_common.
+	 */
+	if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw)))
+		return -1;
+
+	set_bit(0, &data->flags);
+	wake_up_process(data->curr);
+	list_del_init(&curr->entry);
+	return 1;
+}
+
 /*
  * Block if we will exceed our limit, or if we are currently waiting for
  * the timer to kick off queuing again.
@@ -491,19 +526,42 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
 	__acquires(lock)
 {
 	struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
-	DECLARE_WAITQUEUE(wait, current);
+	struct wbt_wait_data data = {
+		.wq = {
+			.func	= wbt_wake_function,
+			.entry	= LIST_HEAD_INIT(data.wq.entry),
+		},
+		.curr = current,
+		.rwb = rwb,
+		.rqw = rqw,
+		.rw = rw,
+	};
 	bool has_sleeper;
 
 	has_sleeper = wq_has_sleeper(&rqw->wait);
 	if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
 		return;
 
-	add_wait_queue_exclusive(&rqw->wait, &wait);
+	prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
 	do {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (test_bit(0, &data.flags))
+			break;
 
-		if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
+		WARN_ON_ONCE(list_empty(&data.wq.entry));
+
+		if (!has_sleeper &&
+		    rq_wait_inc_below(rqw, get_limit(rwb, rw))) {
+			finish_wait(&rqw->wait, &data.wq);
+
+			/*
+			 * We raced with wbt_wake_function() getting a token,
+			 * which means we now have two. Put ours and wake
+			 * anyone else potentially waiting for one.
+			 */
+			if (test_bit(0, &data.flags))
+				wbt_rqw_done(rwb, rqw, wb_acct);
 			break;
+		}
 
 		if (lock) {
 			spin_unlock_irq(lock);
@@ -511,11 +569,11 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
 			spin_lock_irq(lock);
 		} else
 			io_schedule();
+
 		has_sleeper = false;
 	} while (1);
 
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&rqw->wait, &wait);
+	finish_wait(&rqw->wait, &data.wq);
 }
 
 static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)

-- 
Jens Axboe