[PATCH V2 22/22] block, bfq: handle bursts of queue activations

Paolo Valente <paolo.valente@xxxxxxxxxx> · Mon, 8 Aug 2016 13:15:17 +0200

From: Arianna Avanzini <avanzini.arianna@xxxxxxxxx>

Many popular I/O-intensive services or applications spawn or
reactivate many parallel threads/processes during short time
intervals. Examples are systemd during boot or git grep.  These
services or applications benefit mostly from a high throughput: the
quicker the I/O generated by their processes is cumulatively served,
the sooner the target job of these services or applications gets
completed. As a consequence, it is almost always counterproductive to
weight-raise any of the queues associated to the processes of these
services or applications: in most cases it would just lower the
throughput, mainly because weight-raising also implies device idling.

To address this issue, an I/O scheduler needs, first, to detect which
queues are associated with these services or applications. In this
respect, we have that, from the I/O-scheduler standpoint, these
services or applications cause bursts of activations, i.e.,
activations of different queues occurring shortly after each
other. However, a shorter burst of activations may be caused also by
the start of an application that does not consist in a lot of parallel
I/O-bound threads (see the comments on the function bfq_handle_burst
for details).

In view of these facts, this commit introduces:
1) an heuristic to detect (only) bursts of queue activations caused by
   services or applications consisting in many parallel I/O-bound
   threads;
2) the prevention of device idling and weight-raising for the queues
   belonging to these bursts.

Signed-off-by: Arianna Avanzini <avanzini.arianna@xxxxxxxxx>
Signed-off-by: Paolo Valente <paolo.valente@xxxxxxxxxx>
---
 block/cfq-iosched.c | 401 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 389 insertions(+), 12 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 50dab96..19d202b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -40,7 +40,9 @@
  * real-time. This feature enables BFQ to provide applications in
  * these classes with a very low latency. Finally, BFQ also features
  * additional heuristics for preserving both a low latency and a high
- * throughput on NCQ-capable, rotational or flash-based devices.
+ * throughput on NCQ-capable, rotational or flash-based devices, and
+ * to get the job done quickly for applications consisting in many
+ * I/O-bound processes.
  *
  * With respect to the version of BFQ presented in [1], and in the
  * papers cited therein, this implementation adds a hierarchical
@@ -309,6 +311,10 @@ struct bfq_queue {
 
 	/* bit vector: a 1 for each seeky requests in history */
 	u32 seek_history;
+
+	/* node for the device's burst list */
+	struct hlist_node burst_list_node;
+
 	/* position of the last request enqueued */
 	sector_t last_request_pos;
 
@@ -400,6 +406,17 @@ struct bfq_io_cq {
 	 * classification of a queue.
 	 */
 	bool saved_IO_bound;
+
+	/*
+	 * Same purpose as the previous fields for the value of the
+	 * field keeping the queue's belonging to a large burst
+	 */
+	bool saved_in_large_burst;
+	/*
+	 * True if the queue belonged to a burst list before its merge
+	 * with another cooperating queue.
+	 */
+	bool was_in_burst_list;
 };
 
 enum bfq_device_speed {
@@ -538,6 +555,36 @@ struct bfq_data {
 	 */
 	bool strict_guarantees;
 
+	/*
+	 * Last time at which a queue entered the current burst of
+	 * queues being activated shortly after each other; for more
+	 * details about this and the following parameters related to
+	 * a burst of activations, see the comments on the function
+	 * bfq_handle_burst.
+	 */
+	unsigned long last_ins_in_burst;
+	/*
+	 * Reference time interval used to decide whether a queue has
+	 * been activated shortly after @last_ins_in_burst.
+	 */
+	unsigned long bfq_burst_interval;
+	/* number of queues in the current burst of queue activations */
+	int burst_size;
+
+	/* common parent entity for the queues in the burst */
+	struct bfq_entity *burst_parent_entity;
+	/* Maximum burst size above which the current queue-activation
+	 * burst is deemed as 'large'.
+	 */
+	unsigned long bfq_large_burst_thresh;
+	/* true if a large queue-activation burst is in progress */
+	bool large_burst;
+	/*
+	 * Head of the burst list (as for the above fields, more
+	 * details in the comments on the function bfq_handle_burst).
+	 */
+	struct hlist_head burst_list;
+
 	/* if set to true, low-latency heuristics are enabled */
 	bool low_latency;
 	/*
@@ -577,7 +624,8 @@ struct bfq_data {
 };
 
 enum bfqq_state_flags {
-	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is in service */
+	BFQ_BFQQ_FLAG_just_created = 0,	/* queue just allocated */
+	BFQ_BFQQ_FLAG_busy,		/* has requests or is in service */
 	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */
 	BFQ_BFQQ_FLAG_non_blocking_wait_rq, /*
 					     * waiting for a request
@@ -592,6 +640,10 @@ enum bfqq_state_flags {
 					 * having consumed at most 2/10 of
 					 * its budget
 					 */
+	BFQ_BFQQ_FLAG_in_large_burst,	/*
+					 * bfqq activated in a large burst,
+					 * see comments to bfq_handle_burst.
+					 */
 	BFQ_BFQQ_FLAG_softrt_update,	/*
 					 * may need softrt-next-start
 					 * update
@@ -614,6 +666,7 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
 	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
 }
 
+BFQ_BFQQ_FNS(just_created);
 BFQ_BFQQ_FNS(busy);
 BFQ_BFQQ_FNS(wait_request);
 BFQ_BFQQ_FNS(non_blocking_wait_rq);
@@ -622,6 +675,7 @@ BFQ_BFQQ_FNS(fifo_expire);
 BFQ_BFQQ_FNS(idle_window);
 BFQ_BFQQ_FNS(sync);
 BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
 BFQ_BFQQ_FNS(coop);
 BFQ_BFQQ_FNS(split_coop);
 BFQ_BFQQ_FNS(softrt_update);
@@ -3754,6 +3808,232 @@ static int bfqq_process_refs(struct bfq_queue *bfqq)
 	return process_refs;
 }
 
+/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_queue *item;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
+		hlist_del_init(&item->burst_list_node);
+	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+	bfqd->burst_size = 1;
+	bfqd->burst_parent_entity = bfqq->entity.parent;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	/* Increment burst size to take into account also bfqq */
+	bfqd->burst_size++;
+
+	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+		struct bfq_queue *pos, *bfqq_item;
+		struct hlist_node *n;
+
+		/*
+		 * Enough queues have been activated shortly after each
+		 * other to consider this burst as large.
+		 */
+		bfqd->large_burst = true;
+
+		/*
+		 * We can now mark all queues in the burst list as
+		 * belonging to a large burst.
+		 */
+		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
+				     burst_list_node)
+			bfq_mark_bfqq_in_large_burst(bfqq_item);
+		bfq_mark_bfqq_in_large_burst(bfqq);
+
+		/*
+		 * From now on, and until the current burst finishes, any
+		 * new queue being activated shortly after the last queue
+		 * was inserted in the burst can be immediately marked as
+		 * belonging to a large burst. So the burst list is not
+		 * needed any more. Remove it.
+		 */
+		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
+					  burst_list_node)
+			hlist_del_init(&pos->burst_list_node);
+	} else /*
+		* Burst not yet large: add bfqq to the burst list. Do
+		* not increment the ref counter for bfqq, because bfqq
+		* is removed from the burst list before freeing bfqq
+		* in put_queue.
+		*/
+		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues belonging to the same group happen to be created
+ * shortly after each other, then the processes associated with these
+ * queues have typically a common goal. In particular, bursts of queue
+ * creations are usually caused by services or applications that spawn
+ * many parallel threads/processes. Examples are systemd during boot,
+ * or git grep. To help these processes get their job done as soon as
+ * possible, it is usually better to not grant either weight-raising
+ * or device idling to their queues.
+ *
+ * In this comment we describe, firstly, the reasons why this fact
+ * holds, and, secondly, the next function, which implements the main
+ * steps needed to properly mark these queues so that they can then be
+ * treated in a different way.
+ *
+ * The above services or applications benefit mostly from a high
+ * throughput: the quicker the requests of the activated queues are
+ * cumulatively served, the sooner the target job of these queues gets
+ * completed. As a consequence, weight-raising any of these queues,
+ * which also implies idling the device for it, is almost always
+ * counterproductive. In most cases it just lowers throughput.
+ *
+ * On the other hand, a burst of queue creations may be caused also by
+ * the start of an application that does not consist of a lot of
+ * parallel I/O-bound threads. In fact, with a complex application,
+ * several short processes may need to be executed to start-up the
+ * application. In this respect, to start an application as quickly as
+ * possible, the best thing to do is in any case to privilege the I/O
+ * related to the application with respect to all other
+ * I/O. Therefore, the best strategy to start as quickly as possible
+ * an application that causes a burst of queue creations is to
+ * weight-raise all the queues created during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the
+ * two types of bursts need to be distinguished. Fortunately, this
+ * seems relatively easy, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that only bursts with a
+ * larger size than that threshold are apparently caused by
+ * services or commands such as systemd or git grep. For brevity,
+ * hereafter we call just 'large' these bursts. BFQ *does not*
+ * weight-raise queues whose creation occurs in a large burst. In
+ * addition, for each of these queues BFQ performs or does not perform
+ * idling depending on which choice boosts the throughput more. The
+ * exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Unfortunately, false positives may occur while an interactive task
+ * is starting (e.g., an application is being started). The
+ * consequence is that the queues associated with the task do not
+ * enjoy weight raising as expected. Fortunately these false positives
+ * are very rare. They typically occur if some service happens to
+ * start doing I/O exactly when the interactive task starts.
+ *
+ * Turning back to the next function, it implements all the steps
+ * needed to detect the occurrence of a large burst and to properly
+ * mark all the queues belonging to it (so that they can then be
+ * treated in a different way). This goal is achieved by maintaining a
+ * "burst list" that holds, temporarily, the queues that belong to the
+ * burst in progress. The list is then used to mark these queues as
+ * belonging to a large burst if the burst does become large. The main
+ * steps are the following.
+ *
+ * . when the very first queue is created, the queue is inserted into the
+ *   list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ *   not yet belong to the burst is activated shortly after the last time
+ *   at which a new queue entered the burst list, then the function appends
+ *   Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ *   the large-burst threshold, then
+ *
+ *     . all the queues in the burst list are marked as belonging to a
+ *       large burst
+ *
+ *     . the burst list is deleted; in fact, the burst list already served
+ *       its purpose (keeping temporarily track of the queues in a burst,
+ *       so as to be able to mark them as belonging to a large burst in the
+ *       previous sub-step), and now is not needed any more
+ *
+ *     . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is created while
+ *   the device is in large-burst mode and shortly after the last time
+ *   at which a queue either entered the burst list or was marked as
+ *   belonging to the current large burst, then Q is immediately marked
+ *   as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is created a while
+ *   later, i.e., not shortly after, than the last time at which a queue
+ *   either entered the burst list or was marked as belonging to the
+ *   current large burst, then the current burst is deemed as finished and:
+ *
+ *        . the large-burst mode is reset if set
+ *
+ *        . the burst list is emptied
+ *
+ *        . Q is inserted in the burst list, as Q may be the first queue
+ *          in a possible new burst (then the burst list contains just Q
+ *          after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	/*
+	 * If bfqq is already in the burst list or is part of a large
+	 * burst, or finally has just been split, then there is
+	 * nothing else to do.
+	 */
+	if (!hlist_unhashed(&bfqq->burst_list_node) ||
+	    bfq_bfqq_in_large_burst(bfqq) ||
+	    time_is_after_eq_jiffies(bfqq->split_time +
+				     msecs_to_jiffies(10)))
+		return;
+
+	/*
+	 * If bfqq's creation happens late enough, or bfqq belongs to
+	 * a different group than the burst group, then the current
+	 * burst is finished, and related data structures must be
+	 * reset.
+	 *
+	 * In this respect, consider the special case where bfqq is
+	 * the very first queue created after BFQ is selected for this
+	 * device. In this case, last_ins_in_burst and
+	 * burst_parent_entity are not yet significant when we get
+	 * here. But it is easy to verify that, whether or not the
+	 * following condition is true, bfqq will end up being
+	 * inserted into the burst list. In particular the list will
+	 * happen to contain only bfqq. And this is exactly what has
+	 * to happen, as bfqq may be the first queue of the first
+	 * burst.
+	 */
+	if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+	    bfqd->bfq_burst_interval) ||
+	    bfqq->entity.parent != bfqd->burst_parent_entity) {
+		bfqd->large_burst = false;
+		bfq_reset_burst_list(bfqd, bfqq);
+		goto end;
+	}
+
+	/*
+	 * If we get here, then bfqq is being activated shortly after the
+	 * last queue. So, if the current burst is also large, we can mark
+	 * bfqq as belonging to this large burst immediately.
+	 */
+	if (bfqd->large_burst) {
+		bfq_mark_bfqq_in_large_burst(bfqq);
+		goto end;
+	}
+
+	/*
+	 * If we get here, then a large-burst state has not yet been
+	 * reached, but bfqq is being activated shortly after the last
+	 * queue. Then we add bfqq to the burst.
+	 */
+	bfq_add_to_burst(bfqd, bfqq);
+end:
+	/*
+	 * At this point, bfqq either has been added to the current
+	 * burst or has caused the current burst to terminate and a
+	 * possible new burst to start. In particular, in the second
+	 * case, bfqq has become the first queue in the possible new
+	 * burst.  In both cases last_ins_in_burst needs to be moved
+	 * forward.
+	 */
+	bfqd->last_ins_in_burst = jiffies;
+}
+
 static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
 {
 	struct bfq_entity *entity = &bfqq->entity;
@@ -3967,6 +4247,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 					     unsigned int old_wr_coeff,
 					     bool wr_or_deserves_wr,
 					     bool interactive,
+					     bool in_burst,
 					     bool soft_rt)
 {
 	if (old_wr_coeff == 1 && wr_or_deserves_wr) {
@@ -3997,7 +4278,9 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 		if (interactive) { /* update wr coeff and duration */
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
 			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
-		} else if (time_before(
+		} else if (in_burst)
+			bfqq->wr_coeff = 1;
+		else if (time_before(
 				   bfqq->last_wr_start_finish +
 				   bfqq->wr_cur_max_time,
 				   jiffies +
@@ -4070,7 +4353,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 					     struct request *rq,
 					     bool *interactive)
 {
-	bool soft_rt, wr_or_deserves_wr, bfqq_wants_to_preempt,
+	bool soft_rt, in_burst,	wr_or_deserves_wr,
+		bfqq_wants_to_preempt,
 		idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
 		/*
 		 * See the comments on
@@ -4087,12 +4371,15 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 	/*
 	 * bfqq deserves to be weight-raised if:
 	 * - it is sync,
+	 * - it does not belong to a large burst,
 	 * - it has been idle for enough time or is soft real-time,
 	 * - is linked to a bfq_io_cq (it is not shared in any sense).
 	 */
+	in_burst = bfq_bfqq_in_large_burst(bfqq);
 	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+		!in_burst &&
 		time_is_before_jiffies(bfqq->soft_rt_next_start);
-	*interactive = idle_for_long_time;
+	*interactive = !in_burst && idle_for_long_time;
 	wr_or_deserves_wr = bfqd->low_latency &&
 		(bfqq->wr_coeff > 1 ||
 		 (bfq_bfqq_sync(bfqq) &&
@@ -4107,6 +4394,31 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 						    arrived_in_time,
 						    wr_or_deserves_wr);
 
+	/*
+	 * If bfqq happened to be activated in a burst, but has been
+	 * idle for much more than an interactive queue, then we
+	 * assume that, in the overall I/O initiated in the burst, the
+	 * I/O associated with bfqq is finished. So bfqq does not need
+	 * to be treated as a queue belonging to a burst
+	 * anymore. Accordingly, we reset bfqq's in_large_burst flag
+	 * if set, and remove bfqq from the burst list if it's
+	 * there. We do not decrement burst_size, because the fact
+	 * that bfqq does not need to belong to the burst list any
+	 * more does not invalidate the fact that bfqq was created in
+	 * a burst.
+	 */
+	if (likely(!bfq_bfqq_just_created(bfqq)) &&
+	    idle_for_long_time &&
+	    time_is_before_jiffies(
+		    bfqq->budget_timeout +
+		    msecs_to_jiffies(10000))) {
+		hlist_del_init(&bfqq->burst_list_node);
+		bfq_clear_bfqq_in_large_burst(bfqq);
+	}
+
+	bfq_clear_bfqq_just_created(bfqq);
+
+
 	if (!bfq_bfqq_IO_bound(bfqq)) {
 		if (arrived_in_time) {
 			bfqq->requests_within_timer++;
@@ -4129,6 +4441,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 							 old_wr_coeff,
 							 wr_or_deserves_wr,
 							 *interactive,
+							 in_burst,
 							 soft_rt);
 
 			if (old_wr_coeff != bfqq->wr_coeff)
@@ -4713,6 +5026,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 
 	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
 	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
 }
 
 static void bfq_get_bic_reference(struct bfq_queue *bfqq)
@@ -4744,7 +5059,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
 	 * where bfqq has just been created, but has not yet made it
 	 * to be weight-raised (which may happen because EQM may merge
 	 * bfqq even before bfq_add_request is executed for the first
-	 * time for bfqq).
+	 * time for bfqq). Handling this case would however be very
+	 * easy, thanks to the flag just_created.
 	 */
 	if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
 		new_bfqq->wr_coeff = bfqq->wr_coeff;
@@ -5654,6 +5970,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
 	bool idling_boosts_thr, idling_boosts_thr_without_issues,
+		idling_needed_for_service_guarantees,
 		asymmetric_scenario;
 
 	if (bfqd->strict_guarantees)
@@ -5834,6 +6151,23 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 		!bfq_symmetric_scenario(bfqd);
 
 	/*
+	 * Finally, there is a case where maximizing throughput is the
+	 * best choice even if it may cause unfairness toward
+	 * bfqq. Such a case is when bfqq became active in a burst of
+	 * queue activations. Queues that became active during a large
+	 * burst benefit only from throughput, as discussed in the
+	 * comments on bfq_handle_burst. Thus, if bfqq became active
+	 * in a burst and not idling the device maximizes throughput,
+	 * then the device must no be idled, because not idling the
+	 * device provides bfqq and all other queues in the burst with
+	 * maximum benefit. Combining this and the above case, we can
+	 * now establish when idling is actually needed to preserve
+	 * service guarantees.
+	 */
+	idling_needed_for_service_guarantees =
+		asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
+
+	/*
 	 * We have now all the components we need to compute the return
 	 * value of the function, which is true only if both the following
 	 * conditions hold:
@@ -5842,7 +6176,8 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 *    is necessary to preserve service guarantees.
 	 */
 	return bfq_bfqq_sync(bfqq) &&
-		(idling_boosts_thr_without_issues || asymmetric_scenario);
+		(idling_boosts_thr_without_issues ||
+		 idling_needed_for_service_guarantees);
 }
 
 /*
@@ -5961,10 +6296,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
 
 		/*
-		 * If too much time has elapsed from the beginning of
-		 * this weight-raising period, then end weight raising.
+		 * If the queue was activated in a burst, or too much
+		 * time has elapsed from the beginning of this
+		 * weight-raising period, then end weight raising.
 		 */
-		if (time_is_before_jiffies(bfqq->last_wr_start_finish +
+		if (bfq_bfqq_in_large_burst(bfqq) ||
+		    time_is_before_jiffies(bfqq->last_wr_start_finish +
 					   bfqq->wr_cur_max_time)) {
 			bfqq->last_wr_start_finish = jiffies;
 			bfq_log_bfqq(bfqd, bfqq,
@@ -6151,6 +6488,17 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->ref)
 		return;
 
+	if (bfq_bfqq_sync(bfqq))
+		/*
+		 * The fact that this queue is being destroyed does not
+		 * invalidate the fact that this queue may have been
+		 * activated during the current burst. As a consequence,
+		 * although the queue does not exist anymore, and hence
+		 * needs to be removed from the burst list if there,
+		 * the burst size has not to be decremented.
+		 */
+		hlist_del_init(&bfqq->burst_list_node);
+
 	kmem_cache_free(bfq_pool, bfqq);
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	bfqg_put(bfqg);
@@ -6299,6 +6647,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 {
 	RB_CLEAR_NODE(&bfqq->entity.rb_node);
 	INIT_LIST_HEAD(&bfqq->fifo);
+	INIT_HLIST_NODE(&bfqq->burst_list_node);
 
 	bfqq->ref = 0;
 	bfqq->bfqd = bfqd;
@@ -6310,6 +6659,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		if (!bfq_class_idle(bfqq))
 			bfq_mark_bfqq_idle_window(bfqq);
 		bfq_mark_bfqq_sync(bfqq);
+		bfq_mark_bfqq_just_created(bfqq);
 	} else
 		bfq_clear_bfqq_sync(bfqq);
 	bfq_mark_bfqq_IO_bound(bfqq);
@@ -6585,6 +6935,7 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
 			new_bfqq->allocated[rq_data_dir(rq)]++;
 			bfqq->allocated[rq_data_dir(rq)]--;
 			new_bfqq->ref++;
+			bfq_clear_bfqq_just_created(bfqq);
 			bfq_put_queue(bfqq);
 			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
 				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
@@ -6806,12 +7157,27 @@ new_queue:
 			bfq_put_queue(bfqq);
 		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
 		bic_set_bfqq(bic, bfqq, is_sync);
-		if (split && is_sync)
+		if (split && is_sync) {
+			if ((bic->was_in_burst_list && bfqd->large_burst) ||
+			    bic->saved_in_large_burst)
+				bfq_mark_bfqq_in_large_burst(bfqq);
+			else {
+				bfq_clear_bfqq_in_large_burst(bfqq);
+				if (bic->was_in_burst_list)
+					hlist_add_head(&bfqq->burst_list_node,
+						       &bfqd->burst_list);
+			}
 			bfqq->split_time = jiffies;
+		}
 	} else {
 		/* If the queue was seeky for too long, break it apart. */
 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+
+			/* Update bic before losing reference to bfqq */
+			if (bfq_bfqq_in_large_burst(bfqq))
+				bic->saved_in_large_burst = true;
+
 			bfqq = bfq_split_bfqq(bic, bfqq);
 			split = true;
 			if (!bfqq)
@@ -6845,6 +7211,9 @@ new_queue:
 		}
 	}
 
+	if (unlikely(bfq_bfqq_just_created(bfqq)))
+		bfq_handle_burst(bfqd, bfqq);
+
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	return 0;
@@ -7031,6 +7400,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
 	bfqd->oom_bfqq.entity.new_weight =
 		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+
+	/* oom_bfqq does not participate to bursts */
+	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
+
 	/*
 	 * Trigger weight initialization, according to ioprio, at the
 	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
@@ -7061,6 +7434,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 
 	INIT_LIST_HEAD(&bfqd->active_list);
 	INIT_LIST_HEAD(&bfqd->idle_list);
+	INIT_HLIST_HEAD(&bfqd->burst_list);
 
 	bfqd->hw_tag = -1;
 
@@ -7076,6 +7450,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 
 	bfqd->bfq_requests_within_timer = 120;
 
+	bfqd->bfq_large_burst_thresh = 8;
+	bfqd->bfq_burst_interval = msecs_to_jiffies(180);
+
 	bfqd->low_latency = true;
 
 	/*
@@ -7517,7 +7894,7 @@ static int __init bfq_init(void)
 	if (ret)
 		goto err_pol_unreg;
 
-	pr_info("BFQ I/O-scheduler: v7r3");
+	pr_info("BFQ I/O-scheduler: v8r2");
 
 	return 0;
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html