[PATCH] bfq: Fix use-after-free with cgroups

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



BFQ started crashing with 5.15-based kernels like:

BUG: KASAN: use-after-free in rb_erase (lib/rbtree.c:262 lib/rbtr
Read of size 8 at addr ffff888008193098 by task bash/1472

CPU: 0 PID: 1472 Comm: bash Tainted: G            E     5.15.2-0.
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1
Call Trace:
rb_erase (lib/rbtree.c:262 lib/rbtree.c:445)
bfq_idle_extract (block/bfq-wf2q.c:356)
bfq_put_idle_entity (block/bfq-wf2q.c:660)
bfq_bfqq_served (block/bfq-wf2q.c:833)
bfq_dispatch_request (block/bfq-iosched.c:4870 block/bfq-iosched.
__blk_mq_do_dispatch_sched (block/blk-mq-sched.c:150)
__blk_mq_sched_dispatch_requests (block/blk-mq-sched.c:215 block/
blk_mq_sched_dispatch_requests (block/blk-mq-sched.c:360)
blk_mq_sched_insert_requests (include/linux/percpu-refcount.h:174
blk_mq_flush_plug_list (include/linux/list.h:282 block/blk-mq.c:1
blk_flush_plug_list (block/blk-core.c:1722)
blk_finish_plug (block/blk-core.c:1745 block/blk-core.c:1739)
read_pages (include/linux/list.h:282 mm/readahead.c:152)
page_cache_ra_unbounded (mm/readahead.c:212 (discriminator 2))
filemap_fault (mm/filemap.c:2982 mm/filemap.c:3074)
__do_fault (mm/memory.c:3858)
__handle_mm_fault (mm/memory.c:4182 mm/memory.c:4310 mm/memory.c:
handle_mm_fault (mm/memory.c:4801)

After some analysis we've found out that the culprit of the problem is
that some task is reparented from cgroup G to the root cgroup and G is
offlined. But a bfq_queue in task's IO context still points to G as its
parent and thus when task submits more IO, G is inserted into service
trees. Once the task exits and bfq_queue is destroyed, the last
reference to G is dropped as well and G is freed but it is still linked
from service trees causing use-after-free issues sometime later.

Fix the problem by tracking all bfq_queues that point to a particular
cgroup as their parent and reparent them when the cgroup is going
offline.

CC: stable@xxxxxxxxxxxxxxx
Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
Tested-by: Fabian Vogt <fvogt@xxxxxxxx>
Signed-off-by: Jan Kara <jack@xxxxxxx>
---
 block/bfq-cgroup.c  | 100 ++++++--------------------------------------
 block/bfq-iosched.c |  54 ++++++++++++------------
 block/bfq-iosched.h |   6 +++
 3 files changed, 47 insertions(+), 113 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 24a5c5329bcd..519e6291e98e 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -666,6 +666,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 	else if (entity->on_st_or_in_serv)
 		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	hlist_del(&bfqq->children_node);
 	bfqg_and_blkg_put(bfqq_group(bfqq));
 
 	if (entity->parent &&
@@ -678,6 +679,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	entity->sched_data = &bfqg->sched_data;
 	/* pin down bfqg and its associated blkg  */
 	bfqg_and_blkg_get(bfqg);
+	hlist_add_head(&bfqq->children_node, &bfqg->children);
 
 	if (bfq_bfqq_busy(bfqq)) {
 		if (unlikely(!bfqd->nonrot_with_queueing))
@@ -810,68 +812,13 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
 	rcu_read_unlock();
 }
 
-/**
- * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
- * @st: the service tree being flushed.
- */
-static void bfq_flush_idle_tree(struct bfq_service_tree *st)
-{
-	struct bfq_entity *entity = st->first_idle;
-
-	for (; entity ; entity = st->first_idle)
-		__bfq_deactivate_entity(entity, false);
-}
-
-/**
- * bfq_reparent_leaf_entity - move leaf entity to the root_group.
- * @bfqd: the device data structure with the root group.
- * @entity: the entity to move, if entity is a leaf; or the parent entity
- *	    of an active leaf entity to move, if entity is not a leaf.
- */
-static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
-				     struct bfq_entity *entity,
-				     int ioprio_class)
+static void bfq_reparent_children(struct bfq_data *bfqd, struct bfq_group *bfqg)
 {
 	struct bfq_queue *bfqq;
-	struct bfq_entity *child_entity = entity;
-
-	while (child_entity->my_sched_data) { /* leaf not reached yet */
-		struct bfq_sched_data *child_sd = child_entity->my_sched_data;
-		struct bfq_service_tree *child_st = child_sd->service_tree +
-			ioprio_class;
-		struct rb_root *child_active = &child_st->active;
-
-		child_entity = bfq_entity_of(rb_first(child_active));
-
-		if (!child_entity)
-			child_entity = child_sd->in_service_entity;
-	}
-
-	bfqq = bfq_entity_to_bfqq(child_entity);
-	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
-}
-
-/**
- * bfq_reparent_active_queues - move to the root group all active queues.
- * @bfqd: the device data structure with the root group.
- * @bfqg: the group to move from.
- * @st: the service tree to start the search from.
- */
-static void bfq_reparent_active_queues(struct bfq_data *bfqd,
-				       struct bfq_group *bfqg,
-				       struct bfq_service_tree *st,
-				       int ioprio_class)
-{
-	struct rb_root *active = &st->active;
-	struct bfq_entity *entity;
-
-	while ((entity = bfq_entity_of(rb_first(active))))
-		bfq_reparent_leaf_entity(bfqd, entity, ioprio_class);
+	struct hlist_node *next;
 
-	if (bfqg->sched_data.in_service_entity)
-		bfq_reparent_leaf_entity(bfqd,
-					 bfqg->sched_data.in_service_entity,
-					 ioprio_class);
+	hlist_for_each_entry_safe(bfqq, next, &bfqg->children, children_node)
+		bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
 }
 
 /**
@@ -897,38 +844,17 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
 		goto put_async_queues;
 
 	/*
-	 * Empty all service_trees belonging to this group before
-	 * deactivating the group itself.
+	 * Reparent all bfqqs under this bfq group. This will also empty all
+	 * service_trees belonging to this group before deactivating the group
+	 * itself.
 	 */
+	bfq_reparent_children(bfqd, bfqg);
+
 	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
 		st = bfqg->sched_data.service_tree + i;
 
-		/*
-		 * It may happen that some queues are still active
-		 * (busy) upon group destruction (if the corresponding
-		 * processes have been forced to terminate). We move
-		 * all the leaf entities corresponding to these queues
-		 * to the root_group.
-		 * Also, it may happen that the group has an entity
-		 * in service, which is disconnected from the active
-		 * tree: it must be moved, too.
-		 * There is no need to put the sync queues, as the
-		 * scheduler has taken no reference.
-		 */
-		bfq_reparent_active_queues(bfqd, bfqg, st, i);
-
-		/*
-		 * The idle tree may still contain bfq_queues
-		 * belonging to exited task because they never
-		 * migrated to a different cgroup from the one being
-		 * destroyed now. In addition, even
-		 * bfq_reparent_active_queues() may happen to add some
-		 * entities to the idle tree. It happens if, in some
-		 * of the calls to bfq_bfqq_move() performed by
-		 * bfq_reparent_active_queues(), the queue to move is
-		 * empty and gets expired.
-		 */
-		bfq_flush_idle_tree(st);
+		WARN_ON_ONCE(!RB_EMPTY_ROOT(&st->active));
+		WARN_ON_ONCE(!RB_EMPTY_ROOT(&st->idle));
 	}
 
 	__bfq_deactivate_entity(entity, false);
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index fec18118dc30..8f33f6b91d05 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5163,6 +5163,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
 		bfqq->bfqd->last_completed_rq_bfqq = NULL;
 
+	hlist_del(&bfqq->children_node);
 	kmem_cache_free(bfq_pool, bfqq);
 	bfqg_and_blkg_put(bfqg);
 }
@@ -5337,8 +5338,9 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
 		bfq_set_next_ioprio_data(bfqq, bic);
 }
 
-static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct bfq_io_cq *bic, pid_t pid, int is_sync)
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_group *bfqg,
+			  struct bfq_queue *bfqq, struct bfq_io_cq *bic,
+			  pid_t pid, int is_sync)
 {
 	u64 now_ns = ktime_get_ns();
 
@@ -5347,6 +5349,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	INIT_HLIST_NODE(&bfqq->burst_list_node);
 	INIT_HLIST_NODE(&bfqq->woken_list_node);
 	INIT_HLIST_HEAD(&bfqq->woken_list);
+	hlist_add_head(&bfqq->children_node, &bfqg->children);
 
 	bfqq->ref = 0;
 	bfqq->bfqd = bfqd;
@@ -5600,8 +5603,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 				     bfqd->queue->node);
 
 	if (bfqq) {
-		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
-			      is_sync);
+		bfq_init_bfqq(bfqd, bfqg, bfqq, bic, current->pid, is_sync);
 		bfq_init_entity(&bfqq->entity, bfqg);
 		bfq_log_bfqq(bfqd, bfqq, "allocated");
 	} else {
@@ -6908,6 +6910,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
 
 	hrtimer_cancel(&bfqd->idle_slice_timer);
 
+	hlist_del(&bfqd->oom_bfqq.children_node);
 	/* release oom-queue reference to root group */
 	bfqg_and_blkg_put(bfqd->root_group);
 
@@ -6959,28 +6962,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	q->elevator = eq;
 	spin_unlock_irq(&q->queue_lock);
 
-	/*
-	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
-	 * Grab a permanent reference to it, so that the normal code flow
-	 * will not attempt to free it.
-	 */
-	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
-	bfqd->oom_bfqq.ref++;
-	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
-	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
-	bfqd->oom_bfqq.entity.new_weight =
-		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
-
-	/* oom_bfqq does not participate to bursts */
-	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
-
-	/*
-	 * Trigger weight initialization, according to ioprio, at the
-	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
-	 * class won't be changed any more.
-	 */
-	bfqd->oom_bfqq.entity.prio_changed = 1;
-
 	bfqd->queue = q;
 
 	INIT_LIST_HEAD(&bfqd->dispatch);
@@ -7059,6 +7040,27 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 		goto out_free;
 	bfq_init_root_group(bfqd->root_group, bfqd);
 	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+	/*
+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+	 * Grab a permanent reference to it, so that the normal code flow
+	 * will not attempt to free it.
+	 */
+	bfq_init_bfqq(bfqd, bfqd->root_group, &bfqd->oom_bfqq, NULL, 1, 0);
+	bfqd->oom_bfqq.ref++;
+	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+	bfqd->oom_bfqq.entity.new_weight =
+		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+
+	/* oom_bfqq does not participate to bursts */
+	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
+
+	/*
+	 * Trigger weight initialization, according to ioprio, at the
+	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
+	 * class won't be changed any more.
+	 */
+	bfqd->oom_bfqq.entity.prio_changed = 1;
 
 	wbt_disable_default(q);
 	return 0;
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index a73488eec8a4..a1984959d6be 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -292,6 +292,9 @@ struct bfq_queue {
 
 	/* node for active/idle bfqq list inside parent bfqd */
 	struct list_head bfqq_list;
+	/* Member of parent's bfqg children list */
+	struct hlist_node children_node;
+
 
 	/* associated @bfq_ttime struct */
 	struct bfq_ttime ttime;
@@ -929,6 +932,9 @@ struct bfq_group {
 	struct bfq_entity entity;
 	struct bfq_sched_data sched_data;
 
+	/* bfq_queues under this entity */
+	struct hlist_head children;
+
 	void *bfqd;
 
 	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
-- 
2.26.2




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]     [Monitors]

  Powered by Linux