Re: v4.20-rc6: Sporadic use-after-free in bt_iter()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 12/20/18 2:44 PM, Jens Axboe wrote:
> On 12/20/18 2:40 PM, Bart Van Assche wrote:
>> On Thu, 2018-12-20 at 14:34 -0700, Jens Axboe wrote:
>>> Yeah, I don't think it's bullet proof either, it just closes the gap.
>>> I'm fine with fiddling with the tag iteration. On top of what I sent, we
>>> could have tag iteration hold the RCU read lock, and then we just need
>>> to ensure that the tags are freed with RCU.
>>
>> Do you mean using call_rcu() to free tags? Would that require to add a
>> struct rcu_head to every request? Would it be acceptable to increase the
>> size of struct request with an rcu_head? Additionally, could that reduce
>> the queue depth if the time between grace periods is larger than the time
>> between I/O submissions?
> 
> No no, the requests are out of full pages, we just need one per blk_mq_tags.
> Something like the below... And then the tag iteration needs to grab the
> RCU read lock to prevent the page freeing from happening.
> 
> This should essentially be free, as rcu read lock for tag iteration is
> basically a no-op.
> 
> Need to handle the flush request on top of this as well.

With FQ handled as well.


diff --git a/block/blk-flush.c b/block/blk-flush.c
index a3fc7191c694..23e1f5fb091f 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -491,12 +491,22 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 	return NULL;
 }
 
+static void blk_fq_rcu_free(struct work_struct *work)
+{
+	struct blk_flush_queue *fq = container_of(to_rcu_work(work),
+							struct blk_flush_queue,
+							rcu_work);
+
+	kfree(fq->flush_rq);
+	kfree(fq);
+}
+	
 void blk_free_flush_queue(struct blk_flush_queue *fq)
 {
 	/* bio based request queue hasn't flush queue */
 	if (!fq)
 		return;
 
-	kfree(fq->flush_rq);
-	kfree(fq);
+	INIT_RCU_WORK(&fq->rcu_work, blk_fq_rcu_free);
+	queue_rcu_work(system_wq, &fq->rcu_work);
 }
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2089c6c62f44..c39b58391ae8 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -228,13 +228,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 
 	if (!reserved)
 		bitnr += tags->nr_reserved_tags;
-	rq = tags->rqs[bitnr];
+	if (tags->rqs[bitnr].queue != hctx->queue)
+		return true;
 
 	/*
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assigning ->rqs[].
 	 */
-	if (rq && rq->q == hctx->queue)
+	rq = tags->rqs[bitnr].rq;
+	if (rq)
 		return iter_data->fn(hctx, rq, iter_data->data, reserved);
 	return true;
 }
@@ -263,7 +265,9 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
 		.reserved = reserved,
 	};
 
+	rcu_read_lock();
 	sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
+	rcu_read_unlock();
 }
 
 struct bt_tags_iter_data {
@@ -287,7 +291,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assining ->rqs[].
 	 */
-	rq = tags->rqs[bitnr];
+	rq = tags->rqs[bitnr].rq;
 	if (rq && blk_mq_request_started(rq))
 		return iter_data->fn(rq, iter_data->data, reserved);
 
@@ -317,8 +321,11 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
 		.reserved = reserved,
 	};
 
-	if (tags->rqs)
+	if (tags->rqs) {
+		rcu_read_lock();
 		sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
+		rcu_read_unlock();
+	}
 }
 
 /**
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..bdd1bfc08c21 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -4,6 +4,11 @@
 
 #include "blk-mq.h"
 
+struct rq_tag_entry {
+	struct request_queue *queue;
+	struct request *rq;
+};
+
 /*
  * Tag address space map.
  */
@@ -16,9 +21,11 @@ struct blk_mq_tags {
 	struct sbitmap_queue bitmap_tags;
 	struct sbitmap_queue breserved_tags;
 
-	struct request **rqs;
+	struct rq_tag_entry *rqs;
 	struct request **static_rqs;
 	struct list_head page_list;
+
+	struct rcu_work rcu_work;
 };
 
 
@@ -78,7 +85,8 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
 		unsigned int tag, struct request *rq)
 {
-	hctx->tags->rqs[tag] = rq;
+	hctx->tags->rqs[tag].queue = hctx->queue;
+	hctx->tags->rqs[tag].rq = rq;
 }
 
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2de972857496..4decd1e7d2d9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -298,13 +298,16 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		rq->tag = -1;
 		rq->internal_tag = tag;
 	} else {
-		if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
+		struct blk_mq_hw_ctx *hctx = data->hctx;
+
+		if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
 			rq_flags = RQF_MQ_INFLIGHT;
-			atomic_inc(&data->hctx->nr_active);
+			atomic_inc(&hctx->nr_active);
 		}
 		rq->tag = tag;
 		rq->internal_tag = -1;
-		data->hctx->tags->rqs[rq->tag] = rq;
+		hctx->tags->rqs[rq->tag].queue = hctx->queue;
+		hctx->tags->rqs[rq->tag].rq = rq;
 	}
 
 	/* csd/requeue_work/fifo_time is initialized before use */
@@ -797,8 +800,8 @@ EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
 	if (tag < tags->nr_tags) {
-		prefetch(tags->rqs[tag]);
-		return tags->rqs[tag];
+		prefetch(tags->rqs[tag].rq);
+		return tags->rqs[tag].rq;
 	}
 
 	return NULL;
@@ -809,10 +812,11 @@ static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
 			       void *priv, bool reserved)
 {
 	/*
-	 * If we find a request that is inflight and the queue matches,
-	 * we know the queue is busy. Return false to stop the iteration.
+	 * We're only called here if the queue matches. If the rq state is
+	 * inflight, we know the queue is busy. Return false to stop the
+	 * iteration.
 	 */
-	if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
+	if (rq->state == MQ_RQ_IN_FLIGHT) {
 		bool *busy = priv;
 
 		*busy = true;
@@ -1049,11 +1053,14 @@ bool blk_mq_get_driver_tag(struct request *rq)
 	shared = blk_mq_tag_busy(data.hctx);
 	rq->tag = blk_mq_get_tag(&data);
 	if (rq->tag >= 0) {
+		struct blk_mq_hw_ctx *hctx = data.hctx;
+
 		if (shared) {
 			rq->rq_flags |= RQF_MQ_INFLIGHT;
 			atomic_inc(&data.hctx->nr_active);
 		}
-		data.hctx->tags->rqs[rq->tag] = rq;
+		hctx->tags->rqs[rq->tag].queue = hctx->queue;
+		hctx->tags->rqs[rq->tag].rq = rq;
 	}
 
 done:
@@ -2020,11 +2027,27 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	return cookie;
 }
 
-void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-		     unsigned int hctx_idx)
+static void blk_mq_rcu_free_pages(struct work_struct *work)
 {
+	struct blk_mq_tags *tags = container_of(to_rcu_work(work),
+						struct blk_mq_tags, rcu_work);
 	struct page *page;
 
+	while (!list_empty(&tags->page_list)) {
+		page = list_first_entry(&tags->page_list, struct page, lru);
+		list_del_init(&page->lru);
+		/*
+		 * Remove kmemleak object previously allocated in
+		 * blk_mq_init_rq_map().
+		 */
+		kmemleak_free(page_address(page));
+		__free_pages(page, page->private);
+	}
+}
+
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx)
+{
 	if (tags->rqs && set->ops->exit_request) {
 		int i;
 
@@ -2038,16 +2061,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		}
 	}
 
-	while (!list_empty(&tags->page_list)) {
-		page = list_first_entry(&tags->page_list, struct page, lru);
-		list_del_init(&page->lru);
-		/*
-		 * Remove kmemleak object previously allocated in
-		 * blk_mq_init_rq_map().
-		 */
-		kmemleak_free(page_address(page));
-		__free_pages(page, page->private);
-	}
+	/* Punt to RCU free, so we don't race with tag iteration */
+	INIT_RCU_WORK(&tags->rcu_work, blk_mq_rcu_free_pages);
+	queue_rcu_work(system_wq, &tags->rcu_work);
 }
 
 void blk_mq_free_rq_map(struct blk_mq_tags *tags)
@@ -2077,7 +2093,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 	if (!tags)
 		return NULL;
 
-	tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
+	tags->rqs = kcalloc_node(nr_tags, sizeof(struct rq_tag_entry),
 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
 				 node);
 	if (!tags->rqs) {
diff --git a/block/blk.h b/block/blk.h
index 848278c52030..785207fc8a30 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -29,6 +29,8 @@ struct blk_flush_queue {
 	 */
 	struct request		*orig_rq;
 	spinlock_t		mq_flush_lock;
+
+	struct rcu_work		rcu_work;
 };
 
 extern struct kmem_cache *blk_requestq_cachep;

-- 
Jens Axboe




[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux