Patch "writeback, cgroup: fix null-ptr-deref write in bdi_split_work_to_wbs" has been added to the 5.15-stable tree

Sasha Levin <sashal@xxxxxxxxxx> · Wed, 3 May 2023 18:33:46 -0400

This is a note to let you know that I've just added the patch titled

    writeback, cgroup: fix null-ptr-deref write in bdi_split_work_to_wbs

to the 5.15-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     writeback-cgroup-fix-null-ptr-deref-write-in-bdi_spl.patch
and it can be found in the queue-5.15 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit 5d42748b4de13444317b872e60bd60d4f29cec99
Author: Baokun Li <libaokun1@xxxxxxxxxx>
Date:   Mon Apr 10 21:08:26 2023 +0800

    writeback, cgroup: fix null-ptr-deref write in bdi_split_work_to_wbs
    
    [ Upstream commit 1ba1199ec5747f475538c0d25a32804e5ba1dfde ]
    
    KASAN report null-ptr-deref:
    ==================================================================
    BUG: KASAN: null-ptr-deref in bdi_split_work_to_wbs+0x5c5/0x7b0
    Write of size 8 at addr 0000000000000000 by task sync/943
    CPU: 5 PID: 943 Comm: sync Tainted: 6.3.0-rc5-next-20230406-dirty #461
    Call Trace:
     <TASK>
     dump_stack_lvl+0x7f/0xc0
     print_report+0x2ba/0x340
     kasan_report+0xc4/0x120
     kasan_check_range+0x1b7/0x2e0
     __kasan_check_write+0x24/0x40
     bdi_split_work_to_wbs+0x5c5/0x7b0
     sync_inodes_sb+0x195/0x630
     sync_inodes_one_sb+0x3a/0x50
     iterate_supers+0x106/0x1b0
     ksys_sync+0x98/0x160
    [...]
    ==================================================================
    
    The race that causes the above issue is as follows:
    
               cpu1                     cpu2
    -------------------------|-------------------------
    inode_switch_wbs
     INIT_WORK(&isw->work, inode_switch_wbs_work_fn)
     queue_rcu_work(isw_wq, &isw->work)
     // queue_work async
      inode_switch_wbs_work_fn
       wb_put_many(old_wb, nr_switched)
        percpu_ref_put_many
         ref->data->release(ref)
         cgwb_release
          queue_work(cgwb_release_wq, &wb->release_work)
          // queue_work async
           &wb->release_work
           cgwb_release_workfn
                                ksys_sync
                                 iterate_supers
                                  sync_inodes_one_sb
                                   sync_inodes_sb
                                    bdi_split_work_to_wbs
                                     kmalloc(sizeof(*work), GFP_ATOMIC)
                                     // alloc memory failed
            percpu_ref_exit
             ref->data = NULL
             kfree(data)
                                     wb_get(wb)
                                      percpu_ref_get(&wb->refcnt)
                                       percpu_ref_get_many(ref, 1)
                                        atomic_long_add(nr, &ref->data->count)
                                         atomic64_add(i, v)
                                         // trigger null-ptr-deref
    
    bdi_split_work_to_wbs() traverses &bdi->wb_list to split work into all
    wbs.  If the allocation of new work fails, the on-stack fallback will be
    used and the reference count of the current wb is increased afterwards.
    If cgroup writeback membership switches occur before getting the reference
    count and the current wb is released as old_wd, then calling wb_get() or
    wb_put() will trigger the null pointer dereference above.
    
    This issue was introduced in v4.3-rc7 (see fix tag1).  Both
    sync_inodes_sb() and __writeback_inodes_sb_nr() calls to
    bdi_split_work_to_wbs() can trigger this issue.  For scenarios called via
    sync_inodes_sb(), originally commit 7fc5854f8c6e ("writeback: synchronize
    sync(2) against cgroup writeback membership switches") reduced the
    possibility of the issue by adding wb_switch_rwsem, but in v5.14-rc1 (see
    fix tag2) removed the "inode_io_list_del_locked(inode, old_wb)" from
    inode_switch_wbs_work_fn() so that wb->state contains WB_has_dirty_io,
    thus old_wb is not skipped when traversing wbs in bdi_split_work_to_wbs(),
    and the issue becomes easily reproducible again.
    
    To solve this problem, percpu_ref_exit() is called under RCU protection to
    avoid race between cgwb_release_workfn() and bdi_split_work_to_wbs().
    Moreover, replace wb_get() with wb_tryget() in bdi_split_work_to_wbs(),
    and skip the current wb if wb_tryget() fails because the wb has already
    been shutdown.
    
    Link: https://lkml.kernel.org/r/20230410130826.1492525-1-libaokun1@xxxxxxxxxx
    Fixes: b817525a4a80 ("writeback: bdi_writeback iteration must not skip dying ones")
    Signed-off-by: Baokun Li <libaokun1@xxxxxxxxxx>
    Reviewed-by: Jan Kara <jack@xxxxxxx>
    Acked-by: Tejun Heo <tj@xxxxxxxxxx>
    Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx>
    Cc: Andreas Dilger <adilger.kernel@xxxxxxxxx>
    Cc: Christian Brauner <brauner@xxxxxxxxxx>
    Cc: Dennis Zhou <dennis@xxxxxxxxxx>
    Cc: Hou Tao <houtao1@xxxxxxxxxx>
    Cc: yangerkun <yangerkun@xxxxxxxxxx>
    Cc: Zhang Yi <yi.zhang@xxxxxxxxxx>
    Cc: Jens Axboe <axboe@xxxxxxxxx>
    Cc: <stable@xxxxxxxxxxxxxxx>
    Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f4a5a0c2858a1..fbc3f0ef38c02 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1009,6 +1009,16 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 			continue;
 		}
 
+		/*
+		 * If wb_tryget fails, the wb has been shutdown, skip it.
+		 *
+		 * Pin @wb so that it stays on @bdi->wb_list.  This allows
+		 * continuing iteration from @wb after dropping and
+		 * regrabbing rcu read lock.
+		 */
+		if (!wb_tryget(wb))
+			continue;
+
 		/* alloc failed, execute synchronously using on-stack fallback */
 		work = &fallback_work;
 		*work = *base_work;
@@ -1017,13 +1027,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 		work->done = &fallback_work_done;
 
 		wb_queue_work(wb, work);
-
-		/*
-		 * Pin @wb so that it stays on @bdi->wb_list.  This allows
-		 * continuing iteration from @wb after dropping and
-		 * regrabbing rcu read lock.
-		 */
-		wb_get(wb);
 		last_wb = wb;
 
 		rcu_read_unlock();
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 142e118ade87a..afdd132768455 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -385,6 +385,15 @@ static LIST_HEAD(offline_cgwbs);
 static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
 static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
 
+static void cgwb_free_rcu(struct rcu_head *rcu_head)
+{
+	struct bdi_writeback *wb = container_of(rcu_head,
+			struct bdi_writeback, rcu);
+
+	percpu_ref_exit(&wb->refcnt);
+	kfree(wb);
+}
+
 static void cgwb_release_workfn(struct work_struct *work)
 {
 	struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
@@ -407,10 +416,9 @@ static void cgwb_release_workfn(struct work_struct *work)
 	list_del(&wb->offline_node);
 	spin_unlock_irq(&cgwb_lock);
 
-	percpu_ref_exit(&wb->refcnt);
 	wb_exit(wb);
 	WARN_ON_ONCE(!list_empty(&wb->b_attached));
-	kfree_rcu(wb, rcu);
+	call_rcu(&wb->rcu, cgwb_free_rcu);
 }
 
 static void cgwb_release(struct percpu_ref *refcnt)