Re: [PATCH] nfsd: put dl_stid if fail to queue dl_recall

Li Lingfeng <lilingfeng3@xxxxxxxxxx> · Thu, 13 Feb 2025 20:28:02 +0800

在 2025/2/13 19:49, Jeff Layton 写道:
On Thu, 2025-02-13 at 15:25 +0800, Li Lingfeng wrote:
Before calling nfsd4_run_cb to queue dl_recall to the callback_wq, we
increment the reference count of dl_stid.
We expect that after the corresponding work_struct is processed, the
reference count of dl_stid will be decremented through the callback
function nfsd4_cb_recall_release.
However, if the call to nfsd4_run_cb fails, the incremented reference
count of dl_stid will not be decremented correspondingly, leading to the
following nfs4_stid leak:
unreferenced object 0xffff88812067b578 (size 344):
   comm "nfsd", pid 2761, jiffies 4295044002 (age 5541.241s)
   hex dump (first 32 bytes):
     01 00 00 00 6b 6b 6b 6b b8 02 c0 e2 81 88 ff ff  ....kkkk........
     00 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 ad 4e ad de  .kkkkkkk.....N..
   backtrace:
     kmem_cache_alloc+0x4b9/0x700
     nfsd4_process_open1+0x34/0x300
     nfsd4_open+0x2d1/0x9d0
     nfsd4_proc_compound+0x7a2/0xe30
     nfsd_dispatch+0x241/0x3e0
     svc_process_common+0x5d3/0xcc0
     svc_process+0x2a3/0x320
     nfsd+0x180/0x2e0
     kthread+0x199/0x1d0
     ret_from_fork+0x30/0x50
     ret_from_fork_asm+0x1b/0x30
unreferenced object 0xffff8881499f4d28 (size 368):
   comm "nfsd", pid 2761, jiffies 4295044005 (age 5541.239s)
   hex dump (first 32 bytes):
     01 00 00 00 00 00 00 00 30 4d 9f 49 81 88 ff ff  ........0M.I....
     30 4d 9f 49 81 88 ff ff 20 00 00 00 01 00 00 00  0M.I.... .......
   backtrace:
     kmem_cache_alloc+0x4b9/0x700
     nfs4_alloc_stid+0x29/0x210
     alloc_init_deleg+0x92/0x2e0
     nfs4_set_delegation+0x284/0xc00
     nfs4_open_delegation+0x216/0x3f0
     nfsd4_process_open2+0x2b3/0xee0
     nfsd4_open+0x770/0x9d0
     nfsd4_proc_compound+0x7a2/0xe30
     nfsd_dispatch+0x241/0x3e0
     svc_process_common+0x5d3/0xcc0
     svc_process+0x2a3/0x320
     nfsd+0x180/0x2e0
     kthread+0x199/0x1d0
     ret_from_fork+0x30/0x50
     ret_from_fork_asm+0x1b/0x30
Fix it by checking the result of nfsd4_run_cb and call nfs4_put_stid if
fail to queue dl_recall.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Li Lingfeng <lilingfeng3@xxxxxxxxxx>
---
  fs/nfsd/nfs4state.c | 6 +++++-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 153eeea2c7c9..0ccb87be47b7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5414,6 +5414,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
  
  static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
  {
+	bool queued;
  	/*
  	 * We're assuming the state code never drops its reference
  	 * without first removing the lease.  Since we're in this lease
@@ -5422,7 +5423,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
  	 * we know it's safe to take a reference.
  	 */
  	refcount_inc(&dp->dl_stid.sc_count);
-	WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
+	queued = nfsd4_run_cb(&dp->dl_recall);
+	WARN_ON_ONCE(!queued);
+	if (!queued)
+		nfs4_put_stid(&dp->dl_stid);
  }
  
  /* Called from break_lease() with flc_lock held. */

Have you actually seen the WARN_ON_ONCE() pop under normal usage, or
was the problem you reproduced done via fault injection?

Hi,

I add mdelay for cb_work before processing it and add msleep in 
nfs_do_return_delegation to reproduce it.

--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1188,6 +1188,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct 
nfs4_file *fp,
     dp->dl_recalled = false;
     nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
               &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+    dp->dl_recall.cb_work.debug_flag = 0x1234;
+    printk("%s init work func %px\n", __func__, 
dp->dl_recall.cb_work.func);
     get_nfs4_file(fp);
     dp->dl_stid.sc_file = fp;
     return dp;

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2708,6 +2708,12 @@ static void process_scheduled_works(struct worker 
*worker)
             worker->pool->watchdog_ts = jiffies;
             first = false;
         }
+        if (work->debug_flag == 0x1234) {
+            printk("%s func %px\n", __func__, work->func);
+            printk("%s delay before clear pending...\n", __func__);
+            mdelay(10 * 1000);
+            printk("%s delay done\n", __func__);
+        }
         process_one_work(worker, work);
     }
 }

--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -263,6 +263,15 @@ static int nfs_do_return_delegation(struct inode 
*inode, struct nfs_delegation *
     const struct cred *cred;
     int res = 0;

+    printk("sleep before deleg return...\n");
+    while (1) {
+        ifdebug(PROC)
+            msleep(10 * 1000);
+        else
+            break;
+    }
+    printk("sleep done\n");
+
     if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
         spin_lock(&delegation->lock);
         cred = get_cred(delegation->cred);

Unfortunately, this won't work. nfsd_break_one_deleg() is called from
the ->break_lease callback with the flc->flc_lock held, and
nfs4_put_stid can sleep in the sc_free callbacks.

Are you referring to nfs4_free_deleg? I didn't quite see where it might 
sleep. Could you please explain it to me?

Thanks.