Re: [PATCH 1/1] NFSD: fix WARN_ON_ONCE in __queue_delayed_work

Jeff Layton <jlayton@xxxxxxxxxx> · Wed, 11 Jan 2023 09:16:33 -0500

On Wed, 2023-01-11 at 09:01 -0500, Jeff Layton wrote:
> On Wed, 2023-01-11 at 07:33 -0500, Jeff Layton wrote:
> > On Wed, 2023-01-11 at 13:15 +0100, Mike Galbraith wrote:
> > > On Wed, 2023-01-11 at 12:19 +0100, Mike Galbraith wrote:
> > > > On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote:
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > It might be interesting to turn up KASAN if you're able.
> > > > 
> > > > I can try that.
> > > 
> > > KASAN did not squeak.
> > > 
> > > > > If you still have this vmcore, it might be interesting to do the pointer
> > > > > math and find the nfsd_net structure that contains the above
> > > > > delayed_work. Does the rest of it also seem to be corrupt?
> > > 
> > > Virgin source with workqueue.c WARN_ON_ONCE() landmine.
> > > 
> > 
> > Thanks. Mixed bag here...
> > 
> > 
> > > crash> nfsd_net -x 0xFFFF8881114E9800
> > > struct nfsd_net {
> > >   cld_net = 0x0,
> > >   svc_expkey_cache = 0xffff8881420f8a00,
> > >   svc_export_cache = 0xffff8881420f8800,
> > >   idtoname_cache = 0xffff8881420f9a00,
> > >   nametoid_cache = 0xffff8881420f9c00,
> > >   nfsd4_manager = {
> > >     list = {
> > >       next = 0x0,
> > >       prev = 0x0
> > >     },
> > >     block_opens = 0x0
> > >   },
> > >   grace_ended = 0x0,
> > 
> > 
> > >   boot_time = 0x0,
> > >   nfsd_client_dir = 0x0,
> > >   reclaim_str_hashtbl = 0x0,
> > >   reclaim_str_hashtbl_size = 0x0,
> > >   conf_id_hashtbl = 0x0,
> > >   conf_name_tree = {
> > >     rb_node = 0x0
> > >   },
> > >   unconf_id_hashtbl = 0x0,
> > >   unconf_name_tree = {
> > >     rb_node = 0x0
> > >   },
> > >   sessionid_hashtbl = 0x0,
> > >   client_lru = {
> > >     next = 0x0,
> > >     prev = 0x0
> > >   },
> > >   close_lru = {
> > >     next = 0x0,
> > >     prev = 0x0
> > >   },
> > >   del_recall_lru = {
> > >     next = 0x0,
> > >     prev = 0x0
> > >   },
> > >   blocked_locks_lru = {
> > >     next = 0x0,
> > >     prev = 0x0
> > >   },
> > 
> > All of the above list_heads are zeroed out and they shouldn't be.
> > 
> > >   laundromat_work = {
> > >     work = {
> > >       data = {
> > >         counter = 0x0
> > >       },
> > >       entry = {
> > >         next = 0x0,
> > >         prev = 0x0
> > >       },
> > >       func = 0x0
> > >     },
> > >     timer = {
> > >       entry = {
> > >         next = 0x0,
> > >         pprev = 0x0
> > >       },
> > >       expires = 0x0,
> > >       function = 0x0,
> > >       flags = 0x0
> > >     },
> > >     wq = 0x0,
> > >     cpu = 0x0
> > >   },
> > >   client_lock = {
> > >     {
> > >       rlock = {
> > >         raw_lock = {
> > >           {
> > >             val = {
> > >               counter = 0x0
> > >             },
> > >             {
> > >               locked = 0x0,
> > >               pending = 0x0
> > >             },
> > >             {
> > >               locked_pending = 0x0,
> > >               tail = 0x0
> > >             }
> > >           }
> > >         }
> > >       }
> > >     }
> > >   },
> > >   blocked_locks_lock = {
> > >     {
> > >       rlock = {
> > >         raw_lock = {
> > >           {
> > >             val = {
> > >               counter = 0x0
> > >             },
> > >             {
> > >               locked = 0x0,
> > >               pending = 0x0
> > >             },
> > >             {
> > >               locked_pending = 0x0,
> > >               tail = 0x0
> > >             }
> > >           }
> > >         }
> > >       }
> > >     }
> > >   },
> > >   rec_file = 0x0,
> > >   in_grace = 0x0,
> > >   client_tracking_ops = 0x0,
> > >   nfsd4_lease = 0x5a,
> > >   nfsd4_grace = 0x5a,
> > 
> > The grace and lease times look ok, oddly enough.
> > 
> > >   somebody_reclaimed = 0x0,
> > >   track_reclaim_completes = 0x0,
> > >   nr_reclaim_complete = {
> > >     counter = 0x0
> > >   },
> > >   nfsd_net_up = 0x0,
> > 
> > nfsd_net_up is false, which means that this server isn't running (or
> > that the memory here was scribbled over).
> > 
> > >   lockd_up = 0x0,
> > >   writeverf_lock = {
> > >     seqcount = {
> > >       seqcount = {
> > >         sequence = 0x0
> > >       }
> > >     },
> > >     lock = {
> > >       {
> > >         rlock = {
> > >           raw_lock = {
> > >             {
> > >               val = {
> > >                 counter = 0x0
> > >               },
> > >               {
> > >                 locked = 0x0,
> > >                 pending = 0x0
> > >               },
> > >               {
> > >                 locked_pending = 0x0,
> > >                 tail = 0x0
> > >               }
> > >             }
> > >           }
> > >         }
> > >       }
> > >     }
> > >   },
> > >   writeverf = "\000\000\000\000\000\000\000",
> > >   max_connections = 0x0,
> > >   clientid_base = 0x37b4ca7b,
> > >   clientid_counter = 0x37b4ca7d,
> > >   clverifier_counter = 0xa8ee910d,
> > >   nfsd_serv = 0x0,
> > >   keep_active = 0x0,
> > >   s2s_cp_cl_id = 0x37b4ca7c,
> > >   s2s_cp_stateids = {
> > >     idr_rt = {
> > >       xa_lock = {
> > >         {
> > >           rlock = {
> > >             raw_lock = {
> > >               {
> > >                 val = {
> > >                   counter = 0x0
> > >                 },
> > >                 {
> > >                   locked = 0x0,
> > >                   pending = 0x0
> > >                 },
> > >                 {
> > >                   locked_pending = 0x0,
> > >                   tail = 0x0
> > >                 }
> > >               }
> > >             }
> > >           }
> > >         }
> > >       },
> > >       xa_flags = 0x0,
> > >       xa_head = 0x0
> > >     },
> > >     idr_base = 0x0,
> > >     idr_next = 0x0
> > >   },
> > >   s2s_cp_lock = {
> > >     {
> > >       rlock = {
> > >         raw_lock = {
> > >           {
> > >             val = {
> > >               counter = 0x0
> > >             },
> > >             {
> > >               locked = 0x0,
> > >               pending = 0x0
> > >             },
> > >             {
> > >               locked_pending = 0x0,
> > >               tail = 0x0
> > >             }
> > >           }
> > >         }
> > >       }
> > >     }
> > >   },
> > >   nfsd_versions = 0x0,
> > >   nfsd4_minorversions = 0x0,
> > >   drc_hashtbl = 0xffff88810a2f0000,
> > >   max_drc_entries = 0x14740,
> > >   maskbits = 0xb,
> > >   drc_hashsize = 0x800,
> > >   num_drc_entries = {
> > >     counter = 0x0
> > >   },
> > >   counter = {{
> > >       lock = {
> > >         raw_lock = {
> > >           {
> > >             val = {
> > >               counter = 0x0
> > >             },
> > >             {
> > >               locked = 0x0,
> > >               pending = 0x0
> > >             },
> > >             {
> > >               locked_pending = 0x0,
> > >               tail = 0x0
> > >             }
> > >           }
> > >         }
> > >       },
> > >       count = 0x0,
> > >       list = {
> > >         next = 0xffff888103f98dd0,
> > >         prev = 0xffff8881114e9a18
> > >       },
> > >       counters = 0x607dc8402e10
> > >     }, {
> > >       lock = {
> > >         raw_lock = {
> > >           {
> > >             val = {
> > >               counter = 0x0
> > >             },
> > >             {
> > >               locked = 0x0,
> > >               pending = 0x0
> > >             },
> > >             {
> > >               locked_pending = 0x0,
> > >               tail = 0x0
> > >             }
> > >           }
> > >         }
> > >       },
> > >       count = 0x0,
> > >       list = {
> > >         next = 0xffff8881114e99f0,
> > >         prev = 0xffff88810b5743e0
> > >       },
> > >       counters = 0x607dc8402e14
> > >     }},
> > >   longest_chain = 0x0,
> > >   longest_chain_cachesize = 0x0,
> > >   nfsd_reply_cache_shrinker = {
> > >     count_objects = 0xffffffffa0e4e9b0 <nfsd_reply_cache_count>,
> > >     scan_objects = 0xffffffffa0e4f020 <nfsd_reply_cache_scan>,
> > 
> > Shrinker pointers look ok, as does its list_head.
> 
> I think I might see what's going on here:
> 
> This struct is consistent with an nfsd_net structure that allocated and
> had its ->init routine run on it, but that has not had nfsd started in
> its namespace.
> 
> pernet structs are kzalloc'ed. The shrinker registrations and the
> lease/grace times are set just after allocation in the ->init routine.
> The rest of the fields are not set until nfsd is started (specifically
> when /proc/fs/nfsd/threads is written to).
> 
> My guess is that Mike is doing some activity that creates new net
> namespaces, and then once we start getting into OOM conditions the
> shrinker ends up hitting uninitialized fields in the nfsd_net and
> kaboom.
> 
> I haven't yet looked to see when this bug was introduced, but the riggitht
> fix is probably to ensure that everything important for this job is
> initialized after the pernet ->init operation runs.

It might even be best to just move the shrinker registrations to occur
when nfsd is started rather than doing it at nfsd_net initialization
time. Obviously, if nfsd isn't running, then it'll have nothing to free,
and it's a waste of time to register the shrinker at all.

-- 
Jeff Layton <jlayton@xxxxxxxxxx>