On Thu, 27 Feb 2025, zhangjian wrote: > When nfsd inits failed for detecting cld version in > nfsd4_client_tracking_init, kernel may assume nfsdcld support version 1 > message format and try to upcall with v1 message size to nfsdcld. > There exists one error case in the following process, causeing nfsd > hunging for nfsdcld replay: > > kernel write to pipe->msgs (v1 msg length) > |--------- first msg --------|-------- second message -------| > > nfsdcld read from pipe->msgs (v2 msg length) > |------------ first msg --------------|---second message-----| > | valid message | ignore | wrong message | > > When two nfsd kernel thread add two upcall messages to cld pipe with v1 > version cld_msg (size == 1034) concurrently,but nfsdcld reads with v2 > version size(size == 1067), 33 bytes of the second message will be read > and merged with first message. The 33 bytes in second message will be > ignored. Nfsdcld will then read 1001 bytes in second message, which cause > FATAL in cld_messaged_size checking. Nfsd kernel thread will hang for > it forever until nfs server restarts. > > Signed-off-by: zhangjian <zhangjian496@xxxxxxxxxx> > Reviewed-by: Scott Mayhew <smayhew@xxxxxxxxxx> > --- > utils/nfsdcld/nfsdcld.c | 65 ++++++++++++++++++++++++++++------------- > 1 file changed, 45 insertions(+), 20 deletions(-) > > diff --git a/utils/nfsdcld/nfsdcld.c b/utils/nfsdcld/nfsdcld.c > index dbc7a57..005d1ea 100644 > --- a/utils/nfsdcld/nfsdcld.c > +++ b/utils/nfsdcld/nfsdcld.c > @@ -716,35 +716,60 @@ reply: > } > } > > -static void > -cldcb(int UNUSED(fd), short which, void *data) > +static int > +cld_pipe_read_msg(struct cld_client *clnt) > { > - ssize_t len; > - struct cld_client *clnt = data; > -#if UPCALL_VERSION >= 2 > - struct cld_msg_v2 *cmsg = &clnt->cl_u.cl_msg_v2; > -#else > - struct cld_msg *cmsg = &clnt->cl_u.cl_msg; > -#endif > + ssize_t len, left_len; > + ssize_t hdr_len = sizeof(struct cld_msg_hdr); > + struct cld_msg_hdr *hdr = (struct cld_msg_hdr *)&clnt->cl_u; > > - if (which != EV_READ) > - goto out; > + len = atomicio(read, clnt->cl_fd, hdr, hdr_len); > > - len = atomicio(read, clnt->cl_fd, cmsg, sizeof(*cmsg)); > if (len <= 0) { > xlog(L_ERROR, "%s: pipe read failed: %m", __func__); > - cld_pipe_open(clnt); > - goto out; > + goto fail_read; > } We probably also want to fail if len != hdr_len. > > - if (cmsg->cm_vers > UPCALL_VERSION) { > + switch (hdr->cm_vers) { > + case 1: > + left_len = sizeof(struct cld_msg) - hdr_len; > + break; > + case 2: > + left_len = sizeof(struct cld_msg_v2) - hdr_len; > + break; > + default: > xlog(L_ERROR, "%s: unsupported upcall version: %hu", > - __func__, cmsg->cm_vers); > - cld_pipe_open(clnt); > - goto out; > + __func__, hdr->cm_vers); > + goto fail_read; > } > > - switch(cmsg->cm_cmd) { > + len = atomicio(read, clnt->cl_fd, hdr, left_len); This is reading into the beginning of the message and overwriting the header. In the original version of this patch you had hdr + 1 as the to read the data into. -Scott > + > + if (len <= 0) { > + xlog(L_ERROR, "%s: pipe read failed: %m", __func__); > + goto fail_read; > + } > + > + return 0; > + > +fail_read: > + cld_pipe_open(clnt); > + return -1; > +} > + > +static void > +cldcb(int UNUSED(fd), short which, void *data) > +{ > + struct cld_client *clnt = data; > + struct cld_msg_hdr *hdr = (struct cld_msg_hdr *)&clnt->cl_u; > + > + if (which != EV_READ) > + goto out; > + > + if (cld_pipe_read_msg(clnt) < 0) > + goto out; > + > + switch (hdr->cm_cmd) { > case Cld_Create: > cld_create(clnt); > break; > @@ -765,7 +790,7 @@ cldcb(int UNUSED(fd), short which, void *data) > break; > default: > xlog(L_WARNING, "%s: command %u is not yet implemented", > - __func__, cmsg->cm_cmd); > + __func__, hdr->cm_cmd); > cld_not_implemented(clnt); > } > out: > -- > 2.33.0 >