It's really that you say. I will fix it at once. On 2025/2/28 5:25, Scott Mayhew wrote: > On Thu, 27 Feb 2025, zhangjian wrote: > >> When nfsd inits failed for detecting cld version in >> nfsd4_client_tracking_init, kernel may assume nfsdcld support version 1 >> message format and try to upcall with v1 message size to nfsdcld. >> There exists one error case in the following process, causeing nfsd >> hunging for nfsdcld replay: >> >> kernel write to pipe->msgs (v1 msg length) >> |--------- first msg --------|-------- second message -------| >> >> nfsdcld read from pipe->msgs (v2 msg length) >> |------------ first msg --------------|---second message-----| >> | valid message | ignore | wrong message | >> >> When two nfsd kernel thread add two upcall messages to cld pipe with v1 >> version cld_msg (size == 1034) concurrently,but nfsdcld reads with v2 >> version size(size == 1067), 33 bytes of the second message will be read >> and merged with first message. The 33 bytes in second message will be >> ignored. Nfsdcld will then read 1001 bytes in second message, which cause >> FATAL in cld_messaged_size checking. Nfsd kernel thread will hang for >> it forever until nfs server restarts. >> >> Signed-off-by: zhangjian <zhangjian496@xxxxxxxxxx> >> Reviewed-by: Scott Mayhew <smayhew@xxxxxxxxxx> >> --- >> utils/nfsdcld/nfsdcld.c | 65 ++++++++++++++++++++++++++++------------- >> 1 file changed, 45 insertions(+), 20 deletions(-) >> >> diff --git a/utils/nfsdcld/nfsdcld.c b/utils/nfsdcld/nfsdcld.c >> index dbc7a57..005d1ea 100644 >> --- a/utils/nfsdcld/nfsdcld.c >> +++ b/utils/nfsdcld/nfsdcld.c >> @@ -716,35 +716,60 @@ reply: >> } >> } >> >> -static void >> -cldcb(int UNUSED(fd), short which, void *data) >> +static int >> +cld_pipe_read_msg(struct cld_client *clnt) >> { >> - ssize_t len; >> - struct cld_client *clnt = data; >> -#if UPCALL_VERSION >= 2 >> - struct cld_msg_v2 *cmsg = &clnt->cl_u.cl_msg_v2; >> -#else >> - struct cld_msg *cmsg = &clnt->cl_u.cl_msg; >> -#endif >> + ssize_t len, left_len; >> + ssize_t hdr_len = sizeof(struct cld_msg_hdr); >> + struct cld_msg_hdr *hdr = (struct cld_msg_hdr *)&clnt->cl_u; >> >> - if (which != EV_READ) >> - goto out; >> + len = atomicio(read, clnt->cl_fd, hdr, hdr_len); >> >> - len = atomicio(read, clnt->cl_fd, cmsg, sizeof(*cmsg)); >> if (len <= 0) { >> xlog(L_ERROR, "%s: pipe read failed: %m", __func__); >> - cld_pipe_open(clnt); >> - goto out; >> + goto fail_read; >> } > > We probably also want to fail if len != hdr_len. >> >> - if (cmsg->cm_vers > UPCALL_VERSION) { >> + switch (hdr->cm_vers) { >> + case 1: >> + left_len = sizeof(struct cld_msg) - hdr_len; >> + break; >> + case 2: >> + left_len = sizeof(struct cld_msg_v2) - hdr_len; >> + break; >> + default: >> xlog(L_ERROR, "%s: unsupported upcall version: %hu", >> - __func__, cmsg->cm_vers); >> - cld_pipe_open(clnt); >> - goto out; >> + __func__, hdr->cm_vers); >> + goto fail_read; >> } >> >> - switch(cmsg->cm_cmd) { >> + len = atomicio(read, clnt->cl_fd, hdr, left_len); > > This is reading into the beginning of the message and overwriting the > header. In the original version of this patch you had hdr + 1 as the > to read the data into. > > -Scott > >> + >> + if (len <= 0) { >> + xlog(L_ERROR, "%s: pipe read failed: %m", __func__); >> + goto fail_read; >> + } >> + >> + return 0; >> + >> +fail_read: >> + cld_pipe_open(clnt); >> + return -1; >> +} >> + >> +static void >> +cldcb(int UNUSED(fd), short which, void *data) >> +{ >> + struct cld_client *clnt = data; >> + struct cld_msg_hdr *hdr = (struct cld_msg_hdr *)&clnt->cl_u; >> + >> + if (which != EV_READ) >> + goto out; >> + >> + if (cld_pipe_read_msg(clnt) < 0) >> + goto out; >> + >> + switch (hdr->cm_cmd) { >> case Cld_Create: >> cld_create(clnt); >> break; >> @@ -765,7 +790,7 @@ cldcb(int UNUSED(fd), short which, void *data) >> break; >> default: >> xlog(L_WARNING, "%s: command %u is not yet implemented", >> - __func__, cmsg->cm_cmd); >> + __func__, hdr->cm_cmd); >> cld_not_implemented(clnt); >> } >> out: >> -- >> 2.33.0 >> > >