Re: Thread safe client destruction

Attila Kovacs <attila.kovacs@xxxxxxxxxxxxxxx> · Thu, 21 Jul 2022 18:00:16 -0400

Hi Steve,

Yes, the two patches affect the same code area, so if they are both 
relative to master (which is how I submitted them originally) then they 
conflict.

Here is the second patch as a clean incremental on the first.

-- A.

-----------------------------------------------------------------------

diff --git a/src/clnt_dg.c b/src/clnt_dg.c
index 7c5d22e..24c825b 100644
--- a/src/clnt_dg.c
+++ b/src/clnt_dg.c
@@ -101,6 +101,7 @@ extern mutex_t clnt_fd_lock;
 #define	release_fd_lock(fd_lock, mask) {	\
 	mutex_lock(&clnt_fd_lock);	\
 	fd_lock->active = FALSE;	\
+	fd_lock->pending--;		\
 	thr_sigsetmask(SIG_SETMASK, &(mask), NULL); \
 	cond_signal(&fd_lock->cv);	\
 	mutex_unlock(&clnt_fd_lock);    \
@@ -311,6 +312,7 @@ clnt_dg_call(cl, proc, xargs, argsp, xresults, 
resultsp, utimeout)
 	sigfillset(&newmask);
 	thr_sigsetmask(SIG_SETMASK, &newmask, &mask);
 	mutex_lock(&clnt_fd_lock);
+	cu->cu_fd_lock->pending++;
 	while (cu->cu_fd_lock->active)
 		cond_wait(&cu->cu_fd_lock->cv, &clnt_fd_lock);
 	cu->cu_fd_lock->active = TRUE;
@@ -571,11 +573,12 @@ clnt_dg_freeres(cl, xdr_res, res_ptr)
 	sigfillset(&newmask);
 	thr_sigsetmask(SIG_SETMASK, &newmask, &mask);
 	mutex_lock(&clnt_fd_lock);
+	cu->cu_fd_lock->pending++;
 	while (cu->cu_fd_lock->active)
 		cond_wait(&cu->cu_fd_lock->cv, &clnt_fd_lock);
-	cu->cu_fd_lock->active = TRUE;
 	xdrs->x_op = XDR_FREE;
 	dummy = (*xdr_res)(xdrs, res_ptr);
+	cu->cu_fd_lock->pending--;
 	thr_sigsetmask(SIG_SETMASK, &mask, NULL);
 	cond_signal(&cu->cu_fd_lock->cv);
 	mutex_unlock(&clnt_fd_lock);
@@ -603,6 +606,7 @@ clnt_dg_control(cl, request, info)
 	sigfillset(&newmask);
 	thr_sigsetmask(SIG_SETMASK, &newmask, &mask);
 	mutex_lock(&clnt_fd_lock);
+	cu->cu_fd_lock->pending++;
 	while (cu->cu_fd_lock->active)
 		cond_wait(&cu->cu_fd_lock->cv, &clnt_fd_lock);
 	cu->cu_fd_lock->active = TRUE;
@@ -743,7 +747,7 @@ clnt_dg_destroy(cl)
 	sigfillset(&newmask);
 	thr_sigsetmask(SIG_SETMASK, &newmask, &mask);
 	mutex_lock(&clnt_fd_lock);
-	while (cu_fd_lock->active)
+	while (cu_fd_lock->active || cu_fd_lock->pending > 0)
 		cond_wait(&cu_fd_lock->cv, &clnt_fd_lock);
 	if (cu->cu_closeit)
 		(void)close(cu_fd);
diff --git a/src/clnt_fd_locks.h b/src/clnt_fd_locks.h
index 359f995..6ba62cb 100644
--- a/src/clnt_fd_locks.h
+++ b/src/clnt_fd_locks.h
@@ -50,6 +50,7 @@ static unsigned int fd_locks_prealloc = 0;
 /* per-fd lock */
 struct fd_lock_t {
 	bool_t active;
+	int pending;        /* Number of pending operations on fd */
 	cond_t cv;
 };
 typedef struct fd_lock_t fd_lock_t;
@@ -180,6 +181,7 @@ fd_lock_t* fd_lock_create(int fd, fd_locks_t 
*fd_locks) {
 		item->fd = fd;
 		item->refs = 1;
 		item->fd_lock.active = FALSE;
+		item->fd_lock.pending = 0;
 		cond_init(&item->fd_lock.cv, 0, (void *) 0);
 		TAILQ_INSERT_HEAD(list, item, link);
 	} else {
diff --git a/src/clnt_vc.c b/src/clnt_vc.c
index 3c73e65..7610b4a 100644
--- a/src/clnt_vc.c
+++ b/src/clnt_vc.c
@@ -153,6 +153,7 @@ extern mutex_t  clnt_fd_lock;
 #define release_fd_lock(fd_lock, mask) {	\
 	mutex_lock(&clnt_fd_lock);	\
 	fd_lock->active = FALSE;	\
+	fd_lock->pending--;		\
 	thr_sigsetmask(SIG_SETMASK, &(mask), (sigset_t *) NULL);	\
 	cond_signal(&fd_lock->cv);	\
 	mutex_unlock(&clnt_fd_lock);    \
@@ -357,6 +358,7 @@ clnt_vc_call(cl, proc, xdr_args, args_ptr, 
xdr_results, results_ptr, timeout)
 	sigfillset(&newmask);
 	thr_sigsetmask(SIG_SETMASK, &newmask, &mask);
 	mutex_lock(&clnt_fd_lock);
+	ct->ct_fd_lock->pending++;
 	while (ct->ct_fd_lock->active)
 		cond_wait(&ct->ct_fd_lock->cv, &clnt_fd_lock);
 	ct->ct_fd_lock->active = TRUE;
@@ -495,10 +497,12 @@ clnt_vc_freeres(cl, xdr_res, res_ptr)
 	sigfillset(&newmask);
 	thr_sigsetmask(SIG_SETMASK, &newmask, &mask);
 	mutex_lock(&clnt_fd_lock);
+	ct->ct_fd_lock->pending++;
 	while (ct->ct_fd_lock->active)
 		cond_wait(&ct->ct_fd_lock->cv, &clnt_fd_lock);
 	xdrs->x_op = XDR_FREE;
 	dummy = (*xdr_res)(xdrs, res_ptr);
+	ct->ct_fd_lock->pending--;
 	thr_sigsetmask(SIG_SETMASK, &(mask), NULL);
 	cond_signal(&ct->ct_fd_lock->cv);
 	mutex_unlock(&clnt_fd_lock);
@@ -533,6 +537,7 @@ clnt_vc_control(cl, request, info)
 	sigfillset(&newmask);
 	thr_sigsetmask(SIG_SETMASK, &newmask, &mask);
 	mutex_lock(&clnt_fd_lock);
+	ct->ct_fd_lock->pending++;
 	while (ct->ct_fd_lock->active)
 		cond_wait(&ct->ct_fd_lock->cv, &clnt_fd_lock);
 	ct->ct_fd_lock->active = TRUE;
@@ -655,7 +660,7 @@ clnt_vc_destroy(cl)
 	sigfillset(&newmask);
 	thr_sigsetmask(SIG_SETMASK, &newmask, &mask);
 	mutex_lock(&clnt_fd_lock);
-	while (ct_fd_lock->active)
+	while (ct_fd_lock->active || ct_fd_lock->pending > 0)
 		cond_wait(&ct_fd_lock->cv, &clnt_fd_lock);
 	if (ct->ct_closeit && ct->ct_fd != -1) {
 		(void)close(ct->ct_fd);

-------------------------------------------------------------



On 7/21/22 17:38, Steve Dickson wrote:
Hello,

On 7/21/22 2:41 PM, Attila Kovacs wrote:
Hi again,


I found yet more potential MT flaws in clnt_dg.c and clnt_vg.c.

1. In clnt_dg.c in clnt_dg_freeres(), cu_fd_lock->active is set to 
TRUE, with no corresponding clearing when the operation (*xdr_res() 
call) is completed. This would leave other waiting operations blocked 
indefinitely, effectively deadlocked. For comparison, 
clnt_vd_freeres() in clnt_vc.c does not set the active state to TRUE. 
I believe the vc behavior is correct, while the dg behavior is a bug.

2. If clnt_dg_destroy() or clnt_vc_destroy() is awoken with other 
blocked operations pending (such as clnt_*_call(), clnt_*_control(), 
or clnt_*_freeres()) but no active operation currently being executed, 
then the client gets destroyed. Then, as the other blocked operations 
get subsequently awoken, they will try operate on an invalid client 
handle, potentially causing unpredictable behavior and stack corruption.

The proposed fix is to introduce a simple mutexed counting variable 
into the client lock structure, which keeps track of the number of 
pending blocking operations on the client. This way, clnt_*_destroy() 
can check if there are any operations pending on a client, and keep 
waiting until all pending operations are completed, before the client 
is destroyed safely and its resources are freed.

Attached is a patch with the above fixes.
There is a problem here... This patch does contain the first patch
you posted. So when I apply this patch it fails.

So Please apply your first patch, then apply the fixes from
your second patch... test... then when all is good...
Resent the second patch which will apply, cleanly,
with the first patch.

steved.


-- A.