Re: handle_exit_race && PF_EXITING

Oleg Nesterov <oleg@xxxxxxxxxx> · Wed, 6 Nov 2019 13:11:11 +0100

On 11/06, Thomas Gleixner wrote:
>
> But even if we adapt that patch to the current code it won't solve the
> -ESRCH issue I described above.

Hmm. I must have missed something. Why?

Please see the unfinished/untested patch below.

I think that (with or without this fix) handle_exit_race() logic needs
cleanups, there is no reason for get_futex_value_locked(), we can drop
->pi_lock right after we see PF_EXITPIDONE. Lets discuss this later.

But why we can not rely on handle_exit_race() without PF_EXITING check?

Yes, exit_robust_list() is called before exit_pi_state_list() which (with
this patch) sets PF_EXITPIDONE. But this is fine? handle_futex_death()
doesn't wakeup pi futexes, exit_pi_state_list() does this.

Could you explain?

Oleg.

diff --git a/kernel/exit.c b/kernel/exit.c
index a46a50d..a6c788c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -761,17 +761,6 @@ void __noreturn do_exit(long code)
 	}
 
 	exit_signals(tsk);  /* sets PF_EXITING */
-	/*
-	 * Ensure that all new tsk->pi_lock acquisitions must observe
-	 * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
-	 */
-	smp_mb();
-	/*
-	 * Ensure that we must observe the pi_state in exit_mm() ->
-	 * mm_release() -> exit_pi_state_list().
-	 */
-	raw_spin_lock_irq(&tsk->pi_lock);
-	raw_spin_unlock_irq(&tsk->pi_lock);
 
 	if (unlikely(in_atomic())) {
 		pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -846,12 +835,6 @@ void __noreturn do_exit(long code)
 	 * Make sure we are holding no locks:
 	 */
 	debug_check_no_locks_held();
-	/*
-	 * We can do this unlocked here. The futex code uses this flag
-	 * just to verify whether the pi state cleanup has been done
-	 * or not. In the worst case it loops once more.
-	 */
-	tsk->flags |= PF_EXITPIDONE;
 
 	if (tsk->io_context)
 		exit_io_context(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index bcdf531..0d8edcf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1297,8 +1297,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 		tsk->compat_robust_list = NULL;
 	}
 #endif
-	if (unlikely(!list_empty(&tsk->pi_state_list)))
-		exit_pi_state_list(tsk);
+	exit_pi_state_list(tsk);
 #endif
 
 	uprobe_free_utask(tsk);
diff --git a/kernel/futex.c b/kernel/futex.c
index bd18f60..c3b5d33 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -905,6 +905,7 @@ void exit_pi_state_list(struct task_struct *curr)
 	 * versus waiters unqueueing themselves:
 	 */
 	raw_spin_lock_irq(&curr->pi_lock);
+	curr->flags |= PF_EXITPIDONE;
 	while (!list_empty(head)) {
 		next = head->next;
 		pi_state = list_entry(next, struct futex_pi_state, list);
@@ -1169,18 +1170,11 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
 	return ret;
 }
 
-static int handle_exit_race(u32 __user *uaddr, u32 uval,
-			    struct task_struct *tsk)
+static int handle_exit_race(u32 __user *uaddr, u32 uval)
 {
 	u32 uval2;
 
 	/*
-	 * If PF_EXITPIDONE is not yet set, then try again.
-	 */
-	if (tsk && !(tsk->flags & PF_EXITPIDONE))
-		return -EAGAIN;
-
-	/*
 	 * Reread the user space value to handle the following situation:
 	 *
 	 * CPU0				CPU1
@@ -1245,7 +1239,7 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
 		return -EAGAIN;
 	p = find_get_task_by_vpid(pid);
 	if (!p)
-		return handle_exit_race(uaddr, uval, NULL);
+		return handle_exit_race(uaddr, uval);
 
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		put_task_struct(p);
@@ -1259,13 +1253,13 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
 	 * p->pi_lock:
 	 */
 	raw_spin_lock_irq(&p->pi_lock);
-	if (unlikely(p->flags & PF_EXITING)) {
+	if (unlikely(p->flags & PF_EXITPIDONE)) {
 		/*
 		 * The task is on the way out. When PF_EXITPIDONE is
 		 * set, we know that the task has finished the
 		 * cleanup:
 		 */
-		int ret = handle_exit_race(uaddr, uval, p);
+		int ret = handle_exit_race(uaddr, uval);
 
 		raw_spin_unlock_irq(&p->pi_lock);
 		put_task_struct(p);