On 11/06, Thomas Gleixner wrote: > > But even if we adapt that patch to the current code it won't solve the > -ESRCH issue I described above. Hmm. I must have missed something. Why? Please see the unfinished/untested patch below. I think that (with or without this fix) handle_exit_race() logic needs cleanups, there is no reason for get_futex_value_locked(), we can drop ->pi_lock right after we see PF_EXITPIDONE. Lets discuss this later. But why we can not rely on handle_exit_race() without PF_EXITING check? Yes, exit_robust_list() is called before exit_pi_state_list() which (with this patch) sets PF_EXITPIDONE. But this is fine? handle_futex_death() doesn't wakeup pi futexes, exit_pi_state_list() does this. Could you explain? Oleg. diff --git a/kernel/exit.c b/kernel/exit.c index a46a50d..a6c788c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -761,17 +761,6 @@ void __noreturn do_exit(long code) } exit_signals(tsk); /* sets PF_EXITING */ - /* - * Ensure that all new tsk->pi_lock acquisitions must observe - * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). - */ - smp_mb(); - /* - * Ensure that we must observe the pi_state in exit_mm() -> - * mm_release() -> exit_pi_state_list(). - */ - raw_spin_lock_irq(&tsk->pi_lock); - raw_spin_unlock_irq(&tsk->pi_lock); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", @@ -846,12 +835,6 @@ void __noreturn do_exit(long code) * Make sure we are holding no locks: */ debug_check_no_locks_held(); - /* - * We can do this unlocked here. The futex code uses this flag - * just to verify whether the pi state cleanup has been done - * or not. In the worst case it loops once more. - */ - tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index bcdf531..0d8edcf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1297,8 +1297,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) tsk->compat_robust_list = NULL; } #endif - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); + exit_pi_state_list(tsk); #endif uprobe_free_utask(tsk); diff --git a/kernel/futex.c b/kernel/futex.c index bd18f60..c3b5d33 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -905,6 +905,7 @@ void exit_pi_state_list(struct task_struct *curr) * versus waiters unqueueing themselves: */ raw_spin_lock_irq(&curr->pi_lock); + curr->flags |= PF_EXITPIDONE; while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); @@ -1169,18 +1170,11 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval, return ret; } -static int handle_exit_race(u32 __user *uaddr, u32 uval, - struct task_struct *tsk) +static int handle_exit_race(u32 __user *uaddr, u32 uval) { u32 uval2; /* - * If PF_EXITPIDONE is not yet set, then try again. - */ - if (tsk && !(tsk->flags & PF_EXITPIDONE)) - return -EAGAIN; - - /* * Reread the user space value to handle the following situation: * * CPU0 CPU1 @@ -1245,7 +1239,7 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, return -EAGAIN; p = find_get_task_by_vpid(pid); if (!p) - return handle_exit_race(uaddr, uval, NULL); + return handle_exit_race(uaddr, uval); if (unlikely(p->flags & PF_KTHREAD)) { put_task_struct(p); @@ -1259,13 +1253,13 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, * p->pi_lock: */ raw_spin_lock_irq(&p->pi_lock); - if (unlikely(p->flags & PF_EXITING)) { + if (unlikely(p->flags & PF_EXITPIDONE)) { /* * The task is on the way out. When PF_EXITPIDONE is * set, we know that the task has finished the * cleanup: */ - int ret = handle_exit_race(uaddr, uval, p); + int ret = handle_exit_race(uaddr, uval); raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p);