On Fri, Jan 21, 2022 at 12:47:58PM +0100, Peter Zijlstra wrote: > On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote: > > > +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, timo) > > +{ > > + struct task_struct *tsk = current; > > + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); > > + bool worker = tsk->flags & PF_UMCG_WORKER; > > + int ret; > > + > > + if (!self || flags) > > + return -EINVAL; > > + > > + if (worker) { > > + tsk->flags &= ~PF_UMCG_WORKER; > > + if (timo) > > + return -ERANGE; > > + } > > + > > + /* see umcg_sys_{enter,exit}() syscall exceptions */ > > + ret = umcg_pin_pages(); > > + if (ret) > > + goto unblock; > > + > > + /* > > + * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE. > > + */ > > + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE); > > + if (ret) > > + goto unpin; > > + > > + ret = umcg_wake_next(tsk, self); > > + if (ret) > > + goto unpin; > > + > > + if (worker) { > > + /* > > + * If this fails it is possible ::next_tid is already running > > + * while this task is not going to block. This violates our > > + * constraints. > > + * > > + * That said, pretty much the only way to make this fail is by > > + * force munmap()'ing things. In which case one is most welcome > > + * to the pieces. > > + */ > > + ret = umcg_enqueue_and_wake(tsk); > > + if (ret) > > + goto unpin; > > + } > > + > > + umcg_unpin_pages(); > > + > > + ret = umcg_wait(timo); > > + switch (ret) { > > + case 0: /* all done */ > > + case -EINTR: /* umcg_notify_resume() will continue the wait */ > > So I was playing with the whole worker timeout thing last night and > realized this is broken. If we get a signal while we have a timeout, the > timeout gets lost. > > I think the easiest solution is to have umcg_notify_resume() also resume > the timeout, but the first pass of that was yuck, so I need to try > again. > > Related, by moving the whole enqueue-and-wake thing into the timeout, we > get more 'fun' failure cases :-( This is the best I can come up with,... but it's a hot mess :-( Still, let me go try this. --- --- a/include/uapi/linux/umcg.h +++ b/include/uapi/linux/umcg.h @@ -127,6 +127,14 @@ struct umcg_task { } __attribute__((packed, aligned(UMCG_TASK_ALIGN))); /** + * enum umcg_wait_flag - flags to pass to sys_umcg_wait + * @UMCG_WAIT_ENQUEUE: Enqueue the task on runnable_workers_ptr before waiting + */ +enum umcg_wait_flag { + UMCG_WAIT_ENQUEUE = 0x0001, +}; + +/** * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl * @UMCG_CTL_REGISTER: register the current task as a UMCG task * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task --- a/kernel/sched/umcg.c +++ b/kernel/sched/umcg.c @@ -227,7 +227,6 @@ static int umcg_update_state(struct task #define UMCG_DIE(reason) __UMCG_DIE(,reason) #define UMCG_DIE_PF(reason) __UMCG_DIE(pagefault_enable(), reason) -#define UMCG_DIE_UNPIN(reason) __UMCG_DIE(umcg_unpin_pages(), reason) /* Called from syscall enter path and exceptions that can schedule */ void umcg_sys_enter(struct pt_regs *regs, long syscall) @@ -371,15 +370,23 @@ static int umcg_enqueue_runnable(struct static int umcg_enqueue_and_wake(struct task_struct *tsk) { - int ret; - - ret = umcg_enqueue_runnable(tsk); + int ret = umcg_enqueue_runnable(tsk); if (!ret) ret = umcg_wake_server(tsk); return ret; } +static int umcg_pin_enqueue_and_wake(struct task_struct *tsk) +{ + int ret = umcg_pin_pages(); + if (!ret) { + ret = umcg_enqueue_and_wake(tsk); + umcg_unpin_pages(); + } + return ret; +} + /* * umcg_wait: Wait for ->state to become RUNNING * @@ -469,16 +476,11 @@ static void umcg_unblock_and_wait(void) /* avoid recursion vs schedule() */ tsk->flags &= ~PF_UMCG_WORKER; - if (umcg_pin_pages()) - UMCG_DIE("pin"); - if (umcg_update_state(tsk, self, UMCG_TASK_BLOCKED, UMCG_TASK_RUNNABLE)) - UMCG_DIE_UNPIN("state"); + UMCG_DIE("state"); - if (umcg_enqueue_and_wake(tsk)) - UMCG_DIE_UNPIN("enqueue-wake"); - - umcg_unpin_pages(); + if (umcg_pin_enqueue_and_wake(tsk)) + UMCG_DIE("pin-enqueue-wake"); switch (umcg_wait(0)) { case 0: @@ -544,18 +546,13 @@ void umcg_notify_resume(struct pt_regs * goto done; if (state & UMCG_TF_PREEMPT) { - if (umcg_pin_pages()) - UMCG_DIE("pin"); - if (umcg_update_state(tsk, self, UMCG_TASK_RUNNING, UMCG_TASK_RUNNABLE)) - UMCG_DIE_UNPIN("state"); + UMCG_DIE("state"); - if (umcg_enqueue_and_wake(tsk)) - UMCG_DIE_UNPIN("enqueue-wake"); - - umcg_unpin_pages(); + if (umcg_pin_enqueue_and_wake(tsk)) + UMCG_DIE("pin-enqueue-wake"); } if (WARN_ON_ONCE(timeout && syscall_get_nr(tsk, regs) != __NR_umcg_wait)) @@ -570,6 +567,13 @@ void umcg_notify_resume(struct pt_regs * case -ETIMEDOUT: regs_set_return_value(regs, ret); + if (worker) { + ret = umcg_pin_enqueue_and_wake(tsk); + if (ret) { + umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING); + regs_set_return_value(regs, ret); + } + } break; default: @@ -710,7 +714,6 @@ static int umcg_wake_next(struct task_st * Returns: * 0 - OK; * -ETIMEDOUT - the timeout expired; - * -ERANGE - the timeout is out of range (worker); * -EAGAIN - ::state wasn't RUNNABLE, concurrent wakeup; * -EFAULT - failed accessing struct umcg_task __user of the current * task, the server or next; @@ -725,48 +728,40 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u bool worker = tsk->flags & PF_UMCG_WORKER; int ret; - if (!self || flags) + if (!self || (flags & ~(UMCG_WAIT_ENQUEUE))) return -EINVAL; - if (worker) { - tsk->flags &= ~PF_UMCG_WORKER; - if (timo) - return -ERANGE; - } + if ((flags & UMCG_WAIT_ENQUEUE) && (timo || !worker)) + return -EINVAL; - /* see umcg_sys_{enter,exit}() syscall exceptions */ - ret = umcg_pin_pages(); - if (ret) - goto unblock; + if (worker) + tsk->flags &= ~PF_UMCG_WORKER; /* * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE. */ ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE); if (ret) - goto unpin; + goto unblock; ret = umcg_wake_next(tsk, self); if (ret) - goto unpin; + goto unblock; - if (worker) { + if (flags & UMCG_WAIT_ENQUEUE) { /* * If this fails it is possible ::next_tid is already running * while this task is not going to block. This violates our * constraints. * - * That said, pretty much the only way to make this fail is by - * force munmap()'ing things. In which case one is most welcome - * to the pieces. + * Userspace can detect this case by looking at: ::next_tid & + * TID_RUNNING. */ - ret = umcg_enqueue_and_wake(tsk); + ret = umcg_pin_enqueue_and_wake(tsk); if (ret) - goto unpin; + goto unblock; } - umcg_unpin_pages(); - ret = umcg_wait(timo); switch (ret) { case 0: /* all done */ @@ -775,6 +770,26 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u ret = 0; break; + case -ETIMEDOUT: + if (worker) { + /* + * See the UMCG_WAIT_ENQUEUE case above; except this is + * even more complicated because we *did* wait and + * things might have progressed lots. + * + * Still, abort the wait because otherwise nobody would + * ever find us again. Hopefully userspace can make + * sense of things. + */ + ret = umcg_pin_enqueue_and_wake(tsk); + if (ret) + goto unblock; + + ret = -ETIMEDOUT; + break; + } + goto unblock; + default: goto unblock; } @@ -783,8 +798,6 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u tsk->flags |= PF_UMCG_WORKER; return ret; -unpin: - umcg_unpin_pages(); unblock: umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING); goto out;