Re: [RFC][PATCH v2 5/5] sched: User Mode Concurency Groups

Peter Zijlstra <peterz@xxxxxxxxxxxxx> · Fri, 21 Jan 2022 16:18:45 +0100

On Fri, Jan 21, 2022 at 12:47:58PM +0100, Peter Zijlstra wrote:
> On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote:
> 
> > +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, timo)
> > +{
> > +	struct task_struct *tsk = current;
> > +	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
> > +	bool worker = tsk->flags & PF_UMCG_WORKER;
> > +	int ret;
> > +
> > +	if (!self || flags)
> > +		return -EINVAL;
> > +
> > +	if (worker) {
> > +		tsk->flags &= ~PF_UMCG_WORKER;
> > +		if (timo)
> > +			return -ERANGE;
> > +	}
> > +
> > +	/* see umcg_sys_{enter,exit}() syscall exceptions */
> > +	ret = umcg_pin_pages();
> > +	if (ret)
> > +		goto unblock;
> > +
> > +	/*
> > +	 * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE.
> > +	 */
> > +	ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE);
> > +	if (ret)
> > +		goto unpin;
> > +
> > +	ret = umcg_wake_next(tsk, self);
> > +	if (ret)
> > +		goto unpin;
> > +
> > +	if (worker) {
> > +		/*
> > +		 * If this fails it is possible ::next_tid is already running
> > +		 * while this task is not going to block. This violates our
> > +		 * constraints.
> > +		 *
> > +		 * That said, pretty much the only way to make this fail is by
> > +		 * force munmap()'ing things. In which case one is most welcome
> > +		 * to the pieces.
> > +		 */
> > +		ret = umcg_enqueue_and_wake(tsk);
> > +		if (ret)
> > +			goto unpin;
> > +	}
> > +
> > +	umcg_unpin_pages();
> > +
> > +	ret = umcg_wait(timo);
> > +	switch (ret) {
> > +	case 0:		/* all done */
> > +	case -EINTR:	/* umcg_notify_resume() will continue the wait */
> 
> So I was playing with the whole worker timeout thing last night and
> realized this is broken. If we get a signal while we have a timeout, the
> timeout gets lost.
> 
> I think the easiest solution is to have umcg_notify_resume() also resume
> the timeout, but the first pass of that was yuck, so I need to try
> again.

Something like this, still yuck though. Also still need to write me a
test for this.

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1300,12 +1300,14 @@ struct task_struct {
 	clockid_t		umcg_clock;
 	struct umcg_task __user	*umcg_task;
 
-	/* setup by umcg_pin_enter() */
+	/* setup by umcg_pin_pages() */
 	struct page		*umcg_page;
 
 	struct task_struct	*umcg_server;
 	struct umcg_task __user *umcg_server_task;
 	struct page		*umcg_server_page;
+
+	u64			umcg_timeout;
 #endif
 
 	struct tlbflush_unmap_batch	tlb_ubc;
--- a/kernel/sched/umcg.c
+++ b/kernel/sched/umcg.c
@@ -232,6 +232,8 @@ static int umcg_update_state(struct task
 /* Called from syscall enter path and exceptions that can schedule */
 void umcg_sys_enter(struct pt_regs *regs, long syscall)
 {
+	current->umcg_timeout = 0;
+
 	/* avoid recursion vs our own syscalls */
 	if (syscall == __NR_umcg_wait ||
 	    syscall == __NR_umcg_ctl)
@@ -519,6 +521,7 @@ void umcg_notify_resume(struct pt_regs *
 	struct umcg_task __user *self = tsk->umcg_task;
 	bool worker = tsk->flags & PF_UMCG_WORKER;
 	u32 state;
+	int ret;
 
 	/* avoid recursion vs schedule() */
 	if (worker)
@@ -554,12 +557,17 @@ void umcg_notify_resume(struct pt_regs *
 		umcg_unpin_pages();
 	}
 
-	switch (umcg_wait(0)) {
+	ret = umcg_wait(tsk->umcg_timeout);
+	switch (ret) {
 	case 0:
 	case -EINTR:
 		/* we will resume the wait after the signal */
 		break;
 
+	case -ETIMEDOUT:
+		regs_set_return_value(regs, ret);
+		break;
+
 	default:
 		UMCG_DIE("wait");
 	}
@@ -759,6 +767,7 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u
 	switch (ret) {
 	case 0:		/* all done */
 	case -EINTR:	/* umcg_notify_resume() will continue the wait */
+		tsk->umcg_timeout = timo;
 		ret = 0;
 		break;