[RFC PATCH 4/5] userfaultfd: non-cooperative: use fault_pending_wqh for all events

"Mike Rapoport" <rppt@xxxxxxxxxxxxxxxxxx> · Tue, 16 May 2017 13:36:01 +0300

Queuing page faults and non-cooperative events into different wait queues
does not have real value but rather makes the code more complicated.

Signed-off-by: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx>
---
 fs/userfaultfd.c | 64 +++++++++++++++++++++-----------------------------------
 1 file changed, 24 insertions(+), 40 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 1bd772a..8868229 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -48,8 +48,6 @@ struct userfaultfd_ctx {
 	wait_queue_head_t fault_wqh;
 	/* waitqueue head for the pseudo fd to wakeup poll/read */
 	wait_queue_head_t fd_wqh;
-	/* waitqueue head for events */
-	wait_queue_head_t event_wqh;
 	/* a refile sequence protected by fault_pending_wqh lock */
 	struct seqcount refile_seq;
 	/* pseudo fd refcounting */
@@ -101,6 +99,9 @@ struct userfaultfd_wake_key {
 static bool userfaultfd_should_wake(struct userfaultfd_wait_queue *uwq,
 				    struct userfaultfd_wake_key *key)
 {
+	if (key->event != uwq->msg.event)
+		return false;
+
 	if (key->event == UFFD_EVENT_PAGEFAULT) {
 		unsigned long start, len, address;
 
@@ -188,8 +189,6 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 		VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
 		VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
 		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
-		VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
-		VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
 		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
 		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
 		mmdrop(ctx->mm);
@@ -560,22 +559,21 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 	if (WARN_ON_ONCE(current->flags & PF_EXITING))
 		goto out;
 
-	ewq->ctx = ctx;
-	init_waitqueue_entry(&ewq->wq, current);
+	userfaultfd_init_waitqueue(ctx, ewq);
 
-	spin_lock(&ctx->event_wqh.lock);
+	spin_lock(&ctx->fault_pending_wqh.lock);
 	/*
 	 * After the __add_wait_queue the uwq is visible to userland
 	 * through poll/read().
 	 */
-	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
+	__add_wait_queue(&ctx->fault_pending_wqh, &ewq->wq);
 	for (;;) {
 		set_current_state(TASK_KILLABLE);
-		if (ewq->msg.event == 0)
+		if (READ_ONCE(ewq->waken))
 			break;
 		if (ACCESS_ONCE(ctx->released) ||
 		    fatal_signal_pending(current)) {
-			__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+			__remove_wait_queue(&ctx->fault_pending_wqh, &ewq->wq);
 			if (ewq->msg.event == UFFD_EVENT_FORK) {
 				struct userfaultfd_ctx *new;
 
@@ -588,15 +586,15 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 			break;
 		}
 
-		spin_unlock(&ctx->event_wqh.lock);
+		spin_unlock(&ctx->fault_pending_wqh.lock);
 
 		wake_up_poll(&ctx->fd_wqh, POLLIN);
 		schedule();
 
-		spin_lock(&ctx->event_wqh.lock);
+		spin_lock(&ctx->fault_pending_wqh.lock);
 	}
 	__set_current_state(TASK_RUNNING);
-	spin_unlock(&ctx->event_wqh.lock);
+	spin_unlock(&ctx->fault_pending_wqh.lock);
 
 	/*
 	 * ctx may go away after this if the userfault pseudo fd is
@@ -609,9 +607,10 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
 				       struct userfaultfd_wait_queue *ewq)
 {
-	ewq->msg.event = 0;
-	wake_up_locked(&ctx->event_wqh);
-	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+	struct userfaultfd_wake_key key = { 0 };
+
+	key.event = ewq->msg.event;
+	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &key);
 }
 
 int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
@@ -898,12 +897,6 @@ static inline struct userfaultfd_wait_queue *find_userfault(
 	return find_userfault_in(&ctx->fault_pending_wqh);
 }
 
-static inline struct userfaultfd_wait_queue *find_userfault_evt(
-		struct userfaultfd_ctx *ctx)
-{
-	return find_userfault_in(&ctx->event_wqh);
-}
-
 static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
@@ -935,8 +928,6 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 		smp_mb();
 		if (waitqueue_active(&ctx->fault_pending_wqh))
 			ret = POLLIN;
-		else if (waitqueue_active(&ctx->event_wqh))
-			ret = POLLIN;
 
 		return ret;
 	default:
@@ -981,7 +972,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 	struct userfaultfd_wait_queue *uwq;
 	/*
 	 * Handling fork event requires sleeping operations, so
-	 * we drop the event_wqh lock, then do these ops, then
+	 * we drop the fault_pending_wqh lock, then do these ops, then
 	 * lock it back and wake up the waiter. While the lock is
 	 * dropped the ewq may go away so we keep track of it
 	 * carefully.
@@ -996,7 +987,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 		set_current_state(TASK_INTERRUPTIBLE);
 		spin_lock(&ctx->fault_pending_wqh.lock);
 		uwq = find_userfault(ctx);
-		if (uwq) {
+		if (uwq && uwq->msg.event == UFFD_EVENT_PAGEFAULT) {
 			/*
 			 * Use a seqcount to repeat the lockless check
 			 * in wake_userfault() to avoid missing
@@ -1037,12 +1028,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 			spin_unlock(&ctx->fault_pending_wqh.lock);
 			ret = 0;
 			break;
-		}
-		spin_unlock(&ctx->fault_pending_wqh.lock);
-
-		spin_lock(&ctx->event_wqh.lock);
-		uwq = find_userfault_evt(ctx);
-		if (uwq) {
+		} else if (uwq) { /* non-pagefault event */
 			*msg = uwq->msg;
 
 			if (uwq->msg.event == UFFD_EVENT_FORK) {
@@ -1050,17 +1036,16 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 					(unsigned long)
 					uwq->msg.arg.reserved.reserved1;
 				list_move(&uwq->wq.task_list, &fork_event);
-				spin_unlock(&ctx->event_wqh.lock);
+				spin_unlock(&ctx->fault_pending_wqh.lock);
 				ret = 0;
 				break;
 			}
-
 			userfaultfd_event_complete(ctx, uwq);
-			spin_unlock(&ctx->event_wqh.lock);
+			spin_unlock(&ctx->fault_pending_wqh.lock);
 			ret = 0;
 			break;
 		}
-		spin_unlock(&ctx->event_wqh.lock);
+		spin_unlock(&ctx->fault_pending_wqh.lock);
 
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
@@ -1082,16 +1067,16 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 		ret = resolve_userfault_fork(ctx, fork_nctx, msg);
 
 		if (!ret) {
-			spin_lock(&ctx->event_wqh.lock);
+			spin_lock(&ctx->fault_pending_wqh.lock);
 			if (!list_empty(&fork_event)) {
 				uwq = list_first_entry(&fork_event,
 						       typeof(*uwq),
 						       wq.task_list);
 				list_del(&uwq->wq.task_list);
-				__add_wait_queue(&ctx->event_wqh, &uwq->wq);
+				__add_wait_queue(&ctx->fault_pending_wqh, &uwq->wq);
 				userfaultfd_event_complete(ctx, uwq);
 			}
-			spin_unlock(&ctx->event_wqh.lock);
+			spin_unlock(&ctx->fault_pending_wqh.lock);
 		}
 	}
 
@@ -1808,7 +1793,6 @@ static void init_once_userfaultfd_ctx(void *mem)
 
 	init_waitqueue_head(&ctx->fault_pending_wqh);
 	init_waitqueue_head(&ctx->fault_wqh);
-	init_waitqueue_head(&ctx->event_wqh);
 	init_waitqueue_head(&ctx->fd_wqh);
 	seqcount_init(&ctx->refile_seq);
 }
-- 
2.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>