[PATCH RFC] fs/aio: fix sleeping while TASK_INTERRUPTIBLE

Chris Mason <clm@xxxxxx> · Mon, 22 Dec 2014 19:16:25 -0500

The 3.19 merge window brought in a great new warning to catch someone
calling might_sleep with their state != TASK_RUNNING.  The idea was to
find buggy code locking mutexes after calling prepare_to_wait(), kind
of like this:

[  445.464634] WARNING: CPU: 22 PID: 4367 at kernel/sched/core.c:7303 __might_sleep+0x9a/0xb0()
[  445.481699] do not call blocking ops when !TASK_RUNNING; state=1 set at [<ffffffff81093f68>] prepare_to_wait_event+0x68/0x120
[  445.504494] Modules linked in: btrfs raid6_pq zlib_deflate lzo_compress xor xfs exportfs libcrc32c nfsv4 k10temp coretemp hwmon loop tcp_diag inet_diag ip6table_filter ip6_tables xt_NFLOG nfnetlink_log nfnetlink xt_comment xt_statistic iptable_filter ip_tables x_tables mptctl netconsole autofs4 nfsv3 nfs lockd grace rpcsec_gss_krb5 auth_rpcgss oid_registry sunrpc ipv6 ext3 jbd dm_mod iTCO_wdt iTCO_vendor_support ipmi_si ipmi_msghandler rtc_cmos pcspkr i2c_i801 lpc_ich mfd_core shpchp ehci_pci ehci_hcd mlx4_en ptp pps_core mlx4_core ses enclosure sg button megaraid_sas
[  445.612013] CPU: 22 PID: 4367 Comm: mysqld Tainted: G        W 3.18.0-mason+ #21
[  445.627862] Hardware name: ZTSYSTEMS Echo Ridge T4  /A9DRPF-10D, BIOS 1.07 05/10/2012
[  445.643727]  0000000000001c87 ffff880e18ae7c58 ffffffff81658fc9 0000000000001c87
[  445.659194]  ffff880e18ae7ca8 ffff880e18ae7c98 ffffffff81054b25 0000000000000000
[  445.674695]  0000000000000000 000000000000026d ffffffff81a13c0b 0000000000000000
[  445.690091] Call Trace:
[  445.695129]  [<ffffffff81658fc9>] dump_stack+0x4f/0x6e
[  445.705545]  [<ffffffff81054b25>] warn_slowpath_common+0x95/0xe0
[  445.717699]  [<ffffffff81054c26>] warn_slowpath_fmt+0x46/0x50
[  445.729341]  [<ffffffff81093f68>] ?  prepare_to_wait_event+0x68/0x120
[  445.742191]  [<ffffffff81093f68>] ?  prepare_to_wait_event+0x68/0x120
[  445.755037]  [<ffffffff8107b03a>] __might_sleep+0x9a/0xb0
[  445.765975]  [<ffffffff8165b5df>] mutex_lock_nested+0x2f/0x3b0
[  445.777783]  [<ffffffff81093f5e>] ?  prepare_to_wait_event+0x5e/0x120
[  445.790617]  [<ffffffff81093f5e>] ?  prepare_to_wait_event+0x5e/0x120
[  445.803473]  [<ffffffff811f794b>] aio_read_events+0x4b/0x2a0
[  445.814930]  [<ffffffff811f7d36>] read_events+0x196/0x240
[  445.825873]  [<ffffffff810bb0f0>] ? update_rmtp+0x80/0x80
[  445.836799]  [<ffffffff810bba54>] ?  hrtimer_start_range_ns+0x14/0x20
[  445.849656]  [<ffffffff810939e0>] ? wait_woken+0xa0/0xa0
[  445.860419]  [<ffffffff811f7867>] ? lookup_ioctx+0x47/0xe0
[  445.871519]  [<ffffffff811f8c34>] SyS_io_getevents+0x64/0x110
[  445.883140]  [<ffffffff810f433c>] ?  __audit_syscall_entry+0xac/0x110
[  445.895986]  [<ffffffff8165fcd2>] system_call_fastpath+0x12/0x17
[  445.908131] ---[ end trace 1df697a0d5d1d796 ]---

The patch below fixes things by pushing the mutex out of
aio_read_events and into it's caller.  I ended up open coding the
waiting and hrtimer loop because I couldn't see a cleaner way to avoid
taking the mutex extra times.

I also had to set the state to running before calling kmap or
copy_to_user(), which means we might blow through schedule() if we do end up
copying out a single event.

This has been lightly tested and hasn't been benchmarked, so RFC for
now.

Signed-off-by: Chris Mason <clm@xxxxxx>
Reported-by: The code of Peter Zijlstra <peterz@xxxxxxxxxxxxx>
---
 fs/aio.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 93 insertions(+), 24 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 1b7893e..b3f9fd5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1131,6 +1131,8 @@ EXPORT_SYMBOL(aio_complete);
 /* aio_read_events_ring
  *	Pull an event off of the ioctx's event ring.  Returns the number of
  *	events fetched
+ *
+ *	must be called with ctx->ring locked
  */
 static long aio_read_events_ring(struct kioctx *ctx,
 				 struct io_event __user *event, long nr)
@@ -1139,8 +1141,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
 	unsigned head, tail, pos;
 	long ret = 0;
 	int copy_ret;
-
-	mutex_lock(&ctx->ring_lock);
+	int running = current->state == TASK_RUNNING;
 
 	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
 	ring = kmap_atomic(ctx->ring_pages[0]);
@@ -1179,6 +1180,17 @@ static long aio_read_events_ring(struct kioctx *ctx,
 		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
 		pos %= AIO_EVENTS_PER_PAGE;
 
+		/* we're called after calling prepare_to_wait, which means
+		 * our current state might not be TASK_RUNNING.  Set it
+		 * to running here to sleeps in kmap or copy_to_user don't
+		 * last forever.  If we don't copy enough events out, we'll
+		 * loop through schedule() one time before sleeping.
+		 */
+		if (!running) {
+			__set_current_state(TASK_RUNNING);
+			running = 1;
+		}
+
 		ev = kmap(page);
 		copy_ret = copy_to_user(event + ret, ev + pos,
 					sizeof(*ev) * avail);
@@ -1201,11 +1213,10 @@ static long aio_read_events_ring(struct kioctx *ctx,
 
 	pr_debug("%li  h%u t%u\n", ret, head, tail);
 out:
-	mutex_unlock(&ctx->ring_lock);
-
 	return ret;
 }
 
+/* must be called with ctx->ring locked */
 static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
 			    struct io_event __user *event, long *i)
 {
@@ -1223,6 +1234,73 @@ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
 	return ret < 0 || *i >= min_nr;
 }
 
+/*
+ * will take aio ring lock
+ */
+static long aio_wait_events(struct kioctx *ctx, long min_nr, long nr,
+			    struct io_event __user *event,
+			    long *i_ret, ktime_t until)
+{
+	struct hrtimer_sleeper t;
+	long ret;
+	DEFINE_WAIT(wait);
+
+	mutex_lock(&ctx->ring_lock);
+
+	/*
+	 * this is an open coding of wait_event_interruptible_hrtimeout
+	 * so we can deal with the ctx->mutex nicely.  It starts with the
+	 * fast path for existing events:
+	 */
+	ret = aio_read_events(ctx, min_nr, nr, event, i_ret);
+	if (ret) {
+		mutex_unlock(&ctx->ring_lock);
+		return ret;
+	}
+
+	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init_sleeper(&t, current);
+	if (until.tv64 != KTIME_MAX) {
+		hrtimer_start_range_ns(&t.timer, until, current->timer_slack_ns,
+				      HRTIMER_MODE_REL);
+	}
+
+	while (1) {
+		ret = prepare_to_wait_event(&ctx->wait, &wait,
+					    TASK_INTERRUPTIBLE);
+		if (ret)
+			break;
+
+		ret = aio_read_events(ctx, min_nr, nr, event, i_ret);
+		if (ret)
+			break;
+
+		/* make sure we didn't timeout taking the mutex */
+		if (!t.task) {
+			ret = -ETIME;
+			break;
+		}
+
+		mutex_unlock(&ctx->ring_lock);
+		schedule();
+
+		if (!t.task) {
+			ret = -ETIME;
+			goto out;
+		}
+		mutex_lock(&ctx->ring_lock);
+
+	}
+	mutex_unlock(&ctx->ring_lock);
+
+out:
+	finish_wait(&ctx->wait, &wait);
+	hrtimer_cancel(&t.timer);
+	destroy_hrtimer_on_stack(&t.timer);
+	return ret;
+
+}
+
 static long read_events(struct kioctx *ctx, long min_nr, long nr,
 			struct io_event __user *event,
 			struct timespec __user *timeout)
@@ -1239,27 +1317,18 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
 		until = timespec_to_ktime(ts);
 	}
 
-	/*
-	 * Note that aio_read_events() is being called as the conditional - i.e.
-	 * we're calling it after prepare_to_wait() has set task state to
-	 * TASK_INTERRUPTIBLE.
-	 *
-	 * But aio_read_events() can block, and if it blocks it's going to flip
-	 * the task state back to TASK_RUNNING.
-	 *
-	 * This should be ok, provided it doesn't flip the state back to
-	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
-	 * will only happen if the mutex_lock() call blocks, and we then find
-	 * the ringbuffer empty. So in practice we should be ok, but it's
-	 * something to be aware of when touching this code.
-	 */
-	if (until.tv64 == 0)
-		aio_read_events(ctx, min_nr, nr, event, &ret);
-	else
-		wait_event_interruptible_hrtimeout(ctx->wait,
-				aio_read_events(ctx, min_nr, nr, event, &ret),
-				until);
 
+	if (until.tv64 == 0) {
+		mutex_lock(&ctx->ring_lock);
+		aio_read_events(ctx, min_nr, nr, event, &ret);
+		mutex_unlock(&ctx->ring_lock);
+	} else {
+		/*
+		 * we're pitching the -ETIME and -ERESTARTSYS return values
+		 * here.  It'll turn into -EINTR? ick
+		 */
+		aio_wait_events(ctx, min_nr, nr, event, &ret, until);
+	}
 	if (!ret && signal_pending(current))
 		ret = -EINTR;
 
-- 
1.8.1

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html