The 3.19 merge window brought in a great new warning to catch someone calling might_sleep with their state != TASK_RUNNING. The idea was to find buggy code locking mutexes after calling prepare_to_wait(), kind of like this: [ 445.464634] WARNING: CPU: 22 PID: 4367 at kernel/sched/core.c:7303 __might_sleep+0x9a/0xb0() [ 445.481699] do not call blocking ops when !TASK_RUNNING; state=1 set at [<ffffffff81093f68>] prepare_to_wait_event+0x68/0x120 [ 445.504494] Modules linked in: btrfs raid6_pq zlib_deflate lzo_compress xor xfs exportfs libcrc32c nfsv4 k10temp coretemp hwmon loop tcp_diag inet_diag ip6table_filter ip6_tables xt_NFLOG nfnetlink_log nfnetlink xt_comment xt_statistic iptable_filter ip_tables x_tables mptctl netconsole autofs4 nfsv3 nfs lockd grace rpcsec_gss_krb5 auth_rpcgss oid_registry sunrpc ipv6 ext3 jbd dm_mod iTCO_wdt iTCO_vendor_support ipmi_si ipmi_msghandler rtc_cmos pcspkr i2c_i801 lpc_ich mfd_core shpchp ehci_pci ehci_hcd mlx4_en ptp pps_core mlx4_core ses enclosure sg button megaraid_sas [ 445.612013] CPU: 22 PID: 4367 Comm: mysqld Tainted: G W 3.18.0-mason+ #21 [ 445.627862] Hardware name: ZTSYSTEMS Echo Ridge T4 /A9DRPF-10D, BIOS 1.07 05/10/2012 [ 445.643727] 0000000000001c87 ffff880e18ae7c58 ffffffff81658fc9 0000000000001c87 [ 445.659194] ffff880e18ae7ca8 ffff880e18ae7c98 ffffffff81054b25 0000000000000000 [ 445.674695] 0000000000000000 000000000000026d ffffffff81a13c0b 0000000000000000 [ 445.690091] Call Trace: [ 445.695129] [<ffffffff81658fc9>] dump_stack+0x4f/0x6e [ 445.705545] [<ffffffff81054b25>] warn_slowpath_common+0x95/0xe0 [ 445.717699] [<ffffffff81054c26>] warn_slowpath_fmt+0x46/0x50 [ 445.729341] [<ffffffff81093f68>] ? prepare_to_wait_event+0x68/0x120 [ 445.742191] [<ffffffff81093f68>] ? prepare_to_wait_event+0x68/0x120 [ 445.755037] [<ffffffff8107b03a>] __might_sleep+0x9a/0xb0 [ 445.765975] [<ffffffff8165b5df>] mutex_lock_nested+0x2f/0x3b0 [ 445.777783] [<ffffffff81093f5e>] ? prepare_to_wait_event+0x5e/0x120 [ 445.790617] [<ffffffff81093f5e>] ? prepare_to_wait_event+0x5e/0x120 [ 445.803473] [<ffffffff811f794b>] aio_read_events+0x4b/0x2a0 [ 445.814930] [<ffffffff811f7d36>] read_events+0x196/0x240 [ 445.825873] [<ffffffff810bb0f0>] ? update_rmtp+0x80/0x80 [ 445.836799] [<ffffffff810bba54>] ? hrtimer_start_range_ns+0x14/0x20 [ 445.849656] [<ffffffff810939e0>] ? wait_woken+0xa0/0xa0 [ 445.860419] [<ffffffff811f7867>] ? lookup_ioctx+0x47/0xe0 [ 445.871519] [<ffffffff811f8c34>] SyS_io_getevents+0x64/0x110 [ 445.883140] [<ffffffff810f433c>] ? __audit_syscall_entry+0xac/0x110 [ 445.895986] [<ffffffff8165fcd2>] system_call_fastpath+0x12/0x17 [ 445.908131] ---[ end trace 1df697a0d5d1d796 ]--- The patch below fixes things by pushing the mutex out of aio_read_events and into it's caller. I ended up open coding the waiting and hrtimer loop because I couldn't see a cleaner way to avoid taking the mutex extra times. I also had to set the state to running before calling kmap or copy_to_user(), which means we might blow through schedule() if we do end up copying out a single event. This has been lightly tested and hasn't been benchmarked, so RFC for now. Signed-off-by: Chris Mason <clm@xxxxxx> Reported-by: The code of Peter Zijlstra <peterz@xxxxxxxxxxxxx> --- fs/aio.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 93 insertions(+), 24 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 1b7893e..b3f9fd5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1131,6 +1131,8 @@ EXPORT_SYMBOL(aio_complete); /* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of * events fetched + * + * must be called with ctx->ring locked */ static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) @@ -1139,8 +1141,7 @@ static long aio_read_events_ring(struct kioctx *ctx, unsigned head, tail, pos; long ret = 0; int copy_ret; - - mutex_lock(&ctx->ring_lock); + int running = current->state == TASK_RUNNING; /* Access to ->ring_pages here is protected by ctx->ring_lock. */ ring = kmap_atomic(ctx->ring_pages[0]); @@ -1179,6 +1180,17 @@ static long aio_read_events_ring(struct kioctx *ctx, page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; pos %= AIO_EVENTS_PER_PAGE; + /* we're called after calling prepare_to_wait, which means + * our current state might not be TASK_RUNNING. Set it + * to running here to sleeps in kmap or copy_to_user don't + * last forever. If we don't copy enough events out, we'll + * loop through schedule() one time before sleeping. + */ + if (!running) { + __set_current_state(TASK_RUNNING); + running = 1; + } + ev = kmap(page); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); @@ -1201,11 +1213,10 @@ static long aio_read_events_ring(struct kioctx *ctx, pr_debug("%li h%u t%u\n", ret, head, tail); out: - mutex_unlock(&ctx->ring_lock); - return ret; } +/* must be called with ctx->ring locked */ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i) { @@ -1223,6 +1234,73 @@ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, return ret < 0 || *i >= min_nr; } +/* + * will take aio ring lock + */ +static long aio_wait_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, + long *i_ret, ktime_t until) +{ + struct hrtimer_sleeper t; + long ret; + DEFINE_WAIT(wait); + + mutex_lock(&ctx->ring_lock); + + /* + * this is an open coding of wait_event_interruptible_hrtimeout + * so we can deal with the ctx->mutex nicely. It starts with the + * fast path for existing events: + */ + ret = aio_read_events(ctx, min_nr, nr, event, i_ret); + if (ret) { + mutex_unlock(&ctx->ring_lock); + return ret; + } + + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init_sleeper(&t, current); + if (until.tv64 != KTIME_MAX) { + hrtimer_start_range_ns(&t.timer, until, current->timer_slack_ns, + HRTIMER_MODE_REL); + } + + while (1) { + ret = prepare_to_wait_event(&ctx->wait, &wait, + TASK_INTERRUPTIBLE); + if (ret) + break; + + ret = aio_read_events(ctx, min_nr, nr, event, i_ret); + if (ret) + break; + + /* make sure we didn't timeout taking the mutex */ + if (!t.task) { + ret = -ETIME; + break; + } + + mutex_unlock(&ctx->ring_lock); + schedule(); + + if (!t.task) { + ret = -ETIME; + goto out; + } + mutex_lock(&ctx->ring_lock); + + } + mutex_unlock(&ctx->ring_lock); + +out: + finish_wait(&ctx->wait, &wait); + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + return ret; + +} + static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, struct timespec __user *timeout) @@ -1239,27 +1317,18 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, until = timespec_to_ktime(ts); } - /* - * Note that aio_read_events() is being called as the conditional - i.e. - * we're calling it after prepare_to_wait() has set task state to - * TASK_INTERRUPTIBLE. - * - * But aio_read_events() can block, and if it blocks it's going to flip - * the task state back to TASK_RUNNING. - * - * This should be ok, provided it doesn't flip the state back to - * TASK_RUNNING and return 0 too much - that causes us to spin. That - * will only happen if the mutex_lock() call blocks, and we then find - * the ringbuffer empty. So in practice we should be ok, but it's - * something to be aware of when touching this code. - */ - if (until.tv64 == 0) - aio_read_events(ctx, min_nr, nr, event, &ret); - else - wait_event_interruptible_hrtimeout(ctx->wait, - aio_read_events(ctx, min_nr, nr, event, &ret), - until); + if (until.tv64 == 0) { + mutex_lock(&ctx->ring_lock); + aio_read_events(ctx, min_nr, nr, event, &ret); + mutex_unlock(&ctx->ring_lock); + } else { + /* + * we're pitching the -ETIME and -ERESTARTSYS return values + * here. It'll turn into -EINTR? ick + */ + aio_wait_events(ctx, min_nr, nr, event, &ret, until); + } if (!ret && signal_pending(current)) ret = -EINTR; -- 1.8.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html