This should cover most cases I think. I'll have to break this out into different patches, and maybe clean up things a bit (there's certainly comments missing and some repetition). But this boots on my test box and builds a kernel without generating a single WARN -- big improvement over not booting (partly due to excessive warn output) with just the sched/core.c bit. --- drivers/tty/n_tty.c | 71 +++++++++++++++++++++++++++++++++------- fs/notify/inotify/inotify_user.c | 34 +++++++++++++++++-- fs/notify/notification.c | 2 +- include/linux/kernel.h | 4 +-- include/linux/sched.h | 46 ++++++++++++++++++++++++-- include/linux/wait.h | 2 ++ kernel/exit.c | 6 ++++ kernel/sched/core.c | 14 ++++++++ kernel/smpboot.c | 15 +++++---- 9 files changed, 168 insertions(+), 26 deletions(-) diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index f44f1ba762c3..5e4830979937 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -2098,6 +2098,36 @@ static int job_control(struct tty_struct *tty, struct file *file) return 0; } +/* + * n_tty_{read,write} use blocking primitives (mutex_lock, down_read, etc.) + * inside an open-coded wait loop. + * + * The wait loop relies on current->state to record wakeups; these will change + * it back to TASK_RUNNING. However blocking primitives themselves also change + * current->state. Therefore we must implement another means of recording + * wakeups. + * + * We do this by setting an alternative waitqueue wake function which changes + * an additional variable. + */ + +struct n_tty_wakeup_state { + struct task_struct *task; + bool woken; +}; + +static int n_tty_wake(wait_queue_t *wait, unsigned mode, + int wake_flags, void *key) +{ + struct n_tty_wakeup_state *s = wait->private; + DECLARE_WAITQUEUE(dummy_wait, s->task); + + smp_wmb(); + s->woken = true; + + return default_wake_function(&dummy_wait, mode, wake_flags, key); +} + /** * n_tty_read - read function for tty @@ -2123,7 +2153,11 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, { struct n_tty_data *ldata = tty->disc_data; unsigned char __user *b = buf; - DECLARE_WAITQUEUE(wait, current); + struct n_tty_wakeup_state s = { + .task = current, + .woken = false, + }; + wait_queue_t wait; int c; int minimum, time; ssize_t retval = 0; @@ -2167,6 +2201,9 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, packet = tty->packet; + init_waitqueue_func_entry(&wait, n_tty_wake); + wait.private = &s; + add_wait_queue(&tty->read_wait, &wait); while (nr) { /* First test for status change. */ @@ -2186,10 +2223,11 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, nr--; break; } - /* This statement must be first before checking for input - so that any interrupt will set the state back to - TASK_RUNNING. */ - set_current_state(TASK_INTERRUPTIBLE); + /* + * This statement must be first before checking for input so + * that any interrupt will set it to true. + */ + s.woken = false; if (((minimum - (b - buf)) < ldata->minimum_to_wake) && ((minimum - (b - buf)) >= 1)) @@ -2220,13 +2258,15 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, n_tty_set_room(tty); up_read(&tty->termios_rwsem); - timeout = schedule_timeout(timeout); + set_current_state(TASK_INTERRUPTIBLE); + if (!s.woken) + timeout = schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); down_read(&tty->termios_rwsem); continue; } } - __set_current_state(TASK_RUNNING); /* Deal with packet mode. */ if (packet && b == buf) { @@ -2273,7 +2313,6 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, mutex_unlock(&ldata->atomic_read_lock); - __set_current_state(TASK_RUNNING); if (b - buf) retval = b - buf; @@ -2306,7 +2345,11 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file, const unsigned char *buf, size_t nr) { const unsigned char *b = buf; - DECLARE_WAITQUEUE(wait, current); + struct n_tty_wakeup_state s = { + .task = current, + .woken = false, + }; + wait_queue_t wait; int c; ssize_t retval = 0; @@ -2322,9 +2365,12 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file, /* Write out any echoed characters that are still pending */ process_echoes(tty); + init_waitqueue_func_entry(&wait, n_tty_wake); + wait.private = &s; + add_wait_queue(&tty->write_wait, &wait); while (1) { - set_current_state(TASK_INTERRUPTIBLE); + s.woken = false; if (signal_pending(current)) { retval = -ERESTARTSYS; break; @@ -2378,7 +2424,10 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file, } up_read(&tty->termios_rwsem); - schedule(); + set_current_state(TASK_INTERRUPTIBLE); + if (!s.woken) + schedule(); + __set_current_state(TASK_RUNNING); down_read(&tty->termios_rwsem); } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index cc423a30a0c8..e3b65ce6b312 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -220,6 +220,23 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, return event_size; } +struct inotify_wakeup_state { + struct task_struct *task; + bool woken; +}; + +static int inotify_wake(wait_queue_t *wait, unsigned mode, + int wake_flags, void *key) +{ + struct inotify_wakeup_state *s = wait->private; + DECLARE_WAITQUEUE(dummy_wait, s->task); + + smp_wmb(); + s->woken = true; + + return autoremove_wake_function(&dummy_wait, mode, wake_flags, key); +} + static ssize_t inotify_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { @@ -227,13 +244,21 @@ static ssize_t inotify_read(struct file *file, char __user *buf, struct fsnotify_event *kevent; char __user *start; int ret; - DEFINE_WAIT(wait); + struct inotify_wakeup_state s = { + .task = current, + .woken = false, + }; + wait_queue_t wait; + + init_waitqueue_func_entry(&wait, inotify_wake); + wait.private = &s; start = buf; group = file->private_data; + add_wait_queue(&group->notification_waitq, &wait); while (1) { - prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE); + s.woken = false; mutex_lock(&group->notification_mutex); kevent = get_one_event(group, count); @@ -264,7 +289,10 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (start != buf) break; - schedule(); + set_current_state(TASK_INTERRUPTIBLE); + if (!s.woken) + schedule(); + __set_current_state(TASK_RUNNING); } finish_wait(&group->notification_waitq, &wait); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 1e58402171a5..dcfcdd69d1de 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(fsnotify_get_cookie); /* return true if the notify queue is empty, false otherwise */ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + lockdep_assert_held(&group->notification_mutex); return list_empty(&group->notification_list) ? true : false; } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 4c52907a6d8b..aac1dc9da2d0 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -162,7 +162,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP - void __might_sleep(const char *file, int line, int preempt_offset); +extern void __might_sleep(const char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep * @@ -174,7 +174,7 @@ extern int _cond_resched(void); * supposed to. */ # define might_sleep() \ - do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) + do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) #else static inline void __might_sleep(const char *file, int line, int preempt_offset) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 66124d63371a..62dab5738e66 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -241,6 +241,43 @@ extern char ___assert_task_state[1 - 2*!!( ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ (task->flags & PF_FROZEN) == 0) +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + +#define __set_task_state(tsk, state_value) \ + do { \ + (tsk)->task_state_change = _THIS_IP_; \ + (tsk)->state = (state_value); \ + } while (0) +#define set_task_state(tsk, state_value) \ + do { \ + (tsk)->task_state_change = _THIS_IP_; \ + set_mb((tsk)->state, (state_value)); \ + } while (0) + +/* + * set_current_state() includes a barrier so that the write of current->state + * is correctly serialised wrt the caller's subsequent test of whether to + * actually sleep: + * + * set_current_state(TASK_UNINTERRUPTIBLE); + * if (do_i_need_to_sleep()) + * schedule(); + * + * If the caller does not need such serialisation then use __set_current_state() + */ +#define __set_current_state(state_value) \ + do { \ + current->task_state_change = _THIS_IP_; \ + current->state = (state_value); \ + } while (0) +#define set_current_state(state_value) \ + do { \ + current->task_state_change = _THIS_IP_; \ + set_mb(current->state, (state_value)); \ + } while (0) + +#else + #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) #define set_task_state(tsk, state_value) \ @@ -257,11 +294,13 @@ extern char ___assert_task_state[1 - 2*!!( * * If the caller does not need such serialisation then use __set_current_state() */ -#define __set_current_state(state_value) \ +#define __set_current_state(state_value) \ do { current->state = (state_value); } while (0) -#define set_current_state(state_value) \ +#define set_current_state(state_value) \ set_mb(current->state, (state_value)) +#endif + /* Task command name length */ #define TASK_COMM_LEN 16 @@ -1650,6 +1689,9 @@ struct task_struct { unsigned int sequential_io; unsigned int sequential_io_avg; #endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + unsigned long task_state_change; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/include/linux/wait.h b/include/linux/wait.h index 6fb1ba5f9b2f..041b744e99b4 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -211,6 +211,8 @@ wait_queue_head_t *bit_waitqueue(void *, int); wait_queue_t __wait; \ long __ret = ret; /* explicit shadow */ \ \ + might_sleep(); \ + \ INIT_LIST_HEAD(&__wait.task_list); \ if (exclusive) \ __wait.flags = WQ_FLAG_EXCLUSIVE; \ diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668f1799..44cbc8791fca 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1067,6 +1067,12 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) } /* + * Since we're going to terminate the wait loop from do_wait(), + * reset task state. + */ + __set_current_state(TASK_RUNNING); + + /* * Now we are sure this task is interesting, and no other * thread can reap it because we its state == DEAD/TRACE. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2676866b4394..0577c12c9cf8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7077,6 +7077,19 @@ void __might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + /* + * Blocking primitives will set (and therefore destroy) current->state, + * since we will exit with TASK_RUNNING make sure we enter with it, + * otherwise we will destroy state. + */ + if (WARN(current->state != TASK_RUNNING, + "do not call blocking ops when !TASK_RUNNING; " + "state=%lx set at [<%p>] %pS\n", + current->state, + current->task_state_change, + current->task_state_change)) + __set_current_state(TASK_RUNNING); + rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && !is_idle_task(current)) || @@ -7107,6 +7120,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) dump_stack(); } EXPORT_SYMBOL(__might_sleep); + #endif #ifdef CONFIG_MAGIC_SYSRQ diff --git a/kernel/smpboot.c b/kernel/smpboot.c index eb89e1807408..f032fb5284e3 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data) set_current_state(TASK_INTERRUPTIBLE); preempt_disable(); if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->cleanup) ht->cleanup(td->cpu, cpu_online(td->cpu)); @@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data) /* Check for state change setup */ switch (td->status) { case HP_THREAD_NONE: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->setup) ht->setup(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; + case HP_THREAD_PARKED: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->unpark) ht->unpark(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; } if (!ht->thread_should_run(td->cpu)) { - preempt_enable(); + preempt_enable_no_resched(); schedule(); } else { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); ht->thread_fn(td->cpu); } -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html