The md driver currently supports 'poll' on /proc/mdstat. This is unsafe as if the md-mod module is removed while a 'poll' or 'select' is outstanding on /proc/mdstat, an oops occurs when the syscall completes. poll_freewait() will call remove_wait_queue() on a wait_queue_head_t which was local to the module which no-longer exists. This problem is particular to /proc. Most filesystems do not allow the module to be unloaded while any files are open on it. /proc only blocks module unloading while a file_operations call is currently active into the module, not while the file is open. kernfs has this property too but kernfs allocates a wait_queue_head_t in its internal data structures so the module doesn't need to provide one. (A previous patch to add a similar allocation to procfs was not accepted). This patch takes a different approach and allows a module to disconnect the wait_queue_head_t that was passed to poll_wait() from all the clients which are waiting on it. Thus after calling proc_remove_entry("mdstat", NULL); we simply call wait_queue_purge(&md_event_waiters); and then know that it is safe to remove the module. rcu infrastructure is used to avoid races. poll_freewait() checks if the purge has happened under rcu_read_lock() to ensure that it never touches any freed memory. wait_queue_purge() uses synchronize_rcu() to ensure no poll_freewait() could still be looking at the wait_queue_head_t. Reported-by: "majianpeng" <majianpeng@xxxxxxxxx> Signed-off-by: NeilBrown <neilb@xxxxxxx> diff --git a/drivers/md/md.c b/drivers/md/md.c index 4ad5cc4e63e8..e28c9d2a1166 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8681,6 +8681,7 @@ static __exit void md_exit(void) unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); remove_proc_entry("mdstat", NULL); + wait_queue_purge(&md_event_waiters); for_each_mddev(mddev, tmp) { export_array(mddev); mddev->hold_active = 0; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index af903128891c..a095312d01e2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -521,7 +521,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) /* If it is cleared by POLLFREE, it should be rcu-safe */ whead = rcu_dereference(pwq->whead); if (whead) - remove_wait_queue(whead, &pwq->wait); + remove_wait_queue_purgeable(whead, &pwq->wait); rcu_read_unlock(); } diff --git a/fs/select.c b/fs/select.c index 467bb1cb3ea5..7c35bcdbd94c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -129,7 +129,7 @@ EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { - remove_wait_queue(entry->wait_address, &entry->wait); + remove_wait_queue_purgeable(entry->wait_address, &entry->wait); fput(entry->filp); } diff --git a/include/linux/wait.h b/include/linux/wait.h index 559044c79232..18d0d2fbf3bd 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -106,6 +106,8 @@ static inline int waitqueue_active(wait_queue_head_t *q) extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait); extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); +extern void wait_queue_purge(wait_queue_head_t *q); +extern void remove_wait_queue_purgeable(wait_queue_head_t *q, wait_queue_t *wait); static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) { diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 7d50f794e248..12548730c6ed 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -52,6 +52,51 @@ void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) EXPORT_SYMBOL(remove_wait_queue); +/** + * wait_queue_purge - remove all waiter from a wait_queue + * @q: The queue to be purged + * + * Unlink all pending waiters from the queue. + * This can be used prior to freeing a queue providing all waiters are + * prepared for queue purging. + * Waiters must call remove_wait_queue_puregeable() rather than + * remove_wait_queue(). + * + */ +void wait_queue_purge(wait_queue_head_t *q) +{ + spin_lock(&q->lock); + while (!list_empty(&q->task_list)) + list_del_init(q->task_list.next); + spin_unlock(&q->lock); + synchronize_rcu(); +} +EXPORT_SYMBOL(wait_queue_purge); + +/** + * remove_wait_queue_puregeable - remove_wait_queue if wait_queue_purge might be used. + * @q: the queue, which may already be purged, to remove from + * @wait: the waiter to remove + * + * Remove a waiter from a queue if it hasn't already been purged. + * If the queue has already been purged then task_list will be empty. + * If it isn't then it is still safe to lock the queue and remove + * the task. + */ +void remove_wait_queue_purgeable(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + rcu_read_lock(); + if (!list_empty(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(remove_wait_queue_purgeable); + /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
Attachment:
signature.asc
Description: PGP signature