On 2020/08/11 4:29, Andrea Arcangeli wrote: > However once the mutex is killable there's no concern anymore and the > hangcheck timer is correct also not reporting any misbehavior anymore. Do you mean something like below untested patch? I think that the difficult part is that mutex for close() operation can't become killable. And I worry that syzbot soon reports a hung task at pipe_release() instead of pipe_read() or pipe_write(). If pagefault with locks held can be avoided, there will be no such worry. fs/pipe.c | 106 ++++++++++++++++++++++++++++++++++++++-------- fs/splice.c | 41 ++++++++++++------ include/linux/pipe_fs_i.h | 5 ++- 3 files changed, 120 insertions(+), 32 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 60dbee4..537d1ef 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -66,6 +66,13 @@ static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) mutex_lock_nested(&pipe->mutex, subclass); } +static int __must_check pipe_lock_killable_nested(struct pipe_inode_info *pipe, int subclass) +{ + if (pipe->files) + return mutex_lock_killable_nested(&pipe->mutex, subclass); + return 0; +} + void pipe_lock(struct pipe_inode_info *pipe) { /* @@ -75,6 +82,14 @@ void pipe_lock(struct pipe_inode_info *pipe) } EXPORT_SYMBOL(pipe_lock); +int pipe_lock_killable(struct pipe_inode_info *pipe) +{ + /* + * pipe_lock() nests non-pipe inode locks (for writing to a file) + */ + return pipe_lock_killable_nested(pipe, I_MUTEX_PARENT); +} + void pipe_unlock(struct pipe_inode_info *pipe) { if (pipe->files) @@ -87,23 +102,37 @@ static inline void __pipe_lock(struct pipe_inode_info *pipe) mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); } +static inline int __must_check __pipe_lock_killable(struct pipe_inode_info *pipe) +{ + return mutex_lock_killable_nested(&pipe->mutex, I_MUTEX_PARENT); +} + static inline void __pipe_unlock(struct pipe_inode_info *pipe) { mutex_unlock(&pipe->mutex); } -void pipe_double_lock(struct pipe_inode_info *pipe1, - struct pipe_inode_info *pipe2) +int pipe_double_lock_killable(struct pipe_inode_info *pipe1, + struct pipe_inode_info *pipe2) { BUG_ON(pipe1 == pipe2); if (pipe1 < pipe2) { - pipe_lock_nested(pipe1, I_MUTEX_PARENT); - pipe_lock_nested(pipe2, I_MUTEX_CHILD); + if (pipe_lock_killable_nested(pipe1, I_MUTEX_PARENT)) + return -ERESTARTSYS; + if (pipe_lock_killable_nested(pipe2, I_MUTEX_CHILD)) { + pipe_unlock(pipe1); + return -ERESTARTSYS; + } } else { - pipe_lock_nested(pipe2, I_MUTEX_PARENT); - pipe_lock_nested(pipe1, I_MUTEX_CHILD); + if (pipe_lock_killable_nested(pipe2, I_MUTEX_PARENT)) + return -ERESTARTSYS; + if (pipe_lock_killable_nested(pipe1, I_MUTEX_CHILD)) { + pipe_unlock(pipe2); + return -ERESTARTSYS; + } } + return 0; } /* Drop the inode semaphore and wait for a pipe event, atomically */ @@ -125,6 +154,24 @@ void pipe_wait(struct pipe_inode_info *pipe) pipe_lock(pipe); } +int pipe_wait_killable(struct pipe_inode_info *pipe) +{ + DEFINE_WAIT(rdwait); + DEFINE_WAIT(wrwait); + + /* + * Pipes are system-local resources, so sleeping on them + * is considered a noninteractive wait: + */ + prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); + prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE); + pipe_unlock(pipe); + schedule(); + finish_wait(&pipe->rd_wait, &rdwait); + finish_wait(&pipe->wr_wait, &wrwait); + return pipe_lock_killable(pipe); +} + static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -244,7 +291,8 @@ static inline bool pipe_readable(const struct pipe_inode_info *pipe) return 0; ret = 0; - __pipe_lock(pipe); + if (__pipe_lock_killable(pipe)) + return -ERESTARTSYS; /* * We only wake up writers if the pipe was full when we started @@ -381,7 +429,8 @@ static inline bool pipe_readable(const struct pipe_inode_info *pipe) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - __pipe_lock(pipe); + if (__pipe_lock_killable(pipe)) + return -ERESTARTSYS; was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); wake_next_reader = true; } @@ -432,7 +481,8 @@ static inline bool pipe_writable(const struct pipe_inode_info *pipe) if (unlikely(total_len == 0)) return 0; - __pipe_lock(pipe); + if (__pipe_lock_killable(pipe)) + return -ERESTARTSYS; if (!pipe->readers) { send_sig(SIGPIPE, current, 0); @@ -577,7 +627,14 @@ static inline bool pipe_writable(const struct pipe_inode_info *pipe) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); - __pipe_lock(pipe); + if (__pipe_lock_killable(pipe)) { + if (!ret) + ret = -ERESTARTSYS; + /* Extra notification is better than missing notification? */ + was_empty = true; + wake_next_writer = true; + goto out_unlocked; + } was_empty = pipe_empty(pipe->head, pipe->tail); wake_next_writer = true; } @@ -586,6 +643,7 @@ static inline bool pipe_writable(const struct pipe_inode_info *pipe) wake_next_writer = false; __pipe_unlock(pipe); +out_unlocked: /* * If we do do a wakeup event, we do a 'sync' wakeup, because we * want the reader to start processing things asap, rather than @@ -617,7 +675,8 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FIONREAD: - __pipe_lock(pipe); + if (__pipe_lock_killable(pipe)) + return -ERESTARTSYS; count = 0; head = pipe->head; tail = pipe->tail; @@ -634,7 +693,8 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) #ifdef CONFIG_WATCH_QUEUE case IOC_WATCH_QUEUE_SET_SIZE: { int ret; - __pipe_lock(pipe); + if (__pipe_lock_killable(pipe)) + return -ERESTARTSYS; ret = watch_queue_set_size(pipe, arg); __pipe_unlock(pipe); return ret; @@ -719,6 +779,10 @@ static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) { struct pipe_inode_info *pipe = file->private_data; + /* + * Think lock can't be killable. How to avoid deadlock if page fault + * with pipe mutex held does not finish quickly? + */ __pipe_lock(pipe); if (file->f_mode & FMODE_READ) pipe->readers--; @@ -744,7 +808,8 @@ static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) struct pipe_inode_info *pipe = filp->private_data; int retval = 0; - __pipe_lock(pipe); + if (__pipe_lock_killable(pipe)) /* Can this be safely killable? */ + return -ERESTARTSYS; if (filp->f_mode & FMODE_READ) retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { @@ -1040,7 +1105,8 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) int cur = *cnt; while (cur == *cnt) { - pipe_wait(pipe); + if (pipe_wait_killable(pipe)) + break; if (signal_pending(current)) break; } @@ -1083,10 +1149,13 @@ static int fifo_open(struct inode *inode, struct file *filp) spin_unlock(&inode->i_lock); } } - filp->private_data = pipe; - /* OK, we have a pipe and it's pinned down */ - __pipe_lock(pipe); + /* OK, we have a pipe and it's pinned down */ + if (__pipe_lock_killable(pipe)) { + put_pipe_info(inode, pipe); + return -ERESTARTSYS; + } + filp->private_data = pipe; /* We can only do regular read/write on fifos */ stream_open(inode, filp); @@ -1349,7 +1418,8 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!pipe) return -EBADF; - __pipe_lock(pipe); + if (__pipe_lock_killable(pipe)) + return -ERESTARTSYS; switch (cmd) { case F_SETPIPE_SZ: diff --git a/fs/splice.c b/fs/splice.c index d7c8a7c..65d24df 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -563,7 +563,8 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des sd->need_wakeup = false; } - pipe_wait(pipe); + if (pipe_wait_killable(pipe)) + return -ERESTARTSYS; } return 1; @@ -657,7 +658,8 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, .u.file = out, }; - pipe_lock(pipe); + if (pipe_lock_killable(pipe)) + return -ERESTARTSYS; ret = __splice_from_pipe(pipe, &sd, actor); pipe_unlock(pipe); @@ -696,7 +698,10 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, if (unlikely(!array)) return -ENOMEM; - pipe_lock(pipe); + if (pipe_lock_killable(pipe)) { + kfree(array); + return -ERESTARTSYS; + } splice_from_pipe_begin(&sd); while (sd.total_len) { @@ -1077,7 +1082,8 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; - pipe_wait(pipe); + if (pipe_wait_killable(pipe)) + return -ERESTARTSYS; } } @@ -1167,7 +1173,8 @@ long do_splice(struct file *in, loff_t __user *off_in, if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; - pipe_lock(opipe); + if (pipe_lock_killable(opipe)) + return -ERESTARTSYS; ret = wait_for_space(opipe, flags); if (!ret) { unsigned int p_space; @@ -1264,7 +1271,8 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, return -EBADF; if (sd.total_len) { - pipe_lock(pipe); + if (pipe_lock_killable(pipe)) + return -ERESTARTSYS; ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); } @@ -1291,7 +1299,8 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, if (!pipe) return -EBADF; - pipe_lock(pipe); + if (pipe_lock_killable(pipe)) + return -ERESTARTSYS; ret = wait_for_space(pipe, flags); if (!ret) ret = iter_to_pipe(iter, pipe, buf_flag); @@ -1441,7 +1450,8 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) return 0; ret = 0; - pipe_lock(pipe); + if (pipe_lock_killable(pipe)) + return -ERESTARTSYS; while (pipe_empty(pipe->head, pipe->tail)) { if (signal_pending(current)) { @@ -1454,7 +1464,8 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -EAGAIN; break; } - pipe_wait(pipe); + if (pipe_wait_killable(pipe)) + return -ERESTARTSYS; } pipe_unlock(pipe); @@ -1477,7 +1488,8 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) return 0; ret = 0; - pipe_lock(pipe); + if (pipe_lock_killable(pipe)) + return -ERESTARTSYS; while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { if (!pipe->readers) { @@ -1493,7 +1505,8 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -ERESTARTSYS; break; } - pipe_wait(pipe); + if (pipe_wait_killable(pipe)) + return -ERESTARTSYS; } pipe_unlock(pipe); @@ -1529,7 +1542,8 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ - pipe_double_lock(ipipe, opipe); + if (pipe_double_lock_killable(ipipe, opipe)) + return -ERESTARTSYS; i_tail = ipipe->tail; i_mask = ipipe->ring_size - 1; @@ -1655,7 +1669,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ - pipe_double_lock(ipipe, opipe); + if (pipe_double_lock_killable(ipipe, opipe)) + return -ERESTARTSYS; i_tail = ipipe->tail; i_mask = ipipe->ring_size - 1; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 50afd0d..eb99c18 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -233,8 +233,10 @@ static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe, /* Pipe lock and unlock operations */ void pipe_lock(struct pipe_inode_info *); +int __must_check pipe_lock_killable(struct pipe_inode_info *pipe); void pipe_unlock(struct pipe_inode_info *); -void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); +int __must_check pipe_double_lock_killable(struct pipe_inode_info *pipe1, + struct pipe_inode_info *pipe2); extern unsigned int pipe_max_size; extern unsigned long pipe_user_pages_hard; @@ -242,6 +244,7 @@ static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe, /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe); +int __must_check pipe_wait_killable(struct pipe_inode_info *pipe); struct pipe_inode_info *alloc_pipe_info(void); void free_pipe_info(struct pipe_inode_info *); -- 1.8.3.1