Thers is a out-of-order access problem which would cause endless sleep when we use pipe with epoll. The story is following, we assume the ring size is 2, the ring head is 1, the ring tail is 0, task0 is write task, task1 is read task, task2 is write task. Task0 Task1 Task2 epoll_ctl(fd, EPOLL_CTL_ADD, ...) pipe_poll() poll_wait() tail = READ_ONCE(pipe->tail); // Re-order and get tail=0 pipe_read tail++ //tail=1 pipe_write head++ //head=2 head = READ_ONCE(pipe->head); // head = 2 check ring is full by head - tail Task0 get head = 2 and tail = 0, so it mistake that the pipe ring is full, then task0 is not add into ready list. If the ring is not full anymore, task0 would not be woken up forever The reason of this problem is that we got inconsistent head/tail value of the pipe ring, so we fix the problem by getting them by atomic. It seems that pipe_readable and pipe_writable is safe, so we don't change them. Signed-off-by: Miao Xie <miaoxie@xxxxxxxxxx> --- fs/pipe.c | 6 ++++-- include/linux/pipe_fs_i.h | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 6d4342bad9f1..454056b1eaad 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -649,6 +649,7 @@ pipe_poll(struct file *filp, poll_table *wait) __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; unsigned int head, tail; + u64 ring_idxs; /* Epoll has some historical nasty semantics, this enables them */ pipe->poll_usage = 1; @@ -669,8 +670,9 @@ pipe_poll(struct file *filp, poll_table *wait) * if something changes and you got it wrong, the poll * table entry will wake you up and fix it. */ - head = READ_ONCE(pipe->head); - tail = READ_ONCE(pipe->tail); + ring_idxs = (u64)atomic64_read(&pipe->ring_idxs); + head = pipe_get_ring_head(ring_idxs); + tail = pipe_get_ring_tail(ring_idxs); mask = 0; if (filp->f_mode & FMODE_READ) { diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index fc5642431b92..9a7cb8077dc8 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -58,8 +58,18 @@ struct pipe_buffer { struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; - unsigned int head; - unsigned int tail; + union { + /* + * If someone want to change this structure, you should also + * change the macro *PIPE_GET_DOORBELL* that is used to + * generate the ring head/tail access function. + */ + struct { + unsigned int head; + unsigned int tail; + }; + atomic64_t ring_idxs; + }; unsigned int max_usage; unsigned int ring_size; #ifdef CONFIG_WATCH_QUEUE @@ -82,6 +92,24 @@ struct pipe_inode_info { #endif }; +#define PIPE_GET_DOORBELL(bellname) \ +static inline unsigned int pipe_get_ring_##bellname(u64 ring_idxs) \ +{ \ + unsigned int doorbell; \ + unsigned char *ptr = ((char *)&ring_idxs); \ + int offset; \ + \ + offset = (int)offsetof(struct pipe_inode_info, bellname); \ + offset -= (int)offsetof(struct pipe_inode_info, ring_idxs); \ + ptr += offset; \ + doorbell = *((unsigned int *)ptr); \ + \ + return doorbell; \ +} + +PIPE_GET_DOORBELL(head) +PIPE_GET_DOORBELL(tail) + /* * Note on the nesting of these functions: * -- 2.31.1