Gentel ping.... On 2021/12/12 0:51, miaoxie (A) wrote: > Thers is a out-of-order access problem which would cause endless sleep > when we use pipe with epoll. > > The story is following, we assume the ring size is 2, the ring head > is 1, the ring tail is 0, task0 is write task, task1 is read task, > task2 is write task. > Task0 Task1 Task2 > epoll_ctl(fd, EPOLL_CTL_ADD, ...) > pipe_poll() > poll_wait() > tail = READ_ONCE(pipe->tail); > // Re-order and get tail=0 > pipe_read > tail++ //tail=1 > pipe_write > head++ //head=2 > head = READ_ONCE(pipe->head); > // head = 2 > check ring is full by head - tail > Task0 get head = 2 and tail = 0, so it mistake that the pipe ring is > full, then task0 is not add into ready list. If the ring is not full > anymore, task0 would not be woken up forever > > The reason of this problem is that we got inconsistent head/tail value > of the pipe ring, so we fix the problem by getting them by atomic. > > It seems that pipe_readable and pipe_writable is safe, so we don't > change them. > > Signed-off-by: Miao Xie <miaoxie@xxxxxxxxxx> > --- > fs/pipe.c | 6 ++++-- > include/linux/pipe_fs_i.h | 32 ++++++++++++++++++++++++++++++-- > 2 files changed, 34 insertions(+), 4 deletions(-) > > diff --git a/fs/pipe.c b/fs/pipe.c > index 6d4342bad9f1..454056b1eaad 100644 > --- a/fs/pipe.c > +++ b/fs/pipe.c > @@ -649,6 +649,7 @@ pipe_poll(struct file *filp, poll_table *wait) > __poll_t mask; > struct pipe_inode_info *pipe = filp->private_data; > unsigned int head, tail; > + u64 ring_idxs; > > /* Epoll has some historical nasty semantics, this enables them */ > pipe->poll_usage = 1; > @@ -669,8 +670,9 @@ pipe_poll(struct file *filp, poll_table *wait) > * if something changes and you got it wrong, the poll > * table entry will wake you up and fix it. > */ > - head = READ_ONCE(pipe->head); > - tail = READ_ONCE(pipe->tail); > + ring_idxs = (u64)atomic64_read(&pipe->ring_idxs); > + head = pipe_get_ring_head(ring_idxs); > + tail = pipe_get_ring_tail(ring_idxs); > > mask = 0; > if (filp->f_mode & FMODE_READ) { > diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h > index fc5642431b92..9a7cb8077dc8 100644 > --- a/include/linux/pipe_fs_i.h > +++ b/include/linux/pipe_fs_i.h > @@ -58,8 +58,18 @@ struct pipe_buffer { > struct pipe_inode_info { > struct mutex mutex; > wait_queue_head_t rd_wait, wr_wait; > - unsigned int head; > - unsigned int tail; > + union { > + /* > + * If someone want to change this structure, you should also > + * change the macro *PIPE_GET_DOORBELL* that is used to > + * generate the ring head/tail access function. > + */ > + struct { > + unsigned int head; > + unsigned int tail; > + }; > + atomic64_t ring_idxs; > + }; > unsigned int max_usage; > unsigned int ring_size; > #ifdef CONFIG_WATCH_QUEUE > @@ -82,6 +92,24 @@ struct pipe_inode_info { > #endif > }; > > +#define PIPE_GET_DOORBELL(bellname) \ > +static inline unsigned int pipe_get_ring_##bellname(u64 ring_idxs) \ > +{ \ > + unsigned int doorbell; \ > + unsigned char *ptr = ((char *)&ring_idxs); \ > + int offset; \ > + \ > + offset = (int)offsetof(struct pipe_inode_info, bellname); \ > + offset -= (int)offsetof(struct pipe_inode_info, ring_idxs); \ > + ptr += offset; \ > + doorbell = *((unsigned int *)ptr); \ > + \ > + return doorbell; \ > +} > + > +PIPE_GET_DOORBELL(head) > +PIPE_GET_DOORBELL(tail) > + > /* > * Note on the nesting of these functions: > * >