Hi Vegard, On 08/17/2016 10:00 AM, Vegard Nossum wrote: > On 08/16/2016 10:21 PM, Michael Kerrisk (man-pages) wrote: >>>> @@ -1132,8 +1136,8 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) >>>> if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { >>>> ret = -EPERM; >>>> goto out; >>>> - } else if ((too_many_pipe_buffers_hard(pipe->user) || >>>> - too_many_pipe_buffers_soft(pipe->user)) && >>>> + } else if ((too_many_pipe_buffers_hard(pipe->user, nr_pages) || >>>> + too_many_pipe_buffers_soft(pipe->user, nr_pages)) && >>>> !capable(CAP_SYS_RESOURCE) && >>>> !capable(CAP_SYS_ADMIN)) { >>>> ret = -EPERM; >>>> >>> >>> Isn't there also a race where two or more concurrent pipe()/fnctl() >>> calls can together push us over the limits before the accounting is done? >> >> I guess there is! >> >>> I think there really ought to be a check after doing the accounting if >>> we really want to be meticulous here. >> >> Let me confirm what I understand from your comment: because of the race, >> then a user could subvert the checks and allocate an arbitrary amount >> of kernel memory for pipes. Right? >> >> I'm not sure what you mean by "a check after doing the accounting". Is not the >> only solution here some kind of lock around the check+accounting steps? > > Instead of doing atomic_long_read() in the check + atomic_long_add() for > accounting we could do a single speculative atomic_long_add_return() and > then if it goes above the limit we can lower it again with atomic_sub() > when aborting the operation (if it doesn't go above the limit we don't > need to do anything). So, would that mean something like the following (where I've moved some checks from pipe_fcntl() to pipe_set_size()): 8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x--- long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe; long ret; pipe = get_pipe_info(file); if (!pipe) return -EBADF; __pipe_lock(pipe); switch (cmd) { case F_SETPIPE_SZ: { unsigned int size, nr_pages; size = round_pipe_size(arg); nr_pages = size >> PAGE_SHIFT; ret = -EINVAL; if (!nr_pages) goto out; ret = pipe_set_size(pipe, nr_pages); break; } case F_GETPIPE_SZ: ret = pipe->buffers * PAGE_SIZE; break; default: ret = -EINVAL; break; } out: __pipe_unlock(pipe); return ret; } /*...*/ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) { struct pipe_buffer *bufs; unsigned int size; long ret = 0; size = nr_pages * PAGE_SIZE; account_pipe_buffers(pipe, pipe->buffers, nr_pages); /* * If trying to increase the pipe capacity, check that an * unprivileged user is not trying to exceed various limits. * (Decreasing the pipe capacity is always permitted, even * if the user is currently over a limit.) */ if (nr_pages > pipe->buffers) { if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; } else if ((too_many_pipe_buffers_hard(pipe->user, 0) || too_many_pipe_buffers_soft(pipe->user, 0)) && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { ret = -EPERM; } } /* * If we exceeded a limit, revert the accounting and go no further */ if (ret) { account_pipe_buffers(pipe, nr_pages, pipe->buffers); return ret; } /* * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't * expect a lot of shrink+grow operations, just free and allocate * again like we would do for growing. If the pipe currently * contains more buffers than arg, then return busy. */ if (nr_pages < pipe->nrbufs) return -EBUSY; bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; /* * The pipe array wraps around, so just start the new one at zero * and adjust the indexes. */ if (pipe->nrbufs) { unsigned int tail; unsigned int head; tail = pipe->curbuf + pipe->nrbufs; if (tail < pipe->buffers) tail = 0; else tail &= (pipe->buffers - 1); head = pipe->nrbufs - tail; if (head) memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); if (tail) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; pipe->buffers = nr_pages; return nr_pages * PAGE_SIZE; } 8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x--- Seem okay? Probably, some analogous fix is required in alloc_pipe_info() when creating a pipe(?). Thanks, Michael -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Linux/UNIX System Programming Training: http://man7.org/training/ -- To unsubscribe from this list: send the line "unsubscribe stable" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html