On Fri, 23 Sep 2016 14:42:53 +0800 "Hillf Danton" <hillf.zj@xxxxxxxxxxxxxxx> wrote: > > > > The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows > > with the number of fds passed. We had a customer report page allocation > > failures of order-4 for this allocation. This is a costly order, so it might > > easily fail, as the VM expects such allocation to have a lower-order fallback. > > > > Such trivial fallback is vmalloc(), as the memory doesn't have to be > > physically contiguous. Also the allocation is temporary for the duration of the > > syscall, so it's unlikely to stress vmalloc too much. > > > > Note that the poll(2) syscall seems to use a linked list of order-0 pages, so > > it doesn't need this kind of fallback. How about something like this? (untested) Eric isn't wrong about vmalloc sucking :) Thanks, Nick --- fs/select.c | 57 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/fs/select.c b/fs/select.c index 8ed9da5..3b4834c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -555,6 +555,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, void *bits; int ret, max_fds; unsigned int size; + size_t nr_bytes; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; @@ -576,21 +577,39 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, * since we used fdset we need to allocate memory in units of * long-words. */ - size = FDS_BYTES(n); + ret = -ENOMEM; bits = stack_fds; - if (size > sizeof(stack_fds) / 6) { - /* Not enough space in on-stack array; must use kmalloc */ + size = FDS_BYTES(n); + nr_bytes = 6 * size; + + if (unlikely(nr_bytes > PAGE_SIZE)) { + /* Avoid multi-page allocation if possible */ ret = -ENOMEM; - bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; + fds.in = kmalloc(size, GFP_KERNEL); + fds.out = kmalloc(size, GFP_KERNEL); + fds.ex = kmalloc(size, GFP_KERNEL); + fds.res_in = kmalloc(size, GFP_KERNEL); + fds.res_out = kmalloc(size, GFP_KERNEL); + fds.res_ex = kmalloc(size, GFP_KERNEL); + + if (!(fds.in && fds.out && fds.ex && + fds.res_in && fds.res_out && fds.res_ex)) + goto out; + } else { + if (nr_bytes > sizeof(stack_fds)) { + /* Not enough space in on-stack array */ + if (nr_bytes > PAGE_SIZE * 2) + bits = kmalloc(nr_bytes, GFP_KERNEL); + if (!bits) + goto out_nofds; + } + fds.in = bits; + fds.out = bits + size; + fds.ex = bits + 2*size; + fds.res_in = bits + 3*size; + fds.res_out = bits + 4*size; + fds.res_ex = bits + 5*size; } - fds.in = bits; - fds.out = bits + size; - fds.ex = bits + 2*size; - fds.res_in = bits + 3*size; - fds.res_out = bits + 4*size; - fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || @@ -617,8 +636,18 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, ret = -EFAULT; out: - if (bits != stack_fds) - kfree(bits); + if (unlikely(nr_bytes > PAGE_SIZE)) { + kfree(fds.in); + kfree(fds.out); + kfree(fds.ex); + kfree(fds.res_in); + kfree(fds.res_out); + kfree(fds.res_ex); + } else { + if (bits != stack_fds) + kfree(bits); + } + out_nofds: return ret; } -- 2.9.3 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html