On Mon, Dec 10, 2018 at 12:22:40PM +0100, Håkon Bugge wrote: > > > > On 10 Dec 2018, at 08:53, Shamir Rabinovitch <shamir.rabinovitch@xxxxxxxxxx> wrote: > > > > From: shamir rabinovitch <shamir.rabinovitch@xxxxxxxxxx> > > > > redundant copy_from_user in rds_sendmsg system call expose rds > > to issue where rds_rdma_extra_size walk the rds iovec and and > > calculate the number pf pages (sgs) it need to add to the tail of > > rds message and later rds_cmsg_rdma_args copy the rds iovec again > > and re calculate the same number and get different result causing > > WARN_ON in rds_message_alloc_sgs. > > Am I onto something if I say that the user-data is qualified after the first copy_from_user(), and RDS assumes the validity after the second copy_form_user(), although the user-space app at this point may have fiddled with it? This is correct. > > > > fix this by doing the copy_from_user only once per rds_sendmsg > > system call. > > > > When issue occur the below dump is seen: > > > > WARNING: CPU: 0 PID: 19789 at net/rds/message.c:316 rds_message_alloc_sgs+0x10c/0x160 net/rds/message.c:316 > > Kernel panic - not syncing: panic_on_warn set ... > > CPU: 0 PID: 19789 Comm: syz-executor827 Not tainted 4.19.0-next-20181030+ #101 > > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 > > Call Trace: > > __dump_stack lib/dump_stack.c:77 [inline] > > dump_stack+0x244/0x39d lib/dump_stack.c:113 > > panic+0x2ad/0x55c kernel/panic.c:188 > > __warn.cold.8+0x20/0x45 kernel/panic.c:540 > > report_bug+0x254/0x2d0 lib/bug.c:186 > > fixup_bug arch/x86/kernel/traps.c:178 [inline] > > do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271 > > do_invalid_op+0x36/0x40 arch/x86/kernel/traps.c:290 > > invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:969 > > RIP: 0010:rds_message_alloc_sgs+0x10c/0x160 net/rds/message.c:316 > > Code: c0 74 04 3c 03 7e 6c 44 01 ab 78 01 00 00 e8 2b 9e 35 fa 4c 89 e0 48 83 c4 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 e8 14 9e 35 fa <0f> 0b 31 ff 44 89 ee e8 18 9f 35 fa 45 85 ed 75 1b e8 fe 9d 35 fa > > RSP: 0018:ffff8801c51b7460 EFLAGS: 00010293 > > RAX: ffff8801bc412080 RBX: ffff8801d7bf4040 RCX: ffffffff8749c9e6 > > RDX: 0000000000000000 RSI: ffffffff8749ca5c RDI: 0000000000000004 > > RBP: ffff8801c51b7490 R08: ffff8801bc412080 R09: ffffed003b5c5b67 > > R10: ffffed003b5c5b67 R11: ffff8801dae2db3b R12: 0000000000000000 > > R13: 000000000007165c R14: 000000000007165c R15: 0000000000000005 > > rds_cmsg_rdma_args+0x82d/0x1510 net/rds/rdma.c:623 > > rds_cmsg_send net/rds/send.c:971 [inline] > > rds_sendmsg+0x19a2/0x3180 net/rds/send.c:1273 > > sock_sendmsg_nosec net/socket.c:622 [inline] > > sock_sendmsg+0xd5/0x120 net/socket.c:632 > > ___sys_sendmsg+0x7fd/0x930 net/socket.c:2117 > > __sys_sendmsg+0x11d/0x280 net/socket.c:2155 > > __do_sys_sendmsg net/socket.c:2164 [inline] > > __se_sys_sendmsg net/socket.c:2162 [inline] > > __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2162 > > do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 > > entry_SYSCALL_64_after_hwframe+0x49/0xbe > > RIP: 0033:0x44a859 > > Code: e8 dc e6 ff ff 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 6b cb fb ff c3 66 2e 0f 1f 84 00 00 00 00 > > RSP: 002b:00007f1d4710ada8 EFLAGS: 00000297 ORIG_RAX: 000000000000002e > > RAX: ffffffffffffffda RBX: 00000000006dcc28 RCX: 000000000044a859 > > RDX: 0000000000000000 RSI: 0000000020001600 RDI: 0000000000000003 > > RBP: 00000000006dcc20 R08: 0000000000000000 R09: 0000000000000000 > > R10: 0000000000000000 R11: 0000000000000297 R12: 00000000006dcc2c > > R13: 646e732f7665642f R14: 00007f1d4710b9c0 R15: 00000000006dcd2c > > Kernel Offset: disabled > > Rebooting in 86400 seconds.. > > > > Reported-by: syzbot+26de17458aeda9d305d8@xxxxxxxxxxxxxxxxxxxxxxxxx > > Acked-by: Santosh Shilimkar <santosh.shilimkar@xxxxxxxxxx> > > Signed-off-by: shamir rabinovitch <shamir.rabinovitch@xxxxxxxxxx> > > --- > > net/rds/rdma.c | 63 +++++++++++++++++++++++++------------------------- > > net/rds/rds.h | 20 ++++++++++++---- > > net/rds/send.c | 47 +++++++++++++++++++++++++++++++------ > > 3 files changed, 88 insertions(+), 42 deletions(-) > > > > diff --git a/net/rds/rdma.c b/net/rds/rdma.c > > index 98237feb607a..e1965d9cbcf8 100644 > > --- a/net/rds/rdma.c > > +++ b/net/rds/rdma.c > > @@ -517,9 +517,10 @@ static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) > > return tot_pages; > > } > > > > -int rds_rdma_extra_size(struct rds_rdma_args *args) > > +int rds_rdma_extra_size(struct rds_rdma_args *args, > > + struct rds_iov_vector *iov) > > { > > - struct rds_iovec vec; > > + struct rds_iovec *vec; > > struct rds_iovec __user *local_vec; > > int tot_pages = 0; > > unsigned int nr_pages; > > @@ -530,13 +531,23 @@ int rds_rdma_extra_size(struct rds_rdma_args *args) > > if (args->nr_local == 0) > > return -EINVAL; > > > > + iov->iov = kcalloc(args->nr_local, > > + sizeof(struct rds_iovec), > > + GFP_KERNEL); > > + if (!iov->iov) > > + return -ENOMEM; > > + > > + vec = &iov->iov[0]; > > + > > + if (copy_from_user(vec, local_vec, args->nr_local * > > + sizeof(struct rds_iovec))) > > + return -EFAULT; > > + iov->len = args->nr_local; > > + > > /* figure out the number of pages in the vector */ > > - for (i = 0; i < args->nr_local; i++) { > > - if (copy_from_user(&vec, &local_vec[i], > > - sizeof(struct rds_iovec))) > > - return -EFAULT; > > + for (i = 0; i < args->nr_local; i++, vec++) { > > > > - nr_pages = rds_pages_in_vec(&vec); > > + nr_pages = rds_pages_in_vec(vec); > > if (nr_pages == 0) > > return -EINVAL; > > > > @@ -558,15 +569,15 @@ int rds_rdma_extra_size(struct rds_rdma_args *args) > > * Extract all arguments and set up the rdma_op > > */ > > int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, > > - struct cmsghdr *cmsg) > > + struct cmsghdr *cmsg, > > + struct rds_iov_vector *vec) > > { > > struct rds_rdma_args *args; > > struct rm_rdma_op *op = &rm->rdma; > > int nr_pages; > > unsigned int nr_bytes; > > struct page **pages = NULL; > > - struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack; > > - int iov_size; > > + struct rds_iovec *iovs; > > unsigned int i, j; > > int ret = 0; > > > > @@ -586,31 +597,23 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, > > goto out_ret; > > } > > > > - /* Check whether to allocate the iovec area */ > > - iov_size = args->nr_local * sizeof(struct rds_iovec); > > - if (args->nr_local > UIO_FASTIOV) { > > - iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); > > - if (!iovs) { > > - ret = -ENOMEM; > > - goto out_ret; > > - } > > + if (vec->len != args->nr_local) { > > + ret = -EINVAL; > > + goto out_ret; > > } > > > > - if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) { > > - ret = -EFAULT; > > - goto out; > > - } > > + iovs = vec->iov; > > > > nr_pages = rds_rdma_pages(iovs, args->nr_local); > > if (nr_pages < 0) { > > ret = -EINVAL; > > - goto out; > > + goto out_ret; > > } > > > > pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); > > if (!pages) { > > ret = -ENOMEM; > > - goto out; > > + goto out_ret; > > } > > > > op->op_write = !!(args->flags & RDS_RDMA_READWRITE); > > @@ -623,7 +626,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, > > op->op_sg = rds_message_alloc_sgs(rm, nr_pages); > > if (!op->op_sg) { > > ret = -ENOMEM; > > - goto out; > > + goto out_pages; > > } > > > > if (op->op_notify || op->op_recverr) { > > @@ -635,7 +638,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, > > op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); > > if (!op->op_notifier) { > > ret = -ENOMEM; > > - goto out; > > + goto out_pages; > > } > > op->op_notifier->n_user_token = args->user_token; > > op->op_notifier->n_status = RDS_RDMA_SUCCESS; > > @@ -681,7 +684,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, > > */ > > ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); > > if (ret < 0) > > - goto out; > > + goto out_pages; > > else > > ret = 0; > > > > @@ -714,13 +717,11 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, > > nr_bytes, > > (unsigned int) args->remote_vec.bytes); > > ret = -EINVAL; > > - goto out; > > + goto out_pages; > > } > > op->op_bytes = nr_bytes; > > > > -out: > > - if (iovs != iovstack) > > - sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); > > +out_pages: > > kfree(pages); > > out_ret: > > if (ret) > > diff --git a/net/rds/rds.h b/net/rds/rds.h > > index 6bfaf05b63b2..42f4350444f6 100644 > > --- a/net/rds/rds.h > > +++ b/net/rds/rds.h > > @@ -386,6 +386,18 @@ static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q) > > INIT_LIST_HEAD(&q->zcookie_head); > > } > > > > +struct rds_iov_vector { > > + struct rds_iovec *iov; > > + int len; > > +}; > > + > > +struct rds_iov_vector_arr { > > + struct rds_iov_vector *vec; > > + int len; > > + int ind; > > + int inc; > > Would it be possible to use more descriptive names here for ind and inc? "inc" is often used as an instantiation of struct rds_incoming. Or is it increment? Is ind "index"? > Yes. Will do that in v2. > > +}; > > + > > struct rds_message { > > refcount_t m_refcount; > > struct list_head m_sock_item; > > @@ -904,13 +916,13 @@ int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); > > int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); > > int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); > > void rds_rdma_drop_keys(struct rds_sock *rs); > > -int rds_rdma_extra_size(struct rds_rdma_args *args); > > -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, > > - struct cmsghdr *cmsg); > > +int rds_rdma_extra_size(struct rds_rdma_args *args, > > + struct rds_iov_vector *iov); > > int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, > > struct cmsghdr *cmsg); > > int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, > > - struct cmsghdr *cmsg); > > + struct cmsghdr *cmsg, > > + struct rds_iov_vector *vec); > > int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, > > struct cmsghdr *cmsg); > > void rds_rdma_free_op(struct rm_rdma_op *ro); > > diff --git a/net/rds/send.c b/net/rds/send.c > > index fe785ee819dd..b23e0b999beb 100644 > > --- a/net/rds/send.c > > +++ b/net/rds/send.c > > @@ -876,13 +876,15 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, > > * rds_message is getting to be quite complicated, and we'd like to allocate > > * it all in one go. This figures out how big it needs to be up front. > > */ > > -static int rds_rm_size(struct msghdr *msg, int num_sgs) > > +static int rds_rm_size(struct msghdr *msg, int num_sgs, > > + struct rds_iov_vector_arr *vct) > > { > > struct cmsghdr *cmsg; > > int size = 0; > > int cmsg_groups = 0; > > int retval; > > bool zcopy_cookie = false; > > + struct rds_iov_vector *iov; > > > > for_each_cmsghdr(cmsg, msg) { > > if (!CMSG_OK(msg, cmsg)) > > @@ -893,8 +895,21 @@ static int rds_rm_size(struct msghdr *msg, int num_sgs) > > > > switch (cmsg->cmsg_type) { > > case RDS_CMSG_RDMA_ARGS: > > + if (vct->ind >= vct->len) { > > + vct->len += vct->inc; > > + vct->vec = > > + krealloc(vct->vec, > > + vct->len * > > + sizeof(struct rds_iov_vector), > > + GFP_KERNEL); > > Here you will leak memory, because you loose the pointer to what you already have allocated. You must do something aka: > > tmp = krealloc(); > if (!tmp) { > kfree(vct->vec); > return -ENOMEM; > } > vct->vec = tmp; > > With that, > > Acked-by: Håkon Bugge <haakon.bugge@xxxxxxxxxx> > > > Thxs, Håkon > Thanks Hakon! Will fix this in v2. > > > > + if (!vct->vec) > > + return -ENOMEM; > > + } > > + iov = &vct->vec[vct->ind]; > > + memset(iov, 0, sizeof(struct rds_iov_vector)); > > + vct->ind++; > > cmsg_groups |= 1; > > - retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); > > + retval = rds_rdma_extra_size(CMSG_DATA(cmsg), iov); > > if (retval < 0) > > return retval; > > size += retval; > > @@ -951,10 +966,11 @@ static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, > > } > > > > static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, > > - struct msghdr *msg, int *allocated_mr) > > + struct msghdr *msg, int *allocated_mr, > > + struct rds_iov_vector_arr *vct) > > { > > struct cmsghdr *cmsg; > > - int ret = 0; > > + int ret = 0, ind = 0; > > > > for_each_cmsghdr(cmsg, msg) { > > if (!CMSG_OK(msg, cmsg)) > > @@ -968,7 +984,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, > > */ > > switch (cmsg->cmsg_type) { > > case RDS_CMSG_RDMA_ARGS: > > - ret = rds_cmsg_rdma_args(rs, rm, cmsg); > > + if (ind >= vct->ind) > > + return -ENOMEM; > > + ret = rds_cmsg_rdma_args(rs, rm, cmsg, &vct->vec[ind]); > > + ind++; > > break; > > > > case RDS_CMSG_RDMA_DEST: > > @@ -1084,6 +1103,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) > > sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); > > int num_sgs = ceil(payload_len, PAGE_SIZE); > > int namelen; > > + struct rds_iov_vector_arr vct = {0}; > > + int ind; > > + > > + /* expect 1 RDMA CMSG per rds_sendmsg. can still grow if more needed. */ > > + vct.inc = 1; > > > > /* Mirror Linux UDP mirror of BSD error message compatibility */ > > /* XXX: Perhaps MSG_MORE someday */ > > @@ -1220,7 +1244,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) > > num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); > > } > > /* size of rm including all sgs */ > > - ret = rds_rm_size(msg, num_sgs); > > + ret = rds_rm_size(msg, num_sgs, &vct); > > if (ret < 0) > > goto out; > > > > @@ -1270,7 +1294,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) > > rm->m_conn_path = cpath; > > > > /* Parse any control messages the user may have included. */ > > - ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); > > + ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct); > > if (ret) { > > /* Trigger connection so that its ready for the next retry */ > > if (ret == -EAGAIN) > > @@ -1348,9 +1372,18 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) > > if (ret) > > goto out; > > rds_message_put(rm); > > + > > + for (ind = 0; ind < vct.ind; ind++) > > + kfree(vct.vec[ind].iov); > > + kfree(vct.vec); > > + > > return payload_len; > > > > out: > > + for (ind = 0; ind < vct.ind; ind++) > > + kfree(vct.vec[ind].iov); > > + kfree(vct.vec); > > + > > /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. > > * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN > > * or in any other way, we need to destroy the MR again */ > > -- > > 2.19.1 > > >