Hi Steve and Jeff (and others). Here is a patch that Samba vendors have been using to implement recvfile (copy directly from socket to file). It can improve write performance on boxes by a significant amount (10% or more). I'm not qualified to evaluate this code, can someone who is (hi there Steve and Jeff :-) take a look at this and see if it's work shepherding into the kernel ? Cheers, Jeremy.
diff -urp linux-2.6.37-rc5.orig/fs/splice.c linux-2.6.37-rc5/fs/splice.c --- linux-2.6.37-rc5.orig/fs/splice.c 2010-12-06 20:09:04.000000000 -0800 +++ linux-2.6.37-rc5/fs/splice.c 2010-12-07 16:16:48.000000000 -0800 @@ -31,6 +31,7 @@ #include <linux/uio.h> #include <linux/security.h> #include <linux/gfp.h> +#include <net/sock.h> /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -1387,6 +1388,141 @@ static long do_splice(struct file *in, l return -EINVAL; } +static ssize_t do_splice_from_socket(struct file *file, struct socket *sock, + loff_t __user *ppos, size_t count) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + loff_t pos; + int count_tmp; + int err = 0; + int cPagePtr = 0; + int cPagesAllocated = 0; + struct recvfile_ctl_blk rv_cb[MAX_PAGES_PER_RECVFILE]; + struct kvec iov[MAX_PAGES_PER_RECVFILE]; + struct msghdr msg; + long rcvtimeo; + int ret; + + if(copy_from_user(&pos, ppos, sizeof(loff_t))) + return -EFAULT; + + if(count > MAX_PAGES_PER_RECVFILE * PAGE_SIZE) { + printk("%s: count(%u) exceeds maxinum\n", __func__, count); + return -EINVAL; + } + mutex_lock(&inode->i_mutex); + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err != 0 || count == 0) + goto done; + + file_remove_suid(file); + file_update_time(file); + + count_tmp = count; + do { + unsigned long bytes; /* Bytes to write to page */ + unsigned long offset; /* Offset into pagecache page */ + struct page *pageP; + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE - 1)); + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count_tmp) + bytes = count_tmp; + ret = mapping->a_ops->write_begin(file, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &pageP, &fsdata); + + if (unlikely(ret)) { + err = ret; + for(cPagePtr = 0; cPagePtr < cPagesAllocated; cPagePtr++) { + kunmap(rv_cb[cPagePtr].rv_page); + ret = mapping->a_ops->write_end(file, mapping, + rv_cb[cPagePtr].rv_pos, + rv_cb[cPagePtr].rv_count, + rv_cb[cPagePtr].rv_count, + rv_cb[cPagePtr].rv_page, + rv_cb[cPagePtr].rv_fsdata); + } + goto done; + } + rv_cb[cPagesAllocated].rv_page = pageP; + rv_cb[cPagesAllocated].rv_pos = pos; + rv_cb[cPagesAllocated].rv_count = bytes; + rv_cb[cPagesAllocated].rv_fsdata = fsdata; + iov[cPagesAllocated].iov_base = kmap(pageP) + offset; + iov[cPagesAllocated].iov_len = bytes; + cPagesAllocated++; + count_tmp -= bytes; + pos += bytes; + } while (count_tmp); + + /* IOV is ready, receive the date from socket now */ + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = (struct iovec *)&iov[0]; + msg.msg_iovlen = cPagesAllocated ; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_KERNSPACE; + rcvtimeo = sock->sk->sk_rcvtimeo; + sock->sk->sk_rcvtimeo = 8 * HZ; + + ret = kernel_recvmsg(sock, &msg, &iov[0], cPagesAllocated, count, + MSG_WAITALL | MSG_NOCATCHSIG); + + sock->sk->sk_rcvtimeo = rcvtimeo; + if(ret != count) + err = -EPIPE; + else + err = 0; + + if (unlikely(err < 0)) { + for(cPagePtr = 0; cPagePtr < cPagesAllocated; cPagePtr++) { + kunmap(rv_cb[cPagePtr].rv_page); + ret = mapping->a_ops->write_end(file, mapping, + rv_cb[cPagePtr].rv_pos, + rv_cb[cPagePtr].rv_count, + rv_cb[cPagePtr].rv_count, + rv_cb[cPagePtr].rv_page, + rv_cb[cPagePtr].rv_fsdata); + } + goto done; + } + + for(cPagePtr=0,count=0;cPagePtr < cPagesAllocated;cPagePtr++) { + //flush_dcache_page(pageP); + kunmap(rv_cb[cPagePtr].rv_page); + ret = mapping->a_ops->write_end(file, mapping, + rv_cb[cPagePtr].rv_pos, + rv_cb[cPagePtr].rv_count, + rv_cb[cPagePtr].rv_count, + rv_cb[cPagePtr].rv_page, + rv_cb[cPagePtr].rv_fsdata); + if (unlikely(ret < 0)) + printk("%s: write_end fail,ret = %d\n", __func__, ret); + count += rv_cb[cPagePtr].rv_count; + //cond_resched(); + } + balance_dirty_pages_ratelimited_nr(mapping, cPagesAllocated); + copy_to_user(ppos,&pos,sizeof(loff_t)); + +done: + current->backing_dev_info = NULL; + mutex_unlock(&inode->i_mutex); + if(err) + return err; + else + return count; +} + /* * Map an iov into an array of pages and offset/length tupples. With the * partial_page structure, we can map several non-contiguous ranges into @@ -1698,11 +1834,33 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff long error; struct file *in, *out; int fput_in, fput_out; + struct socket *sock = NULL; if (unlikely(!len)) return 0; error = -EBADF; + + /* check if fd_in is a socket */ + sock = sockfd_lookup(fd_in, &error); + if (sock) { + out = NULL; + if (!sock->sk) + goto done; + out = fget_light(fd_out, &fput_out); + + if (out) { + if (!(out->f_mode & FMODE_WRITE)) + goto done; + error = do_splice_from_socket(out, sock, off_out, len); + } +done: + if(out) + fput_light(out, fput_out); + fput(sock->file); + return error; + } + in = fget_light(fd_in, &fput_in); if (in) { if (in->f_mode & FMODE_READ) { diff -urp linux-2.6.37-rc5.orig/include/linux/fs.h linux-2.6.37-rc5/include/linux/fs.h --- linux-2.6.37-rc5.orig/include/linux/fs.h 2010-12-06 20:09:04.000000000 -0800 +++ linux-2.6.37-rc5/include/linux/fs.h 2010-12-07 15:58:26.000000000 -0800 @@ -372,6 +372,8 @@ struct inodes_stat_t { #define SYNC_FILE_RANGE_WRITE 2 #define SYNC_FILE_RANGE_WAIT_AFTER 4 +#define MAX_PAGES_PER_RECVFILE 32 + #ifdef __KERNEL__ #include <linux/linkage.h> diff -urp linux-2.6.37-rc5.orig/include/linux/skbuff.h linux-2.6.37-rc5/include/linux/skbuff.h --- linux-2.6.37-rc5.orig/include/linux/skbuff.h 2010-12-06 20:09:04.000000000 -0800 +++ linux-2.6.37-rc5/include/linux/skbuff.h 2010-12-07 15:31:43.000000000 -0800 @@ -1817,6 +1817,9 @@ extern unsigned int datagram_poll(str extern int skb_copy_datagram_iovec(const struct sk_buff *from, int offset, struct iovec *to, int size); +extern int skb_copy_datagram_to_kernel_iovec(const struct sk_buff *from, + int offset, struct iovec *to, + int size); extern int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov); diff -urp linux-2.6.37-rc5.orig/include/linux/socket.h linux-2.6.37-rc5/include/linux/socket.h --- linux-2.6.37-rc5.orig/include/linux/socket.h 2010-12-06 20:09:04.000000000 -0800 +++ linux-2.6.37-rc5/include/linux/socket.h 2010-12-07 15:33:52.000000000 -0800 @@ -261,6 +261,8 @@ struct ucred { #define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ #define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ +#define MSG_KERNSPACE 0x20000 +#define MSG_NOCATCHSIG 0x40000 #define MSG_EOF MSG_FIN @@ -326,6 +328,7 @@ extern int verify_iovec(struct msghdr *m extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len); extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata, int offset, int len); +extern void memcpy_tokerneliovec(struct iovec *iov, unsigned char *kdata, int len); extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); diff -urp linux-2.6.37-rc5.orig/include/linux/splice.h linux-2.6.37-rc5/include/linux/splice.h --- linux-2.6.37-rc5.orig/include/linux/splice.h 2010-12-06 20:09:04.000000000 -0800 +++ linux-2.6.37-rc5/include/linux/splice.h 2010-12-07 15:46:44.000000000 -0800 @@ -57,6 +57,14 @@ struct splice_pipe_desc { void (*spd_release)(struct splice_pipe_desc *, unsigned int); }; +struct recvfile_ctl_blk +{ + struct page *rv_page; + loff_t rv_pos; + size_t rv_count; + void *rv_fsdata; +}; + typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, struct splice_desc *); typedef int (splice_direct_actor)(struct pipe_inode_info *, diff -urp linux-2.6.37-rc5.orig/net/core/datagram.c linux-2.6.37-rc5/net/core/datagram.c --- linux-2.6.37-rc5.orig/net/core/datagram.c 2010-12-06 20:09:04.000000000 -0800 +++ linux-2.6.37-rc5/net/core/datagram.c 2010-12-07 16:01:36.000000000 -0800 @@ -128,6 +128,65 @@ out_noerr: goto out; } +/* + * skb_copy_datagram_to_kernel_iovec - Copy a datagram to a kernel iovec structure. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: io vector to copy to + * @len: amount of data to copy from buffer to iovec + * + * Note: the iovec is modified during the copy. + */ +int skb_copy_datagram_to_kernel_iovec(const struct sk_buff *skb, int offset, + struct iovec *to, int len) +{ + int i, fraglen, end = 0; + struct sk_buff *next = skb_shinfo(skb)->frag_list; + + if (!len) + return 0; + +next_skb: + fraglen = skb_headlen(skb); + i = -1; + + while (1) { + int start = end; + + if ((end += fraglen) > offset) { + int copy = end - offset; + int o = offset - start; + + if (copy > len) + copy = len; + if (i == -1) + memcpy_tokerneliovec(to, skb->data + o, copy); + else { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + void *p = kmap(page) + frag->page_offset + o; + memcpy_tokerneliovec(to, p, copy); + kunmap(page); + } + + if (!(len -= copy)) + return 0; + offset += copy; + } + if (++i >= skb_shinfo(skb)->nr_frags) + break; + fraglen = skb_shinfo(skb)->frags[i].size; + } + if (next) { + skb = next; + BUG_ON(skb_shinfo(skb)->frag_list); + next = skb->next; + goto next_skb; + } + + return -EFAULT; +} + /** * __skb_recv_datagram - Receive a datagram skbuff * @sk: socket diff -urp linux-2.6.37-rc5.orig/net/core/iovec.c linux-2.6.37-rc5/net/core/iovec.c --- linux-2.6.37-rc5.orig/net/core/iovec.c 2010-12-06 20:09:04.000000000 -0800 +++ linux-2.6.37-rc5/net/core/iovec.c 2010-12-07 16:03:46.000000000 -0800 @@ -124,6 +124,30 @@ int memcpy_toiovecend(const struct iovec } EXPORT_SYMBOL(memcpy_toiovecend); +/* This was removed in 2.6. Re-add it for splice from socket to file. */ +/* + * In kernel copy to iovec. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +void memcpy_tokerneliovec(struct iovec *iov, unsigned char *kdata, int len) +{ + while(len>0) + { + if(iov->iov_len) + { + int copy = min_t(unsigned int, iov->iov_len, len); + memcpy(iov->iov_base, kdata, copy); + len -= copy; + kdata += copy; + iov->iov_base += copy; + iov->iov_len -= copy; + } + iov++; + } +} + /* * Copy iovec to kernel. Returns -EFAULT on error. * diff -urp linux-2.6.37-rc5.orig/net/ipv4/tcp.c linux-2.6.37-rc5/net/ipv4/tcp.c --- linux-2.6.37-rc5.orig/net/ipv4/tcp.c 2010-12-06 20:09:04.000000000 -0800 +++ linux-2.6.37-rc5/net/ipv4/tcp.c 2010-12-07 15:49:35.000000000 -0800 @@ -1460,8 +1460,23 @@ int tcp_recvmsg(struct kiocb *iocb, stru do { u32 offset; + if (flags & MSG_NOCATCHSIG) { + if (signal_pending(current)) { + if (sigismember(¤t->pending.signal, SIGQUIT) || + sigismember(¤t->pending.signal, SIGABRT) || + sigismember(¤t->pending.signal, SIGKILL) || + sigismember(¤t->pending.signal, SIGTERM) || + sigismember(¤t->pending.signal, SIGSTOP)) { + + if (copied) + break; + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; + break; + } + } + } /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ - if (tp->urg_data && tp->urg_seq == *seq) { + else if (tp->urg_data && tp->urg_seq == *seq) { if (copied) break; if (signal_pending(current)) { @@ -1690,8 +1705,12 @@ do_prequeue: } else #endif { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); + if(msg->msg_flags & MSG_KERNSPACE) + err = skb_copy_datagram_to_kernel_iovec(skb, + offset, msg->msg_iov, used); + else + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); if (err) { /* Exception. Bailout! */ if (!copied)