udp sendfile & csum_partial work around

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Attached is a patch which has just the udp sendfile support in the 
networking stack. It has an interface which takes a struct containing
pointers to header and trailer sent along with the sendfile.

csum_partial assumes the buffer passed to it is 2-byte aligned. Hence I 
used Byte Order Independence & Associative properties of the checksum 
(RFC 1071: Computing the Internet Checksum) to derive the checksum of 
the udp datagram.

Feel free to play with UDP sendfile but you have to come up with an 
interface to use it. NFS might be a good start.
I am working on a patch which includes a sendfilev system call
which does udp_sendfile.

-Kishore.
-- 
Kishore V. Chitrapu
Storigen Systems
(978) 323-4454 x395
kishore.chitrapu@storigen.com
diff -urN linux.orig/include/linux/fs.h linux/include/linux/fs.h
--- linux.orig/include/linux/fs.h	Wed Sep  4 17:22:21 2002
+++ linux/include/linux/fs.h	Wed Sep  4 17:23:56 2002
@@ -814,6 +812,9 @@
  * read, write, poll, fsync, readv, writev can be called
  *   without the big kernel lock held in all filesystems.
  */
+
+struct sf_hdtr;
+
 struct file_operations {
 	struct module *owner;
 	loff_t (*llseek) (struct file *, loff_t, int);
@@ -833,6 +834,8 @@
 	ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+	ssize_t (*sendfile) (struct file *, struct file *, loff_t *, size_t);
+	ssize_t (*udp_sendfile) (struct file *, struct file *, loff_t *, size_t, struct sf_hdtr *);
 };
 
 struct inode_operations {
@@ -1596,6 +1599,22 @@
 	dput(d1);
 	dput(d2);
 }
+
+/*
+ * structure used by sys_sendfilev
+ */
+struct sf_hdtr {
+	struct iovec	*headers;	/* pointer to header iovecs */
+	int		hdr_cnt;	/* number of header iovecs */
+	struct iovec	*trailers;	/* pointer to trailer iovecs */
+	int		trl_cnt;	/* number of trailer iovecs */
+	int		flags;
+};
+
+/*
+ * definitions of flags
+ */
+#define SF_SET_TCPCORK        0x0001
 
 #endif /* __KERNEL__ */

diff -urN linux.orig/include/net/udp_sendfile.h linux/include/net/udp_sendfile.h
--- linux.orig/include/net/udp_sendfile.h	Wed Dec 31 19:00:00 1969
+++ linux/include/net/udp_sendfile.h	Thu Sep  5 00:13:23 2002
@@ -0,0 +1,71 @@
+/*                                                                                    */ 
+/*                                                                                    */
+/*        Copyright (c) 2000-2001, Storigen Systems, Inc.                             */
+/*        May be copied or modified under the terms of the GNU General Public License */
+/*                                                                                    */
+
+#ifndef _UDP_SENDFILE_H_
+#define _UDP_SENDFILE_H_
+
+
+struct udp_dgram {
+	size_t   dgram_size;     /* Datagram size                         */  
+	size_t   iovec_size; 	 /* Datagram size                         */  
+	size_t   copied;         /* Copied dgram                          */ 
+	size_t   doffset;        /* Offset into the UDP datagram          */
+	size_t   skb_datapart;   /* Data to be copied on the current skb  */
+	size_t   skb_copied;     /* Data copied into on the current skb   */
+  	int	 leftover_skb;    /* length of skb remainig from pages    */
+
+	unsigned int	flags;	 /* Flags                                 */
+	size_t		udp_mss; /* UDP mss                               */  
+	struct rtable	*rt;     /* Route to dst                          */
+	u16		ip_id;   /* IP identification                     */
+	u32		udp_wcsum; /* UDP word checksum			  */
+	unsigned int	udp_carry; /* UDP carry leftover bytes in cksum	  */
+	union {                  /* Data                                  */
+		struct {
+			struct iovec	*iov;
+			int		iov_cnt;
+		} buf;
+		struct {
+			struct page	**pages;
+			size_t		poffset;
+			int		len;
+			int		page_start_index; /* Start index of phy.
+							     memory pages  */
+			int		page_end_index;	  /* End index of phy. 
+							     memory pages    */
+		} pglist;
+	} data;
+	int	nr_frags_cur_skb;
+	struct {
+		struct page *page;
+		u16	page_offset;
+		u16	size;
+	} frags[2];
+};
+
+/* Function Definitions */
+
+/* UDP layer function definitions */
+extern size_t udp_sendpages(void *sp, struct page **pages, int poffset, size_t size, struct sf_hdtr *hdtr);
+
+#define UDP_HEADER          8
+#define IP_UDP_HEADER      (UDP_HEADER + 128 + MAX_HEADER)
+
+/* Bunch of macros used */
+#define  FIRST_FRAG          0x00001
+#define  IOVEC_DATA          0x00002
+#define  LAST_FRAG           0x00004
+#define  IOVEC_DATA_AND_PAGE 0x00008
+#define  HW_CSUM             0x00010
+#define	 UDP_CARRY	     0x00020
+#define	 NEXT_PAGE	     0x00040	
+#define	 SKB_BY_IOVEC	     0x00080	
+
+#define	HDR_ALIGNED 0x0001
+#define	PAGE_DATA_ALIGNED 0x0010
+#define HDR_DATA_SAVE_WORD 0x0100
+#define TWO_PAGES_MAPPED 0x1000
+#endif

diff -urN linux.orig/net/ipv4/Makefile linux/net/ipv4/Makefile
--- linux.orig/net/ipv4/Makefile	Fri Dec 21 12:42:05 2001
+++ linux/net/ipv4/Makefile	Wed Sep  4 11:14:32 2002
@@ -16,7 +16,8 @@
 	     ip_output.o ip_sockglue.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
 	     tcp_diag.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
-	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o
+	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o \
+	     udp_sendfile.o
 
 obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
 obj-$(CONFIG_IP_ROUTE_NAT) += ip_nat_dumb.o

diff -urN linux.orig/net/ipv4/udp_sendfile.c linux/net/ipv4/udp_sendfile.c
--- linux.orig/net/ipv4/udp_sendfile.c	Wed Dec 31 19:00:00 1969
+++ linux/net/ipv4/udp_sendfile.c	Thu Sep  5 00:12:10 2002
@@ -0,0 +1,803 @@
+/*                                                                                    */ 
+/*                                                                                    */
+/*        Copyright (c) 2000-2001, Storigen Systems, Inc.                             */
+/*        May be copied or modified under the terms of the GNU General Public License */
+/*                                                                                    */
+/*
+ * INET		An implementation of udp sendfile.
+ * 		
+ * Authors:	Kishore V. Chitrapu, <kishore@storigen.com>
+ 		Ravi Chamarti, <ravi.chamarti@storigen.com>
+*/
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/module.h>
+
+/* Network layer specific include files */
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/in_route.h>
+#include <linux/ip.h>
+#include <asm/uaccess.h>
+#include <asm/checksum.h>
+#include <asm/atomic.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/route.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/udp_sendfile.h>
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+/* 
+ * sysctl to turn on/off the UDP checksum 
+ * by default turned on
+ */
+int	sysctl_udp_checksum = 1; 
+
+struct sk_buff *get_newpacket(struct sock *sk, struct udp_dgram *udpdata, 
+			      int *retval);
+
+static unsigned short 
+udp_check(struct udphdr *uh, int len, unsigned long saddr, 
+	  unsigned long daddr, unsigned long base)
+{
+        return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base));
+}
+
+void  
+udp_fill_header(struct sock *sk, struct sk_buff *skb, unsigned short size, 
+		unsigned int flags, unsigned int wcsum)
+{
+	unsigned short	ulen = size;
+	struct udphdr	*uh;
+
+	uh = (struct udphdr *)skb_push(skb, sizeof(struct udphdr));
+	/* prepare UDP header and payload data */
+	skb->h.uh = uh;
+	uh->len = htons(ulen);
+	uh->check = 0;
+	uh->source = sk->sport;
+	uh->dest = sk->dport;
+	if (flags & HW_CSUM) {
+		uh->check = ~udp_check(uh, skb->len, sk->saddr, sk->daddr, 0);
+		skb->csum = offsetof(struct udphdr, check);
+		skb->ip_summed = CHECKSUM_HW;
+	}
+	else {
+		wcsum = csum_partial((char *)uh, sizeof(struct udphdr), wcsum);
+		uh->check = csum_tcpudp_magic(sk->saddr, sk->daddr, size, 
+					      IPPROTO_UDP, wcsum);
+		/* checksum turned off?? */
+		if ( sysctl_udp_checksum == 0 ) {
+			uh->check = 0;
+		}
+		skb->csum = 0;
+		skb->ip_summed = CHECKSUM_NONE;
+	}
+	skb->sk = sk;
+	return;
+}
+static inline int
+output_maybe_reroute(struct sk_buff *skb)
+{
+	return skb->dst->output(skb);
+}
+
+
+int 
+ip_fill_header(struct sock *sk, struct sk_buff *skb, struct rtable *rt,
+		   unsigned short offset, unsigned short snd_size,
+		   unsigned short id, short last_frag, unsigned int flags)
+{
+	short		mf = 0;
+	short		df = 0;
+	int		retval = 0;
+	struct iphdr	*ip_hdr;
+
+	/* Let us fill the iphdr and also ip fragmentation */
+	skb->priority = sk->priority;
+	skb->dst  = dst_clone(&rt->u.dst);
+
+	if (ip_dont_fragment(sk, &rt->u.dst))
+		df = htons(IP_DF);
+
+	if (!last_frag)
+		mf = htons(IP_MF);
+	
+	skb->nh.iph = ip_hdr = (struct iphdr *) skb_push(skb, 
+							 sizeof(struct iphdr));
+	
+	ip_hdr->version = 4;
+	ip_hdr->ihl = 5;
+	ip_hdr->tos = sk->protinfo.af_inet.tos;
+	ip_hdr->tot_len = htons(snd_size + ip_hdr->ihl*4);
+	ip_hdr->id = id;
+	ip_hdr->frag_off = htons(offset >> 3)| mf | df;
+	ip_hdr->protocol = sk->protocol;
+	ip_hdr->ttl = sk->protinfo.af_inet.ttl;
+	ip_hdr->check = 0;
+	ip_hdr->saddr = rt->rt_src;
+	ip_hdr->daddr = rt->rt_dst;
+	ip_hdr->check = ip_fast_csum((unsigned char *)ip_hdr, ip_hdr->ihl);
+	if (!(flags & HW_CSUM))
+		skb->ip_summed = CHECKSUM_NONE;
+	
+	retval = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
+		      skb->dst->dev, output_maybe_reroute);
+	
+	if (retval > 0) 
+		retval = sk->protinfo.af_inet.recverr ? net_xmit_errno(retval):0;
+	if (!retval) {
+		UDP_INC_STATS_USER(UdpOutDatagrams);
+	}
+	return retval;
+}
+
+int 
+udp_transmit_skb(struct sock *sk, struct sk_buff *skb, struct udp_dgram *udpdata)
+{
+	int		retval = 0;
+	struct tcp_opt	*tp = &(sk->tp_pinfo.af_tcp);
+
+	skb->data -= udpdata->skb_copied;
+	skb->len = udpdata->skb_copied;
+	
+	if (udpdata->flags & FIRST_FRAG) {
+		udp_fill_header(sk, skb, udpdata->dgram_size, udpdata->flags, 
+				udpdata->udp_wcsum);
+		udpdata->skb_copied += sizeof(struct udphdr);
+		udpdata->doffset = 0;
+		udpdata->flags &= ~FIRST_FRAG;
+	} 
+	retval = ip_fill_header(sk, skb, udpdata->rt, udpdata->doffset,
+				udpdata->skb_copied, udpdata->ip_id, 
+				(udpdata->flags & LAST_FRAG), udpdata->flags);
+	if (retval < 0)
+		return retval;
+	if ( udpdata->flags & LAST_FRAG ) {
+		udpdata->flags &= ~LAST_FRAG;
+	}
+	tp->send_head = NULL;
+	return retval;
+}
+
+/* 
+   does fragmentation as is done in stock linux except this accepts physical 
+   memory address, so datagram length, page size and udp_mss variables which are 
+   to be considered while sending udp datagrams.
+*/
+int
+udp_fill_pages(struct sock *sk, struct udp_dgram *udpdata)
+{
+#define	HDR_ALIGNED 0x0001
+#define	PAGE_DATA_ALIGNED 0x0010
+#define HDR_DATA_SAVE_WORD 0x0100
+#define TWO_PAGES_MAPPED 0x1000
+
+	int		page_offset = 0;
+	int		page_poffset = 0;
+	int		dgram_remain = 0;
+	int		snd_size = 0;
+	int		overhd = 0;
+	int		page_usage;
+	int		page_end;
+	int		page_index;
+	int		first_page_len;
+	int		retval = 0;
+	int		page_align_err = 0, page_align_corr = 0;
+	int		max_pageusage_by_mss = 0;
+	int		frag_size = 0;
+	int		offset_4align;
+	int		i = 0;
+	caddr_t		vaddr = NULL, vaddr_offset = NULL;
+	struct page	*page = NULL;
+	struct sk_buff	*skb = NULL;
+
+	if (udpdata->flags & IOVEC_DATA)
+		return -EINVAL;
+	if(!(dgram_remain = udpdata->data.pglist.len))
+		return -EINVAL;
+	/* 
+	 * csum_partial should be feeded with a word aligned buffer unless the 
+	 * buffer is going into the last fragment.
+	 */
+	max_pageusage_by_mss = udpdata->udp_mss/PAGE_SIZE + 1;
+	overhd = udpdata->dgram_size  - dgram_remain;
+	/* two things for which word alignment is checked if 
+	 * 1. app. header 
+	 * 2. page offset in the first page of the array.
+	 * if the data in a page ( except the page with last frag) is not word 
+	 * aligned then we cannot pass that buffer to csum_partial. we need to 
+	 * do some correction before 
+	 * we pass the buffer to csum_partial.
+	 */
+	
+	page_index = udpdata->data.pglist.page_end_index;
+	first_page_len = PAGE_SIZE - udpdata->data.pglist.poffset;
+	page_end = ( dgram_remain - first_page_len + PAGE_SIZE - 1 ) % 
+	  PAGE_SIZE + 1;
+	snd_size = (dgram_remain + overhd - 1) % udpdata->udp_mss + 1;
+	if ( dgram_remain + overhd <= udpdata->udp_mss ) {
+		if (( snd_size - overhd ) != dgram_remain )
+			BUG();
+		snd_size = dgram_remain;
+	}
+	/* XXX: can be removed */
+	if (udpdata->data.pglist.page_start_index == udpdata->data.pglist.page_end_index )
+		page_end = udpdata->data.pglist.poffset + dgram_remain;
+	do {
+		/* start working from the tail */
+		if ( page_index == udpdata->data.pglist.page_start_index ){
+			page_offset = udpdata->data.pglist.poffset;
+			if ( page_offset < 0)
+				BUG();
+		}
+		page_usage = page_end - page_offset;
+		if ( page_index < 0 )
+			BUG();
+		page = udpdata->data.pglist.pages[page_index--];
+		if ( sysctl_udp_checksum == 0 ) {
+			goto nochecksum;
+		}
+		vaddr = (caddr_t) kmap (page );
+		offset_4align = page_offset;
+		vaddr_offset = vaddr + page_offset;
+		page_align_corr = 0;
+		/* 
+		 * check if the data in first page starts at a unaligned address 
+		 * if it does then copy the "correction" bytes into carry 
+		 * calculate checksum. Calculate checksum for data in pages if
+		 * any thereafter. as checksum is associative 
+		 * csum =csum( carry + Sum(csum(pages)) + csum(app. header) 
+		 * 		+ csum (UDP hdr)
+		 */
+		if ( (page_align_err = page_offset % 4)){
+			page_align_corr = 4 - page_align_err;
+			udpdata->udp_carry = 0;
+			memcpy( ((char *)&udpdata->udp_carry) + page_align_err,
+				vaddr_offset, page_align_corr );
+			udpdata->udp_wcsum = csum_partial((unsigned char *)&udpdata->udp_carry,
+							  4, udpdata->udp_wcsum);
+			if ( ( offset_4align = ((page_offset & ~3) + 4)) != 
+			     (int)PAGE_SIZE ){
+				vaddr_offset = vaddr + offset_4align;
+			}
+		}
+/*
+		printk( KERN_INFO "udp_fill_pages: csum_partial("
+			"(voffset + hdr_align_corr)= 0x%x + %d, page_usage -"
+			"alignpg_end= %d - %d, udpdata->udp_wcsum=%d)\n", 
+			voffset, align_pgbegin, page_usage,
+			align_pgend, udpdata->udp_wcsum );
+*/
+		if ( (( offset_4align + page_usage - page_align_corr) < 0) 
+		     || (offset_4align + page_usage - page_align_corr > 
+			 (int) PAGE_SIZE) 
+		     || (((page_usage - page_align_corr) %4) && 
+			 !(udpdata->flags & LAST_FRAG)))
+			BUG();
+		if ( ( page_usage - page_align_corr) > 0) {
+			udpdata->udp_wcsum = csum_partial((unsigned char *)
+							  (vaddr_offset ),
+							  page_usage - 
+							  page_align_corr,
+							  udpdata->udp_wcsum);
+		}
+		kunmap(page);
+	nochecksum:
+		do{
+			frag_size = snd_size;
+			page_usage -=snd_size;
+			page_poffset = page_offset + page_usage;
+			dgram_remain -= snd_size;
+			if(udpdata->flags & NEXT_PAGE){
+				udpdata->flags &= ~NEXT_PAGE;
+				/* skb_shinfo */
+				i--;
+			} else {
+				if ( dgram_remain <= 0  ){
+				     if (udpdata->iovec_size )
+					     udpdata->flags |= SKB_BY_IOVEC;
+				     else {
+					     udpdata->flags |= FIRST_FRAG;
+					     /* 
+					      *	if the data in the page starts 
+					      * on an unaligned offset then
+					      *	swap csum
+					      */
+					     if ( sysctl_udp_checksum && 
+						  ((!(page_align_err % 2) && 
+						    ((overhd - 
+						      sizeof(struct udphdr))% 2))
+						  || ((page_align_err % 2) && 
+						      !((overhd - 
+							 sizeof(struct udphdr))% 
+							2)))){
+						     udpdata->udp_wcsum = 
+						       ntohl(udpdata->udp_wcsum);
+					     }
+				     }
+				     if(page_usage > 0)
+					     BUG();
+				}
+				if ((udpdata->flags & SKB_BY_IOVEC) == 0){
+					if (!skb)
+						skb = get_newpacket(sk, udpdata,
+								    &retval);
+					if (!skb)
+						return retval;
+				/* 
+				 * we can make this fast by hardcoding this value
+				 * to 2 for MTU = 1500
+				 */
+					i = skb_shinfo(skb)->nr_frags;
+				}
+				/* 
+				 * current page cannot serve anymore, store state
+ 				 * for the coming page
+				 */
+				if (page_usage < 0){
+					dgram_remain += -page_usage;
+					if (((dgram_remain == 0) && (page_usage!=
+								     -overhd)))
+						BUG();
+					if(((udpdata->flags & FIRST_FRAG) && 
+					    (page_usage == -overhd)) == 0) {
+						i = max_pageusage_by_mss;
+						udpdata->flags |= NEXT_PAGE;
+					}
+					page_poffset = page_offset;
+					frag_size = snd_size + page_usage;
+				}
+				if (((udpdata->flags & SKB_BY_IOVEC) == 0) && 
+				    ((i+1) < MAX_SKB_FRAGS))
+					skb_shinfo(skb)->nr_frags = i + 1;
+			}
+
+			/* we do not support can coalesce in this one */
+
+			if (udpdata->flags & SKB_BY_IOVEC) {
+				udpdata->frags[udpdata->nr_frags_cur_skb].page = 
+				  page;
+				udpdata->frags[udpdata->nr_frags_cur_skb].page_offset = (u16)page_poffset;
+				udpdata->frags[udpdata->nr_frags_cur_skb].size = 
+				  (u16)frag_size;
+				udpdata->nr_frags_cur_skb++;
+				if ( udpdata->nr_frags_cur_skb > 2)
+					BUG();
+			} else {
+				udpdata->skb_copied += frag_size;
+				skb->data += frag_size;
+				skb->data_len += frag_size;
+				if(0 <= i && i < MAX_SKB_FRAGS) {
+					get_page(page);
+					skb_shinfo(skb)->frags[i].page = page;
+					skb_shinfo(skb)->frags[i].page_offset = 
+					  (u16) page_poffset;
+					skb_shinfo(skb)->frags[i].size = 
+					  (u16 ) frag_size;
+				}
+			}
+			if (((page_usage >=0) || ((udpdata->flags & FIRST_FRAG)&&
+						  (dgram_remain == 0 )))
+			    && !(udpdata->flags & SKB_BY_IOVEC)){
+				udpdata->doffset -= udpdata->skb_copied;
+				retval = udp_transmit_skb(sk, skb, udpdata);
+				if (retval < 0)
+					goto error;
+				skb = NULL;
+			/* this calculation can be avoided too */
+				if (dgram_remain ){
+					snd_size = ((dgram_remain + overhd) == 
+						    udpdata->udp_mss)
+						? dgram_remain:udpdata->udp_mss;
+					if ((dgram_remain + overhd) % 
+					    udpdata->udp_mss)
+						BUG();
+				}
+			} else {
+				snd_size = -page_usage;
+			}
+		} while (page_usage > 0);
+		page_offset = 0;
+		page_end = PAGE_SIZE;
+	} while (dgram_remain > 0);
+
+	if ( sysctl_udp_checksum && ((!(page_align_err % 2) && 
+				      ((overhd - sizeof(struct udphdr))% 2))
+		|| ((page_align_err % 2) && !((overhd - sizeof(struct udphdr)) %
+					      2)))){
+		udpdata->udp_wcsum = ntohl(udpdata->udp_wcsum);
+	}
+	udpdata->leftover_skb = frag_size;
+ error:
+	return retval;
+}
+
+struct 
+sk_buff *get_newpacket(struct sock *sk, struct udp_dgram *udpdata, int *retval)
+{
+	struct sk_buff	*skb;
+	struct tcp_opt	*tp = &(sk->tp_pinfo.af_tcp);
+
+	skb = tp->send_head;
+	if (skb == NULL) {
+		/* Allocate a skb for udp packet and reserve UDP header */
+		if ((skb = alloc_skb(udpdata->udp_mss + IP_UDP_HEADER,
+				     sk->allocation)) == NULL) {
+			*retval = -ENOMEM;
+			return NULL;
+		}
+		
+		skb_reserve(skb, IP_UDP_HEADER+udpdata->udp_mss);
+		skb_set_owner_w(skb, sk);
+		tp->send_head = skb;
+		udpdata->skb_copied = 0;
+		if (udpdata->flags & FIRST_FRAG) 
+			udpdata->skb_datapart = udpdata->udp_mss-
+			  sizeof(struct udphdr);
+		else
+			udpdata->skb_datapart = udpdata->udp_mss;
+		skb_push(skb, udpdata->skb_datapart);
+	}
+	return skb;
+}
+
+/* Copy Header and Trailer iovec table from user space
+ * Check the size and iov len sent by user.
+ * On error free the memory created and return error;
+ */
+int
+get_iovectable(struct sf_hdtr *hdtr, struct iovec **hdr, int *hdr_len, 
+	       struct iovec **trl, int *trl_len)
+{
+	int retval = 0, i = 0;
+	/* First get header iovec table */
+
+	if (hdtr->headers) {
+		if (!hdtr->hdr_cnt || hdtr->hdr_cnt > UIO_MAXIOV) {
+			*hdr = NULL;
+			retval = -EINVAL;
+			goto error;
+		}
+
+		*hdr = kmalloc(hdtr->hdr_cnt*sizeof(struct iovec), GFP_KERNEL);
+		if (!(*hdr)) {
+			retval = -ENOMEM;
+			goto error;
+		}
+		if (copy_from_user((*hdr), hdtr->headers, 
+				   hdtr->hdr_cnt*sizeof(struct iovec))) {
+			retval = -EFAULT;
+			goto hdr_free;
+		}
+
+		*hdr_len = 0;
+		for (i=0; i < hdtr->hdr_cnt; i++){
+			size_t tmp = *hdr_len;
+			if ((*hdr)[i].iov_len < 0) {
+				retval = -EINVAL;
+				goto hdr_free;
+			}
+			*hdr_len += (*hdr)[i].iov_len;
+			/* Check for int size overflow */
+			if ((*hdr_len) < tmp || (*hdr_len) < (*hdr)[i].iov_len) {
+				retval = -EINVAL;
+				goto hdr_free;
+			}
+		}
+		
+	}
+
+	if (hdtr->trailers) {
+
+		if (!hdtr->trl_cnt || hdtr->trl_cnt > UIO_MAXIOV) {
+			retval = -EINVAL;
+			goto hdr_free;
+		}
+		
+		*trl = kmalloc(hdtr->trl_cnt*sizeof(struct iovec), GFP_KERNEL);
+		if (!(*trl)) {
+			retval = -ENOMEM;
+			goto hdr_free;
+		}
+		
+		if (copy_from_user((*trl), hdtr->trailers, 
+				   hdtr->trl_cnt*sizeof(struct iovec))) {
+			retval = -EFAULT;
+			goto trl_free;
+		}
+
+		*trl_len = 0;
+		for (i=0; i < hdtr->trl_cnt; i++){
+			size_t tmp = *trl_len;
+			if ((*trl)[i].iov_len < 0) {
+				retval = -EINVAL;
+				goto trl_free;
+			}
+			*trl_len += (*trl)[i].iov_len;
+			/* Check for int size overflow */
+			if (*trl_len < tmp || *trl_len < (*trl)[i].iov_len) {
+				retval = -EINVAL;
+				goto trl_free;
+			}
+		}
+	}
+	goto out;
+
+ trl_free:
+	kfree((*trl));
+ hdr_free:
+	kfree((*hdr));
+ error:
+ out:
+	return retval;
+}
+/*
+ *  This is the main UDP scatter/gather i/o routine to prepare datagrams.
+ *  ASSUMPTION: preceding data (eg. UDP which is) 32-bit aligned necessary 
+ *		for checksum calculation. As the hdr size will be small its not
+ * 		worth the calculations unless it takes more than two IP fragments
+ *		to accomodate the UDP + application hdr.
+ *		2. DONT USE THIS FOR TRAILERS.
+ */
+size_t 
+udp_fill_data(struct sock *sk, struct udp_dgram *udpdata)
+{
+	int		size_to_copy;
+	int		retval;
+	int		iov_cnt = 0;
+	int		overhd_rmndr = 0;
+	int		i;
+	struct sk_buff	*skb = NULL;
+	struct iovec	*iov;
+
+	udpdata->flags |= FIRST_FRAG;
+	/* First allocate the packet */
+	if ((udpdata->flags & IOVEC_DATA) == 0)
+		return -EINVAL;
+	/* asssumption here is we do not have any overhead but app. headers */
+	overhd_rmndr = udpdata->dgram_size - udpdata->data.pglist.len - 
+	  sizeof (struct udphdr);
+	if (overhd_rmndr + sizeof(struct udphdr) >= udpdata->udp_mss){
+		printk("header is too large :( HOUSTON WE HAVE A PROBLEM \n");
+		return -EINVAL;
+	}
+	iov = udpdata->data.buf.iov;
+	iov_cnt = udpdata->data.buf.iov_cnt;
+	size_to_copy = overhd_rmndr;
+	skb = get_newpacket(sk, udpdata, &retval);
+	if (!skb)
+		goto error;
+
+	if ( csum_partial_copy_fromiovecend(skb->data, iov, 0, size_to_copy, 
+					    &udpdata->udp_wcsum) < 0) {
+		retval = -EFAULT;
+		goto error;
+	}
+		/* KC: I avoiding sanity checks assuming there is 
+		 *  no messup while passing udp_carry 
+		 */
+	skb->data += size_to_copy;
+	udpdata->skb_copied += size_to_copy;
+	udpdata->doffset += udpdata->skb_copied;
+	overhd_rmndr -= size_to_copy;
+
+	while(udpdata->nr_frags_cur_skb > 0){
+		skb_frag_t *frag = NULL;
+
+		udpdata->nr_frags_cur_skb--;
+		if(!udpdata->leftover_skb || udpdata->nr_frags_cur_skb < 0){
+			BUG();
+		}
+		i = skb_shinfo(skb)->nr_frags;
+		frag = &skb_shinfo(skb)->frags[i];
+		get_page(udpdata->frags[udpdata->nr_frags_cur_skb].page);
+		frag->page = udpdata->frags[udpdata->nr_frags_cur_skb].page;
+		frag->page_offset = udpdata->frags[udpdata->nr_frags_cur_skb].page_offset;
+		frag->size = udpdata->frags[udpdata->nr_frags_cur_skb].size;
+		skb_shinfo(skb)->nr_frags = i+1;
+		udpdata->skb_copied += frag->size;
+		skb->data += frag->size;
+		skb->data_len += frag->size;
+	}
+
+	/* associativity of addition so checksum not
+	 * effected even if walk from the start of the datagram
+	 */
+	if ((retval = udp_transmit_skb(sk, skb, udpdata)) < 0)
+			goto error;
+ error:
+	return retval;
+}
+	
+/*
+ * UDP send routine to send datagram of size upto 65KB
+ */
+size_t 
+udp_sendpages(void *sp, struct page **pages, int poffset, size_t size, 
+	      struct sf_hdtr *hdtr)
+{
+	u8		tos;
+	int		trl_size = 0,hdr_size = 0;
+	int		err;
+	int		total_bytes = 0;
+	struct socket	*sock = (struct socket *) sp;
+	struct sock	*sk = sock->sk;
+	struct tcp_opt	*tp = &(sk->tp_pinfo.af_tcp);
+	struct sk_buff	*skb;
+	struct rtable	*rt;
+	struct udp_dgram	udpdata;
+	struct ipcm_cookie	ipc;
+	struct iovec	*hdr_iov = NULL, *trl_iov = NULL;
+	int		retval = 0;
+	
+	if (sk->state != TCP_ESTABLISHED)
+		return -ENOTCONN;
+	
+	
+	/* UDP sendfile call is always connected */
+	rt = (struct rtable *) sk_dst_check(sk, 0);
+
+	if (rt == NULL) {
+		/* We don't have a route and get one */
+		ipc.opt = NULL;
+		ipc.oif = sk->bound_dev_if;
+		if (!ipc.opt)
+			ipc.opt = sk->protinfo.af_inet.opt;
+
+		tos = RT_TOS(sk->protinfo.af_inet.tos);
+		
+		if (sk->localroute || (ipc.opt && ipc.opt->is_strictroute))
+			tos |= RTO_ONLINK;
+		
+		err = ip_route_output(&rt, sk->daddr, sk->saddr, tos, ipc.oif);
+		if (err)
+			goto out;
+		err = -EINVAL;
+		if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast)
+			goto out;
+		
+		sk_dst_set(sk, dst_clone(&rt->u.dst));
+	}
+	udpdata.rt = rt;
+	/* 
+	 * Calculate the UDP segement size
+	 * this segment size is rounded to 8 byte boundary.
+	 */
+	udpdata.udp_mss = ((rt->u.dst.pmtu - sizeof(struct iphdr)) & ~7);
+
+	if (udpdata.udp_mss < 8)
+		return -EMSGSIZE;
+
+	/* User passed header and trailers iovecs pointers
+	 * We need to get the data and get point list
+	 * We need to free the header and trailer iovec table.
+	 */
+	retval = get_iovectable(hdtr, &hdr_iov, &hdr_size, &trl_iov, &trl_size);
+	if (retval < 0)
+		return retval;
+	
+	/* 
+	 * The complete datagram size is
+	 *   One UDP header + APP header +App data
+	 * Only the first fragment contains UDP hdr and
+	 * the rest of the fragments contains only IP hdr.
+	 */
+	udpdata.dgram_size = size + sizeof(struct udphdr) + hdr_size + trl_size;
+
+	if (udpdata.dgram_size < 0 || udpdata.dgram_size > 65536) {
+		retval = -EMSGSIZE;
+		goto error;
+	} 
+	if (udpdata.dgram_size <= (rt->u.dst.pmtu - sizeof(struct iphdr)))
+		udpdata.flags |= HW_CSUM;
+
+	/*
+	 * Do we need fragmentation?
+	 *
+	 *  if dgram_size > mtu, then we need fragmentation.
+	 ***************************************************
+	 * The complete datagram is not multiple of udp_mss 
+	 * and hence the last leftover bytes are send in a 
+	 * packet.
+	 */
+	udpdata.iovec_size = hdr_size + trl_size;
+	udpdata.doffset = udpdata.dgram_size;
+	udpdata.copied = 0;
+	udpdata.flags = 0;
+	udpdata.flags |= LAST_FRAG;
+	udpdata.udp_carry = 0;
+	udpdata.udp_wcsum = 0;
+	udpdata.ip_id = (sk ? sk->protinfo.af_inet.id++ : 0);
+	udpdata.nr_frags_cur_skb = 0;
+	/*
+	 * Currently without IP fragmentation, we can do HW chksum loading.
+	 * But we do have to finish UDP chksum.
+	 */
+
+	/* Sending the data */
+	udpdata.data.pglist.pages = pages;
+	udpdata.data.pglist.poffset = poffset;
+	udpdata.data.pglist.page_start_index = ( udpdata.data.pglist.poffset ) /
+	  PAGE_SIZE;
+	udpdata.data.pglist.page_end_index = ( udpdata.data.pglist.poffset + 
+					       size - 1 ) / PAGE_SIZE;
+	udpdata.data.pglist.len = size;
+	if ( size > 0 ) {
+		if ((retval = udp_fill_pages(sk, &udpdata)) < 0) 
+			goto error;
+	} else {
+		retval = -EINVAL;
+		goto error;
+	}
+	
+	total_bytes += size;
+	/* Send the application header part */
+	if (hdtr && hdtr->headers) {
+		udpdata.flags |= IOVEC_DATA;
+		udpdata.data.buf.iov = hdr_iov;
+		udpdata.data.buf.iov_cnt = hdtr->hdr_cnt;
+		
+		if (udp_fill_data(sk, &udpdata) < 0) {
+			retval = -EINVAL;
+			goto error;
+		}
+		udpdata.flags &= ~IOVEC_DATA;
+		kfree( hdr_iov );
+		total_bytes += hdr_size;
+	} 
+	retval = total_bytes;
+		
+ error:
+		if ((skb = tp->send_head) != NULL) {
+			kfree_skb(skb);
+			tp->send_head = NULL;
+		}
+ out:
+		return retval;
+}
+
+/*
+ * Wrapper which makes sure that a socket is connected before calling 
+ * udp_sendpages
+ */
+size_t
+udp_sendpages_sock(void *sp, struct page **pages, int poffset, size_t size,
+		   struct sf_hdtr *hdtr, struct sockaddr *sockaddr, int addr_len)
+{
+	
+	int		connected = 0;
+	struct socket	*sock = (struct socket *) sp;
+	struct sock	*sk = sock->sk;
+	int		retval = 0;
+
+	if ((sk->state != TCP_ESTABLISHED) && (!sockaddr || addr_len < 0))
+		return -EINVAL;
+
+	if (sk->state != TCP_ESTABLISHED) {
+		retval = udp_connect(sk, sockaddr, addr_len);
+		if (retval < 0)
+			return retval;
+		connected = 1;
+	}
+	
+	/* Now the socket is connected, let us call udp_sendpages */
+	retval = udp_sendpages(sp, pages, poffset, size, hdtr);
+
+	if (connected)
+		udp_disconnect(sk, 0);
+	return retval;
+}

[Index of Archives]     [Netdev]     [Ethernet Bridging]     [Linux 802.1Q VLAN]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Git]     [Bugtraq]     [Yosemite News and Information]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux PCI]     [Linux Admin]     [Samba]

  Powered by Linux