Attached is a patch which has just the udp sendfile support in the networking stack. It has an interface which takes a struct containing pointers to header and trailer sent along with the sendfile. csum_partial assumes the buffer passed to it is 2-byte aligned. Hence I used Byte Order Independence & Associative properties of the checksum (RFC 1071: Computing the Internet Checksum) to derive the checksum of the udp datagram. Feel free to play with UDP sendfile but you have to come up with an interface to use it. NFS might be a good start. I am working on a patch which includes a sendfilev system call which does udp_sendfile. -Kishore. -- Kishore V. Chitrapu Storigen Systems (978) 323-4454 x395 kishore.chitrapu@storigen.com
diff -urN linux.orig/include/linux/fs.h linux/include/linux/fs.h --- linux.orig/include/linux/fs.h Wed Sep 4 17:22:21 2002 +++ linux/include/linux/fs.h Wed Sep 4 17:23:56 2002 @@ -814,6 +812,9 @@ * read, write, poll, fsync, readv, writev can be called * without the big kernel lock held in all filesystems. */ + +struct sf_hdtr; + struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); @@ -833,6 +834,8 @@ ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + ssize_t (*sendfile) (struct file *, struct file *, loff_t *, size_t); + ssize_t (*udp_sendfile) (struct file *, struct file *, loff_t *, size_t, struct sf_hdtr *); }; struct inode_operations { @@ -1596,6 +1599,22 @@ dput(d1); dput(d2); } + +/* + * structure used by sys_sendfilev + */ +struct sf_hdtr { + struct iovec *headers; /* pointer to header iovecs */ + int hdr_cnt; /* number of header iovecs */ + struct iovec *trailers; /* pointer to trailer iovecs */ + int trl_cnt; /* number of trailer iovecs */ + int flags; +}; + +/* + * definitions of flags + */ +#define SF_SET_TCPCORK 0x0001 #endif /* __KERNEL__ */ diff -urN linux.orig/include/net/udp_sendfile.h linux/include/net/udp_sendfile.h --- linux.orig/include/net/udp_sendfile.h Wed Dec 31 19:00:00 1969 +++ linux/include/net/udp_sendfile.h Thu Sep 5 00:13:23 2002 @@ -0,0 +1,71 @@ +/* */ +/* */ +/* Copyright (c) 2000-2001, Storigen Systems, Inc. */ +/* May be copied or modified under the terms of the GNU General Public License */ +/* */ + +#ifndef _UDP_SENDFILE_H_ +#define _UDP_SENDFILE_H_ + + +struct udp_dgram { + size_t dgram_size; /* Datagram size */ + size_t iovec_size; /* Datagram size */ + size_t copied; /* Copied dgram */ + size_t doffset; /* Offset into the UDP datagram */ + size_t skb_datapart; /* Data to be copied on the current skb */ + size_t skb_copied; /* Data copied into on the current skb */ + int leftover_skb; /* length of skb remainig from pages */ + + unsigned int flags; /* Flags */ + size_t udp_mss; /* UDP mss */ + struct rtable *rt; /* Route to dst */ + u16 ip_id; /* IP identification */ + u32 udp_wcsum; /* UDP word checksum */ + unsigned int udp_carry; /* UDP carry leftover bytes in cksum */ + union { /* Data */ + struct { + struct iovec *iov; + int iov_cnt; + } buf; + struct { + struct page **pages; + size_t poffset; + int len; + int page_start_index; /* Start index of phy. + memory pages */ + int page_end_index; /* End index of phy. + memory pages */ + } pglist; + } data; + int nr_frags_cur_skb; + struct { + struct page *page; + u16 page_offset; + u16 size; + } frags[2]; +}; + +/* Function Definitions */ + +/* UDP layer function definitions */ +extern size_t udp_sendpages(void *sp, struct page **pages, int poffset, size_t size, struct sf_hdtr *hdtr); + +#define UDP_HEADER 8 +#define IP_UDP_HEADER (UDP_HEADER + 128 + MAX_HEADER) + +/* Bunch of macros used */ +#define FIRST_FRAG 0x00001 +#define IOVEC_DATA 0x00002 +#define LAST_FRAG 0x00004 +#define IOVEC_DATA_AND_PAGE 0x00008 +#define HW_CSUM 0x00010 +#define UDP_CARRY 0x00020 +#define NEXT_PAGE 0x00040 +#define SKB_BY_IOVEC 0x00080 + +#define HDR_ALIGNED 0x0001 +#define PAGE_DATA_ALIGNED 0x0010 +#define HDR_DATA_SAVE_WORD 0x0100 +#define TWO_PAGES_MAPPED 0x1000 +#endif diff -urN linux.orig/net/ipv4/Makefile linux/net/ipv4/Makefile --- linux.orig/net/ipv4/Makefile Fri Dec 21 12:42:05 2001 +++ linux/net/ipv4/Makefile Wed Sep 4 11:14:32 2002 @@ -16,7 +16,8 @@ ip_output.o ip_sockglue.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ tcp_diag.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ - sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o \ + udp_sendfile.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_ROUTE_NAT) += ip_nat_dumb.o diff -urN linux.orig/net/ipv4/udp_sendfile.c linux/net/ipv4/udp_sendfile.c --- linux.orig/net/ipv4/udp_sendfile.c Wed Dec 31 19:00:00 1969 +++ linux/net/ipv4/udp_sendfile.c Thu Sep 5 00:12:10 2002 @@ -0,0 +1,803 @@ +/* */ +/* */ +/* Copyright (c) 2000-2001, Storigen Systems, Inc. */ +/* May be copied or modified under the terms of the GNU General Public License */ +/* */ +/* + * INET An implementation of udp sendfile. + * + * Authors: Kishore V. Chitrapu, <kishore@storigen.com> + Ravi Chamarti, <ravi.chamarti@storigen.com> +*/ +#include <linux/config.h> +#include <linux/version.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/module.h> + +/* Network layer specific include files */ +#include <net/sock.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <linux/in_route.h> +#include <linux/ip.h> +#include <asm/uaccess.h> +#include <asm/checksum.h> +#include <asm/atomic.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <net/route.h> +#include <net/ip.h> +#include <net/udp.h> +#include <net/udp_sendfile.h> + +#include <linux/fs.h> +#include <linux/types.h> + +/* + * sysctl to turn on/off the UDP checksum + * by default turned on + */ +int sysctl_udp_checksum = 1; + +struct sk_buff *get_newpacket(struct sock *sk, struct udp_dgram *udpdata, + int *retval); + +static unsigned short +udp_check(struct udphdr *uh, int len, unsigned long saddr, + unsigned long daddr, unsigned long base) +{ + return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base)); +} + +void +udp_fill_header(struct sock *sk, struct sk_buff *skb, unsigned short size, + unsigned int flags, unsigned int wcsum) +{ + unsigned short ulen = size; + struct udphdr *uh; + + uh = (struct udphdr *)skb_push(skb, sizeof(struct udphdr)); + /* prepare UDP header and payload data */ + skb->h.uh = uh; + uh->len = htons(ulen); + uh->check = 0; + uh->source = sk->sport; + uh->dest = sk->dport; + if (flags & HW_CSUM) { + uh->check = ~udp_check(uh, skb->len, sk->saddr, sk->daddr, 0); + skb->csum = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_HW; + } + else { + wcsum = csum_partial((char *)uh, sizeof(struct udphdr), wcsum); + uh->check = csum_tcpudp_magic(sk->saddr, sk->daddr, size, + IPPROTO_UDP, wcsum); + /* checksum turned off?? */ + if ( sysctl_udp_checksum == 0 ) { + uh->check = 0; + } + skb->csum = 0; + skb->ip_summed = CHECKSUM_NONE; + } + skb->sk = sk; + return; +} +static inline int +output_maybe_reroute(struct sk_buff *skb) +{ + return skb->dst->output(skb); +} + + +int +ip_fill_header(struct sock *sk, struct sk_buff *skb, struct rtable *rt, + unsigned short offset, unsigned short snd_size, + unsigned short id, short last_frag, unsigned int flags) +{ + short mf = 0; + short df = 0; + int retval = 0; + struct iphdr *ip_hdr; + + /* Let us fill the iphdr and also ip fragmentation */ + skb->priority = sk->priority; + skb->dst = dst_clone(&rt->u.dst); + + if (ip_dont_fragment(sk, &rt->u.dst)) + df = htons(IP_DF); + + if (!last_frag) + mf = htons(IP_MF); + + skb->nh.iph = ip_hdr = (struct iphdr *) skb_push(skb, + sizeof(struct iphdr)); + + ip_hdr->version = 4; + ip_hdr->ihl = 5; + ip_hdr->tos = sk->protinfo.af_inet.tos; + ip_hdr->tot_len = htons(snd_size + ip_hdr->ihl*4); + ip_hdr->id = id; + ip_hdr->frag_off = htons(offset >> 3)| mf | df; + ip_hdr->protocol = sk->protocol; + ip_hdr->ttl = sk->protinfo.af_inet.ttl; + ip_hdr->check = 0; + ip_hdr->saddr = rt->rt_src; + ip_hdr->daddr = rt->rt_dst; + ip_hdr->check = ip_fast_csum((unsigned char *)ip_hdr, ip_hdr->ihl); + if (!(flags & HW_CSUM)) + skb->ip_summed = CHECKSUM_NONE; + + retval = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, + skb->dst->dev, output_maybe_reroute); + + if (retval > 0) + retval = sk->protinfo.af_inet.recverr ? net_xmit_errno(retval):0; + if (!retval) { + UDP_INC_STATS_USER(UdpOutDatagrams); + } + return retval; +} + +int +udp_transmit_skb(struct sock *sk, struct sk_buff *skb, struct udp_dgram *udpdata) +{ + int retval = 0; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + skb->data -= udpdata->skb_copied; + skb->len = udpdata->skb_copied; + + if (udpdata->flags & FIRST_FRAG) { + udp_fill_header(sk, skb, udpdata->dgram_size, udpdata->flags, + udpdata->udp_wcsum); + udpdata->skb_copied += sizeof(struct udphdr); + udpdata->doffset = 0; + udpdata->flags &= ~FIRST_FRAG; + } + retval = ip_fill_header(sk, skb, udpdata->rt, udpdata->doffset, + udpdata->skb_copied, udpdata->ip_id, + (udpdata->flags & LAST_FRAG), udpdata->flags); + if (retval < 0) + return retval; + if ( udpdata->flags & LAST_FRAG ) { + udpdata->flags &= ~LAST_FRAG; + } + tp->send_head = NULL; + return retval; +} + +/* + does fragmentation as is done in stock linux except this accepts physical + memory address, so datagram length, page size and udp_mss variables which are + to be considered while sending udp datagrams. +*/ +int +udp_fill_pages(struct sock *sk, struct udp_dgram *udpdata) +{ +#define HDR_ALIGNED 0x0001 +#define PAGE_DATA_ALIGNED 0x0010 +#define HDR_DATA_SAVE_WORD 0x0100 +#define TWO_PAGES_MAPPED 0x1000 + + int page_offset = 0; + int page_poffset = 0; + int dgram_remain = 0; + int snd_size = 0; + int overhd = 0; + int page_usage; + int page_end; + int page_index; + int first_page_len; + int retval = 0; + int page_align_err = 0, page_align_corr = 0; + int max_pageusage_by_mss = 0; + int frag_size = 0; + int offset_4align; + int i = 0; + caddr_t vaddr = NULL, vaddr_offset = NULL; + struct page *page = NULL; + struct sk_buff *skb = NULL; + + if (udpdata->flags & IOVEC_DATA) + return -EINVAL; + if(!(dgram_remain = udpdata->data.pglist.len)) + return -EINVAL; + /* + * csum_partial should be feeded with a word aligned buffer unless the + * buffer is going into the last fragment. + */ + max_pageusage_by_mss = udpdata->udp_mss/PAGE_SIZE + 1; + overhd = udpdata->dgram_size - dgram_remain; + /* two things for which word alignment is checked if + * 1. app. header + * 2. page offset in the first page of the array. + * if the data in a page ( except the page with last frag) is not word + * aligned then we cannot pass that buffer to csum_partial. we need to + * do some correction before + * we pass the buffer to csum_partial. + */ + + page_index = udpdata->data.pglist.page_end_index; + first_page_len = PAGE_SIZE - udpdata->data.pglist.poffset; + page_end = ( dgram_remain - first_page_len + PAGE_SIZE - 1 ) % + PAGE_SIZE + 1; + snd_size = (dgram_remain + overhd - 1) % udpdata->udp_mss + 1; + if ( dgram_remain + overhd <= udpdata->udp_mss ) { + if (( snd_size - overhd ) != dgram_remain ) + BUG(); + snd_size = dgram_remain; + } + /* XXX: can be removed */ + if (udpdata->data.pglist.page_start_index == udpdata->data.pglist.page_end_index ) + page_end = udpdata->data.pglist.poffset + dgram_remain; + do { + /* start working from the tail */ + if ( page_index == udpdata->data.pglist.page_start_index ){ + page_offset = udpdata->data.pglist.poffset; + if ( page_offset < 0) + BUG(); + } + page_usage = page_end - page_offset; + if ( page_index < 0 ) + BUG(); + page = udpdata->data.pglist.pages[page_index--]; + if ( sysctl_udp_checksum == 0 ) { + goto nochecksum; + } + vaddr = (caddr_t) kmap (page ); + offset_4align = page_offset; + vaddr_offset = vaddr + page_offset; + page_align_corr = 0; + /* + * check if the data in first page starts at a unaligned address + * if it does then copy the "correction" bytes into carry + * calculate checksum. Calculate checksum for data in pages if + * any thereafter. as checksum is associative + * csum =csum( carry + Sum(csum(pages)) + csum(app. header) + * + csum (UDP hdr) + */ + if ( (page_align_err = page_offset % 4)){ + page_align_corr = 4 - page_align_err; + udpdata->udp_carry = 0; + memcpy( ((char *)&udpdata->udp_carry) + page_align_err, + vaddr_offset, page_align_corr ); + udpdata->udp_wcsum = csum_partial((unsigned char *)&udpdata->udp_carry, + 4, udpdata->udp_wcsum); + if ( ( offset_4align = ((page_offset & ~3) + 4)) != + (int)PAGE_SIZE ){ + vaddr_offset = vaddr + offset_4align; + } + } +/* + printk( KERN_INFO "udp_fill_pages: csum_partial(" + "(voffset + hdr_align_corr)= 0x%x + %d, page_usage -" + "alignpg_end= %d - %d, udpdata->udp_wcsum=%d)\n", + voffset, align_pgbegin, page_usage, + align_pgend, udpdata->udp_wcsum ); +*/ + if ( (( offset_4align + page_usage - page_align_corr) < 0) + || (offset_4align + page_usage - page_align_corr > + (int) PAGE_SIZE) + || (((page_usage - page_align_corr) %4) && + !(udpdata->flags & LAST_FRAG))) + BUG(); + if ( ( page_usage - page_align_corr) > 0) { + udpdata->udp_wcsum = csum_partial((unsigned char *) + (vaddr_offset ), + page_usage - + page_align_corr, + udpdata->udp_wcsum); + } + kunmap(page); + nochecksum: + do{ + frag_size = snd_size; + page_usage -=snd_size; + page_poffset = page_offset + page_usage; + dgram_remain -= snd_size; + if(udpdata->flags & NEXT_PAGE){ + udpdata->flags &= ~NEXT_PAGE; + /* skb_shinfo */ + i--; + } else { + if ( dgram_remain <= 0 ){ + if (udpdata->iovec_size ) + udpdata->flags |= SKB_BY_IOVEC; + else { + udpdata->flags |= FIRST_FRAG; + /* + * if the data in the page starts + * on an unaligned offset then + * swap csum + */ + if ( sysctl_udp_checksum && + ((!(page_align_err % 2) && + ((overhd - + sizeof(struct udphdr))% 2)) + || ((page_align_err % 2) && + !((overhd - + sizeof(struct udphdr))% + 2)))){ + udpdata->udp_wcsum = + ntohl(udpdata->udp_wcsum); + } + } + if(page_usage > 0) + BUG(); + } + if ((udpdata->flags & SKB_BY_IOVEC) == 0){ + if (!skb) + skb = get_newpacket(sk, udpdata, + &retval); + if (!skb) + return retval; + /* + * we can make this fast by hardcoding this value + * to 2 for MTU = 1500 + */ + i = skb_shinfo(skb)->nr_frags; + } + /* + * current page cannot serve anymore, store state + * for the coming page + */ + if (page_usage < 0){ + dgram_remain += -page_usage; + if (((dgram_remain == 0) && (page_usage!= + -overhd))) + BUG(); + if(((udpdata->flags & FIRST_FRAG) && + (page_usage == -overhd)) == 0) { + i = max_pageusage_by_mss; + udpdata->flags |= NEXT_PAGE; + } + page_poffset = page_offset; + frag_size = snd_size + page_usage; + } + if (((udpdata->flags & SKB_BY_IOVEC) == 0) && + ((i+1) < MAX_SKB_FRAGS)) + skb_shinfo(skb)->nr_frags = i + 1; + } + + /* we do not support can coalesce in this one */ + + if (udpdata->flags & SKB_BY_IOVEC) { + udpdata->frags[udpdata->nr_frags_cur_skb].page = + page; + udpdata->frags[udpdata->nr_frags_cur_skb].page_offset = (u16)page_poffset; + udpdata->frags[udpdata->nr_frags_cur_skb].size = + (u16)frag_size; + udpdata->nr_frags_cur_skb++; + if ( udpdata->nr_frags_cur_skb > 2) + BUG(); + } else { + udpdata->skb_copied += frag_size; + skb->data += frag_size; + skb->data_len += frag_size; + if(0 <= i && i < MAX_SKB_FRAGS) { + get_page(page); + skb_shinfo(skb)->frags[i].page = page; + skb_shinfo(skb)->frags[i].page_offset = + (u16) page_poffset; + skb_shinfo(skb)->frags[i].size = + (u16 ) frag_size; + } + } + if (((page_usage >=0) || ((udpdata->flags & FIRST_FRAG)&& + (dgram_remain == 0 ))) + && !(udpdata->flags & SKB_BY_IOVEC)){ + udpdata->doffset -= udpdata->skb_copied; + retval = udp_transmit_skb(sk, skb, udpdata); + if (retval < 0) + goto error; + skb = NULL; + /* this calculation can be avoided too */ + if (dgram_remain ){ + snd_size = ((dgram_remain + overhd) == + udpdata->udp_mss) + ? dgram_remain:udpdata->udp_mss; + if ((dgram_remain + overhd) % + udpdata->udp_mss) + BUG(); + } + } else { + snd_size = -page_usage; + } + } while (page_usage > 0); + page_offset = 0; + page_end = PAGE_SIZE; + } while (dgram_remain > 0); + + if ( sysctl_udp_checksum && ((!(page_align_err % 2) && + ((overhd - sizeof(struct udphdr))% 2)) + || ((page_align_err % 2) && !((overhd - sizeof(struct udphdr)) % + 2)))){ + udpdata->udp_wcsum = ntohl(udpdata->udp_wcsum); + } + udpdata->leftover_skb = frag_size; + error: + return retval; +} + +struct +sk_buff *get_newpacket(struct sock *sk, struct udp_dgram *udpdata, int *retval) +{ + struct sk_buff *skb; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + skb = tp->send_head; + if (skb == NULL) { + /* Allocate a skb for udp packet and reserve UDP header */ + if ((skb = alloc_skb(udpdata->udp_mss + IP_UDP_HEADER, + sk->allocation)) == NULL) { + *retval = -ENOMEM; + return NULL; + } + + skb_reserve(skb, IP_UDP_HEADER+udpdata->udp_mss); + skb_set_owner_w(skb, sk); + tp->send_head = skb; + udpdata->skb_copied = 0; + if (udpdata->flags & FIRST_FRAG) + udpdata->skb_datapart = udpdata->udp_mss- + sizeof(struct udphdr); + else + udpdata->skb_datapart = udpdata->udp_mss; + skb_push(skb, udpdata->skb_datapart); + } + return skb; +} + +/* Copy Header and Trailer iovec table from user space + * Check the size and iov len sent by user. + * On error free the memory created and return error; + */ +int +get_iovectable(struct sf_hdtr *hdtr, struct iovec **hdr, int *hdr_len, + struct iovec **trl, int *trl_len) +{ + int retval = 0, i = 0; + /* First get header iovec table */ + + if (hdtr->headers) { + if (!hdtr->hdr_cnt || hdtr->hdr_cnt > UIO_MAXIOV) { + *hdr = NULL; + retval = -EINVAL; + goto error; + } + + *hdr = kmalloc(hdtr->hdr_cnt*sizeof(struct iovec), GFP_KERNEL); + if (!(*hdr)) { + retval = -ENOMEM; + goto error; + } + if (copy_from_user((*hdr), hdtr->headers, + hdtr->hdr_cnt*sizeof(struct iovec))) { + retval = -EFAULT; + goto hdr_free; + } + + *hdr_len = 0; + for (i=0; i < hdtr->hdr_cnt; i++){ + size_t tmp = *hdr_len; + if ((*hdr)[i].iov_len < 0) { + retval = -EINVAL; + goto hdr_free; + } + *hdr_len += (*hdr)[i].iov_len; + /* Check for int size overflow */ + if ((*hdr_len) < tmp || (*hdr_len) < (*hdr)[i].iov_len) { + retval = -EINVAL; + goto hdr_free; + } + } + + } + + if (hdtr->trailers) { + + if (!hdtr->trl_cnt || hdtr->trl_cnt > UIO_MAXIOV) { + retval = -EINVAL; + goto hdr_free; + } + + *trl = kmalloc(hdtr->trl_cnt*sizeof(struct iovec), GFP_KERNEL); + if (!(*trl)) { + retval = -ENOMEM; + goto hdr_free; + } + + if (copy_from_user((*trl), hdtr->trailers, + hdtr->trl_cnt*sizeof(struct iovec))) { + retval = -EFAULT; + goto trl_free; + } + + *trl_len = 0; + for (i=0; i < hdtr->trl_cnt; i++){ + size_t tmp = *trl_len; + if ((*trl)[i].iov_len < 0) { + retval = -EINVAL; + goto trl_free; + } + *trl_len += (*trl)[i].iov_len; + /* Check for int size overflow */ + if (*trl_len < tmp || *trl_len < (*trl)[i].iov_len) { + retval = -EINVAL; + goto trl_free; + } + } + } + goto out; + + trl_free: + kfree((*trl)); + hdr_free: + kfree((*hdr)); + error: + out: + return retval; +} +/* + * This is the main UDP scatter/gather i/o routine to prepare datagrams. + * ASSUMPTION: preceding data (eg. UDP which is) 32-bit aligned necessary + * for checksum calculation. As the hdr size will be small its not + * worth the calculations unless it takes more than two IP fragments + * to accomodate the UDP + application hdr. + * 2. DONT USE THIS FOR TRAILERS. + */ +size_t +udp_fill_data(struct sock *sk, struct udp_dgram *udpdata) +{ + int size_to_copy; + int retval; + int iov_cnt = 0; + int overhd_rmndr = 0; + int i; + struct sk_buff *skb = NULL; + struct iovec *iov; + + udpdata->flags |= FIRST_FRAG; + /* First allocate the packet */ + if ((udpdata->flags & IOVEC_DATA) == 0) + return -EINVAL; + /* asssumption here is we do not have any overhead but app. headers */ + overhd_rmndr = udpdata->dgram_size - udpdata->data.pglist.len - + sizeof (struct udphdr); + if (overhd_rmndr + sizeof(struct udphdr) >= udpdata->udp_mss){ + printk("header is too large :( HOUSTON WE HAVE A PROBLEM \n"); + return -EINVAL; + } + iov = udpdata->data.buf.iov; + iov_cnt = udpdata->data.buf.iov_cnt; + size_to_copy = overhd_rmndr; + skb = get_newpacket(sk, udpdata, &retval); + if (!skb) + goto error; + + if ( csum_partial_copy_fromiovecend(skb->data, iov, 0, size_to_copy, + &udpdata->udp_wcsum) < 0) { + retval = -EFAULT; + goto error; + } + /* KC: I avoiding sanity checks assuming there is + * no messup while passing udp_carry + */ + skb->data += size_to_copy; + udpdata->skb_copied += size_to_copy; + udpdata->doffset += udpdata->skb_copied; + overhd_rmndr -= size_to_copy; + + while(udpdata->nr_frags_cur_skb > 0){ + skb_frag_t *frag = NULL; + + udpdata->nr_frags_cur_skb--; + if(!udpdata->leftover_skb || udpdata->nr_frags_cur_skb < 0){ + BUG(); + } + i = skb_shinfo(skb)->nr_frags; + frag = &skb_shinfo(skb)->frags[i]; + get_page(udpdata->frags[udpdata->nr_frags_cur_skb].page); + frag->page = udpdata->frags[udpdata->nr_frags_cur_skb].page; + frag->page_offset = udpdata->frags[udpdata->nr_frags_cur_skb].page_offset; + frag->size = udpdata->frags[udpdata->nr_frags_cur_skb].size; + skb_shinfo(skb)->nr_frags = i+1; + udpdata->skb_copied += frag->size; + skb->data += frag->size; + skb->data_len += frag->size; + } + + /* associativity of addition so checksum not + * effected even if walk from the start of the datagram + */ + if ((retval = udp_transmit_skb(sk, skb, udpdata)) < 0) + goto error; + error: + return retval; +} + +/* + * UDP send routine to send datagram of size upto 65KB + */ +size_t +udp_sendpages(void *sp, struct page **pages, int poffset, size_t size, + struct sf_hdtr *hdtr) +{ + u8 tos; + int trl_size = 0,hdr_size = 0; + int err; + int total_bytes = 0; + struct socket *sock = (struct socket *) sp; + struct sock *sk = sock->sk; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + struct rtable *rt; + struct udp_dgram udpdata; + struct ipcm_cookie ipc; + struct iovec *hdr_iov = NULL, *trl_iov = NULL; + int retval = 0; + + if (sk->state != TCP_ESTABLISHED) + return -ENOTCONN; + + + /* UDP sendfile call is always connected */ + rt = (struct rtable *) sk_dst_check(sk, 0); + + if (rt == NULL) { + /* We don't have a route and get one */ + ipc.opt = NULL; + ipc.oif = sk->bound_dev_if; + if (!ipc.opt) + ipc.opt = sk->protinfo.af_inet.opt; + + tos = RT_TOS(sk->protinfo.af_inet.tos); + + if (sk->localroute || (ipc.opt && ipc.opt->is_strictroute)) + tos |= RTO_ONLINK; + + err = ip_route_output(&rt, sk->daddr, sk->saddr, tos, ipc.oif); + if (err) + goto out; + err = -EINVAL; + if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) + goto out; + + sk_dst_set(sk, dst_clone(&rt->u.dst)); + } + udpdata.rt = rt; + /* + * Calculate the UDP segement size + * this segment size is rounded to 8 byte boundary. + */ + udpdata.udp_mss = ((rt->u.dst.pmtu - sizeof(struct iphdr)) & ~7); + + if (udpdata.udp_mss < 8) + return -EMSGSIZE; + + /* User passed header and trailers iovecs pointers + * We need to get the data and get point list + * We need to free the header and trailer iovec table. + */ + retval = get_iovectable(hdtr, &hdr_iov, &hdr_size, &trl_iov, &trl_size); + if (retval < 0) + return retval; + + /* + * The complete datagram size is + * One UDP header + APP header +App data + * Only the first fragment contains UDP hdr and + * the rest of the fragments contains only IP hdr. + */ + udpdata.dgram_size = size + sizeof(struct udphdr) + hdr_size + trl_size; + + if (udpdata.dgram_size < 0 || udpdata.dgram_size > 65536) { + retval = -EMSGSIZE; + goto error; + } + if (udpdata.dgram_size <= (rt->u.dst.pmtu - sizeof(struct iphdr))) + udpdata.flags |= HW_CSUM; + + /* + * Do we need fragmentation? + * + * if dgram_size > mtu, then we need fragmentation. + *************************************************** + * The complete datagram is not multiple of udp_mss + * and hence the last leftover bytes are send in a + * packet. + */ + udpdata.iovec_size = hdr_size + trl_size; + udpdata.doffset = udpdata.dgram_size; + udpdata.copied = 0; + udpdata.flags = 0; + udpdata.flags |= LAST_FRAG; + udpdata.udp_carry = 0; + udpdata.udp_wcsum = 0; + udpdata.ip_id = (sk ? sk->protinfo.af_inet.id++ : 0); + udpdata.nr_frags_cur_skb = 0; + /* + * Currently without IP fragmentation, we can do HW chksum loading. + * But we do have to finish UDP chksum. + */ + + /* Sending the data */ + udpdata.data.pglist.pages = pages; + udpdata.data.pglist.poffset = poffset; + udpdata.data.pglist.page_start_index = ( udpdata.data.pglist.poffset ) / + PAGE_SIZE; + udpdata.data.pglist.page_end_index = ( udpdata.data.pglist.poffset + + size - 1 ) / PAGE_SIZE; + udpdata.data.pglist.len = size; + if ( size > 0 ) { + if ((retval = udp_fill_pages(sk, &udpdata)) < 0) + goto error; + } else { + retval = -EINVAL; + goto error; + } + + total_bytes += size; + /* Send the application header part */ + if (hdtr && hdtr->headers) { + udpdata.flags |= IOVEC_DATA; + udpdata.data.buf.iov = hdr_iov; + udpdata.data.buf.iov_cnt = hdtr->hdr_cnt; + + if (udp_fill_data(sk, &udpdata) < 0) { + retval = -EINVAL; + goto error; + } + udpdata.flags &= ~IOVEC_DATA; + kfree( hdr_iov ); + total_bytes += hdr_size; + } + retval = total_bytes; + + error: + if ((skb = tp->send_head) != NULL) { + kfree_skb(skb); + tp->send_head = NULL; + } + out: + return retval; +} + +/* + * Wrapper which makes sure that a socket is connected before calling + * udp_sendpages + */ +size_t +udp_sendpages_sock(void *sp, struct page **pages, int poffset, size_t size, + struct sf_hdtr *hdtr, struct sockaddr *sockaddr, int addr_len) +{ + + int connected = 0; + struct socket *sock = (struct socket *) sp; + struct sock *sk = sock->sk; + int retval = 0; + + if ((sk->state != TCP_ESTABLISHED) && (!sockaddr || addr_len < 0)) + return -EINVAL; + + if (sk->state != TCP_ESTABLISHED) { + retval = udp_connect(sk, sockaddr, addr_len); + if (retval < 0) + return retval; + connected = 1; + } + + /* Now the socket is connected, let us call udp_sendpages */ + retval = udp_sendpages(sp, pages, poffset, size, hdtr); + + if (connected) + udp_disconnect(sk, 0); + return retval; +}