This patch modifies tun to allow a vringfd to specify the send buffer. The user does a write to push out packets from the buffer. Again, more thought needs to be put into the possible races with ring registration. Again we use the 'struct virtio_net_hdr' to allow userspace to send GSO packets. In this case, it can hint how much to copy, and the other pages will be made into skb fragments. Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx> diff -r 8270b5fdf03f drivers/net/tun.c --- a/drivers/net/tun.c Sat Apr 05 22:49:10 2008 +1100 +++ b/drivers/net/tun.c Sat Apr 05 22:51:10 2008 +1100 @@ -101,7 +101,7 @@ struct tun_struct { u32 chr_filter[2]; u32 net_filter[2]; - struct vring_info *inring; + struct vring_info *inring, *outring; #ifdef TUN_DEBUG int debug; @@ -258,6 +258,162 @@ static void tun_net_init(struct net_devi } } +/* We don't consolidate consecutive iovecs, so huge iovecs can break here. + * Users will learn not to do that. */ +static int get_user_skb_frags(const struct iovec *iv, size_t len, + struct skb_frag_struct *f) +{ + unsigned int i, j, num_pg = 0; + int err; + struct page *pages[MAX_SKB_FRAGS]; + + down_read(¤t->mm->mmap_sem); + while (len) { + int n, npages; + unsigned long base, len; + base = (unsigned long)iv->iov_base; + len = (unsigned long)iv->iov_len; + + if (len == 0) { + iv++; + continue; + } + + /* How many pages will this take? */ + npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE; + if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) { + err = -ENOSPC; + goto fail; + } + n = get_user_pages(current, current->mm, base, npages, + 0, 0, pages, NULL); + if (unlikely(n < 0)) { + err = n; + goto fail; + } + + /* Transfer pages to the frag array */ + for (j = 0; j < n; j++) { + f[num_pg].page = pages[j]; + if (j == 0) { + f[num_pg].page_offset = offset_in_page(base); + f[num_pg].size = min(len, PAGE_SIZE - + f[num_pg].page_offset); + } else { + f[num_pg].page_offset = 0; + f[num_pg].size = min(len, PAGE_SIZE); + } + len -= f[num_pg].size; + base += f[num_pg].size; + num_pg++; + } + + if (unlikely(n != npages)) { + err = -EFAULT; + goto fail; + } + } + up_read(¤t->mm->mmap_sem); + return num_pg; + +fail: + for (i = 0; i < num_pg; i++) + put_page(f[i].page); + up_read(¤t->mm->mmap_sem); + return err; +} + +/* Get packet from user space buffer. copylen is a hint as to how + * much to copy (rest is pinned). */ +static struct sk_buff *get_user_skb(struct tun_struct *tun, struct iovec *iv, + size_t copylen, size_t len, int extra) +{ + struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) }; + struct sk_buff *skb; + size_t align = 0; + int err; + + /* You can't have user fragments without room for destruction info. */ + BUG_ON(!extra && copylen != len); + + if (!(tun->flags & TUN_NO_PI)) { + if (len < sizeof(pi)) { + err = -EINVAL; + goto fail; + } + len -= sizeof(pi); + + if (memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) { + err = -EFAULT; + goto fail; + } + if (copylen > len) + copylen = len; + } + + if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) { + align = NET_IP_ALIGN; + if (unlikely(copylen < ETH_HLEN)) { + if (len < ETH_HLEN) { + err = -EINVAL; + goto fail; + } + copylen = ETH_HLEN; + } + } + + /* We don't need a destructor if we don't have fragments. */ + if (extra && copylen == len) + extra = 0; + + if (!(skb = __alloc_skb(copylen + align, GFP_KERNEL, 0, extra, -1))) { + err = -ENOMEM; + goto fail; + } + + if (align) + skb_reserve(skb, align); + if (memcpy_fromiovec(skb_put(skb, copylen), iv, copylen)) { + err = -EFAULT; + goto free_skb; + } + + switch (tun->flags & TUN_TYPE_MASK) { + case TUN_TUN_DEV: + skb_reset_mac_header(skb); + skb->protocol = pi.proto; + skb->dev = tun->dev; + break; + case TUN_TAP_DEV: + skb->protocol = eth_type_trans(skb, tun->dev); + break; + }; + + if (tun->flags & TUN_NOCHECKSUM) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* Anything left gets put into frags. */ + if (extra) { + struct skb_shared_info *sinfo = skb_shinfo(skb); + int err = get_user_skb_frags(iv, len - copylen, sinfo->frags); + if (err < 0) + goto free_skb; + sinfo->nr_frags = err; + } + tun->dev->last_rx = jiffies; + + tun->dev->stats.rx_packets++; + tun->dev->stats.rx_bytes += len; + + return skb; + +free_skb: + kfree_skb(skb); +fail: + tun->dev->stats.rx_dropped++; + return ERR_PTR(err); +} + #ifdef CONFIG_VRINGFD static void unset_recv(void *_tun) { @@ -362,8 +518,118 @@ static int set_recv_vring(struct tun_str tun->inring = vi; return 0; } + +static void unset_xmit(void *_tun) +{ + struct tun_struct *tun = _tun; + + tun->outring = NULL; +} + +struct skb_shinfo_tun { + struct tun_struct *tun; + + unsigned int id; + unsigned int len; +}; + +/* We are done with this skb: put it in the used pile. */ +static void skb_finished(struct skb_shared_info *sinfo) +{ + struct skb_shinfo_tun *sht = (void *)(sinfo + 1); + + /* FIXME: Race prevention */ + vring_used_buffer_atomic(sht->tun->outring, sht->id, sht->len); + vring_wake(sht->tun->outring); + + /* Release device. */ + dev_put(sht->tun->dev); +} + +static int xmit_packets(void *_tun) +{ + struct tun_struct *tun = _tun; + struct iovec iov[1+MAX_SKB_FRAGS]; + unsigned int iovnum = ARRAY_SIZE(iov); + int id, err, wake = 0; + unsigned long len; + + while ((id = vring_get_buffer(tun->outring, NULL, NULL, NULL, + iov, &iovnum, &len)) > 0) { + struct virtio_net_hdr h; + struct sk_buff *skb; + struct skb_shared_info *shinfo; + struct skb_shinfo_tun *sht; + + if (unlikely(len < sizeof(h))) + return -EINVAL; + + err = memcpy_fromiovec((void *)&h, iov, sizeof(h)); + if (unlikely(err)) + return -EFAULT; + + len -= sizeof(h); + if (h.hdr_len > len) + return -EINVAL; + + /* Without GSO, we copy entire packet. */ + if (h.gso_type == VIRTIO_NET_HDR_GSO_NONE) + h.hdr_len = len; + + skb = get_user_skb(tun, iov, h.hdr_len, len, sizeof(*sht)); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + if ((h.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && + !skb_partial_csum_set(skb, h.csum_start, h.csum_offset)) { + kfree_skb(skb); + return -EINVAL; + } + + shinfo = skb_shinfo(skb); + /* If it has fragments, set up destructor for later. */ + if (shinfo->nr_frags) { + sht = (void *)(shinfo + 1); + shinfo->destructor = skb_finished; + sht->id = id; + sht->len = sizeof(h) + skb->len; + } else { + vring_used_buffer(tun->outring, id, sizeof(h)+skb->len); + wake = 1; + } + netif_rx_ni(skb); + } + + if (wake) + vring_wake(tun->outring); + + /* 0 or error. */ + return id; +} + +static struct vring_ops xmitops = { + .destroy = unset_xmit, + .push = xmit_packets, +}; + +static int set_xmit_vring(struct tun_struct *tun, int fd) +{ + struct vring_info *vi; + + /* FIXME: Racy. */ + vi = vring_attach(fd, &xmitops, tun, false); + if (IS_ERR(vi)) + return PTR_ERR(vi); + tun->outring = vi; + return 0; +} #else /* ... !CONFIG_VRINGFD */ static int set_recv_vring(struct tun_struct *tun, int fd) +{ + return -ENOTTY; +} + +static int set_xmit_vring(struct tun_struct *tun, int fd) { return -ENOTTY; } @@ -390,74 +656,26 @@ static unsigned int tun_chr_poll(struct return mask; } -/* Get packet from user space buffer */ -static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count) -{ - struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) }; - struct sk_buff *skb; - size_t len = count, align = 0; - - if (!(tun->flags & TUN_NO_PI)) { - if ((len -= sizeof(pi)) > count) - return -EINVAL; - - if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) - return -EFAULT; - } - - if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) { - align = NET_IP_ALIGN; - if (unlikely(len < ETH_HLEN)) - return -EINVAL; - } - - if (!(skb = alloc_skb(len + align, GFP_KERNEL))) { - tun->dev->stats.rx_dropped++; - return -ENOMEM; - } - - if (align) - skb_reserve(skb, align); - if (memcpy_fromiovec(skb_put(skb, len), iv, len)) { - tun->dev->stats.rx_dropped++; - kfree_skb(skb); - return -EFAULT; - } - - switch (tun->flags & TUN_TYPE_MASK) { - case TUN_TUN_DEV: - skb_reset_mac_header(skb); - skb->protocol = pi.proto; - skb->dev = tun->dev; - break; - case TUN_TAP_DEV: - skb->protocol = eth_type_trans(skb, tun->dev); - break; - }; - - if (tun->flags & TUN_NOCHECKSUM) - skb->ip_summed = CHECKSUM_UNNECESSARY; - - netif_rx_ni(skb); - tun->dev->last_rx = jiffies; - - tun->dev->stats.rx_packets++; - tun->dev->stats.rx_bytes += len; - - return count; -} - static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, unsigned long count, loff_t pos) { struct tun_struct *tun = iocb->ki_filp->private_data; + size_t len; + struct sk_buff *skb; if (!tun) return -EBADFD; DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count); - return tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count)); + len = iov_length(iv, count); + + skb = get_user_skb(tun, (struct iovec *)iv, len, len, 0); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + netif_rx_ni(skb); + return len; } /* Put packet to the user space buffer */ @@ -795,7 +1013,10 @@ static int tun_chr_ioctl(struct inode *i #endif case TUNSETRECVVRING: - return set_recv_vring(tun, arg); + return set_recv_vring(tun, arg); + + case TUNSETXMITVRING: + return set_xmit_vring(tun, arg); case SIOCGIFFLAGS: ifr.ifr_flags = tun->if_flags; diff -r 8270b5fdf03f include/linux/if_tun.h --- a/include/linux/if_tun.h Sat Apr 05 22:49:10 2008 +1100 +++ b/include/linux/if_tun.h Sat Apr 05 22:51:10 2008 +1100 @@ -43,6 +43,7 @@ #define TUNSETLINK _IOW('T', 205, int) #define TUNSETGROUP _IOW('T', 206, int) #define TUNSETRECVVRING _IOW('T', 207, int) +#define TUNSETXMITVRING _IOW('T', 208, int) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/virtualization