This patch adds vnet_hdr processing for mergeable buffer support to vhost-net. Signed-off-by: David L Stevens <dlstevens@xxxxxxxxxx> diff -ruN net-next-p1/drivers/vhost/net.c net-next-p2/drivers/vhost/net.c --- net-next-p1/drivers/vhost/net.c 2010-03-01 11:44:22.000000000 -0800 +++ net-next-p2/drivers/vhost/net.c 2010-03-02 13:01:34.000000000 -0800 @@ -109,7 +109,6 @@ }; size_t len, total_len = 0; int err, wmem; - size_t hdr_size; struct socket *sock = rcu_dereference(vq->private_data); if (!sock) return; @@ -124,7 +123,6 @@ if (wmem < sock->sk->sk_sndbuf * 2) tx_poll_stop(net); - hdr_size = vq->hdr_size; for (;;) { head.iov_base = (void *)vhost_get_vq_desc(&net->dev, vq, @@ -148,25 +146,45 @@ "out %d, int %d\n", out, in); break; } + if (vq->guest_hlen > vq->sock_hlen) { + if (msg.msg_iov[0].iov_len == vq->guest_hlen) + msg.msg_iov[0].iov_len = vq->sock_hlen; + else if (out == ARRAY_SIZE(vq->iov)) + vq_err(vq, "handle_tx iov overflow!"); + else { + int i; + + /* give header its own iov */ + for (i=out; i>0; ++i) + msg.msg_iov[i+1] = msg.msg_iov[i]; + msg.msg_iov[0].iov_len = vq->sock_hlen; + msg.msg_iov[1].iov_base += vq->guest_hlen; + msg.msg_iov[1].iov_len -= vq->guest_hlen; + out++; + } + } /* Skip header. TODO: support TSO. */ - s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); msg.msg_iovlen = out; head.iov_len = len = iov_length(vq->iov, out); /* Sanity check */ if (!len) { vq_err(vq, "Unexpected header len for TX: " "%zd expected %zd\n", - iov_length(vq->hdr, s), hdr_size); + len, vq->guest_hlen); break; } /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(NULL, sock, &msg, len); if (unlikely(err < 0)) { - vhost_discard(vq, 1); - tx_poll_start(net, sock); + if (err == -EAGAIN) { + tx_poll_start(net, sock); + } else { + vq_err(vq, "sendmsg: errno %d\n", -err); + /* drop packet; do not discard/resend */ + vhost_add_used_and_signal(&net->dev,vq,&head,1); + } break; - } - if (err != len) + } else if (err != len) pr_err("Truncated TX packet: " " len %d != %zd\n", err, len); vhost_add_used_and_signal(&net->dev, vq, &head, 1); @@ -207,14 +225,8 @@ .msg_flags = MSG_DONTWAIT, }; - struct virtio_net_hdr hdr = { - .flags = 0, - .gso_type = VIRTIO_NET_HDR_GSO_NONE - }; - size_t len, total_len = 0; int err, headcount, datalen; - size_t hdr_size; struct socket *sock = rcu_dereference(vq->private_data); if (!sock || !skb_head_len(&sock->sk->sk_receive_queue)) @@ -223,7 +235,6 @@ use_mm(net->dev.mm); mutex_lock(&vq->mutex); vhost_disable_notify(vq); - hdr_size = vq->hdr_size; vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL; @@ -232,25 +243,18 @@ headcount = vhost_get_heads(vq, datalen, &in, vq_log, &log); /* OK, now we need to know about added descriptors. */ if (!headcount) { - if (unlikely(vhost_enable_notify(vq))) { - /* They have slipped one in as we were - * doing that: check again. */ - vhost_disable_notify(vq); - continue; - } - /* Nothing new? Wait for eventfd to tell us - * they refilled. */ + vhost_enable_notify(vq); break; } /* Skip header. TODO: support TSO/mergeable rx buffers. */ - s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in); msg.msg_iovlen = in; len = iov_length(vq->iov, in); + /* Sanity check */ if (!len) { vq_err(vq, "Unexpected header len for RX: " "%zd expected %zd\n", - iov_length(vq->hdr, s), hdr_size); + len, vq->guest_hlen); break; } err = sock->ops->recvmsg(NULL, sock, &msg, @@ -268,13 +272,7 @@ continue; } len = err; - err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size); - if (err) { - vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", - vq->iov->iov_base, err); - break; - } - len += hdr_size; + len += vq->guest_hlen - vq->sock_hlen; vhost_add_used_and_signal(&net->dev, vq, vq->heads, headcount); if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, len); @@ -483,6 +481,13 @@ return ERR_PTR(-ENOTSOCK); } +static int vhost_sock_is_raw(struct socket *sock) +{ + if (!sock || !sock->sk) + return 0; + return sock->sk->sk_type == SOCK_RAW; +} + static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) { struct socket *sock, *oldsock; @@ -519,6 +524,20 @@ vhost_net_disable_vq(n, vq); rcu_assign_pointer(vq->private_data, sock); + + if (sock && sock->sk) { + if (!vhost_sock_is_raw(sock) || + vhost_has_feature(&n->dev, VHOST_NET_F_VIRTIO_NET_HDR)) { + vq->sock_hlen = sizeof(struct virtio_net_hdr); + if (vhost_has_feature(&n->dev, VIRTIO_NET_F_MRG_RXBUF)) + vq->guest_hlen = + sizeof(struct virtio_net_hdr_mrg_rxbuf); + else + vq->guest_hlen = sizeof(struct virtio_net_hdr); + } else + vq->guest_hlen = vq->sock_hlen = 0; + } else + vq_err(vq, "vhost_net_set_backend: sock->sk is NULL"); vhost_net_enable_vq(n, vq); mutex_unlock(&vq->mutex); done: @@ -566,8 +585,17 @@ n->dev.acked_features = features; smp_wmb(); for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { - mutex_lock(&n->vqs[i].mutex); - n->vqs[i].hdr_size = hdr_size; + struct vhost_virtqueue *vq = n->vqs + i; + struct socket *sock = vq->private_data; + + mutex_lock(&vq->mutex); + if (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) + vq->sock_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + else if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) || + !vhost_sock_is_raw(sock)) + vq->sock_hlen = sizeof(struct virtio_net_hdr); + else + vq->sock_hlen = 0; mutex_unlock(&n->vqs[i].mutex); } vhost_net_flush(n); diff -ruN net-next-p1/drivers/vhost/vhost.c net-next-p2/drivers/vhost/vhost.c --- net-next-p1/drivers/vhost/vhost.c 2010-03-01 11:44:06.000000000 -0800 +++ net-next-p2/drivers/vhost/vhost.c 2010-03-02 12:53:02.000000000 -0800 @@ -113,7 +113,8 @@ vq->used_flags = 0; vq->log_used = false; vq->log_addr = -1ull; - vq->hdr_size = 0; + vq->guest_hlen = 0; + vq->sock_hlen = 0; vq->private_data = NULL; vq->log_base = NULL; vq->error_ctx = NULL; @@ -848,20 +849,85 @@ return 0; } +static int +vhost_get_hdr(struct vhost_virtqueue *vq, int *in, struct vhost_log *log, + int *log_num) +{ + struct iovec *heads = vq->heads; + struct iovec *iov = vq->iov; + int out; + + *in = 0; + iov[0].iov_len = 0; + + /* get buffer, starting from iov[1] */ + heads[0].iov_base = (void *)vhost_get_vq_desc(vq->dev, vq, + vq->iov+1, ARRAY_SIZE(vq->iov)-1, &out, in, log, log_num); + if (out || *in <= 0) { + vq_err(vq, "unexpected descriptor format for RX: out %d, " + "in %d\n", out, *in); + return 0; + } + if (heads[0].iov_base == (void *)vq->num) + return 0; + + /* make iov[0] the header */ + if (!vq->guest_hlen) { + if (vq->sock_hlen) { + static struct virtio_net_hdr junk; /* bit bucket */ + + iov[0].iov_base = &junk; + iov[0].iov_len = sizeof(junk); + } else + iov[0].iov_len = 0; + } + if (vq->sock_hlen < vq->guest_hlen) { + iov[0].iov_base = iov[1].iov_base; + iov[0].iov_len = vq->sock_hlen; + + if (iov[1].iov_len < vq->sock_hlen) { + vq_err(vq, "can't fit header in one buffer!"); + vhost_discard(vq, 1); + return 0; + } + if (!vq->sock_hlen) { + static const struct virtio_net_hdr_mrg_rxbuf hdr = { + .hdr.flags = 0, + .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + memcpy(iov[0].iov_base, &hdr, vq->guest_hlen); + } + iov[1].iov_base += vq->guest_hlen; + iov[1].iov_len -= vq->guest_hlen; + } + return 1; +} + unsigned vhost_get_heads(struct vhost_virtqueue *vq, int datalen, int *iovcount, struct vhost_log *log, unsigned int *log_num) { struct iovec *heads = vq->heads; - int out, in; + int out, in = 0; + int seg = 0; int hc = 0; + if (vq->guest_hlen != vq->sock_hlen) { + seg = vhost_get_hdr(vq, &in, log, log_num); + if (!seg) + return 0; + hc++; + datalen -= iov_length(vq->iov+seg, in); + seg += in; + } + while (datalen > 0) { if (hc >= VHOST_NET_MAX_SG) { vhost_discard(vq, hc); return 0; } heads[hc].iov_base = (void *)vhost_get_vq_desc(vq->dev, vq, - vq->iov, ARRAY_SIZE(vq->iov), &out, &in, log, log_num); + vq->iov+seg, ARRAY_SIZE(vq->iov)-seg, &out, &in, + log, log_num); if (heads[hc].iov_base == (void *)vq->num) { vhost_discard(vq, hc); return 0; @@ -872,11 +938,12 @@ vhost_discard(vq, hc); return 0; } - heads[hc].iov_len = iov_length(vq->iov, in); - hc++; + heads[hc].iov_len = iov_length(vq->iov+seg, in); datalen -= heads[hc].iov_len; + hc++; + seg += in; } - *iovcount = in; + *iovcount = seg; return hc; } diff -ruN net-next-p1/drivers/vhost/vhost.h net-next-p2/drivers/vhost/vhost.h --- net-next-p1/drivers/vhost/vhost.h 2010-03-01 11:42:18.000000000 -0800 +++ net-next-p2/drivers/vhost/vhost.h 2010-03-02 13:02:03.000000000 -0800 @@ -82,10 +82,9 @@ u64 log_addr; struct iovec indirect[VHOST_NET_MAX_SG]; - struct iovec iov[VHOST_NET_MAX_SG]; - struct iovec hdr[VHOST_NET_MAX_SG]; + struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr */ struct iovec heads[VHOST_NET_MAX_SG]; - size_t hdr_size; + size_t guest_hlen, sock_hlen; /* We use a kind of RCU to access private pointer. * All readers access it from workqueue, which makes it possible to * flush the workqueue instead of synchronize_rcu. Therefore readers do
Attachment:
MRXB2.patch
Description: Binary data