We send packets using a copy-free mechanism (this is the Guest to Host transport via VMBUS). While this is obviously optimal for large packets, it may not be optimal for small packets. Hyper-V host supports a second mechanism for sending packets that is "copy based". We implement that mechanism in this patch. In this version of the patch I have addressed a comment from David Miller. With this patch (and all of the other offload and VRSS patches), we are now able to almost saturate a 10G interface between Linux VMs on Hyper-V on different hosts - close to 9 Gbps as measured via iperf. Signed-off-by: K. Y. Srinivasan <kys@xxxxxxxxxxxxx> Reviewed-by: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx> --- drivers/net/hyperv/hyperv_net.h | 14 +++ drivers/net/hyperv/netvsc.c | 226 +++++++++++++++++++++++++++++++++++++-- drivers/net/hyperv/netvsc_drv.c | 3 +- 3 files changed, 234 insertions(+), 9 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index d1f7826..4b7df5a 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -140,6 +140,8 @@ struct hv_netvsc_packet { void *send_completion_ctx; void (*send_completion)(void *context); + u32 send_buf_index; + /* This points to the memory after page_buf */ struct rndis_message *rndis_msg; @@ -582,6 +584,9 @@ struct nvsp_message { #define NETVSC_RECEIVE_BUFFER_SIZE (1024*1024*16) /* 16MB */ #define NETVSC_RECEIVE_BUFFER_SIZE_LEGACY (1024*1024*15) /* 15MB */ +#define NETVSC_SEND_BUFFER_SIZE (1024 * 1024) /* 1MB */ +#define NETVSC_INVALID_INDEX -1 + #define NETVSC_RECEIVE_BUFFER_ID 0xcafe @@ -607,6 +612,15 @@ struct netvsc_device { u32 recv_section_cnt; struct nvsp_1_receive_buffer_section *recv_section; + /* Send buffer allocated by us */ + void *send_buf; + u32 send_buf_size; + u32 send_buf_gpadl_handle; + u32 send_section_cnt; + u32 send_section_size; + unsigned long *send_section_map; + int map_words; + /* Used for NetVSP initialization protocol */ struct completion channel_init_wait; struct nvsp_message channel_init_pkt; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index bbee446..c041f63 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/if_ether.h> +#include <asm/sync_bitops.h> #include "hyperv_net.h" @@ -80,7 +81,7 @@ get_in_err: } -static int netvsc_destroy_recv_buf(struct netvsc_device *net_device) +static int netvsc_destroy_buf(struct netvsc_device *net_device) { struct nvsp_message *revoke_packet; int ret = 0; @@ -146,10 +147,62 @@ static int netvsc_destroy_recv_buf(struct netvsc_device *net_device) net_device->recv_section = NULL; } + /* Deal with the send buffer we may have setup. + * If we got a send section size, it means we received a + * SendsendBufferComplete msg (ie sent + * NvspMessage1TypeSendReceiveBuffer msg) therefore, we need + * to send a revoke msg here + */ + if (net_device->send_section_size) { + /* Send the revoke receive buffer */ + revoke_packet = &net_device->revoke_packet; + memset(revoke_packet, 0, sizeof(struct nvsp_message)); + + revoke_packet->hdr.msg_type = + NVSP_MSG1_TYPE_REVOKE_SEND_BUF; + revoke_packet->msg.v1_msg.revoke_recv_buf.id = 0; + + ret = vmbus_sendpacket(net_device->dev->channel, + revoke_packet, + sizeof(struct nvsp_message), + (unsigned long)revoke_packet, + VM_PKT_DATA_INBAND, 0); + /* If we failed here, we might as well return and + * have a leak rather than continue and a bugchk + */ + if (ret != 0) { + netdev_err(ndev, "unable to send " + "revoke send buffer to netvsp\n"); + return ret; + } + } + /* Teardown the gpadl on the vsp end */ + if (net_device->send_buf_gpadl_handle) { + ret = vmbus_teardown_gpadl(net_device->dev->channel, + net_device->send_buf_gpadl_handle); + + /* If we failed here, we might as well return and have a leak + * rather than continue and a bugchk + */ + if (ret != 0) { + netdev_err(ndev, + "unable to teardown send buffer's gpadl\n"); + return ret; + } + net_device->recv_buf_gpadl_handle = 0; + } + if (net_device->send_buf) { + /* Free up the receive buffer */ + free_pages((unsigned long)net_device->send_buf, + get_order(net_device->send_buf_size)); + net_device->send_buf = NULL; + } + kfree(net_device->send_section_map); + return ret; } -static int netvsc_init_recv_buf(struct hv_device *device) +static int netvsc_init_buf(struct hv_device *device) { int ret = 0; int t; @@ -248,10 +301,90 @@ static int netvsc_init_recv_buf(struct hv_device *device) goto cleanup; } + /* Now setup the send buffer. + */ + net_device->send_buf = + (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, + get_order(net_device->send_buf_size)); + if (!net_device->send_buf) { + netdev_err(ndev, "unable to allocate send " + "buffer of size %d\n", net_device->send_buf_size); + ret = -ENOMEM; + goto cleanup; + } + + /* Establish the gpadl handle for this buffer on this + * channel. Note: This call uses the vmbus connection rather + * than the channel to establish the gpadl handle. + */ + ret = vmbus_establish_gpadl(device->channel, net_device->send_buf, + net_device->send_buf_size, + &net_device->send_buf_gpadl_handle); + if (ret != 0) { + netdev_err(ndev, + "unable to establish send buffer's gpadl\n"); + goto cleanup; + } + + /* Notify the NetVsp of the gpadl handle */ + init_packet = &net_device->channel_init_pkt; + memset(init_packet, 0, sizeof(struct nvsp_message)); + init_packet->hdr.msg_type = NVSP_MSG1_TYPE_SEND_SEND_BUF; + init_packet->msg.v1_msg.send_recv_buf.gpadl_handle = + net_device->send_buf_gpadl_handle; + init_packet->msg.v1_msg.send_recv_buf.id = 0; + + /* Send the gpadl notification request */ + ret = vmbus_sendpacket(device->channel, init_packet, + sizeof(struct nvsp_message), + (unsigned long)init_packet, + VM_PKT_DATA_INBAND, + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + if (ret != 0) { + netdev_err(ndev, + "unable to send send buffer's gpadl to netvsp\n"); + goto cleanup; + } + + t = wait_for_completion_timeout(&net_device->channel_init_wait, 5*HZ); + BUG_ON(t == 0); + + /* Check the response */ + if (init_packet->msg.v1_msg. + send_send_buf_complete.status != NVSP_STAT_SUCCESS) { + netdev_err(ndev, "Unable to complete send buffer " + "initialization with NetVsp - status %d\n", + init_packet->msg.v1_msg. + send_recv_buf_complete.status); + ret = -EINVAL; + goto cleanup; + } + + /* Parse the response */ + net_device->send_section_size = init_packet->msg. + v1_msg.send_send_buf_complete.section_size; + + /* Section count is simply the size divided by the section size. + */ + net_device->send_section_cnt = + net_device->send_buf_size/net_device->send_section_size; + + dev_info(&device->device, "Send section size: %d, Section count:%d\n", + net_device->send_section_size, net_device->send_section_cnt); + + /* Setup state for managing the send buffer. */ + net_device->map_words = DIV_ROUND_UP(net_device->send_section_cnt, + BITS_PER_LONG); + + net_device->send_section_map = + kzalloc(net_device->map_words * sizeof(ulong), GFP_KERNEL); + if (net_device->send_section_map == NULL) + goto cleanup; + goto exit; cleanup: - netvsc_destroy_recv_buf(net_device); + netvsc_destroy_buf(net_device); exit: return ret; @@ -369,8 +502,9 @@ static int netvsc_connect_vsp(struct hv_device *device) net_device->recv_buf_size = NETVSC_RECEIVE_BUFFER_SIZE_LEGACY; else net_device->recv_buf_size = NETVSC_RECEIVE_BUFFER_SIZE; + net_device->send_buf_size = NETVSC_SEND_BUFFER_SIZE; - ret = netvsc_init_recv_buf(device); + ret = netvsc_init_buf(device); cleanup: return ret; @@ -378,7 +512,7 @@ cleanup: static void netvsc_disconnect_vsp(struct netvsc_device *net_device) { - netvsc_destroy_recv_buf(net_device); + netvsc_destroy_buf(net_device); } /* @@ -440,6 +574,12 @@ static inline u32 hv_ringbuf_avail_percent( return avail_write * 100 / ring_info->ring_datasize; } +static inline void netvsc_free_send_slot(struct netvsc_device *net_device, + u32 index) +{ + sync_change_bit(index, net_device->send_section_map); +} + static void netvsc_send_completion(struct netvsc_device *net_device, struct hv_device *device, struct vmpacket_descriptor *packet) @@ -447,6 +587,7 @@ static void netvsc_send_completion(struct netvsc_device *net_device, struct nvsp_message *nvsp_packet; struct hv_netvsc_packet *nvsc_packet; struct net_device *ndev; + u32 send_index; ndev = net_device->ndev; @@ -477,6 +618,9 @@ static void netvsc_send_completion(struct netvsc_device *net_device, /* Notify the layer above us */ if (nvsc_packet) { + send_index = nvsc_packet->send_buf_index; + if (send_index != NETVSC_INVALID_INDEX) + netvsc_free_send_slot(net_device, send_index); q_idx = nvsc_packet->q_idx; channel = nvsc_packet->channel; nvsc_packet->send_completion(nvsc_packet-> @@ -504,6 +648,52 @@ static void netvsc_send_completion(struct netvsc_device *net_device, } +static u32 netvsc_get_next_send_section(struct netvsc_device *net_device) +{ + unsigned long index; + u32 max_words = net_device->map_words; + unsigned long *map_addr = (unsigned long *)net_device->send_section_map; + u32 section_cnt = net_device->send_section_cnt; + int ret_val = NETVSC_INVALID_INDEX; + int i; + int prev_val; + + for (i = 0; i < max_words; i++) { + if (!~(map_addr[i])) + continue; + index = ffz(map_addr[i]); + prev_val = sync_test_and_set_bit(index, &map_addr[i]); + if (prev_val) + continue; + if ((index + (i * BITS_PER_LONG)) >= section_cnt) + break; + ret_val = (index + (i * BITS_PER_LONG)); + break; + } + return ret_val; +} + +u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device, + unsigned int section_index, + struct hv_netvsc_packet *packet) +{ + char *start = net_device->send_buf; + char *dest = (start + (section_index * net_device->send_section_size)); + int i; + u32 msg_size = 0; + + for (i = 0; i < packet->page_buf_cnt; i++) { + char *src = phys_to_virt(packet->page_buf[i].pfn << PAGE_SHIFT); + u32 offset = packet->page_buf[i].offset; + u32 len = packet->page_buf[i].len; + + memcpy(dest, (src + offset), len); + msg_size += len; + dest += len; + } + return msg_size; +} + int netvsc_send(struct hv_device *device, struct hv_netvsc_packet *packet) { @@ -513,6 +703,10 @@ int netvsc_send(struct hv_device *device, struct net_device *ndev; struct vmbus_channel *out_channel = NULL; u64 req_id; + unsigned int section_index = NETVSC_INVALID_INDEX; + u32 msg_size = 0; + struct sk_buff *skb; + net_device = get_outbound_net_device(device); if (!net_device) @@ -528,10 +722,26 @@ int netvsc_send(struct hv_device *device, sendMessage.msg.v1_msg.send_rndis_pkt.channel_type = 1; } - /* Not using send buffer section */ + /* Attempt to send via sendbuf */ + if (packet->total_data_buflen < net_device->send_section_size) { + section_index = netvsc_get_next_send_section(net_device); + if (section_index != NETVSC_INVALID_INDEX) { + msg_size = netvsc_copy_to_send_buf(net_device, + section_index, + packet); + skb = (struct sk_buff *) + (unsigned long)packet->send_completion_tid; + if (skb) + dev_kfree_skb_any(skb); + packet->page_buf_cnt = 0; + } + } + packet->send_buf_index = section_index; + + sendMessage.msg.v1_msg.send_rndis_pkt.send_buf_section_index = - 0xFFFFFFFF; - sendMessage.msg.v1_msg.send_rndis_pkt.send_buf_section_size = 0; + section_index; + sendMessage.msg.v1_msg.send_rndis_pkt.send_buf_section_size = msg_size; if (packet->send_completion) req_id = (ulong)packet; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index c76b665..939e3af 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -236,10 +236,11 @@ static void netvsc_xmit_completion(void *context) struct hv_netvsc_packet *packet = (struct hv_netvsc_packet *)context; struct sk_buff *skb = (struct sk_buff *) (unsigned long)packet->send_completion_tid; + u32 index = packet->send_buf_index; kfree(packet); - if (skb) + if (skb && (index == NETVSC_INVALID_INDEX)) dev_kfree_skb_any(skb); } -- 1.7.4.1 _______________________________________________ devel mailing list devel@xxxxxxxxxxxxxxxxxxxxxx http://driverdev.linuxdriverproject.org/mailman/listinfo/driverdev-devel