[PATCH] virtio: Use ioeventfd for virtqueue notify

Stefan Hajnoczi <stefanha@xxxxxxxxxxxxxxxxxx> · Thu, 30 Sep 2010 15:01:52 +0100

Virtqueue notify is currently handled synchronously in userspace virtio.
This prevents the vcpu from executing guest code while hardware
emulation code handles the notify.

On systems that support KVM, the ioeventfd mechanism can be used to make
virtqueue notify a lightweight exit by deferring hardware emulation to
the iothread and allowing the VM to continue execution.  This model is
similar to how vhost receives virtqueue notifies.

The result of this change is improved performance for userspace virtio
devices.  Virtio-blk throughput increases especially for multithreaded
scenarios and virtio-net transmit throughput increases substantially.
Full numbers are below.

This patch employs ioeventfd virtqueue notify for all virtio devices.
Linux kernels pre-2.6.34 only allow for 6 ioeventfds per VM and care
must be taken so that vhost-net, the other ioeventfd user in QEMU, is
able to function.  On such kernels ioeventfd virtqueue notify will not
be used.

Khoa Huynh <khoa@xxxxxxxxxx> collected the following data for
virtio-blk with cache=none,aio=native:

FFSB Test          Threads  Unmodified  Patched
                            (MB/s)      (MB/s)
Large file create  1        21.7        21.8
                   8        101.0       118.0
                   16       119.0       157.0

Sequential reads   1        21.9        23.2
                   8        114.0       139.0
                   16       143.0       178.0

Random reads       1        3.3         3.6
                   8        23.0        25.4
                   16       43.3        47.8

Random writes      1        22.2        23.0
                   8        93.1        111.6
                   16       110.5       132.0

Sridhar Samudrala <sri@xxxxxxxxxx> collected the following data for
virtio-net with 2.6.36-rc1 on the host and 2.6.34 on the guest.

Guest to Host TCP_STREAM throughput(Mb/sec)
-------------------------------------------
Msg Size  vhost-net  virtio-net  virtio-net/ioeventfd
65536         12755        6430                  7590
16384          8499        3084                  5764
 4096          4723        1578                  3659
 1024          1827         981                  2060

Host to Guest TCP_STREAM throughput(Mb/sec)
-------------------------------------------
Msg Size  vhost-net  virtio-net  virtio-net/ioeventfd
65536         11156        5790                  5853
16384         10787        5575                  5691
 4096         10452        5556                  4277
 1024          4437        3671                  5277

Guest to Host TCP_RR latency(transactions/sec)
----------------------------------------------

Msg Size  vhost-net  virtio-net  virtio-net/ioeventfd
    1          9903        3459                  3425
 4096          7185        1931                  1899
16384          6108        2102                  1923
65536          3161        1610                  1744

Signed-off-by: Stefan Hajnoczi <stefanha@xxxxxxxxxxxxxxxxxx>
---
Small changes are required for qemu-kvm.git.  I will send them once qemu.git
has virtio-ioeventfd support.

 hw/vhost.c  |    6 ++--
 hw/virtio.c |  106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/virtio.h |    9 +----
 kvm-all.c   |   39 +++++++++++++++++++++
 kvm-stub.c  |    5 +++
 kvm.h       |    1 +
 6 files changed, 156 insertions(+), 10 deletions(-)

diff --git a/hw/vhost.c b/hw/vhost.c
index 1b8624d..f127a07 100644
--- a/hw/vhost.c
+++ b/hw/vhost.c
@@ -517,7 +517,7 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
         goto fail_guest_notifier;
     }
 
-    r = vdev->binding->set_host_notifier(vdev->binding_opaque, idx, true);
+    r = virtio_set_host_notifier(vdev, idx, true);
     if (r < 0) {
         fprintf(stderr, "Error binding host notifier: %d\n", -r);
         goto fail_host_notifier;
@@ -539,7 +539,7 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
 
 fail_call:
 fail_kick:
-    vdev->binding->set_host_notifier(vdev->binding_opaque, idx, false);
+    virtio_set_host_notifier(vdev, idx, false);
 fail_host_notifier:
     vdev->binding->set_guest_notifier(vdev->binding_opaque, idx, false);
 fail_guest_notifier:
@@ -575,7 +575,7 @@ static void vhost_virtqueue_cleanup(struct vhost_dev *dev,
     }
     assert (r >= 0);
 
-    r = vdev->binding->set_host_notifier(vdev->binding_opaque, idx, false);
+    r = virtio_set_host_notifier(vdev, idx, false);
     if (r < 0) {
         fprintf(stderr, "vhost VQ %d host cleanup failed: %d\n", idx, r);
         fflush(stderr);
diff --git a/hw/virtio.c b/hw/virtio.c
index fbef788..f075b3a 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -16,6 +16,7 @@
 #include "trace.h"
 #include "virtio.h"
 #include "sysemu.h"
+#include "kvm.h"
 
 /* The alignment to use between consumer and producer parts of vring.
  * x86 pagesize again. */
@@ -77,6 +78,11 @@ struct VirtQueue
     VirtIODevice *vdev;
     EventNotifier guest_notifier;
     EventNotifier host_notifier;
+    enum {
+        HOST_NOTIFIER_DEASSIGNED,   /* inactive */
+        HOST_NOTIFIER_ASSIGNED,     /* active */
+        HOST_NOTIFIER_OFFLIMITS,    /* active but outside our control */
+    } host_notifier_state;
 };
 
 /* virt queue functions */
@@ -453,6 +459,93 @@ void virtio_update_irq(VirtIODevice *vdev)
     virtio_notify_vector(vdev, VIRTIO_NO_VECTOR);
 }
 
+/* Service virtqueue notify from a host notifier */
+static void virtio_read_host_notifier(void *opaque)
+{
+    VirtQueue *vq = opaque;
+    EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
+    if (event_notifier_test_and_clear(notifier)) {
+        if (vq->vring.desc) {
+            vq->handle_output(vq->vdev, vq);
+        }
+    }
+}
+
+/* Transition between host notifier states */
+static int virtio_set_host_notifier_state(VirtIODevice *vdev, int n, int state)
+{
+    VirtQueue *vq = &vdev->vq[n];
+    EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
+    int rc;
+
+    if (!kvm_enabled()) {
+        return -ENOSYS;
+    }
+
+    /* If the number of ioeventfds is limited, use them for vhost only */
+    if (state == HOST_NOTIFIER_ASSIGNED && !kvm_has_many_iobus_devs()) {
+        state = HOST_NOTIFIER_DEASSIGNED;
+    }
+
+    /* Ignore if no state change */
+    if (vq->host_notifier_state == state) {
+        return 0;
+    }
+
+    /* Disable read handler if transitioning away from assigned */
+    if (vq->host_notifier_state == HOST_NOTIFIER_ASSIGNED) {
+        qemu_set_fd_handler(event_notifier_get_fd(notifier), NULL, NULL, NULL);
+    }
+
+    /* Toggle host notifier if transitioning to or from deassigned */
+    if (state == HOST_NOTIFIER_DEASSIGNED ||
+        vq->host_notifier_state == HOST_NOTIFIER_DEASSIGNED) {
+        rc = vdev->binding->set_host_notifier(vdev->binding_opaque, n,
+                state != HOST_NOTIFIER_DEASSIGNED);
+        if (rc < 0) {
+            return rc;
+        }
+    }
+
+    /* Enable read handler if transitioning to assigned */
+    if (state == HOST_NOTIFIER_ASSIGNED) {
+        qemu_set_fd_handler(event_notifier_get_fd(notifier),
+                            virtio_read_host_notifier, NULL, vq);
+    }
+
+    vq->host_notifier_state = state;
+    return 0;
+}
+
+/* Try to assign/deassign host notifiers for all virtqueues */
+static void virtio_set_host_notifiers(VirtIODevice *vdev, bool assigned)
+{
+    int state = assigned ? HOST_NOTIFIER_ASSIGNED : HOST_NOTIFIER_DEASSIGNED;
+    int i;
+
+    if (!vdev->binding->set_host_notifier) {
+        return;
+    }
+
+    for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
+        if (vdev->vq[i].host_notifier_state == HOST_NOTIFIER_OFFLIMITS) {
+            continue;
+        }
+
+        if (!vdev->vq[i].vring.desc) {
+            continue;
+        }
+
+        virtio_set_host_notifier_state(vdev, i, state);
+    }
+}
+
+int virtio_set_host_notifier(VirtIODevice *vdev, int n, bool assigned)
+{
+    int state = assigned ? HOST_NOTIFIER_OFFLIMITS : HOST_NOTIFIER_ASSIGNED;
+    return virtio_set_host_notifier_state(vdev, n, state);
+}
+
 void virtio_reset(void *opaque)
 {
     VirtIODevice *vdev = opaque;
@@ -467,6 +560,7 @@ void virtio_reset(void *opaque)
     vdev->isr = 0;
     vdev->config_vector = VIRTIO_NO_VECTOR;
     virtio_notify_vector(vdev, vdev->config_vector);
+    virtio_set_host_notifiers(vdev, false);
 
     for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
         vdev->vq[i].vring.desc = 0;
@@ -592,6 +686,16 @@ void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector)
         vdev->vq[n].vector = vector;
 }
 
+void virtio_set_status(VirtIODevice *vdev, uint8_t val)
+{
+    virtio_set_host_notifiers(vdev, val & VIRTIO_CONFIG_S_DRIVER_OK);
+
+    if (vdev->set_status) {
+        vdev->set_status(vdev, val);
+    }
+    vdev->status = val;
+}
+
 VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
                             void (*handle_output)(VirtIODevice *, VirtQueue *))
 {
@@ -719,6 +823,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f)
     }
 
     virtio_notify_vector(vdev, VIRTIO_NO_VECTOR);
+    virtio_set_host_notifiers(vdev, vdev->status & VIRTIO_CONFIG_S_DRIVER_OK);
     return 0;
 }
 
@@ -746,6 +851,7 @@ VirtIODevice *virtio_common_init(const char *name, uint16_t device_id,
     for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
         vdev->vq[i].vector = VIRTIO_NO_VECTOR;
         vdev->vq[i].vdev = vdev;
+        vdev->vq[i].host_notifier_state = HOST_NOTIFIER_DEASSIGNED;
     }
 
     vdev->name = name;
diff --git a/hw/virtio.h b/hw/virtio.h
index 96514e6..d76157e 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -125,13 +125,7 @@ struct VirtIODevice
     uint16_t device_id;
 };
 
-static inline void virtio_set_status(VirtIODevice *vdev, uint8_t val)
-{
-    if (vdev->set_status) {
-        vdev->set_status(vdev, val);
-    }
-    vdev->status = val;
-}
+void virtio_set_status(VirtIODevice *vdev, uint8_t val);
 
 VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
                             void (*handle_output)(VirtIODevice *,
@@ -217,6 +211,7 @@ target_phys_addr_t virtio_queue_get_ring_size(VirtIODevice *vdev, int n);
 uint16_t virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n);
 void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx);
 VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n);
+int virtio_set_host_notifier(VirtIODevice *vdev, int n, bool assigned);
 EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq);
 EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq);
 void virtio_irq(VirtQueue *vq);
diff --git a/kvm-all.c b/kvm-all.c
index 1cc696f..2f09e34 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -24,6 +24,7 @@
 #include "qemu-barrier.h"
 #include "sysemu.h"
 #include "hw/hw.h"
+#include "hw/event_notifier.h"
 #include "gdbstub.h"
 #include "kvm.h"
 #include "bswap.h"
@@ -72,6 +73,7 @@ struct KVMState
     int irqchip_in_kernel;
     int pit_in_kernel;
     int xsave, xcrs;
+    int many_iobus_devs;
 };
 
 static KVMState *kvm_state;
@@ -423,6 +425,36 @@ int kvm_check_extension(KVMState *s, unsigned int extension)
     return ret;
 }
 
+static int kvm_check_many_iobus_devs(void)
+{
+    /* Older kernels have a 6 device limit on the KVM io bus.  In that case
+     * creating many ioeventfds must be avoided.  This tests checks for the
+     * limitation.
+     */
+    EventNotifier notifiers[7];
+    int i, ret = 0;
+    for (i = 0; i < ARRAY_SIZE(notifiers); i++) {
+        ret = event_notifier_init(&notifiers[i], 0);
+        if (ret < 0) {
+            break;
+        }
+        ret = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(&notifiers[i]), 0, i, true);
+        if (ret < 0) {
+            event_notifier_cleanup(&notifiers[i]);
+            break;
+        }
+    }
+
+    /* Decide whether many devices are supported or not */
+    ret = i == ARRAY_SIZE(notifiers);
+
+    while (i-- > 0) {
+        kvm_set_ioeventfd_pio_word(event_notifier_get_fd(&notifiers[i]), 0, i, false);
+        event_notifier_cleanup(&notifiers[i]);
+    }
+    return ret;
+}
+
 static void kvm_set_phys_mem(target_phys_addr_t start_addr,
 			     ram_addr_t size,
 			     ram_addr_t phys_offset)
@@ -699,6 +731,8 @@ int kvm_init(int smp_cpus)
     kvm_state = s;
     cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
 
+    s->many_iobus_devs = kvm_check_many_iobus_devs();
+
     return 0;
 
 err:
@@ -1028,6 +1062,11 @@ int kvm_has_xcrs(void)
     return kvm_state->xcrs;
 }
 
+int kvm_has_many_iobus_devs(void)
+{
+    return kvm_state->many_iobus_devs;
+}
+
 void kvm_setup_guest_memory(void *start, size_t size)
 {
     if (!kvm_has_sync_mmu()) {
diff --git a/kvm-stub.c b/kvm-stub.c
index d45f9fa..b0887fb 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -99,6 +99,11 @@ int kvm_has_robust_singlestep(void)
     return 0;
 }
 
+int kvm_has_many_iobus_devs(void)
+{
+    return 0;
+}
+
 void kvm_setup_guest_memory(void *start, size_t size)
 {
 }
diff --git a/kvm.h b/kvm.h
index 50b6c01..f405906 100644
--- a/kvm.h
+++ b/kvm.h
@@ -42,6 +42,7 @@ int kvm_has_robust_singlestep(void);
 int kvm_has_debugregs(void);
 int kvm_has_xsave(void);
 int kvm_has_xcrs(void);
+int kvm_has_many_iobus_devs(void);
 
 #ifdef NEED_CPU_H
 int kvm_init_vcpu(CPUState *env);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html