[RFC PATCH 3/3] Qemu: Introduce pci-sriov device type to support VF live migration

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch is to migrate VF status during migration between
source and target machine.

There are three kinds of VF status involved.
1) PCI configure space regs
2) MSIX configure
3) VF status in the PF driver

The PCI configure space regs and MSIX configure are originally
stored in Qemu.

VF status in the PF driver can be saved and restored via new sysfs
node state_in_pf under VF sysfs directory.

Fake PCI configure space regs "0xF0" to let VF driver to know migration
status. Qemu set reg "0xF0" to 1 when migration starts and set it to 0
when migration completes. VF driver tells Qemu it's ready for migration
via writing 1 to reg "0xF1".

Qemu notifies VF driver about migration status change via new sysfs
node notify_vf to send mailbox msg to VF driver.

Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx>
---
 hw/i386/kvm/Makefile.objs |   2 +-
 hw/i386/kvm/pci-assign.c  |   2 +-
 hw/i386/kvm/sriov.c       | 213 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 215 insertions(+), 2 deletions(-)
 create mode 100644 hw/i386/kvm/sriov.c

diff --git a/hw/i386/kvm/Makefile.objs b/hw/i386/kvm/Makefile.objs
index d8bce20..09324e9 100644
--- a/hw/i386/kvm/Makefile.objs
+++ b/hw/i386/kvm/Makefile.objs
@@ -1 +1 @@
-obj-y += clock.o apic.o i8259.o ioapic.o i8254.o pci-assign.o
+obj-y += clock.o apic.o i8259.o ioapic.o i8254.o pci-assign.o sriov.o
diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c
index 616532d..84c5ff5 100644
--- a/hw/i386/kvm/pci-assign.c
+++ b/hw/i386/kvm/pci-assign.c
@@ -1770,7 +1770,7 @@ static void assign_class_init(ObjectClass *klass, void *data)
     k->config_read  = assigned_dev_pci_read_config;
     k->config_write = assigned_dev_pci_write_config;
     dc->props       = assigned_dev_properties;
-    dc->vmsd        = &vmstate_assigned_device;
+//    dc->vmsd        = &vmstate_assigned_device;
     dc->reset       = reset_assigned_device;
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
     dc->desc        = "KVM-based PCI passthrough";
diff --git a/hw/i386/kvm/sriov.c b/hw/i386/kvm/sriov.c
new file mode 100644
index 0000000..ac37035
--- /dev/null
+++ b/hw/i386/kvm/sriov.c
@@ -0,0 +1,213 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/io.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <glob.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+
+#include "hw/hw.h"
+#include "hw/i386/pc.h"
+#include "pci-assign.h"
+
+
+#define TYPE_PCI_SRIOV "pci-sriov"
+
+#define SRIOV_LM_SETUP 0x01
+#define SRIOV_LM_COMPLETE 0x02
+
+static int pt_save_pf_buf(struct PCIDevice *pdev, unsigned char **buf,
+			   int *len)
+{
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    char file[128];
+    FILE *f;
+
+    *len = 0;
+
+    snprintf(file, sizeof(file),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/state_in_pf",
+             adev->host.domain, adev->host.bus, adev->host.slot,
+             adev->host.function);
+
+    if (!(f = fopen(file, "rb"))) {
+        return -EEXIST;
+    }
+    *buf = g_malloc(4096);
+    *len = fread(*buf, 1, 4096, f);
+    fclose(f);
+
+    return 0;
+}
+
+static void pt_restore_pf_buf(struct PCIDevice *pdev, unsigned char *buf, int len)
+{
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    FILE *f;
+    char file[128];
+
+    snprintf(file, sizeof(file),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/state_in_pf",
+             adev->host.domain, adev->host.bus, adev->host.slot,
+             adev->host.function);
+
+    printf("path: %s\n", file);
+    if (!(f = fopen(file, "wb")))
+        return;
+
+    fwrite(buf, 1, len, f);
+    fclose(f);
+
+}
+
+static void assign_dev_post_load(void *opaque)
+{
+    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    char file[128];
+    FILE *f;
+
+    snprintf(file, sizeof(file),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/notify_vf",
+             adev->host.domain, adev->host.bus, adev->host.slot,
+             adev->host.function);
+
+    printf("notify path %s\n", file);
+    if (!(f = fopen(file, "wb")))
+        return;
+    
+    fwrite("1", 1, 1, f);
+    fclose(f);
+}
+
+static int assign_dev_load(QEMUFile *f, void *opaque, int version_id)
+{
+    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    unsigned char *buf = NULL;
+    int ret, len, num;
+
+    if(qemu_get_byte(f)!= SRIOV_LM_COMPLETE)
+        return 0;
+
+    ret = pci_device_load(pdev, f);
+    if (ret) {
+        printf("pci config error %d\n", ret);
+        return ret;
+    }
+
+    qemu_get_sbe32s(f, &num);
+    qemu_get_buffer(f, (unsigned char *)adev->msix_table,
+	num * PCI_MSIX_ENTRY_SIZE);
+    assigned_dev_update_msix(pdev);
+
+    len = qemu_get_be32(f);
+    if (len) {
+        buf = g_malloc(len);
+        qemu_get_buffer(f, buf, len);
+        pt_restore_pf_buf(pdev, buf, len);
+        g_free(buf);
+    }
+
+
+    pci_default_write_config(pdev, 0xf0, 0x00, 1);
+    pci_default_write_config(pdev, 0xf1, 0x00, 1);
+    return 0;
+}
+
+static int assign_dev_save_complete(QEMUFile *f, void *opaque)
+{
+    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    int len, entries_nr = 0;
+    unsigned char *buf = NULL;
+    int i;
+    MSIXTableEntry *entry = adev->msix_table;
+
+    qemu_put_byte(f, SRIOV_LM_COMPLETE);
+    pci_device_save(pdev, f);
+
+    for (i = 0; i < adev->msix_max; i++, entry++) {
+        if (assigned_dev_msix_skipped(entry)) {
+            continue;
+        }
+        entries_nr++;
+    }
+
+    qemu_put_sbe32s(f, &entries_nr);
+    qemu_put_buffer(f, (unsigned char *)adev->msix_table, entries_nr * PCI_MSIX_ENTRY_SIZE);
+
+    if (pt_save_pf_buf(pdev, &buf, &len))
+        return -EFAULT;
+
+    qemu_put_be32(f, len);
+    if (len) {
+        printf("pf state saved, size %d\n", len);
+        qemu_put_buffer(f, buf, len);
+    }
+    
+    return 0;
+}
+
+static int assign_dev_setup(QEMUFile *f, void *opaque)
+{
+    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    char file[128];
+    FILE *fd;
+
+    pci_default_write_config(pdev, 0xf0, 0x01, 1);
+
+    snprintf(file, sizeof(file),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/notify_vf",
+             adev->host.domain, adev->host.bus, adev->host.slot,
+             adev->host.function);
+
+    if (!(fd = fopen(file, "wb")))
+        return -EFAULT;
+
+    fwrite("1", 1, 1, fd);
+    fclose(fd);
+
+    printf("notify path %s\n", file);
+    qemu_put_byte(f, SRIOV_LM_SETUP);
+    return 0;
+}
+
+static uint64_t assign_dev_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+
+    return pci_default_read_config(pdev, 0xf1,1) ?
+                0 : max_size;
+}
+
+static SaveVMHandlers savevm_pt_handlers = {
+    .save_live_setup = assign_dev_setup,
+    .save_live_complete = assign_dev_save_complete,
+    .save_live_pending = assign_dev_save_pending,
+    .load_state = assign_dev_load,
+    .post_load_state = assign_dev_post_load,
+};
+
+static void sriov_pci_instance_init(Object *obj)
+{
+    PCIDevice *pci_dev = PCI_DEVICE(obj);
+
+    register_savevm_live(NULL, "pci-assign", 1, 1,
+                         &savevm_pt_handlers, pci_dev);
+}
+
+static const TypeInfo sriov_pci_type_info = {
+    .name = TYPE_PCI_SRIOV,
+    .parent = TYPE_PCI_ASSIGN, 
+    .instance_init = sriov_pci_instance_init,
+};
+
+static void sriov_register_types(void)
+{
+    type_register_static(&sriov_pci_type_info);
+}
+type_init(sriov_register_types)
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux