On Tue, Nov 24, 2015 at 09:35:26PM +0800, Lan Tianyu wrote: > This patch is to add SRIOV VF migration support. > Create new device type "vfio-sriov" and add faked PCI migration capability > to the type device. > > The purpose of the new capability > 1) sync migration status with VF driver in the VM > 2) Get mailbox irq vector to notify VF driver during migration. > 3) Provide a way to control injecting irq or not. > > Qemu will migrate PCI configure space regs and MSIX config for VF. > Inject mailbox irq at last stage of migration to notify VF about > migration event and wait VF driver ready for migration. I think this last bit "wait VF driver ready for migration" is wrong. Not a lot is gained as compared to hotunplug. To really get a benefit from this feature migration should succeed even if guest is stuck, then interrupt should tell guest that it has to reset the driver. > VF driver > writeS PCI config reg PCI_VF_MIGRATION_VF_STATUS in the new cap table > to tell Qemu. > > Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx> > --- > hw/vfio/Makefile.objs | 2 +- > hw/vfio/pci.c | 6 ++ > hw/vfio/pci.h | 4 ++ > hw/vfio/sriov.c | 178 ++++++++++++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 189 insertions(+), 1 deletion(-) > create mode 100644 hw/vfio/sriov.c > > diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs > index d540c9d..9cf0178 100644 > --- a/hw/vfio/Makefile.objs > +++ b/hw/vfio/Makefile.objs > @@ -1,6 +1,6 @@ > ifeq ($(CONFIG_LINUX), y) > obj-$(CONFIG_SOFTMMU) += common.o > -obj-$(CONFIG_PCI) += pci.o > +obj-$(CONFIG_PCI) += pci.o sriov.o > obj-$(CONFIG_SOFTMMU) += platform.o > obj-$(CONFIG_SOFTMMU) += calxeda-xgmac.o > endif > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 7c43fc1..e7583b5 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -2013,6 +2013,11 @@ void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, > } else if (was_enabled && !is_enabled) { > vfio_disable_msix(vdev); > } > + } else if (vdev->migration_cap && > + ranges_overlap(addr, len, vdev->migration_cap, 0x10)) { > + /* Write everything to QEMU to keep emulated bits correct */ > + pci_default_write_config(pdev, addr, val, len); > + vfio_migration_cap_handle(pdev, addr, val, len); > } else { > /* Write everything to QEMU to keep emulated bits correct */ > pci_default_write_config(pdev, addr, val, len); > @@ -3517,6 +3522,7 @@ static int vfio_initfn(PCIDevice *pdev) > vfio_register_err_notifier(vdev); > vfio_register_req_notifier(vdev); > vfio_setup_resetfn(vdev); > + vfio_add_migration_capability(vdev); > > return 0; > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index 6c00575..ee6ca5e 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -134,6 +134,7 @@ typedef struct VFIOPCIDevice { > PCIHostDeviceAddress host; > EventNotifier err_notifier; > EventNotifier req_notifier; > + uint16_t migration_cap; > int (*resetfn)(struct VFIOPCIDevice *); > uint32_t features; > #define VFIO_FEATURE_ENABLE_VGA_BIT 0 > @@ -162,3 +163,6 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); > void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, > uint32_t val, int len); > void vfio_enable_msix(VFIOPCIDevice *vdev); > +void vfio_add_migration_capability(VFIOPCIDevice *vdev); > +void vfio_migration_cap_handle(PCIDevice *pdev, uint32_t addr, > + uint32_t val, int len); > diff --git a/hw/vfio/sriov.c b/hw/vfio/sriov.c > new file mode 100644 > index 0000000..3109538 > --- /dev/null > +++ b/hw/vfio/sriov.c > @@ -0,0 +1,178 @@ > +#include <stdio.h> > +#include <unistd.h> > +#include <sys/io.h> > +#include <sys/mman.h> > +#include <sys/types.h> > +#include <sys/stat.h> > +#include <glob.h> > +#include <unistd.h> > +#include <sys/ioctl.h> > + > +#include "hw/hw.h" > +#include "hw/vfio/pci.h" > +#include "hw/vfio/vfio.h" > +#include "hw/vfio/vfio-common.h" > + > +#define TYPE_VFIO_SRIOV "vfio-sriov" > + > +#define SRIOV_LM_SETUP 0x01 > +#define SRIOV_LM_COMPLETE 0x02 > + > +QemuEvent migration_event; > + > +static void vfio_dev_post_load(void *opaque) > +{ > + struct PCIDevice *pdev = (struct PCIDevice *)opaque; > + VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); > + MSIMessage msg; > + int vector; > + > + if (vfio_pci_read_config(pdev, > + vdev->migration_cap + PCI_VF_MIGRATION_CAP, 1) > + != PCI_VF_MIGRATION_ENABLE) > + return; > + > + vector = vfio_pci_read_config(pdev, > + vdev->migration_cap + PCI_VF_MIGRATION_IRQ, 1); > + > + msg = msix_get_message(pdev, vector); > + kvm_irqchip_send_msi(kvm_state, msg); > +} > + > +static int vfio_dev_load(QEMUFile *f, void *opaque, int version_id) > +{ > + struct PCIDevice *pdev = (struct PCIDevice *)opaque; > + VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); > + int ret; > + > + if(qemu_get_byte(f)!= SRIOV_LM_COMPLETE) > + return 0; > + > + ret = pci_device_load(pdev, f); > + if (ret) { > + error_report("Faild to load PCI config space.\n"); > + return ret; > + } > + > + if (msix_enabled(pdev)) { > + vfio_enable_msix(vdev); > + msix_load(pdev, f); > + } > + > + vfio_pci_write_config(pdev,vdev->migration_cap + > + PCI_VF_MIGRATION_VMM_STATUS, VMM_MIGRATION_END, 1); > + vfio_pci_write_config(pdev,vdev->migration_cap + > + PCI_VF_MIGRATION_VF_STATUS, PCI_VF_WAIT_FOR_MIGRATION, 1); > + return 0; > +} > + > +static int vfio_dev_save_complete(QEMUFile *f, void *opaque) > +{ > + struct PCIDevice *pdev = (struct PCIDevice *)opaque; > + > + qemu_put_byte(f, SRIOV_LM_COMPLETE); > + pci_device_save(pdev, f); > + > + if (msix_enabled(pdev)) { > + msix_save(pdev, f); > + } > + > + return 0; > +} > + > +static int vfio_dev_setup(QEMUFile *f, void *opaque) > +{ > + qemu_put_byte(f, SRIOV_LM_SETUP); > + return 0; > +} > + > +static void vfio_dev_save_before_stop(QEMUFile *f, void *opaque) > +{ > + struct PCIDevice *pdev = (struct PCIDevice *)opaque; > + VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); > + int vector; > + MSIMessage msg; > + > + vfio_pci_write_config(pdev, vdev->migration_cap + > + PCI_VF_MIGRATION_VMM_STATUS, VMM_MIGRATION_START, 1); > + > + if (vfio_pci_read_config(pdev, > + vdev->migration_cap + PCI_VF_MIGRATION_CAP, 1) > + != PCI_VF_MIGRATION_ENABLE) > + return; > + > + vector = vfio_pci_read_config(pdev, > + vdev->migration_cap + PCI_VF_MIGRATION_IRQ, 1); > + > + qemu_event_reset(&migration_event); > + > + msg = msix_get_message(pdev, vector); > + kvm_irqchip_send_msi(kvm_state, msg); > + > + qemu_event_wait(&migration_event); So this blocks QEMU, holding the QEMU lock, and waits for qemu_event_set below. > +} > + > +static SaveVMHandlers savevm_pt_handlers = { > + .save_live_setup = vfio_dev_setup, > + .save_live_complete = vfio_dev_save_complete, > + .save_before_stop = vfio_dev_save_before_stop, > + .load_state = vfio_dev_load, > + .post_load_state = vfio_dev_post_load, > +}; > + > +void vfio_add_migration_capability(VFIOPCIDevice *vdev) > +{ > + PCIDevice *pdev = &vdev->pdev; > + int free_pos; > + > + if (strcmp(object_get_typename(OBJECT(vdev)), TYPE_VFIO_SRIOV)) > + return; > + > + free_pos = vfio_find_free_cfg_reg(vdev, > + pdev->config[PCI_CAPABILITY_LIST], > + PCI_VF_MIGRATION_CAP_SIZE); > + if (free_pos) { > + vdev->migration_cap = free_pos; > + pci_add_capability(pdev, PCI_CAP_ID_MIGRATION, > + free_pos, PCI_VF_MIGRATION_CAP_SIZE); > + memset(vdev->emulated_config_bits + free_pos, 0xff, > + PCI_VF_MIGRATION_CAP_SIZE); > + memset(vdev->pdev.wmask + free_pos, 0xff, > + PCI_VF_MIGRATION_CAP_SIZE); > + } else > + error_report("vfio: Fail to find free PCI config space regs.\n"); > +} > + > +void vfio_migration_cap_handle(PCIDevice *pdev, uint32_t addr, > + uint32_t val, int len) > +{ > + VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); > + > + if (addr == vdev->migration_cap + PCI_VF_MIGRATION_VF_STATUS > + && val == PCI_VF_READY_FOR_MIGRATION) { > + qemu_event_set(&migration_event); This would wake migration so it can proceed - except it needs QEMU lock to run, and that's taken by the migration thread. It seems unlikely that this ever worked - how did you test this? > + } > +} > + > +static void vfio_sriov_instance_init(Object *obj) > +{ > + PCIDevice *pdev = PCI_DEVICE(obj); > + > + register_savevm_live(NULL, "vfio-sriov", 1, 1, > + &savevm_pt_handlers, pdev); > + > + qemu_event_init(&migration_event, false); > + > +} > + > +static const TypeInfo vfio_sriov_type_info = { > + .name = TYPE_VFIO_SRIOV, > + .parent = "vfio-pci", > + .instance_init = vfio_sriov_instance_init, > +}; > + > +static void sriov_register_types(void) > +{ > + type_register_static(&vfio_sriov_type_info); > +} > +type_init(sriov_register_types) > -- > 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html