Re: [RFC PATCH V2 09/10] Qemu/VFIO: Add SRIOV VF migration support

"Michael S. Tsirkin" <mst@xxxxxxxxxx> · Tue, 24 Nov 2015 23:03:29 +0200

On Tue, Nov 24, 2015 at 09:35:26PM +0800, Lan Tianyu wrote:
> This patch is to add SRIOV VF migration support.
> Create new device type "vfio-sriov" and add faked PCI migration capability
> to the type device.
> 
> The purpose of the new capability
> 1) sync migration status with VF driver in the VM
> 2) Get mailbox irq vector to notify VF driver during migration.
> 3) Provide a way to control injecting irq or not.
> 
> Qemu will migrate PCI configure space regs and MSIX config for VF.
> Inject mailbox irq at last stage of migration to notify VF about
> migration event and wait VF driver ready for migration.

I think this last bit "wait VF driver ready for migration"
is wrong. Not a lot is gained as compared to hotunplug.

To really get a benefit from this feature migration should
succeed even if guest is stuck, then interrupt should
tell guest that it has to reset the driver.

> VF driver
> writeS PCI config reg PCI_VF_MIGRATION_VF_STATUS in the new cap table
> to tell Qemu.
> 
> Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx>
> ---
>  hw/vfio/Makefile.objs |   2 +-
>  hw/vfio/pci.c         |   6 ++
>  hw/vfio/pci.h         |   4 ++
>  hw/vfio/sriov.c       | 178 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 189 insertions(+), 1 deletion(-)
>  create mode 100644 hw/vfio/sriov.c
> 
> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
> index d540c9d..9cf0178 100644
> --- a/hw/vfio/Makefile.objs
> +++ b/hw/vfio/Makefile.objs
> @@ -1,6 +1,6 @@
>  ifeq ($(CONFIG_LINUX), y)
>  obj-$(CONFIG_SOFTMMU) += common.o
> -obj-$(CONFIG_PCI) += pci.o
> +obj-$(CONFIG_PCI) += pci.o sriov.o
>  obj-$(CONFIG_SOFTMMU) += platform.o
>  obj-$(CONFIG_SOFTMMU) += calxeda-xgmac.o
>  endif
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 7c43fc1..e7583b5 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2013,6 +2013,11 @@ void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
>          } else if (was_enabled && !is_enabled) {
>              vfio_disable_msix(vdev);
>          }
> +    } else if (vdev->migration_cap &&
> +        ranges_overlap(addr, len, vdev->migration_cap, 0x10)) {
> +        /* Write everything to QEMU to keep emulated bits correct */
> +        pci_default_write_config(pdev, addr, val, len);
> +        vfio_migration_cap_handle(pdev, addr, val, len);
>      } else {
>          /* Write everything to QEMU to keep emulated bits correct */
>          pci_default_write_config(pdev, addr, val, len);
> @@ -3517,6 +3522,7 @@ static int vfio_initfn(PCIDevice *pdev)
>      vfio_register_err_notifier(vdev);
>      vfio_register_req_notifier(vdev);
>      vfio_setup_resetfn(vdev);
> +    vfio_add_migration_capability(vdev);
>  
>      return 0;
>  
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 6c00575..ee6ca5e 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -134,6 +134,7 @@ typedef struct VFIOPCIDevice {
>      PCIHostDeviceAddress host;
>      EventNotifier err_notifier;
>      EventNotifier req_notifier;
> +    uint16_t    migration_cap;
>      int (*resetfn)(struct VFIOPCIDevice *);
>      uint32_t features;
>  #define VFIO_FEATURE_ENABLE_VGA_BIT 0
> @@ -162,3 +163,6 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
>  void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
>                             uint32_t val, int len);
>  void vfio_enable_msix(VFIOPCIDevice *vdev);
> +void vfio_add_migration_capability(VFIOPCIDevice *vdev);
> +void vfio_migration_cap_handle(PCIDevice *pdev, uint32_t addr,
> +                               uint32_t val, int len);
> diff --git a/hw/vfio/sriov.c b/hw/vfio/sriov.c
> new file mode 100644
> index 0000000..3109538
> --- /dev/null
> +++ b/hw/vfio/sriov.c
> @@ -0,0 +1,178 @@
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <sys/io.h>
> +#include <sys/mman.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <glob.h>
> +#include <unistd.h>
> +#include <sys/ioctl.h>
> +
> +#include "hw/hw.h"
> +#include "hw/vfio/pci.h"
> +#include "hw/vfio/vfio.h"
> +#include "hw/vfio/vfio-common.h"
> +
> +#define TYPE_VFIO_SRIOV "vfio-sriov"
> +
> +#define SRIOV_LM_SETUP 0x01
> +#define SRIOV_LM_COMPLETE 0x02
> +
> +QemuEvent migration_event;
> +
> +static void vfio_dev_post_load(void *opaque)
> +{
> +    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> +    MSIMessage msg;
> +    int vector;
> +
> +    if (vfio_pci_read_config(pdev,
> +            vdev->migration_cap + PCI_VF_MIGRATION_CAP, 1)
> +            != PCI_VF_MIGRATION_ENABLE)
> +        return;
> +
> +    vector = vfio_pci_read_config(pdev,
> +        vdev->migration_cap + PCI_VF_MIGRATION_IRQ, 1);
> +
> +    msg = msix_get_message(pdev, vector);
> +    kvm_irqchip_send_msi(kvm_state, msg);
> +}
> +
> +static int vfio_dev_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> +    int ret;
> +
> +    if(qemu_get_byte(f)!= SRIOV_LM_COMPLETE)
> +        return 0;
> +
> +    ret = pci_device_load(pdev, f);
> +    if (ret) {
> +        error_report("Faild to load PCI config space.\n");
> +        return ret;
> +    }
> +
> +    if (msix_enabled(pdev)) {
> +        vfio_enable_msix(vdev);
> +        msix_load(pdev, f);
> +    }
> +
> +    vfio_pci_write_config(pdev,vdev->migration_cap +
> +        PCI_VF_MIGRATION_VMM_STATUS, VMM_MIGRATION_END, 1);
> +    vfio_pci_write_config(pdev,vdev->migration_cap +
> +        PCI_VF_MIGRATION_VF_STATUS, PCI_VF_WAIT_FOR_MIGRATION, 1);
> +    return 0;
> +}
> +
> +static int vfio_dev_save_complete(QEMUFile *f, void *opaque)
> +{
> +    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
> +
> +    qemu_put_byte(f, SRIOV_LM_COMPLETE);
> +    pci_device_save(pdev, f);
> +
> +    if (msix_enabled(pdev)) {
> +        msix_save(pdev, f);
> +    }
> +
> +    return 0;
> +}
> +
> +static int vfio_dev_setup(QEMUFile *f, void *opaque)
> +{
> +    qemu_put_byte(f, SRIOV_LM_SETUP);
> +    return 0;
> +}
> +
> +static void vfio_dev_save_before_stop(QEMUFile *f, void *opaque)
> +{
> +    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> +    int vector;
> +    MSIMessage msg;
> +
> +    vfio_pci_write_config(pdev, vdev->migration_cap +
> +        PCI_VF_MIGRATION_VMM_STATUS, VMM_MIGRATION_START, 1);
> +
> +    if (vfio_pci_read_config(pdev,
> +            vdev->migration_cap + PCI_VF_MIGRATION_CAP, 1)
> +            != PCI_VF_MIGRATION_ENABLE)
> +        return;
> +
> +    vector = vfio_pci_read_config(pdev,
> +        vdev->migration_cap + PCI_VF_MIGRATION_IRQ, 1);
> +
> +    qemu_event_reset(&migration_event);
> +
> +    msg = msix_get_message(pdev, vector);
> +    kvm_irqchip_send_msi(kvm_state, msg);
> +
> +    qemu_event_wait(&migration_event);

So this blocks QEMU, holding the QEMU lock, and
waits for qemu_event_set below.

> +}
> +
> +static SaveVMHandlers savevm_pt_handlers = {
> +    .save_live_setup = vfio_dev_setup,
> +    .save_live_complete = vfio_dev_save_complete,
> +    .save_before_stop = vfio_dev_save_before_stop,          
> +    .load_state = vfio_dev_load,
> +    .post_load_state = vfio_dev_post_load,
> +};
> +
> +void vfio_add_migration_capability(VFIOPCIDevice *vdev)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +    int free_pos;
> +
> +    if (strcmp(object_get_typename(OBJECT(vdev)), TYPE_VFIO_SRIOV))
> +        return;
> +
> +    free_pos = vfio_find_free_cfg_reg(vdev,
> +                pdev->config[PCI_CAPABILITY_LIST],
> +                PCI_VF_MIGRATION_CAP_SIZE);
> +    if (free_pos) {
> +        vdev->migration_cap = free_pos;
> +    	pci_add_capability(pdev, PCI_CAP_ID_MIGRATION,
> +                        free_pos, PCI_VF_MIGRATION_CAP_SIZE);
> +    	memset(vdev->emulated_config_bits + free_pos, 0xff,
> +                        PCI_VF_MIGRATION_CAP_SIZE);
> +    	memset(vdev->pdev.wmask + free_pos, 0xff,
> +                        PCI_VF_MIGRATION_CAP_SIZE);
> +     } else
> +        error_report("vfio: Fail to find free PCI config space regs.\n");
> +}
> +
> +void vfio_migration_cap_handle(PCIDevice *pdev, uint32_t addr,
> +                                  uint32_t val, int len)
> +{
> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> +
> +    if (addr == vdev->migration_cap + PCI_VF_MIGRATION_VF_STATUS
> +        && val == PCI_VF_READY_FOR_MIGRATION) {       
> +        qemu_event_set(&migration_event);

This would wake migration so it can proceed -
except it needs QEMU lock to run, and that's
taken by the migration thread.

It seems unlikely that this ever worked - how
did you test this?

> +    }
> +}
> +
> +static void vfio_sriov_instance_init(Object *obj)
> +{
> +    PCIDevice *pdev = PCI_DEVICE(obj);
> +
> +    register_savevm_live(NULL, "vfio-sriov", 1, 1,
> +                         &savevm_pt_handlers, pdev);
> +
> +    qemu_event_init(&migration_event, false);
> +
> +}
> +
> +static const TypeInfo vfio_sriov_type_info = {
> +    .name = TYPE_VFIO_SRIOV,
> +    .parent = "vfio-pci", 
> +    .instance_init = vfio_sriov_instance_init,
> +};
> +
> +static void sriov_register_types(void)
> +{
> +    type_register_static(&vfio_sriov_type_info);
> +}
> +type_init(sriov_register_types)
> -- 
> 1.9.3
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html