Add live migration support via the VFIO subsystem. The migration implementation aligns with the definition from uapi/vfio.h and uses the auxiliary bus connection between pds_core/pds_vfio to communicate with the device. The ability to suspend, resume, and transfer VF device state data is included along with the required admin queue command structures and implementations. PDS_LM_CMD_SUSPEND and PDS_LM_CMD_SUSPEND_STATUS are added to support the VF device suspend operation. PDS_LM_CMD_RESUME is added to support the VF device resume operation. PDS_LM_CMD_STATUS is added to determine the exact size of the VF device state data. PDS_LM_CMD_SAVE is added to get the VF device state data. PDS_LM_CMD_RESTORE is added to restore the VF device with the previously saved data from PDS_LM_CMD_SAVE. PDS_LM_CMD_HOST_VF_STATUS is added to notify the device when a migration is in/not-in progress from the host's perspective. Signed-off-by: Brett Creeley <brett.creeley@xxxxxxx> Signed-off-by: Shannon Nelson <shannon.nelson@xxxxxxx> --- drivers/vfio/pci/pds/Makefile | 1 + drivers/vfio/pci/pds/aux_drv.c | 1 + drivers/vfio/pci/pds/cmds.c | 312 ++++++++++++++++++++ drivers/vfio/pci/pds/cmds.h | 15 + drivers/vfio/pci/pds/lm.c | 491 ++++++++++++++++++++++++++++++++ drivers/vfio/pci/pds/lm.h | 43 +++ drivers/vfio/pci/pds/pci_drv.c | 15 + drivers/vfio/pci/pds/vfio_dev.c | 119 +++++++- drivers/vfio/pci/pds/vfio_dev.h | 12 + include/linux/pds/pds_lm.h | 206 ++++++++++++++ 10 files changed, 1214 insertions(+), 1 deletion(-) create mode 100644 drivers/vfio/pci/pds/lm.c create mode 100644 drivers/vfio/pci/pds/lm.h diff --git a/drivers/vfio/pci/pds/Makefile b/drivers/vfio/pci/pds/Makefile index 49bf8289739b..b4980323cff4 100644 --- a/drivers/vfio/pci/pds/Makefile +++ b/drivers/vfio/pci/pds/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_PDS_VFIO_PCI) += pds_vfio.o pds_vfio-y := \ aux_drv.o \ cmds.o \ + lm.o \ pci_drv.o \ vfio_dev.o diff --git a/drivers/vfio/pci/pds/aux_drv.c b/drivers/vfio/pci/pds/aux_drv.c index 494551894926..59b43075e85f 100644 --- a/drivers/vfio/pci/pds/aux_drv.c +++ b/drivers/vfio/pci/pds/aux_drv.c @@ -77,6 +77,7 @@ pds_vfio_aux_probe(struct auxiliary_device *aux_dev, dev_dbg(dev, "%s: id %#04x busnr %#x devfn %#x bus %p pds_vfio %p\n", __func__, padev->id, busnr, devfn, bus, vfio_aux->pds_vfio); + vfio_aux->pds_vfio->coredev = aux_dev->dev.parent; vfio_aux->pds_vfio->vfio_aux = vfio_aux; vfio_aux->padrv.event_handler = pds_vfio_aux_notify_handler; diff --git a/drivers/vfio/pci/pds/cmds.c b/drivers/vfio/pci/pds/cmds.c index 5a3fadcd38d8..11823d824ccc 100644 --- a/drivers/vfio/pci/pds/cmds.c +++ b/drivers/vfio/pci/pds/cmds.c @@ -3,9 +3,11 @@ #include <linux/io.h> #include <linux/types.h> +#include <linux/delay.h> #include <linux/pds/pds_core_if.h> #include <linux/pds/pds_adminq.h> +#include <linux/pds/pds_lm.h> #include <linux/pds/pds_auxbus.h> #include "vfio_dev.h" @@ -28,3 +30,313 @@ pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio) padev->ops->unregister_client(padev); } + +#define SUSPEND_TIMEOUT_S 5 +#define SUSPEND_CHECK_INTERVAL_MS 1 + +static int +pds_vfio_suspend_wait_device_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + struct pds_lm_suspend_status_cmd cmd = { + .opcode = PDS_LM_CMD_SUSPEND_STATUS, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }; + struct pci_dev *pdev = pds_vfio->pdev; + struct pds_lm_comp comp = { 0 }; + struct pds_auxiliary_dev *padev; + unsigned long time_limit; + unsigned long time_start; + unsigned long time_done; + int err; + + padev = pds_vfio->vfio_aux->padev; + + time_start = jiffies; + time_limit = time_start + HZ * SUSPEND_TIMEOUT_S; + do { + err = padev->ops->adminq_cmd(padev, + (union pds_core_adminq_cmd *)&cmd, + sizeof(cmd), + (union pds_core_adminq_comp *)&comp, + PDS_AQ_FLAG_FASTPOLL); + if (err != -EAGAIN) + break; + + msleep(SUSPEND_CHECK_INTERVAL_MS); + } while (time_before(jiffies, time_limit)); + + time_done = jiffies; + dev_dbg(&pdev->dev, "%s: vf%u: Suspend comp received in %d msecs\n", + __func__, pds_vfio->vf_id, + jiffies_to_msecs(time_done - time_start)); + + /* Check the results */ + if (time_after_eq(time_done, time_limit)) { + dev_err(&pdev->dev, "%s: vf%u: Suspend comp timeout\n", __func__, + pds_vfio->vf_id); + err = -ETIMEDOUT; + } + + return err; +} + +int +pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + struct pds_lm_suspend_cmd cmd = { + .opcode = PDS_LM_CMD_SUSPEND, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }; + struct pds_lm_suspend_comp comp = {0}; + struct pci_dev *pdev = pds_vfio->pdev; + struct pds_auxiliary_dev *padev; + int err; + + dev_dbg(&pdev->dev, "vf%u: Suspend device\n", pds_vfio->vf_id); + + padev = pds_vfio->vfio_aux->padev; + err = padev->ops->adminq_cmd(padev, + (union pds_core_adminq_cmd *)&cmd, + sizeof(cmd), + (union pds_core_adminq_comp *)&comp, + PDS_AQ_FLAG_FASTPOLL); + if (err) { + dev_err(&pdev->dev, "vf%u: Suspend failed: %pe\n", + pds_vfio->vf_id, ERR_PTR(err)); + return err; + } + + return pds_vfio_suspend_wait_device_cmd(pds_vfio); +} + +int +pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + struct pds_lm_resume_cmd cmd = { + .opcode = PDS_LM_CMD_RESUME, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }; + struct pds_auxiliary_dev *padev; + struct pds_lm_comp comp = {0}; + + dev_dbg(&pds_vfio->pdev->dev, "vf%u: Resume device\n", pds_vfio->vf_id); + + padev = pds_vfio->vfio_aux->padev; + return padev->ops->adminq_cmd(padev, + (union pds_core_adminq_cmd *)&cmd, + sizeof(cmd), + (union pds_core_adminq_comp *)&comp, + 0); +} + +int +pds_vfio_get_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size) +{ + struct pds_lm_status_cmd cmd = { + .opcode = PDS_LM_CMD_STATUS, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }; + struct pds_lm_status_comp comp = {0}; + struct pds_auxiliary_dev *padev; + int err = 0; + + dev_dbg(&pds_vfio->pdev->dev, "vf%u: Get migration status\n", + pds_vfio->vf_id); + + padev = pds_vfio->vfio_aux->padev; + err = padev->ops->adminq_cmd(padev, + (union pds_core_adminq_cmd *)&cmd, + sizeof(cmd), + (union pds_core_adminq_comp *)&comp, + 0); + if (err) + return err; + + *size = le64_to_cpu(comp.size); + return 0; +} + +static int +pds_vfio_dma_map_lm_file(struct device *dev, enum dma_data_direction dir, + struct pds_vfio_lm_file *lm_file) +{ + struct pds_lm_sg_elem *sgl, *sge; + struct scatterlist *sg; + int err = 0; + int i; + + if (!lm_file) + return -EINVAL; + + /* dma map file pages */ + err = dma_map_sgtable(dev, &lm_file->sg_table, dir, 0); + if (err) + goto err_dma_map_sg; + + lm_file->num_sge = lm_file->sg_table.nents; + + /* alloc sgl */ + sgl = dma_alloc_coherent(dev, lm_file->num_sge * + sizeof(struct pds_lm_sg_elem), + &lm_file->sgl_addr, GFP_KERNEL); + if (!sgl) { + err = -ENOMEM; + goto err_alloc_sgl; + } + + lm_file->sgl = sgl; + + /* fill sgl */ + sge = sgl; + for_each_sgtable_dma_sg(&lm_file->sg_table, sg, i) { + sge->addr = cpu_to_le64(sg_dma_address(sg)); + sge->len = cpu_to_le32(sg_dma_len(sg)); + dev_dbg(dev, "addr = %llx, len = %u\n", sge->addr, sge->len); + sge++; + } + + return 0; + +err_alloc_sgl: + dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0); +err_dma_map_sg: + return err; +} + +static void +pds_vfio_dma_unmap_lm_file(struct device *dev, enum dma_data_direction dir, + struct pds_vfio_lm_file *lm_file) +{ + if (!lm_file) + return; + + /* free sgl */ + if (lm_file->sgl) { + dma_free_coherent(dev, lm_file->num_sge * + sizeof(struct pds_lm_sg_elem), + lm_file->sgl, lm_file->sgl_addr); + lm_file->sgl = NULL; + lm_file->sgl_addr = DMA_MAPPING_ERROR; + lm_file->num_sge = 0; + } + + /* dma unmap file pages */ + dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0); +} + +int +pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + struct pds_lm_save_cmd cmd = { + .opcode = PDS_LM_CMD_SAVE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }; + struct pci_dev *pdev = pds_vfio->pdev; + struct pds_vfio_lm_file *lm_file; + struct pds_auxiliary_dev *padev; + struct pds_lm_comp comp = {0}; + int err; + + dev_dbg(&pdev->dev, "vf%u: Get migration state\n", pds_vfio->vf_id); + + lm_file = pds_vfio->save_file; + + padev = pds_vfio->vfio_aux->padev; + err = pds_vfio_dma_map_lm_file(pds_vfio->coredev, DMA_FROM_DEVICE, lm_file); + if (err) { + dev_err(&pdev->dev, "failed to map save migration file: %pe\n", + ERR_PTR(err)); + return err; + } + + cmd.sgl_addr = cpu_to_le64(lm_file->sgl_addr); + cmd.num_sge = cpu_to_le32(lm_file->num_sge); + + err = padev->ops->adminq_cmd(padev, + (union pds_core_adminq_cmd *)&cmd, + sizeof(cmd), + (union pds_core_adminq_comp *)&comp, + 0); + if (err) + dev_err(&pdev->dev, "failed to get migration state: %pe\n", + ERR_PTR(err)); + + pds_vfio_dma_unmap_lm_file(pds_vfio->coredev, DMA_FROM_DEVICE, lm_file); + + return err; +} + +int +pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + struct pds_lm_restore_cmd cmd = { + .opcode = PDS_LM_CMD_RESTORE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }; + struct pci_dev *pdev = pds_vfio->pdev; + struct pds_vfio_lm_file *lm_file; + struct pds_auxiliary_dev *padev; + struct pds_lm_comp comp = {0}; + int err; + + dev_dbg(&pdev->dev, "vf%u: Set migration state\n", pds_vfio->vf_id); + + lm_file = pds_vfio->restore_file; + + padev = pds_vfio->vfio_aux->padev; + err = pds_vfio_dma_map_lm_file(pds_vfio->coredev, DMA_TO_DEVICE, lm_file); + if (err) { + dev_err(&pdev->dev, "failed to map restore migration file: %pe\n", + ERR_PTR(err)); + return err; + } + + cmd.sgl_addr = cpu_to_le64(lm_file->sgl_addr); + cmd.num_sge = cpu_to_le32(lm_file->num_sge); + + err = padev->ops->adminq_cmd(padev, + (union pds_core_adminq_cmd *)&cmd, + sizeof(cmd), + (union pds_core_adminq_comp *)&comp, + 0); + if (err) + dev_err(&pdev->dev, "failed to set migration state: %pe\n", + ERR_PTR(err)); + + pds_vfio_dma_unmap_lm_file(pds_vfio->coredev, DMA_TO_DEVICE, lm_file); + + return err; +} + +void +pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, + enum pds_lm_host_vf_status vf_status) +{ + struct pds_auxiliary_dev *padev = pds_vfio->vfio_aux->padev; + struct pds_lm_host_vf_status_cmd cmd = { + .opcode = PDS_LM_CMD_HOST_VF_STATUS, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .status = vf_status, + }; + struct pci_dev *pdev = pds_vfio->pdev; + struct pds_lm_comp comp = {0}; + int err; + + dev_dbg(&pdev->dev, "vf%u: Set host VF LM status: %u", + pds_vfio->vf_id, cmd.status); + if (vf_status != PDS_LM_STA_IN_PROGRESS && + vf_status != PDS_LM_STA_NONE) { + dev_warn(&pdev->dev, "Invalid host VF migration status, %d\n", + vf_status); + return; + } + + err = padev->ops->adminq_cmd(padev, + (union pds_core_adminq_cmd *)&cmd, + sizeof(cmd), + (union pds_core_adminq_comp *)&comp, + 0); + if (err) + dev_warn(&pdev->dev, "failed to send host VF migration status: %pe\n", + ERR_PTR(err)); +} diff --git a/drivers/vfio/pci/pds/cmds.h b/drivers/vfio/pci/pds/cmds.h index 7fe2d1efd894..5f9ac45ee5a3 100644 --- a/drivers/vfio/pci/pds/cmds.h +++ b/drivers/vfio/pci/pds/cmds.h @@ -4,11 +4,26 @@ #ifndef _CMDS_H_ #define _CMDS_H_ +#include <linux/pds/pds_lm.h> + struct pds_vfio_pci_device; int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio); void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio); +int +pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio); +int +pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio); +int +pds_vfio_get_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size); +int +pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio); +int +pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio); +void +pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, + enum pds_lm_host_vf_status vf_status); #endif /* _CMDS_H_ */ diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c new file mode 100644 index 000000000000..86152b18a5b7 --- /dev/null +++ b/drivers/vfio/pci/pds/lm.c @@ -0,0 +1,491 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2022 Pensando Systems, Inc */ + +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/vfio.h> +#include <linux/vfio_pci_core.h> + +#include "cmds.h" +#include "vfio_dev.h" + +#define PDS_VFIO_LM_FILENAME "pds_vfio_lm" + +const char * +pds_vfio_lm_state(enum vfio_device_mig_state state) +{ + switch (state) { + case VFIO_DEVICE_STATE_ERROR: + return "VFIO_DEVICE_STATE_ERROR"; + case VFIO_DEVICE_STATE_STOP: + return "VFIO_DEVICE_STATE_STOP"; + case VFIO_DEVICE_STATE_RUNNING: + return "VFIO_DEVICE_STATE_RUNNING"; + case VFIO_DEVICE_STATE_STOP_COPY: + return "VFIO_DEVICE_STATE_STOP_COPY"; + case VFIO_DEVICE_STATE_RESUMING: + return "VFIO_DEVICE_STATE_RESUMING"; + case VFIO_DEVICE_STATE_RUNNING_P2P: + return "VFIO_DEVICE_STATE_RUNNING_P2P"; + default: + return "VFIO_DEVICE_STATE_INVALID"; + } + + return "VFIO_DEVICE_STATE_INVALID"; +} + +static struct pds_vfio_lm_file * +pds_vfio_get_lm_file(const char *name, const struct file_operations *fops, + int flags, u64 size) +{ + struct pds_vfio_lm_file *lm_file = NULL; + unsigned long long npages; + unsigned long long i; + struct page **pages; + int err = 0; + + if (!size) + return NULL; + + /* Alloc file structure */ + lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL); + if (!lm_file) + return NULL; + + /* Create file */ + lm_file->filep = anon_inode_getfile(name, fops, lm_file, flags); + if (!lm_file->filep) + goto err_get_file; + + stream_open(lm_file->filep->f_inode, lm_file->filep); + mutex_init(&lm_file->lock); + + lm_file->size = size; + + /* Allocate memory for file pages */ + npages = DIV_ROUND_UP_ULL(lm_file->size, PAGE_SIZE); + + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto err_alloc_pages; + + for (i = 0; i < npages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto err_alloc_page; + } + + lm_file->pages = pages; + lm_file->npages = npages; + lm_file->alloc_size = npages * PAGE_SIZE; + + /* Create scatterlist of file pages to use for DMA mapping later */ + err = sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages, + 0, size, GFP_KERNEL); + if (err) + goto err_alloc_sg_table; + + return lm_file; + +err_alloc_sg_table: +err_alloc_page: + /* free allocated pages */ + for (i = 0; i < npages && pages[i]; i++) + __free_page(pages[i]); + kfree(pages); +err_alloc_pages: + fput(lm_file->filep); + mutex_destroy(&lm_file->lock); +err_get_file: + kfree(lm_file); + + return NULL; +} + +static void +pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file) +{ + unsigned long long i; + + mutex_lock(&lm_file->lock); + + lm_file->size = 0; + lm_file->alloc_size = 0; + + /* Free scatter list of file pages*/ + sg_free_table(&lm_file->sg_table); + + /* Free allocated file pages */ + for (i = 0; i < lm_file->npages && lm_file->pages[i]; i++) + __free_page(lm_file->pages[i]); + kfree(lm_file->pages); + lm_file->pages = NULL; + + /* Delete file */ + fput(lm_file->filep); + lm_file->filep = NULL; + + mutex_unlock(&lm_file->lock); + + mutex_destroy(&lm_file->lock); + + /* Free file structure */ + kfree(lm_file); +} + +void +pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio) +{ + if (!pds_vfio->save_file) + return; + + pds_vfio_put_lm_file(pds_vfio->save_file); + pds_vfio->save_file = NULL; +} + +void +pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio) +{ + if (!pds_vfio->restore_file) + return; + + pds_vfio_put_lm_file(pds_vfio->restore_file); + pds_vfio->restore_file = NULL; +} + +static struct page * +pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file, + unsigned long offset) +{ + unsigned long cur_offset = 0; + struct scatterlist *sg; + unsigned int i; + + /* All accesses are sequential */ + if (offset < lm_file->last_offset || !lm_file->last_offset_sg) { + lm_file->last_offset = 0; + lm_file->last_offset_sg = lm_file->sg_table.sgl; + lm_file->sg_last_entry = 0; + } + + cur_offset = lm_file->last_offset; + + for_each_sg(lm_file->last_offset_sg, sg, + lm_file->sg_table.orig_nents - lm_file->sg_last_entry, + i) { + if (offset < sg->length + cur_offset) { + lm_file->last_offset_sg = sg; + lm_file->sg_last_entry += i; + lm_file->last_offset = cur_offset; + return nth_page(sg_page(sg), + (offset - cur_offset) / PAGE_SIZE); + } + cur_offset += sg->length; + } + + return NULL; +} + +static int +pds_vfio_release_file(struct inode *inode, struct file *filp) +{ + struct pds_vfio_lm_file *lm_file = filp->private_data; + + lm_file->size = 0; + + return 0; +} + +static ssize_t +pds_vfio_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) +{ + struct pds_vfio_lm_file *lm_file = filp->private_data; + ssize_t done = 0; + + if (pos) + return -ESPIPE; + pos = &filp->f_pos; + + mutex_lock(&lm_file->lock); + if (*pos > lm_file->size) { + done = -EINVAL; + goto out_unlock; + } + + len = min_t(size_t, lm_file->size - *pos, len); + while (len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int err; + + page_offset = (*pos) % PAGE_SIZE; + page = pds_vfio_get_file_page(lm_file, *pos - page_offset); + if (!page) { + if (done == 0) + done = -EINVAL; + goto out_unlock; + } + + page_len = min_t(size_t, len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + err = copy_to_user(buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (err) { + done = -EFAULT; + goto out_unlock; + } + *pos += page_len; + len -= page_len; + done += page_len; + buf += page_len; + } + +out_unlock: + mutex_unlock(&lm_file->lock); + return done; +} + +static const struct file_operations +pds_vfio_save_fops = { + .owner = THIS_MODULE, + .read = pds_vfio_save_read, + .release = pds_vfio_release_file, + .llseek = no_llseek, +}; + +static int +pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio) +{ + struct pci_dev *pdev = pds_vfio->pdev; + struct pds_vfio_lm_file *lm_file; + int err = 0; + u64 size; + + /* Get live migration state size in this state */ + err = pds_vfio_get_lm_status_cmd(pds_vfio, &size); + if (err) { + dev_err(&pdev->dev, "failed to get save status: %pe\n", + ERR_PTR(err)); + goto err_get_lm_status; + } + + dev_dbg(&pdev->dev, "save status, size = %lld\n", size); + + if (!size) { + err = -EIO; + dev_err(&pdev->dev, "invalid state size\n"); + goto err_get_lm_status; + } + + lm_file = pds_vfio_get_lm_file(PDS_VFIO_LM_FILENAME, + &pds_vfio_save_fops, + O_RDONLY, size); + if (!lm_file) { + err = -ENOENT; + dev_err(&pdev->dev, "failed to create save file\n"); + goto err_get_lm_file; + } + + dev_dbg(&pdev->dev, "size = %lld, alloc_size = %lld, npages = %lld\n", + lm_file->size, lm_file->alloc_size, lm_file->npages); + + pds_vfio->save_file = lm_file; + + return 0; + +err_get_lm_file: +err_get_lm_status: + return err; +} + +static ssize_t +pds_vfio_restore_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) +{ + struct pds_vfio_lm_file *lm_file = filp->private_data; + loff_t requested_length; + ssize_t done = 0; + + if (pos) + return -ESPIPE; + + pos = &filp->f_pos; + + if (*pos < 0 || + check_add_overflow((loff_t)len, *pos, &requested_length)) + return -EINVAL; + + mutex_lock(&lm_file->lock); + + while (len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *to_buff; + int err; + + page_offset = (*pos) % PAGE_SIZE; + page = pds_vfio_get_file_page(lm_file, *pos - page_offset); + if (!page) { + if (done == 0) + done = -EINVAL; + goto out_unlock; + } + + page_len = min_t(size_t, len, PAGE_SIZE - page_offset); + to_buff = kmap_local_page(page); + err = copy_from_user(to_buff + page_offset, buf, page_len); + kunmap_local(to_buff); + if (err) { + done = -EFAULT; + goto out_unlock; + } + *pos += page_len; + len -= page_len; + done += page_len; + buf += page_len; + lm_file->size += page_len; + } +out_unlock: + mutex_unlock(&lm_file->lock); + return done; +} + +static const struct file_operations +pds_vfio_restore_fops = { + .owner = THIS_MODULE, + .write = pds_vfio_restore_write, + .release = pds_vfio_release_file, + .llseek = no_llseek, +}; + +static int +pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio) +{ + struct pci_dev *pdev = pds_vfio->pdev; + struct pds_vfio_lm_file *lm_file; + int err = 0; + u64 size; + + size = sizeof(union pds_lm_dev_state); + + dev_dbg(&pdev->dev, "restore status, size = %lld\n", size); + + if (!size) { + err = -EIO; + dev_err(&pdev->dev, "invalid state size"); + goto err_get_lm_status; + } + + lm_file = pds_vfio_get_lm_file(PDS_VFIO_LM_FILENAME, + &pds_vfio_restore_fops, + O_WRONLY, size); + if (!lm_file) { + err = -ENOENT; + dev_err(&pdev->dev, "failed to create restore file"); + goto err_get_lm_file; + } + pds_vfio->restore_file = lm_file; + + return 0; + +err_get_lm_file: +err_get_lm_status: + return err; +} + +struct file * +pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio, + enum vfio_device_mig_state next) +{ + enum vfio_device_mig_state cur = pds_vfio->state; + struct device *dev = &pds_vfio->pdev->dev; + int err = 0; + + dev_dbg(dev, "%s => %s\n", + pds_vfio_lm_state(cur), pds_vfio_lm_state(next)); + + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) { + /* Device is already stopped + * create save device data file & get device state from firmware + */ + err = pds_vfio_get_save_file(pds_vfio); + if (err) + return ERR_PTR(err); + + /* Get device state */ + err = pds_vfio_get_lm_state_cmd(pds_vfio); + if (err) { + pds_vfio_put_save_file(pds_vfio); + return ERR_PTR(err); + } + + return pds_vfio->save_file->filep; + } + + if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) { + /* Device is already stopped + * delete the save device state file + */ + pds_vfio_put_save_file(pds_vfio); + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, + PDS_LM_STA_NONE); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) { + /* create resume device data file */ + err = pds_vfio_get_restore_file(pds_vfio); + if (err) + return ERR_PTR(err); + + return pds_vfio->restore_file->filep; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) { + /* Set device state */ + err = pds_vfio_set_lm_state_cmd(pds_vfio); + if (err) + return ERR_PTR(err); + + /* delete resume device data file */ + pds_vfio_put_restore_file(pds_vfio); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) { + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS); + /* Device should be stopped + * no interrupts, dma or change in internal state + */ + err = pds_vfio_suspend_device_cmd(pds_vfio); + if (err) + return ERR_PTR(err); + + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) { + /* Device should be functional + * interrupts, dma, mmio or changes to internal state is allowed + */ + err = pds_vfio_resume_device_cmd(pds_vfio); + if (err) + return ERR_PTR(err); + + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, + PDS_LM_STA_NONE); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P) + return NULL; + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP) + return NULL; + + return ERR_PTR(-EINVAL); +} diff --git a/drivers/vfio/pci/pds/lm.h b/drivers/vfio/pci/pds/lm.h new file mode 100644 index 000000000000..3dd97b807db6 --- /dev/null +++ b/drivers/vfio/pci/pds/lm.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2022 Pensando Systems, Inc */ + +#ifndef _LM_H_ +#define _LM_H_ + +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/scatterlist.h> +#include <linux/types.h> +#include <linux/vfio.h> + +#include <linux/pds/pds_lm.h> + +struct pds_vfio_lm_file { + struct file *filep; + struct mutex lock; /* protect live migration data file */ + u64 size; /* Size with valid data */ + u64 alloc_size; /* Total allocated size. Always >= len */ + struct page **pages; /* Backing pages for file */ + unsigned long long npages; + struct sg_table sg_table; /* SG table for backing pages */ + struct pds_lm_sg_elem *sgl; /* DMA mapping */ + dma_addr_t sgl_addr; + u16 num_sge; + struct scatterlist *last_offset_sg; /* Iterator */ + unsigned int sg_last_entry; + unsigned long last_offset; +}; + +struct pds_vfio_pci_device; + +struct file * +pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio, + enum vfio_device_mig_state next); +const char * +pds_vfio_lm_state(enum vfio_device_mig_state state); +void +pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio); +void +pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio); + +#endif /* _LM_H_ */ diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c index d6ad15719ec4..aafa013ee8c2 100644 --- a/drivers/vfio/pci/pds/pci_drv.c +++ b/drivers/vfio/pci/pds/pci_drv.c @@ -67,12 +67,27 @@ pds_vfio_pci_table[] = { }; MODULE_DEVICE_TABLE(pci, pds_vfio_pci_table); +static void +pds_vfio_pci_aer_reset_done(struct pci_dev *pdev) +{ + struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev); + + pds_vfio_reset(pds_vfio); +} + +static const struct +pci_error_handlers pds_vfio_pci_err_handlers = { + .reset_done = pds_vfio_pci_aer_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + static struct pci_driver pds_vfio_pci_driver = { .name = PDS_VFIO_DRV_NAME, .id_table = pds_vfio_pci_table, .probe = pds_vfio_pci_probe, .remove = pds_vfio_pci_remove, + .err_handler = &pds_vfio_pci_err_handlers, .driver_managed_dma = true, }; diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index 30c3bb47a2be..d28f96f93097 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -4,6 +4,7 @@ #include <linux/vfio.h> #include <linux/vfio_pci_core.h> +#include "lm.h" #include "vfio_dev.h" #include "aux_drv.h" @@ -16,6 +17,97 @@ pds_vfio_pci_drvdata(struct pci_dev *pdev) vfio_coredev); } +static void +pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio) +{ +again: + spin_lock(&pds_vfio->reset_lock); + if (pds_vfio->deferred_reset) { + pds_vfio->deferred_reset = false; + if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) { + pds_vfio->state = VFIO_DEVICE_STATE_RUNNING; + pds_vfio_put_restore_file(pds_vfio); + pds_vfio_put_save_file(pds_vfio); + } + spin_unlock(&pds_vfio->reset_lock); + goto again; + } + mutex_unlock(&pds_vfio->state_mutex); + spin_unlock(&pds_vfio->reset_lock); +} + +void +pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio) +{ + spin_lock(&pds_vfio->reset_lock); + pds_vfio->deferred_reset = true; + if (!mutex_trylock(&pds_vfio->state_mutex)) { + spin_unlock(&pds_vfio->reset_lock); + return; + } + spin_unlock(&pds_vfio->reset_lock); + pds_vfio_state_mutex_unlock(pds_vfio); +} + +static struct file * +pds_vfio_set_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state new_state) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + struct file *res = NULL; + + if (!pds_vfio->vfio_aux) + return ERR_PTR(-ENODEV); + + mutex_lock(&pds_vfio->state_mutex); + while (new_state != pds_vfio->state) { + enum vfio_device_mig_state next_state; + + int err = vfio_mig_get_next_state(vdev, pds_vfio->state, + new_state, &next_state); + if (err) { + res = ERR_PTR(err); + break; + } + + res = pds_vfio_step_device_state_locked(pds_vfio, next_state); + if (IS_ERR(res)) + break; + + pds_vfio->state = next_state; + + if (WARN_ON(res && new_state != pds_vfio->state)) { + res = ERR_PTR(-EINVAL); + break; + } + } + pds_vfio_state_mutex_unlock(pds_vfio); + + return res; +} + +static int +pds_vfio_get_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state *current_state) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + + mutex_lock(&pds_vfio->state_mutex); + *current_state = pds_vfio->state; + pds_vfio_state_mutex_unlock(pds_vfio); + return 0; +} + +static const struct vfio_migration_ops +pds_vfio_lm_ops = { + .migration_set_state = pds_vfio_set_device_state, + .migration_get_state = pds_vfio_get_device_state +}; + static int pds_vfio_init_device(struct vfio_device *vdev) { @@ -35,10 +127,19 @@ pds_vfio_init_device(struct vfio_device *vdev) vfio_aux = pds_vfio_aux_get_drvdata(pds_vfio->pci_id); if (vfio_aux) { vfio_aux->pds_vfio = pds_vfio; + pds_vfio->coredev = vfio_aux->padev->aux_dev.dev.parent; pds_vfio->vfio_aux = vfio_aux; pds_vfio_put_aux_dev(vfio_aux); } + vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; + vdev->mig_ops = &pds_vfio_lm_ops; + + dev_dbg(&pdev->dev, "%s: PF %#04x VF %#04x (%d) vf_id %d domain %d vfio_aux %p pds_vfio %p\n", + __func__, pci_dev_id(pdev->physfn), + pds_vfio->pci_id, pds_vfio->pci_id, pds_vfio->vf_id, + pci_domain_nr(pdev->bus), pds_vfio->vfio_aux, pds_vfio); + return 0; } @@ -54,18 +155,34 @@ pds_vfio_open_device(struct vfio_device *vdev) if (err) return err; + mutex_init(&pds_vfio->state_mutex); + dev_dbg(&pds_vfio->pdev->dev, "%s: %s => VFIO_DEVICE_STATE_RUNNING\n", + __func__, pds_vfio_lm_state(pds_vfio->state)); + pds_vfio->state = VFIO_DEVICE_STATE_RUNNING; + vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev); return 0; } +static void +pds_vfio_close_device(struct vfio_device *vdev) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + + mutex_destroy(&pds_vfio->state_mutex); + vfio_pci_core_close_device(vdev); +} + static const struct vfio_device_ops pds_vfio_ops = { .name = "pds-vfio", .init = pds_vfio_init_device, .release = vfio_pci_core_release_dev, .open_device = pds_vfio_open_device, - .close_device = vfio_pci_core_close_device, + .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h index b16668693e1f..a09570eec6fa 100644 --- a/drivers/vfio/pci/pds/vfio_dev.h +++ b/drivers/vfio/pci/pds/vfio_dev.h @@ -7,10 +7,20 @@ #include <linux/pci.h> #include <linux/vfio_pci_core.h> +#include "lm.h" + struct pds_vfio_pci_device { struct vfio_pci_core_device vfio_coredev; struct pci_dev *pdev; struct pds_vfio_aux *vfio_aux; + struct device *coredev; + + struct pds_vfio_lm_file *save_file; + struct pds_vfio_lm_file *restore_file; + struct mutex state_mutex; /* protect migration state */ + enum vfio_device_mig_state state; + spinlock_t reset_lock; /* protect reset_done flow */ + u8 deferred_reset; int vf_id; int pci_id; @@ -20,5 +30,7 @@ const struct vfio_device_ops * pds_vfio_ops_info(void); struct pds_vfio_pci_device * pds_vfio_pci_drvdata(struct pci_dev *pdev); +void +pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio); #endif /* _VFIO_DEV_H_ */ diff --git a/include/linux/pds/pds_lm.h b/include/linux/pds/pds_lm.h index fdaf2bf71d35..28ebd62f7583 100644 --- a/include/linux/pds/pds_lm.h +++ b/include/linux/pds/pds_lm.h @@ -8,5 +8,211 @@ #define PDS_DEV_TYPE_LM_STR "LM" #define PDS_LM_DEV_NAME PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_LM_STR +#define PDS_LM_DEVICE_STATE_LENGTH 65536 +#define PDS_LM_CHECK_DEVICE_STATE_LENGTH(X) \ + PDS_CORE_SIZE_CHECK(union, PDS_LM_DEVICE_STATE_LENGTH, X) + +/* + * enum pds_lm_cmd_opcode - Live Migration Device commands + */ +enum pds_lm_cmd_opcode { + PDS_LM_CMD_HOST_VF_STATUS = 1, + + /* Device state commands */ + PDS_LM_CMD_STATUS = 16, + PDS_LM_CMD_SUSPEND = 18, + PDS_LM_CMD_SUSPEND_STATUS = 19, + PDS_LM_CMD_RESUME = 20, + PDS_LM_CMD_SAVE = 21, + PDS_LM_CMD_RESTORE = 22, +}; + +/** + * struct pds_lm_cmd - generic command + * @opcode: Opcode + * @rsvd: Word boundary padding + * @vf_id: VF id + * @rsvd2: Structure padding to 60 Bytes + */ +struct pds_lm_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 rsvd2[56]; +}; + +/** + * struct pds_lm_comp - generic command completion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Structure padding to 16 Bytes + */ +struct pds_lm_comp { + u8 status; + u8 rsvd[15]; +}; + +/** + * struct pds_lm_status_cmd - STATUS command + * @opcode: Opcode + * @rsvd: Word boundary padding + * @vf_id: VF id + */ +struct pds_lm_status_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; +}; + +/** + * struct pds_lm_status_comp - STATUS command completion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Word boundary padding + * @comp_index: Index in the desc ring for which this is the completion + * @size: Size of the device state + * @rsvd2: Word boundary padding + * @color: Color bit + */ +struct pds_lm_status_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + union { + __le64 size; + u8 rsvd2[11]; + } __packed; + u8 color; +}; + +/** + * struct pds_lm_suspend_cmd - SUSPEND command + * @opcode: Opcode PDS_LM_CMD_SUSPEND + * @rsvd: Word boundary padding + * @vf_id: VF id + */ +struct pds_lm_suspend_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; +}; + +/** + * struct pds_lm_suspend_comp - SUSPEND command completion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Word boundary padding + * @comp_index: Index in the desc ring for which this is the completion + * @state_size: Size of the device state computed post suspend. + * @rsvd2: Word boundary padding + * @color: Color bit + */ +struct pds_lm_suspend_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + union { + __le64 state_size; + u8 rsvd2[11]; + } __packed; + u8 color; +}; + +/** + * struct pds_lm_suspend_status_cmd - SUSPEND status command + * @opcode: Opcode PDS_AQ_CMD_LM_SUSPEND_STATUS + * @rsvd: Word boundary padding + * @vf_id: VF id + */ +struct pds_lm_suspend_status_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; +}; + +/** + * struct pds_lm_resume_cmd - RESUME command + * @opcode: Opcode PDS_LM_CMD_RESUME + * @rsvd: Word boundary padding + * @vf_id: VF id + */ +struct pds_lm_resume_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; +}; + +/** + * struct pds_lm_sg_elem - Transmit scatter-gather (SG) descriptor element + * @addr: DMA address of SG element data buffer + * @len: Length of SG element data buffer, in bytes + * @rsvd: Word boundary padding + */ +struct pds_lm_sg_elem { + __le64 addr; + __le32 len; + __le16 rsvd[2]; +}; + +/** + * struct pds_lm_save_cmd - SAVE command + * @opcode: Opcode PDS_LM_CMD_SAVE + * @rsvd: Word boundary padding + * @vf_id: VF id + * @rsvd2: Word boundary padding + * @sgl_addr: IOVA address of the SGL to dma the device state + * @num_sge: Total number of SG elements + */ +struct pds_lm_save_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 rsvd2[4]; + __le64 sgl_addr; + __le32 num_sge; +} __packed; + +/** + * struct pds_lm_restore_cmd - RESTORE command + * @opcode: Opcode PDS_LM_CMD_RESTORE + * @rsvd: Word boundary padding + * @vf_id: VF id + * @rsvd2: Word boundary padding + * @sgl_addr: IOVA address of the SGL to dma the device state + * @num_sge: Total number of SG elements + */ +struct pds_lm_restore_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 rsvd2[4]; + __le64 sgl_addr; + __le32 num_sge; +} __packed; + +/** + * union pds_lm_dev_state - device state information + * @words: Device state words + */ +union pds_lm_dev_state { + __le32 words[PDS_LM_DEVICE_STATE_LENGTH / sizeof(__le32)]; +}; + +enum pds_lm_host_vf_status { + PDS_LM_STA_NONE = 0, + PDS_LM_STA_IN_PROGRESS, + PDS_LM_STA_MAX, +}; + +/** + * struct pds_lm_host_vf_status_cmd - HOST_VF_STATUS command + * @opcode: Opcode PDS_LM_CMD_HOST_VF_STATUS + * @rsvd: Word boundary padding + * @vf_id: VF id + * @status: Current LM status of host VF driver (enum pds_lm_host_status) + */ +struct pds_lm_host_vf_status_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 status; +}; #endif /* _PDS_LM_H_ */ -- 2.17.1