From: Shay Drory <shayd@xxxxxxxxxx> In order to support PRE_COPY, mlx5 driver is transferring multiple states (images) of the device. e.g.: the source VF can save and transfer multiple states, and the target VF will load them by that order. The device is saving three kinds of states: 1) Initial state - when the device moves to PRE_COPY state. 2) Middle state - during PRE_COPY phase via VFIO_MIG_GET_PRECOPY_INFO. There can be multiple states of this type. 3) Final state - when the device moves to STOP_COPY state. After moving to PRE_COPY state, user is holding the saving migf FD and can use it. For example: user can start transferring data via read() callback. Also, user can switch from PRE_COPY to STOP_COPY whenever he sees it fits. This will invoke saving of final state. This means that mlx5 VFIO device can be switched to STOP_COPY without transferring any data in PRE_COPY state. Therefore, when the device moves to STOP_COPY, mlx5 will store the final state on a dedicated data structure. Signed-off-by: Shay Drory <shayd@xxxxxxxxxx> Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxx> --- drivers/vfio/pci/mlx5/cmd.c | 56 +++++++++++++++------- drivers/vfio/pci/mlx5/cmd.h | 16 ++++++- drivers/vfio/pci/mlx5/main.c | 93 +++++++++++++++++++++++++++++++----- 3 files changed, 134 insertions(+), 31 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 24c6d2e4c2be..eb684455c2b2 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -14,6 +14,7 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; @@ -21,6 +22,14 @@ int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while the device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the save + * command once it will try to turn on 'tracking' on a suspended device. + */ + if (migf) + wait_event(migf->save_wait, !migf->save_cb_active); MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); @@ -45,7 +54,7 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) } int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size) + size_t *state_size, u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; @@ -59,6 +68,8 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); + MLX5_SET(query_vhca_migration_state_in, in, incremental, + query_flags & MLX5VF_QUERY_INC); ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); @@ -210,11 +221,11 @@ static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, } static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vf_migration_file *migf, + struct sg_table *sgt, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey, size_t length) { - size_t npages = migf ? DIV_ROUND_UP(length, PAGE_SIZE) : + size_t npages = sgt ? DIV_ROUND_UP(length, PAGE_SIZE) : recv_buf->npages; int err = 0, inlen; __be64 *mtt; @@ -232,10 +243,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, DIV_ROUND_UP(npages, 2)); mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - if (migf) { + if (sgt) { struct sg_dma_page_iter dma_iter; - for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) + for_each_sgtable_dma_page(sgt, &dma_iter, 0) *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); } else { int i; @@ -255,7 +266,7 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); - MLX5_SET64(mkc, mkc, len, migf ? length : (npages * PAGE_SIZE)); + MLX5_SET64(mkc, mkc, len, sgt ? length : (npages * PAGE_SIZE)); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); kvfree(in); return err; @@ -277,7 +288,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_unlock(&migf->lock); mlx5_core_destroy_mkey(mdev, async_data->mkey); - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); + dma_unmap_sgtable(mdev->device, async_data->sgt, DMA_FROM_DEVICE, 0); mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); migf->save_cb_active = false; @@ -293,9 +304,14 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->image_length, - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size)); + size_t len = MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + + if (async_data->sgt == &migf->final_table.sgt) + WRITE_ONCE(migf->final_length, len); + else + WRITE_ONCE(migf->image_length, len); + wake_up_interruptible(&migf->poll_wait); } @@ -308,7 +324,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) } int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, bool inc, + bool track) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; @@ -327,12 +344,15 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, - 0); + async_data = &migf->async_data; + async_data->sgt = (!track && inc) ? &migf->final_table.sgt : + &migf->table.sgt; + err = dma_map_sgtable(mdev->device, async_data->sgt, + DMA_FROM_DEVICE, 0); if (err) goto err_dma_map; - err = _create_mkey(mdev, pdn, migf, NULL, + err = _create_mkey(mdev, pdn, async_data->sgt, NULL, &mkey, migf->allocated_length); if (err) goto err_create_mkey; @@ -343,8 +363,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, mkey); MLX5_SET(save_vhca_state_in, in, size, migf->allocated_length); + MLX5_SET(save_vhca_state_in, in, incremental, inc); + MLX5_SET(save_vhca_state_in, in, set_track, track); - async_data = &migf->async_data; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -370,7 +391,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, err_out: mlx5_core_destroy_mkey(mdev, mkey); err_create_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); + dma_unmap_sgtable(mdev->device, async_data->sgt, DMA_FROM_DEVICE, 0); err_dma_map: mlx5_core_dealloc_pd(mdev, pdn); migf->save_cb_active = false; @@ -405,7 +426,8 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) goto err_reg; - err = _create_mkey(mdev, pdn, migf, NULL, &mkey, migf->image_length); + err = _create_mkey(mdev, pdn, &migf->table.sgt, NULL, &mkey, + migf->image_length); if (err) goto err_mkey; diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index b1fa1a0418a5..c12fa81ba53f 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -15,6 +15,7 @@ struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; + struct sg_table *sgt; int status; u32 pdn; u32 mkey; @@ -31,6 +32,12 @@ struct mlx5_vf_migration_file { struct sg_append_table table; size_t image_length; size_t allocated_length; + /* + * The device can be moved to stop_copy before the previous state was + * fully read. Another set of variables is needed to maintain it. + */ + size_t final_length; + struct sg_append_table final_table; /* Optimize mlx5vf_get_migration_page() for sequential access */ struct scatterlist *last_offset_sg; @@ -115,17 +122,22 @@ struct mlx5vf_pci_core_device { struct mlx5_core_dev *mdev; }; +enum { + MLX5VF_QUERY_INC = (1UL << 0), +}; + int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size); + size_t *state_size, u8 query_flags); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops); void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, bool inc, + bool track); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 624b1a99dc21..10e073c32ab1 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -64,7 +64,8 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, } static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, - unsigned int npages) + unsigned int npages, + struct sg_append_table *table) { unsigned int to_alloc = npages; struct page **page_list; @@ -85,7 +86,7 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, } to_alloc -= filled; ret = sg_alloc_append_table_from_pages( - &migf->table, page_list, filled, 0, + table, page_list, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, GFP_KERNEL); @@ -118,7 +119,11 @@ static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) migf->disabled = true; migf->image_length = 0; migf->allocated_length = 0; + migf->final_length = 0; migf->filp->f_pos = 0; + for_each_sgtable_page(&migf->final_table.sgt, &sg_iter, 0) + __free_page(sg_page_iter_page(&sg_iter)); + sg_free_append_table(&migf->final_table); mutex_unlock(&migf->lock); } @@ -215,6 +220,16 @@ static __poll_t mlx5vf_save_poll(struct file *filp, return pollflags; } +/* + * FD is exposed and user can use it after receiving an error. + * Mark migf in error, and wake the user. + */ +static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) +{ + migf->is_err = true; + wake_up_interruptible(&migf->poll_wait); +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, @@ -223,8 +238,35 @@ static const struct file_operations mlx5vf_save_fops = { .llseek = no_llseek, }; +static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; + size_t length; + int ret; + + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, + MLX5VF_QUERY_INC); + if (ret) + return ret; + + if (migf->is_err) + return -ENODEV; + + ret = mlx5vf_add_migration_pages( + migf, DIV_ROUND_UP_ULL(length, PAGE_SIZE), &migf->final_table); + if (ret) { + mlx5vf_mark_err(migf); + return ret; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, true, false); + if (ret) + mlx5vf_mark_err(migf); + return ret; +} + static struct mlx5_vf_migration_file * -mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) +mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) { struct mlx5_vf_migration_file *migf; size_t length; @@ -249,17 +291,17 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) init_waitqueue_head(&migf->save_wait); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); if (ret) goto out_free; ret = mlx5vf_add_migration_pages( - migf, DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + migf, DIV_ROUND_UP_ULL(length, PAGE_SIZE), &migf->table); if (ret) goto out_free; migf->mvdev = mvdev; - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, false, track); if (ret) goto out_free; return migf; @@ -296,7 +338,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, done = mlx5vf_add_migration_pages( migf, DIV_ROUND_UP(requested_length - migf->allocated_length, - PAGE_SIZE)); + PAGE_SIZE), &migf->table); if (done) goto out_unlock; } @@ -403,7 +445,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { ret = mlx5vf_cmd_suspend_vhca(mvdev, MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); if (ret) @@ -411,7 +454,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { ret = mlx5vf_cmd_resume_vhca(mvdev, MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); if (ret) @@ -422,7 +466,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct mlx5_vf_migration_file *migf; - migf = mlx5vf_pci_save_device_data(mvdev); + migf = mlx5vf_pci_save_device_data(mvdev, false); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -430,7 +474,10 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return migf->filp; } - if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) { + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_RUNNING_P2P)) { mlx5vf_disable_fds(mvdev); return NULL; } @@ -455,6 +502,28 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct mlx5_vf_migration_file *migf; + + migf = mlx5vf_pci_save_device_data(mvdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + mvdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + ret = mlx5vf_cmd_suspend_vhca(mvdev, + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); + if (ret) + return ERR_PTR(ret); + ret = mlx5vf_pci_save_device_inc_data(mvdev); + return ret ? ERR_PTR(ret) : NULL; + } + /* * vfio_mig_get_next_state() does not use arcs other than the above */ @@ -523,7 +592,7 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, mutex_lock(&mvdev->state_mutex); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &state_size); + &state_size, 0); if (!ret) *stop_copy_length = state_size; mlx5vf_state_mutex_unlock(mvdev); -- 2.18.1