From: Shay Drory <shayd@xxxxxxxxxx> vfio precopy ioctl returns an estimation of data available for transferring from the device. Whenever a user is using VFIO_MIG_GET_PRECOPY_INFO, track the current state of the device, and if needed, append the dirty data to the transfer FD data. This is done by saving a middle state. As mlx5 runs the SAVE command asynchronously, make sure to query for incremental data only once there is no active save command. Running both in parallel, might end-up with a failure in the incremental query command on un-tracked vhca. Also, a middle state will be saved only after the previous state has finished its SAVE command and has been fully transferred, this enables to re-use the resources. In order to map between FD position and the new saved state data, store the current FD position. Signed-off-by: Shay Drory <shayd@xxxxxxxxxx> Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxx> --- drivers/vfio/pci/mlx5/cmd.c | 9 +++ drivers/vfio/pci/mlx5/cmd.h | 1 + drivers/vfio/pci/mlx5/main.c | 131 +++++++++++++++++++++++++++++++++++ 3 files changed, 141 insertions(+) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index eb684455c2b2..2d2171191218 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -64,6 +64,15 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the + * incremental query command on un-tracked vhca. + */ + if (query_flags & MLX5VF_QUERY_INC) + wait_event(mvdev->saving_migf->save_wait, + !mvdev->saving_migf->save_cb_active); MLX5_SET(query_vhca_migration_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index c12fa81ba53f..07a2fc54c9d8 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -30,6 +30,7 @@ struct mlx5_vf_migration_file { u8 save_cb_active:1; struct sg_append_table table; + size_t table_start_pos; size_t image_length; size_t allocated_length; /* diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 10e073c32ab1..266626066fed 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -107,6 +107,22 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, return ret; } +static void mlx5vf_prep_next_table(struct mlx5_vf_migration_file *migf) +{ + struct sg_page_iter sg_iter; + + lockdep_assert_held(&migf->lock); + migf->table_start_pos += migf->image_length; + /* clear sgtable, all data has been transferred */ + for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0) + __free_page(sg_page_iter_page(&sg_iter)); + sg_free_append_table(&migf->table); + memset(&migf->table, 0, sizeof(migf->table)); + migf->image_length = 0; + migf->allocated_length = 0; + migf->last_offset_sg = NULL; +} + static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { struct sg_page_iter sg_iter; @@ -120,6 +136,7 @@ static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) migf->image_length = 0; migf->allocated_length = 0; migf->final_length = 0; + migf->table_start_pos = 0; migf->filp->f_pos = 0; for_each_sgtable_page(&migf->final_table.sgt, &sg_iter, 0) __free_page(sg_page_iter_page(&sg_iter)); @@ -137,6 +154,13 @@ static int mlx5vf_release_file(struct inode *inode, struct file *filp) return 0; } +#define MIGF_TOTAL_DATA(migf) \ + (migf->table_start_pos + migf->image_length + migf->final_length) + +#define VFIO_MIG_STATE_PRE_COPY(mvdev) \ + (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY || \ + mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY_P2P) + static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { @@ -230,10 +254,117 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) wake_up_interruptible(&migf->poll_wait); } +static ssize_t mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5vf_pci_core_device *mvdev = migf->mvdev; + bool first_state, state_finish_transfer; + struct vfio_precopy_info info; + loff_t *pos = &filp->f_pos; + unsigned long minsz; + size_t inc_length; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&mvdev->state_mutex); + if (!VFIO_MIG_STATE_PRE_COPY(migf->mvdev)) { + ret = -EINVAL; + goto err_state_unlock; + } + + /* + * We can't issue a SAVE command when the device is suspended, so as + * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra + * bytes that can't be read. + */ + if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + /* + * Once the query returns it's guaranteed that there is no + * active SAVE command. + * As so, the other code below is safe with the proper locks. + */ + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, + MLX5VF_QUERY_INC); + if (ret) + goto err_state_unlock; + } + + mutex_lock(&migf->lock); + if (*pos > MIGF_TOTAL_DATA(migf)) { + ret = -EINVAL; + goto err_migf_unlock; + } + + if (migf->disabled || migf->is_err) { + ret = -ENODEV; + goto err_migf_unlock; + } + + first_state = migf->table_start_pos == 0; + if (first_state) { + info.initial_bytes = MIGF_TOTAL_DATA(migf) - *pos; + info.dirty_bytes = 0; + } else { + info.initial_bytes = 0; + info.dirty_bytes = MIGF_TOTAL_DATA(migf) - *pos; + } + state_finish_transfer = *pos == MIGF_TOTAL_DATA(migf); + if (!(state_finish_transfer && inc_length && + mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY)) { + mutex_unlock(&migf->lock); + goto done; + } + + /* + * We finished transferring the current state and the device has a + * dirty state, save a new state to be ready for. + */ + mlx5vf_prep_next_table(migf); + ret = mlx5vf_add_migration_pages(migf, + DIV_ROUND_UP_ULL(inc_length, PAGE_SIZE), + &migf->table); + mutex_unlock(&migf->lock); + if (ret) { + mlx5vf_mark_err(migf); + goto err_state_unlock; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, true, true); + if (ret) { + mlx5vf_mark_err(migf); + goto err_state_unlock; + } + + info.dirty_bytes += inc_length; + +done: + mlx5vf_state_mutex_unlock(mvdev); + return copy_to_user((void __user *)arg, &info, minsz); + +err_migf_unlock: + mutex_unlock(&migf->lock); +err_state_unlock: + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, .poll = mlx5vf_save_poll, + .unlocked_ioctl = mlx5vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = mlx5vf_release_file, .llseek = no_llseek, }; -- 2.18.1