Add PRE_COPY support for live migration. This functionality may reduce the downtime upon STOP_COPY as of letting the target machine to get some 'initial data' from the source once the machine is still in its RUNNING state and let it prepares itself pre-ahead to get the final STOP_COPY data. Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxx> --- drivers/vfio/pci/virtio/common.h | 4 + drivers/vfio/pci/virtio/migrate.c | 234 +++++++++++++++++++++++++++++- 2 files changed, 231 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index 3bdfb3ea1174..37796e1d70bc 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -10,6 +10,8 @@ enum virtiovf_migf_state { VIRTIOVF_MIGF_STATE_ERROR = 1, + VIRTIOVF_MIGF_STATE_PRECOPY = 2, + VIRTIOVF_MIGF_STATE_COMPLETE = 3, }; enum virtiovf_load_state { @@ -57,6 +59,7 @@ struct virtiovf_migration_file { /* synchronize access to the file state */ struct mutex lock; loff_t max_pos; + u64 pre_copy_initial_bytes; u64 record_size; u32 record_tag; u8 has_obj_id:1; @@ -90,6 +93,7 @@ struct virtiovf_pci_core_device { /* protect migration state */ struct mutex state_mutex; enum vfio_device_mig_state mig_state; + u16 num_pre_copy_calls; /* protect the reset_done flow */ spinlock_t reset_lock; struct virtiovf_migration_file *resuming_migf; diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index 2a9614c2ef07..5ffcff3425c6 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -26,6 +26,12 @@ /* Initial target buffer size */ #define VIRTIOVF_TARGET_INITIAL_BUF_SIZE SZ_1M +#define VIRTIOVF_MAX_PRE_COPY_CALLS 128 + +static int +virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf, + u32 ctx_size); + static struct page * virtiovf_get_migration_page(struct virtiovf_data_buffer *buf, unsigned long offset) @@ -155,6 +161,41 @@ virtiovf_pci_free_obj_id(struct virtiovf_pci_core_device *virtvdev, u32 obj_id) VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id); } +static struct virtiovf_data_buffer * +virtiovf_get_data_buffer(struct virtiovf_migration_file *migf, size_t length) +{ + struct virtiovf_data_buffer *buf, *temp_buf; + struct list_head free_list; + + INIT_LIST_HEAD(&free_list); + + spin_lock_irq(&migf->list_lock); + list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { + list_del_init(&buf->buf_elm); + if (buf->allocated_length >= length) { + spin_unlock_irq(&migf->list_lock); + goto found; + } + /* + * Prevent holding redundant buffers. Put in a free + * list and call at the end not under the spin lock + * (&migf->list_lock) to minimize its scope usage. + */ + list_add(&buf->buf_elm, &free_list); + } + spin_unlock_irq(&migf->list_lock); + buf = virtiovf_alloc_data_buffer(migf, length); + +found: + while ((temp_buf = list_first_entry_or_null(&free_list, + struct virtiovf_data_buffer, buf_elm))) { + list_del(&temp_buf->buf_elm); + virtiovf_free_data_buffer(temp_buf); + } + + return buf; +} + static void virtiovf_clean_migf_resources(struct virtiovf_migration_file *migf) { struct virtiovf_data_buffer *entry; @@ -217,6 +258,7 @@ static void virtiovf_state_mutex_unlock(struct virtiovf_pci_core_device *virtvde virtvdev->deferred_reset = false; spin_unlock(&virtvdev->reset_lock); virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + virtvdev->num_pre_copy_calls = 0; virtiovf_disable_fds(virtvdev); goto again; } @@ -341,6 +383,7 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le { struct virtiovf_migration_file *migf = filp->private_data; struct virtiovf_data_buffer *vhca_buf; + bool first_loop_call = true; bool end_of_data; ssize_t done = 0; @@ -358,6 +401,19 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le ssize_t count; vhca_buf = virtiovf_get_data_buff_from_pos(migf, *pos, &end_of_data); + if (first_loop_call) { + first_loop_call = false; + /* Temporary end of file as part of PRE_COPY */ + if (end_of_data && migf->state == VIRTIOVF_MIGF_STATE_PRECOPY) { + done = -ENOMSG; + goto out_unlock; + } + if (end_of_data && migf->state != VIRTIOVF_MIGF_STATE_COMPLETE) { + done = -EINVAL; + goto out_unlock; + } + } + if (end_of_data) goto out_unlock; @@ -379,9 +435,103 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le return done; } +static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct virtiovf_migration_file *migf = filp->private_data; + struct virtiovf_pci_core_device *virtvdev = migf->virtvdev; + struct vfio_precopy_info info = {}; + loff_t *pos = &filp->f_pos; + bool end_of_data = false; + unsigned long minsz; + u32 ctx_size; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&virtvdev->state_mutex); + if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && + virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + ret = -EINVAL; + goto err_state_unlock; + } + + virtvdev->num_pre_copy_calls++; + /* + * There is no PRE_COPY concept in virtio spec, prevent infinite calls + * for a potenital same data. + */ + if (virtvdev->num_pre_copy_calls > VIRTIOVF_MAX_PRE_COPY_CALLS) { + ret = 0; + goto done; + } + + ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE, + &ctx_size); + if (ret) + goto err_state_unlock; + + mutex_lock(&migf->lock); + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) { + ret = -ENODEV; + goto err_migf_unlock; + } + + if (migf->pre_copy_initial_bytes > *pos) { + info.initial_bytes = migf->pre_copy_initial_bytes - *pos; + } else { + info.dirty_bytes = migf->max_pos - *pos; + if (!info.dirty_bytes) + end_of_data = true; + info.dirty_bytes += ctx_size; + } + + if (!end_of_data || !ctx_size) { + mutex_unlock(&migf->lock); + goto done; + } + + mutex_unlock(&migf->lock); + /* + * We finished transferring the current state and the device has a + * dirty state, read a new state. + */ + ret = virtiovf_read_device_context_chunk(migf, ctx_size); + if (ret) + /* + * The machine is running, and context size could be grow, so no reason to mark + * the device state as VIRTIOVF_MIGF_STATE_ERROR. + */ + goto err_state_unlock; + +done: + virtiovf_state_mutex_unlock(virtvdev); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + return 0; + +err_migf_unlock: + mutex_unlock(&migf->lock); +err_state_unlock: + virtiovf_state_mutex_unlock(virtvdev); + return ret; +} + static const struct file_operations virtiovf_save_fops = { .owner = THIS_MODULE, .read = virtiovf_save_read, + .unlocked_ioctl = virtiovf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = virtiovf_release_file, }; @@ -425,7 +575,7 @@ virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf, int nent; int ret; - buf = virtiovf_alloc_data_buffer(migf, ctx_size); + buf = virtiovf_get_data_buffer(migf, ctx_size); if (IS_ERR(buf)) return PTR_ERR(buf); @@ -460,7 +610,7 @@ virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf, goto out; buf->length = res_size; - header_buf = virtiovf_alloc_data_buffer(migf, + header_buf = virtiovf_get_data_buffer(migf, sizeof(struct virtiovf_migration_header)); if (IS_ERR(header_buf)) { ret = PTR_ERR(header_buf); @@ -485,8 +635,43 @@ virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf, return ret; } +static int +virtiovf_pci_save_device_final_data(struct virtiovf_pci_core_device *virtvdev) +{ + struct virtiovf_migration_file *migf = virtvdev->saving_migf; + u32 ctx_size; + int ret; + + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) + return -ENODEV; + + ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE, + &ctx_size); + if (ret) + goto err; + + if (!ctx_size) { + ret = -EINVAL; + goto err; + } + + ret = virtiovf_read_device_context_chunk(migf, ctx_size); + if (ret) + goto err; + + migf->state = VIRTIOVF_MIGF_STATE_COMPLETE; + return 0; + +err: + migf->state = VIRTIOVF_MIGF_STATE_ERROR; + return ret; +} + static struct virtiovf_migration_file * -virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev) +virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev, + bool pre_copy) { struct virtiovf_migration_file *migf; u32 ctx_size; @@ -536,6 +721,13 @@ virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev) if (ret) goto out_clean; + if (pre_copy) { + migf->pre_copy_initial_bytes = migf->max_pos; + migf->state = VIRTIOVF_MIGF_STATE_PRECOPY; + } else { + migf->state = VIRTIOVF_MIGF_STATE_COMPLETE; + } + return migf; out_clean: @@ -948,7 +1140,8 @@ virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, BIT(VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED)); if (ret) @@ -956,7 +1149,8 @@ virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, 0); if (ret) return ERR_PTR(ret); @@ -966,7 +1160,7 @@ virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct virtiovf_migration_file *migf; - migf = virtiovf_pci_save_device_data(virtvdev); + migf = virtiovf_pci_save_device_data(virtvdev, false); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -974,6 +1168,13 @@ virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev, return migf->filp; } + if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) { + virtvdev->num_pre_copy_calls = 0; + virtiovf_disable_fds(virtvdev); + return NULL; + } + if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { virtiovf_disable_fds(virtvdev); return NULL; @@ -995,6 +1196,24 @@ virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev, return NULL; } + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct virtiovf_migration_file *migf; + + migf = virtiovf_pci_save_device_data(virtvdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + virtvdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + ret = virtiovf_pci_save_device_final_data(virtvdev); + return ret ? ERR_PTR(ret) : NULL; + } + /* * vfio_mig_get_next_state() does not use arcs other than the above */ @@ -1098,7 +1317,8 @@ void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev) spin_lock_init(&virtvdev->reset_lock); virtvdev->core_device.vdev.migration_flags = VFIO_MIGRATION_STOP_COPY | - VFIO_MIGRATION_P2P; + VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY; virtvdev->core_device.vdev.mig_ops = &virtvdev_pci_mig_ops; } -- 2.27.0