On Mon, 4 Nov 2024 12:21:30 +0200 Yishai Hadas <yishaih@xxxxxxxxxx> wrote: > Add PRE_COPY support for live migration. > > This functionality may reduce the downtime upon STOP_COPY as of letting > the target machine to get some 'initial data' from the source once the > machine is still in its RUNNING state and let it prepares itself > pre-ahead to get the final STOP_COPY data. > > As the Virtio specification does not support reading partial or > incremental device contexts. This means that during the PRE_COPY state, > the vfio-virtio driver reads the full device state. > > As the device state can be changed and the benefit is highest when the > pre copy data closely matches the final data we read it in a rate > limiter mode and reporting no data available for some time interval > after the previous call. > > With PRE_COPY enabled, we observed a downtime reduction of approximately > 70-75% in various scenarios compared to when PRE_COPY was disabled, > while keeping the total migration time nearly the same. > > Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxx> > --- > drivers/vfio/pci/virtio/common.h | 4 + > drivers/vfio/pci/virtio/migrate.c | 233 +++++++++++++++++++++++++++++- > 2 files changed, 229 insertions(+), 8 deletions(-) > > diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h > index 3bdfb3ea1174..5704603f0f9d 100644 > --- a/drivers/vfio/pci/virtio/common.h > +++ b/drivers/vfio/pci/virtio/common.h > @@ -10,6 +10,8 @@ > > enum virtiovf_migf_state { > VIRTIOVF_MIGF_STATE_ERROR = 1, > + VIRTIOVF_MIGF_STATE_PRECOPY = 2, > + VIRTIOVF_MIGF_STATE_COMPLETE = 3, > }; > > enum virtiovf_load_state { > @@ -57,6 +59,8 @@ struct virtiovf_migration_file { > /* synchronize access to the file state */ > struct mutex lock; > loff_t max_pos; > + u64 pre_copy_initial_bytes; > + struct ratelimit_state pre_copy_rl_state; > u64 record_size; > u32 record_tag; > u8 has_obj_id:1; > diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c > index 2a9614c2ef07..cdb252f6fd80 100644 > --- a/drivers/vfio/pci/virtio/migrate.c > +++ b/drivers/vfio/pci/virtio/migrate.c ... > @@ -379,9 +432,104 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le > return done; > } > > +static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd, > + unsigned long arg) > +{ > + struct virtiovf_migration_file *migf = filp->private_data; > + struct virtiovf_pci_core_device *virtvdev = migf->virtvdev; > + struct vfio_precopy_info info = {}; > + loff_t *pos = &filp->f_pos; > + bool end_of_data = false; > + unsigned long minsz; > + u32 ctx_size; > + int ret; > + > + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) > + return -ENOTTY; > + > + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); > + if (copy_from_user(&info, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz) > + return -EINVAL; > + > + mutex_lock(&virtvdev->state_mutex); > + if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && > + virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { > + ret = -EINVAL; > + goto err_state_unlock; > + } > + > + /* > + * The virtio specification does not include a PRE_COPY concept. > + * Since we can expect the data to remain the same for a certain period, > + * we use a rate limiter mechanism before making a call to the device. > + */ > + if (!__ratelimit(&migf->pre_copy_rl_state)) { > + /* Reporting no data available */ > + ret = 0; > + goto done; @ret is not used by the done: goto target. Don't we need to zero dirty bytes, or account for initial bytes not being fully read yet? > + } > + > + ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev, > + VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id, > + VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE, > + &ctx_size); > + if (ret) > + goto err_state_unlock; > + > + mutex_lock(&migf->lock); > + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) { > + ret = -ENODEV; > + goto err_migf_unlock; > + } > + > + if (migf->pre_copy_initial_bytes > *pos) { > + info.initial_bytes = migf->pre_copy_initial_bytes - *pos; > + } else { > + info.dirty_bytes = migf->max_pos - *pos; > + if (!info.dirty_bytes) > + end_of_data = true; > + info.dirty_bytes += ctx_size; > + } > + > + if (!end_of_data || !ctx_size) { > + mutex_unlock(&migf->lock); > + goto done; > + } > + > + mutex_unlock(&migf->lock); > + /* > + * We finished transferring the current state and the device has a > + * dirty state, read a new state. > + */ > + ret = virtiovf_read_device_context_chunk(migf, ctx_size); > + if (ret) > + /* > + * The machine is running, and context size could be grow, so no reason to mark > + * the device state as VIRTIOVF_MIGF_STATE_ERROR. > + */ > + goto err_state_unlock; > + > +done: > + virtiovf_state_mutex_unlock(virtvdev); > + if (copy_to_user((void __user *)arg, &info, minsz)) > + return -EFAULT; > + return 0; > + > +err_migf_unlock: > + mutex_unlock(&migf->lock); > +err_state_unlock: > + virtiovf_state_mutex_unlock(virtvdev); > + return ret; > +} > + ... > @@ -536,6 +719,17 @@ virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev) > if (ret) > goto out_clean; > > + if (pre_copy) { > + migf->pre_copy_initial_bytes = migf->max_pos; > + ratelimit_state_init(&migf->pre_copy_rl_state, 1 * HZ, 1); A comment describing the choice of this heuristic would be useful for future maintenance, even if the comment is "Arbitrarily rate limit pre-copy to 1s intervals." Thanks, Alex > + /* Prevent any rate messages upon its usage */ > + ratelimit_set_flags(&migf->pre_copy_rl_state, > + RATELIMIT_MSG_ON_RELEASE); > + migf->state = VIRTIOVF_MIGF_STATE_PRECOPY; > + } else { > + migf->state = VIRTIOVF_MIGF_STATE_COMPLETE; > + } > + > return migf; > > out_clean: