Re: [PATCH V1 vfio 6/7] vfio/virtio: Add PRE_COPY support for live migration

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, 4 Nov 2024 12:21:30 +0200
Yishai Hadas <yishaih@xxxxxxxxxx> wrote:

> Add PRE_COPY support for live migration.
> 
> This functionality may reduce the downtime upon STOP_COPY as of letting
> the target machine to get some 'initial data' from the source once the
> machine is still in its RUNNING state and let it prepares itself
> pre-ahead to get the final STOP_COPY data.
> 
> As the Virtio specification does not support reading partial or
> incremental device contexts. This means that during the PRE_COPY state,
> the vfio-virtio driver reads the full device state.
> 
> As the device state can be changed and the benefit is highest when the
> pre copy data closely matches the final data we read it in a rate
> limiter mode and reporting no data available for some time interval
> after the previous call.
> 
> With PRE_COPY enabled, we observed a downtime reduction of approximately
> 70-75% in various scenarios compared to when PRE_COPY was disabled,
> while keeping the total migration time nearly the same.
> 
> Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxx>
> ---
>  drivers/vfio/pci/virtio/common.h  |   4 +
>  drivers/vfio/pci/virtio/migrate.c | 233 +++++++++++++++++++++++++++++-
>  2 files changed, 229 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h
> index 3bdfb3ea1174..5704603f0f9d 100644
> --- a/drivers/vfio/pci/virtio/common.h
> +++ b/drivers/vfio/pci/virtio/common.h
> @@ -10,6 +10,8 @@
>  
>  enum virtiovf_migf_state {
>  	VIRTIOVF_MIGF_STATE_ERROR = 1,
> +	VIRTIOVF_MIGF_STATE_PRECOPY = 2,
> +	VIRTIOVF_MIGF_STATE_COMPLETE = 3,
>  };
>  
>  enum virtiovf_load_state {
> @@ -57,6 +59,8 @@ struct virtiovf_migration_file {
>  	/* synchronize access to the file state */
>  	struct mutex lock;
>  	loff_t max_pos;
> +	u64 pre_copy_initial_bytes;
> +	struct ratelimit_state pre_copy_rl_state;
>  	u64 record_size;
>  	u32 record_tag;
>  	u8 has_obj_id:1;
> diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c
> index 2a9614c2ef07..cdb252f6fd80 100644
> --- a/drivers/vfio/pci/virtio/migrate.c
> +++ b/drivers/vfio/pci/virtio/migrate.c
...
> @@ -379,9 +432,104 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le
>  	return done;
>  }
>  
> +static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd,
> +				   unsigned long arg)
> +{
> +	struct virtiovf_migration_file *migf = filp->private_data;
> +	struct virtiovf_pci_core_device *virtvdev = migf->virtvdev;
> +	struct vfio_precopy_info info = {};
> +	loff_t *pos = &filp->f_pos;
> +	bool end_of_data = false;
> +	unsigned long minsz;
> +	u32 ctx_size;
> +	int ret;
> +
> +	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
> +		return -ENOTTY;
> +
> +	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
> +	if (copy_from_user(&info, (void __user *)arg, minsz))
> +		return -EFAULT;
> +
> +	if (info.argsz < minsz)
> +		return -EINVAL;
> +
> +	mutex_lock(&virtvdev->state_mutex);
> +	if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
> +	    virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
> +		ret = -EINVAL;
> +		goto err_state_unlock;
> +	}
> +
> +	/*
> +	 * The virtio specification does not include a PRE_COPY concept.
> +	 * Since we can expect the data to remain the same for a certain period,
> +	 * we use a rate limiter mechanism before making a call to the device.
> +	 */
> +	if (!__ratelimit(&migf->pre_copy_rl_state)) {
> +		/* Reporting no data available */
> +		ret = 0;
> +		goto done;

@ret is not used by the done: goto target.  Don't we need to zero dirty
bytes, or account for initial bytes not being fully read yet?

> +	}
> +
> +	ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
> +				VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
> +				VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
> +				&ctx_size);
> +	if (ret)
> +		goto err_state_unlock;
> +
> +	mutex_lock(&migf->lock);
> +	if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
> +		ret = -ENODEV;
> +		goto err_migf_unlock;
> +	}
> +
> +	if (migf->pre_copy_initial_bytes > *pos) {
> +		info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
> +	} else {
> +		info.dirty_bytes = migf->max_pos - *pos;
> +		if (!info.dirty_bytes)
> +			end_of_data = true;
> +		info.dirty_bytes += ctx_size;
> +	}
> +
> +	if (!end_of_data || !ctx_size) {
> +		mutex_unlock(&migf->lock);
> +		goto done;
> +	}
> +
> +	mutex_unlock(&migf->lock);
> +	/*
> +	 * We finished transferring the current state and the device has a
> +	 * dirty state, read a new state.
> +	 */
> +	ret = virtiovf_read_device_context_chunk(migf, ctx_size);
> +	if (ret)
> +		/*
> +		 * The machine is running, and context size could be grow, so no reason to mark
> +		 * the device state as VIRTIOVF_MIGF_STATE_ERROR.
> +		 */
> +		goto err_state_unlock;
> +
> +done:
> +	virtiovf_state_mutex_unlock(virtvdev);
> +	if (copy_to_user((void __user *)arg, &info, minsz))
> +		return -EFAULT;
> +	return 0;
> +
> +err_migf_unlock:
> +	mutex_unlock(&migf->lock);
> +err_state_unlock:
> +	virtiovf_state_mutex_unlock(virtvdev);
> +	return ret;
> +}
> +
...
> @@ -536,6 +719,17 @@ virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev)
>  	if (ret)
>  		goto out_clean;
>  
> +	if (pre_copy) {
> +		migf->pre_copy_initial_bytes = migf->max_pos;
> +		ratelimit_state_init(&migf->pre_copy_rl_state, 1 * HZ, 1);

A comment describing the choice of this heuristic would be useful for
future maintenance, even if the comment is "Arbitrarily rate limit
pre-copy to 1s intervals."  Thanks,

Alex

> +		/* Prevent any rate messages upon its usage */
> +		ratelimit_set_flags(&migf->pre_copy_rl_state,
> +				    RATELIMIT_MSG_ON_RELEASE);
> +		migf->state = VIRTIOVF_MIGF_STATE_PRECOPY;
> +	} else {
> +		migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
> +	}
> +
>  	return migf;
>  
>  out_clean:





[Index of Archives]     [KVM Development]     [Libvirt Development]     [Libvirt Users]     [CentOS Virtualization]     [Netdev]     [Ethernet Bridging]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux