Re: [RFC PATCH v4 17/17] KVM: PPC: Book3S HV: XIVE: introduce a 'release' device operation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 4/15/19 5:32 AM, David Gibson wrote:
> On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote:
>> When the VM boots, the CAS negotiation process determines which
>> interrupt mode to use and invokes a machine reset. At that time, any
>> links to the previous KVM interrupt device should be 'destroyed'
>> before the new chosen one is created.
>>
>> To perform the necessary cleanups in KVM, we extend the KVM device
>> interface with a new 'release' operation which is called when the file
>> descriptor of the device is closed.
>>
>> Such operations are defined for the XICS-on-XIVE and the XIVE native
>> KVM devices. They clear the vCPU interrupt presenters that could be
>> attached and then destroy the device.
>>
>> Signed-off-by: Cédric Le Goater <clg@xxxxxxxx>
>> ---
>>  include/linux/kvm_host.h              |  1 +
>>  arch/powerpc/kvm/book3s_xive.c        | 50 +++++++++++++++++++++++++--
>>  arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++
>>  virt/kvm/kvm_main.c                   | 13 +++++++
>>  4 files changed, 85 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 831d963451d8..3b444620d8fc 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -1246,6 +1246,7 @@ struct kvm_device_ops {
>>  	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
>>  		      unsigned long arg);
>>  	int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma);
>> +	void (*release)(struct kvm_device *dev);
>>  };
>>  
>>  void kvm_device_get(struct kvm_device *dev);
>> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
>> index 4d4e1730de84..ba777db849d7 100644
>> --- a/arch/powerpc/kvm/book3s_xive.c
>> +++ b/arch/powerpc/kvm/book3s_xive.c
>> @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
>>  void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
>>  {
>>  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>> -	struct kvmppc_xive *xive = xc->xive;
>> +	struct kvmppc_xive *xive;
>>  	int i;
>>  
>> +	if (!kvmppc_xics_enabled(vcpu))
>> +		return;
>> +
>> +	if (!xc)
>> +		return;
>> +
>>  	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
>>  
>> +	xive = xc->xive;
>> +
>>  	/* Ensure no interrupt is still routed to that VP */
>>  	xc->valid = false;
>>  	kvmppc_xive_disable_vcpu_interrupts(vcpu);
>> @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
>>  	}
>>  	/* Free the VP */
>>  	kfree(xc);
>> +
>> +	/* Cleanup the vcpu */
>> +	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
>> +	vcpu->arch.xive_vcpu = NULL;
>>  }
>>  
>>  int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>> @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>>  	}
>>  	if (xive->kvm != vcpu->kvm)
>>  		return -EPERM;
>> -	if (vcpu->arch.irq_type)
>> +	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
>>  		return -EBUSY;
>>  	if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
>>  		pr_devel("Duplicate !\n");
>> @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev)
>>  	kfree(dev);
>>  }
>>  
>> +static void kvmppc_xive_release(struct kvm_device *dev)
>> +{
>> +	struct kvmppc_xive *xive = dev->private;
>> +	struct kvm *kvm = xive->kvm;
>> +	struct kvm_vcpu *vcpu;
>> +	int i;
>> +
>> +	pr_devel("Releasing xive device\n");
>> +
>> +	/*
>> +	 * When releasing the KVM device fd, the vCPUs can still be
>> +	 * running and we should clean up the vCPU interrupt
>> +	 * presenters first.
>> +	 */
>> +	if (atomic_read(&kvm->online_vcpus) != 0) {
> 
> What prevents online_vcpus from becoming non-zero after this test, but
> before the kvmppc_xive_free()?

I am not sure what you mean. kvmppc_xive_free() is gone with this patch. 
It has been replaced by kvmppc_xive_release().

> Is the test actually necessary?  The operations below should be safe
> even if there are no online cpus, yes?

ah, yes. kvm_for_each_vcpu() should be safe to use anyhow.

>> +		/*
>> +		 * call kick_all_cpus_sync() to ensure that all CPUs
>> +		 * have executed any pending interrupts
>> +		 */
>> +		if (is_kvmppc_hv_enabled(kvm))
>> +			kick_all_cpus_sync();>> +		/*
>> +		 * TODO: There is still a race window with the early
>> +		 * checks in kvmppc_native_connect_vcpu()
>> +		 */
> 
> That's... not reassuring.  What are the consequences of that race, 

a bogus ->xive pointer under the XIVE vCPU

> and what do you plan to do about it?

I don't think this is true any more with the release operation
which will be called by the last user of the device file. 

Anyhow, xc->xive does not seem very useful (just like xc->valid) 
We should try to use only vcpu->kvm->arch.xive instead.

I will propose some preliminary cleanups before introducing the
new release operation.

>> +		kvm_for_each_vcpu(i, vcpu, kvm)
>> +			kvmppc_xive_cleanup_vcpu(vcpu);
>> +	}
>> +
>> +	kvmppc_xive_free(dev);
>> +}
>> +
>>  struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
>>  {
>>  	struct kvmppc_xive *xive;
>> @@ -2043,6 +2088,7 @@ struct kvm_device_ops kvm_xive_ops = {
>>  	.name = "kvm-xive",
>>  	.create = kvmppc_xive_create,
>>  	.init = kvmppc_xive_init,
>> +	.release = kvmppc_xive_release,
>>  	.destroy = kvmppc_xive_free,
>>  	.set_attr = xive_set_attr,
>>  	.get_attr = xive_get_attr,
>> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
>> index 092db0efe628..629da7bf2a89 100644
>> --- a/arch/powerpc/kvm/book3s_xive_native.c
>> +++ b/arch/powerpc/kvm/book3s_xive_native.c
>> @@ -996,6 +996,28 @@ static void kvmppc_xive_native_free(struct kvm_device *dev)
>>  	kfree(dev);
>>  }
>>  
>> +static void kvmppc_xive_native_release(struct kvm_device *dev)
>> +{
>> +	struct kvmppc_xive *xive = dev->private;
>> +	struct kvm *kvm = xive->kvm;
>> +	struct kvm_vcpu *vcpu;
>> +	int i;
>> +
>> +	pr_devel("Releasing xive native device\n");
>> +
>> +	/*
>> +	 * When releasing the KVM device fd, the vCPUs can still be
>> +	 * running and we should clean up the vCPU interrupt
>> +	 * presenters first.
>> +	 */
>> +	if (atomic_read(&kvm->online_vcpus) != 0) {
> 
> Likewise here.
> 
>> +		kvm_for_each_vcpu(i, vcpu, kvm)
>> +			kvmppc_xive_native_cleanup_vcpu(vcpu);
>> +	}
>> +
>> +	kvmppc_xive_native_free(dev);
>> +}
>> +
>>  static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
>>  {
>>  	struct kvmppc_xive *xive;
>> @@ -1187,6 +1209,7 @@ struct kvm_device_ops kvm_xive_native_ops = {
>>  	.name = "kvm-xive-native",
>>  	.create = kvmppc_xive_native_create,
>>  	.init = kvmppc_xive_native_init,
>> +	.release = kvmppc_xive_native_release,
>>  	.destroy = kvmppc_xive_native_free,
>>  	.set_attr = kvmppc_xive_native_set_attr,
>>  	.get_attr = kvmppc_xive_native_get_attr,
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index ea2018ae1cd7..ea2619d5ca98 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -2938,6 +2938,19 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
>>  	struct kvm_device *dev = filp->private_data;
>>  	struct kvm *kvm = dev->kvm;
>>  
>> +	if (!dev)
>> +		return -ENODEV;
>> +
>> +	if (dev->kvm != kvm)
>> +		return -EPERM;
>> +
>> +	if (dev->ops->release) {
>> +		mutex_lock(&kvm->lock);
>> +		list_del(&dev->vm_node);
>> +		dev->ops->release(dev);
>> +		mutex_unlock(&kvm->lock);
>> +	}
>> +
> 
> Wasn't there a big comment that explained that release replaced
> destroy somewhere?

Yes. I did add a comment in the "V5 errata" series. 

I should be sending a v6 this week, to clarify all these attempts 
to solve the device switching.

Thanks,

C. 

> 
>>  	kvm_put_kvm(kvm);
>>  	return 0;
>>  }
> 




[Index of Archives]     [KVM Development]     [KVM ARM]     [KVM ia64]     [Linux Virtualization]     [Linux USB Devel]     [Linux Video]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [Big List of Linux Books]

  Powered by Linux