On 4/15/19 5:32 AM, David Gibson wrote: > On Tue, Apr 09, 2019 at 04:13:47PM +0200, Cédric Le Goater wrote: >> When the VM boots, the CAS negotiation process determines which >> interrupt mode to use and invokes a machine reset. At that time, any >> links to the previous KVM interrupt device should be 'destroyed' >> before the new chosen one is created. >> >> To perform the necessary cleanups in KVM, we extend the KVM device >> interface with a new 'release' operation which is called when the file >> descriptor of the device is closed. >> >> Such operations are defined for the XICS-on-XIVE and the XIVE native >> KVM devices. They clear the vCPU interrupt presenters that could be >> attached and then destroy the device. >> >> Signed-off-by: Cédric Le Goater <clg@xxxxxxxx> >> --- >> include/linux/kvm_host.h | 1 + >> arch/powerpc/kvm/book3s_xive.c | 50 +++++++++++++++++++++++++-- >> arch/powerpc/kvm/book3s_xive_native.c | 23 ++++++++++++ >> virt/kvm/kvm_main.c | 13 +++++++ >> 4 files changed, 85 insertions(+), 2 deletions(-) >> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >> index 831d963451d8..3b444620d8fc 100644 >> --- a/include/linux/kvm_host.h >> +++ b/include/linux/kvm_host.h >> @@ -1246,6 +1246,7 @@ struct kvm_device_ops { >> long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, >> unsigned long arg); >> int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma); >> + void (*release)(struct kvm_device *dev); >> }; >> >> void kvm_device_get(struct kvm_device *dev); >> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c >> index 4d4e1730de84..ba777db849d7 100644 >> --- a/arch/powerpc/kvm/book3s_xive.c >> +++ b/arch/powerpc/kvm/book3s_xive.c >> @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) >> void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) >> { >> struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; >> - struct kvmppc_xive *xive = xc->xive; >> + struct kvmppc_xive *xive; >> int i; >> >> + if (!kvmppc_xics_enabled(vcpu)) >> + return; >> + >> + if (!xc) >> + return; >> + >> pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); >> >> + xive = xc->xive; >> + >> /* Ensure no interrupt is still routed to that VP */ >> xc->valid = false; >> kvmppc_xive_disable_vcpu_interrupts(vcpu); >> @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) >> } >> /* Free the VP */ >> kfree(xc); >> + >> + /* Cleanup the vcpu */ >> + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; >> + vcpu->arch.xive_vcpu = NULL; >> } >> >> int kvmppc_xive_connect_vcpu(struct kvm_device *dev, >> @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, >> } >> if (xive->kvm != vcpu->kvm) >> return -EPERM; >> - if (vcpu->arch.irq_type) >> + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) >> return -EBUSY; >> if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { >> pr_devel("Duplicate !\n"); >> @@ -1855,6 +1867,39 @@ static void kvmppc_xive_free(struct kvm_device *dev) >> kfree(dev); >> } >> >> +static void kvmppc_xive_release(struct kvm_device *dev) >> +{ >> + struct kvmppc_xive *xive = dev->private; >> + struct kvm *kvm = xive->kvm; >> + struct kvm_vcpu *vcpu; >> + int i; >> + >> + pr_devel("Releasing xive device\n"); >> + >> + /* >> + * When releasing the KVM device fd, the vCPUs can still be >> + * running and we should clean up the vCPU interrupt >> + * presenters first. >> + */ >> + if (atomic_read(&kvm->online_vcpus) != 0) { > > What prevents online_vcpus from becoming non-zero after this test, but > before the kvmppc_xive_free()? I am not sure what you mean. kvmppc_xive_free() is gone with this patch. It has been replaced by kvmppc_xive_release(). > Is the test actually necessary? The operations below should be safe > even if there are no online cpus, yes? ah, yes. kvm_for_each_vcpu() should be safe to use anyhow. >> + /* >> + * call kick_all_cpus_sync() to ensure that all CPUs >> + * have executed any pending interrupts >> + */ >> + if (is_kvmppc_hv_enabled(kvm)) >> + kick_all_cpus_sync();>> + /* >> + * TODO: There is still a race window with the early >> + * checks in kvmppc_native_connect_vcpu() >> + */ > > That's... not reassuring. What are the consequences of that race, a bogus ->xive pointer under the XIVE vCPU > and what do you plan to do about it? I don't think this is true any more with the release operation which will be called by the last user of the device file. Anyhow, xc->xive does not seem very useful (just like xc->valid) We should try to use only vcpu->kvm->arch.xive instead. I will propose some preliminary cleanups before introducing the new release operation. >> + kvm_for_each_vcpu(i, vcpu, kvm) >> + kvmppc_xive_cleanup_vcpu(vcpu); >> + } >> + >> + kvmppc_xive_free(dev); >> +} >> + >> struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type) >> { >> struct kvmppc_xive *xive; >> @@ -2043,6 +2088,7 @@ struct kvm_device_ops kvm_xive_ops = { >> .name = "kvm-xive", >> .create = kvmppc_xive_create, >> .init = kvmppc_xive_init, >> + .release = kvmppc_xive_release, >> .destroy = kvmppc_xive_free, >> .set_attr = xive_set_attr, >> .get_attr = xive_get_attr, >> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c >> index 092db0efe628..629da7bf2a89 100644 >> --- a/arch/powerpc/kvm/book3s_xive_native.c >> +++ b/arch/powerpc/kvm/book3s_xive_native.c >> @@ -996,6 +996,28 @@ static void kvmppc_xive_native_free(struct kvm_device *dev) >> kfree(dev); >> } >> >> +static void kvmppc_xive_native_release(struct kvm_device *dev) >> +{ >> + struct kvmppc_xive *xive = dev->private; >> + struct kvm *kvm = xive->kvm; >> + struct kvm_vcpu *vcpu; >> + int i; >> + >> + pr_devel("Releasing xive native device\n"); >> + >> + /* >> + * When releasing the KVM device fd, the vCPUs can still be >> + * running and we should clean up the vCPU interrupt >> + * presenters first. >> + */ >> + if (atomic_read(&kvm->online_vcpus) != 0) { > > Likewise here. > >> + kvm_for_each_vcpu(i, vcpu, kvm) >> + kvmppc_xive_native_cleanup_vcpu(vcpu); >> + } >> + >> + kvmppc_xive_native_free(dev); >> +} >> + >> static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) >> { >> struct kvmppc_xive *xive; >> @@ -1187,6 +1209,7 @@ struct kvm_device_ops kvm_xive_native_ops = { >> .name = "kvm-xive-native", >> .create = kvmppc_xive_native_create, >> .init = kvmppc_xive_native_init, >> + .release = kvmppc_xive_native_release, >> .destroy = kvmppc_xive_native_free, >> .set_attr = kvmppc_xive_native_set_attr, >> .get_attr = kvmppc_xive_native_get_attr, >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c >> index ea2018ae1cd7..ea2619d5ca98 100644 >> --- a/virt/kvm/kvm_main.c >> +++ b/virt/kvm/kvm_main.c >> @@ -2938,6 +2938,19 @@ static int kvm_device_release(struct inode *inode, struct file *filp) >> struct kvm_device *dev = filp->private_data; >> struct kvm *kvm = dev->kvm; >> >> + if (!dev) >> + return -ENODEV; >> + >> + if (dev->kvm != kvm) >> + return -EPERM; >> + >> + if (dev->ops->release) { >> + mutex_lock(&kvm->lock); >> + list_del(&dev->vm_node); >> + dev->ops->release(dev); >> + mutex_unlock(&kvm->lock); >> + } >> + > > Wasn't there a big comment that explained that release replaced > destroy somewhere? Yes. I did add a comment in the "V5 errata" series. I should be sending a v6 this week, to clarify all these attempts to solve the device switching. Thanks, C. > >> kvm_put_kvm(kvm); >> return 0; >> } >