On Thu, Apr 18, 2019 at 12:39:42PM +0200, Cédric Le Goater wrote: > When a P9 sPAPR VM boots, the CAS negotiation process determines which > interrupt mode to use (XICS legacy or XIVE native) and invokes a > machine reset to activate the chosen mode. > > We introduce 'release' methods for the XICS-on-XIVE and the XIVE > native KVM devices which are called when the file descriptor of the > device is closed after the TIMA and ESB pages have been unmapped. > They perform the necessary cleanups : clear the vCPU interrupt > presenters that could be attached and then destroy the device. The > 'release' methods replace the 'destroy' methods as 'destroy' is not > called anymore once 'release' is. Compatibility with older QEMU is > nevertheless maintained. > > This is not considered as a safe operation as the vCPUs are still > running and could be referencing the KVM device through their > presenters. To protect the system from any breakage, the kvmppc_xive > objects representing both KVM devices are now stored in an array under > the VM. Allocation is performed on first usage and memory is freed > only when the VM exits. > > Signed-off-by: Cédric Le Goater <clg@xxxxxxxx> Reviewed-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > --- > > Changes since v5 : > > - replaced the kvmppc_xive device array with named kvmppc_xive structs. > - in kvmppc_xive_cleanup_vcpu(), replaced xc->xive by vcpu->kvm->arch.xive > which is always valid in this context > - in kvmppc_xive_release(), removed the comment on the race. The > device being destroyed when the fd is closed, dev->private should > always be valid when used. Also removed the kick_all_cpus_sync() to > flush the IPIs. > - in the both release operation, removed the test on &kvm->online_vcpus. > - devices are now freed when the machine exits. > > arch/powerpc/include/asm/kvm_host.h | 6 ++- > arch/powerpc/kvm/book3s_xive.h | 1 + > arch/powerpc/kvm/book3s_xive.c | 67 ++++++++++++++++++++++++--- > arch/powerpc/kvm/book3s_xive_native.c | 28 +++++++++-- > arch/powerpc/kvm/powerpc.c | 9 ++++ > 5 files changed, 99 insertions(+), 12 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h > index 9cc6abdce1b9..c7abc7c3c6cd 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -313,7 +313,11 @@ struct kvm_arch { > #endif > #ifdef CONFIG_KVM_XICS > struct kvmppc_xics *xics; > - struct kvmppc_xive *xive; > + struct kvmppc_xive *xive; /* Current XIVE device in use */ > + struct { > + struct kvmppc_xive *native; > + struct kvmppc_xive *xics_on_xive; > + } xive_devices; > struct kvmppc_passthru_irqmap *pimap; > #endif > struct kvmppc_ops *kvm_ops; > diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h > index e011622dc038..426146332984 100644 > --- a/arch/powerpc/kvm/book3s_xive.h > +++ b/arch/powerpc/kvm/book3s_xive.h > @@ -283,6 +283,7 @@ void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb); > int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio); > int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, > bool single_escalation); > +struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); > > #endif /* CONFIG_KVM_XICS */ > #endif /* _KVM_PPC_BOOK3S_XICS_H */ > diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c > index 480a3fc6b9fd..922689b768e6 100644 > --- a/arch/powerpc/kvm/book3s_xive.c > +++ b/arch/powerpc/kvm/book3s_xive.c > @@ -1100,9 +1100,15 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) > void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > { > struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > - struct kvmppc_xive *xive = xc->xive; > + struct kvmppc_xive *xive = vcpu->kvm->arch.xive; > int i; > > + if (!kvmppc_xics_enabled(vcpu)) > + return; > + > + if (!xc) > + return; > + > pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); > > /* Ensure no interrupt is still routed to that VP */ > @@ -1141,6 +1147,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > } > /* Free the VP */ > kfree(xc); > + > + /* Cleanup the vcpu */ > + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; > + vcpu->arch.xive_vcpu = NULL; > } > > int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > @@ -1158,7 +1168,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > } > if (xive->kvm != vcpu->kvm) > return -EPERM; > - if (vcpu->arch.irq_type) > + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) > return -EBUSY; > if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { > pr_devel("Duplicate !\n"); > @@ -1824,12 +1834,26 @@ void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) > } > } > > -static void kvmppc_xive_free(struct kvm_device *dev) > +/* > + * Called when device fd is closed > + */ > +static void kvmppc_xive_release(struct kvm_device *dev) > { > struct kvmppc_xive *xive = dev->private; > struct kvm *kvm = xive->kvm; > + struct kvm_vcpu *vcpu; > int i; > > + pr_devel("Releasing xive device\n"); > + > + /* > + * When releasing the KVM device fd, the vCPUs can still be > + * running and we should clean up the vCPU interrupt > + * presenters first. > + */ > + kvm_for_each_vcpu(i, vcpu, kvm) > + kvmppc_xive_cleanup_vcpu(vcpu); > + > debugfs_remove(xive->dentry); > > if (kvm) > @@ -1846,11 +1870,42 @@ static void kvmppc_xive_free(struct kvm_device *dev) > if (xive->vp_base != XIVE_INVALID_VP) > xive_native_free_vp_block(xive->vp_base); > > + /* > + * A reference of the kvmppc_xive pointer is now kept under > + * the xive_devices struct of the machine for reuse. It is > + * freed when the VM is destroyed for now until we fix all the > + * execution paths. > + */ > > - kfree(xive); > kfree(dev); > } > > +/* > + * When the guest chooses the interrupt mode (XICS legacy or XIVE > + * native), the VM will switch of KVM device. The previous device will > + * be "released" before the new one is created. > + * > + * Until we are sure all execution paths are well protected, provide a > + * fail safe (transitional) method for device destruction, in which > + * the XIVE device pointer is recycled and not directly freed. > + */ > +struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type) > +{ > + struct kvmppc_xive **kvm_xive_device = type == KVM_DEV_TYPE_XIVE ? > + &kvm->arch.xive_devices.native : > + &kvm->arch.xive_devices.xics_on_xive; > + struct kvmppc_xive *xive = *kvm_xive_device; > + > + if (!xive) { > + xive = kzalloc(sizeof(*xive), GFP_KERNEL); > + *kvm_xive_device = xive; > + } else { > + memset(xive, 0, sizeof(*xive)); > + } > + > + return xive; > +} > + > static int kvmppc_xive_create(struct kvm_device *dev, u32 type) > { > struct kvmppc_xive *xive; > @@ -1859,7 +1914,7 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) > > pr_devel("Creating xive for partition\n"); > > - xive = kzalloc(sizeof(*xive), GFP_KERNEL); > + xive = kvmppc_xive_get_device(kvm, type); > if (!xive) > return -ENOMEM; > > @@ -2024,7 +2079,7 @@ struct kvm_device_ops kvm_xive_ops = { > .name = "kvm-xive", > .create = kvmppc_xive_create, > .init = kvmppc_xive_init, > - .destroy = kvmppc_xive_free, > + .release = kvmppc_xive_release, > .set_attr = xive_set_attr, > .get_attr = xive_get_attr, > .has_attr = xive_has_attr, > diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c > index 62648f833adf..0497272a72fa 100644 > --- a/arch/powerpc/kvm/book3s_xive_native.c > +++ b/arch/powerpc/kvm/book3s_xive_native.c > @@ -964,15 +964,27 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev, > return -ENXIO; > } > > -static void kvmppc_xive_native_free(struct kvm_device *dev) > +/* > + * Called when device fd is closed > + */ > +static void kvmppc_xive_native_release(struct kvm_device *dev) > { > struct kvmppc_xive *xive = dev->private; > struct kvm *kvm = xive->kvm; > + struct kvm_vcpu *vcpu; > int i; > > debugfs_remove(xive->dentry); > > - pr_devel("Destroying xive native device\n"); > + pr_devel("Releasing xive native device\n"); > + > + /* > + * When releasing the KVM device fd, the vCPUs can still be > + * running and we should clean up the vCPU interrupt > + * presenters first. > + */ > + kvm_for_each_vcpu(i, vcpu, kvm) > + kvmppc_xive_native_cleanup_vcpu(vcpu); > > if (kvm) > kvm->arch.xive = NULL; > @@ -987,7 +999,13 @@ static void kvmppc_xive_native_free(struct kvm_device *dev) > if (xive->vp_base != XIVE_INVALID_VP) > xive_native_free_vp_block(xive->vp_base); > > - kfree(xive); > + /* > + * A reference of the kvmppc_xive pointer is now kept under > + * the xive_devices struct of the machine for reuse. It is > + * freed when the VM is destroyed for now until we fix all the > + * execution paths. > + */ > + > kfree(dev); > } > > @@ -1002,7 +1020,7 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) > if (kvm->arch.xive) > return -EEXIST; > > - xive = kzalloc(sizeof(*xive), GFP_KERNEL); > + xive = kvmppc_xive_get_device(kvm, type); > if (!xive) > return -ENOMEM; > > @@ -1182,7 +1200,7 @@ struct kvm_device_ops kvm_xive_native_ops = { > .name = "kvm-xive-native", > .create = kvmppc_xive_native_create, > .init = kvmppc_xive_native_init, > - .destroy = kvmppc_xive_native_free, > + .release = kvmppc_xive_native_release, > .set_attr = kvmppc_xive_native_set_attr, > .get_attr = kvmppc_xive_native_get_attr, > .has_attr = kvmppc_xive_native_has_attr, > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c > index f54926c78320..7f41a68d8094 100644 > --- a/arch/powerpc/kvm/powerpc.c > +++ b/arch/powerpc/kvm/powerpc.c > @@ -501,6 +501,15 @@ void kvm_arch_destroy_vm(struct kvm *kvm) > > mutex_unlock(&kvm->lock); > > + /* > + * Free the XIVE devices which are not directly freed by the > + * device 'release' method > + */ > + kfree(kvm->arch.xive_devices.native); > + kvm->arch.xive_devices.native = NULL; > + kfree(kvm->arch.xive_devices.xics_on_xive); > + kvm->arch.xive_devices.xics_on_xive = NULL; > + > /* drop the module reference */ > module_put(kvm->arch.kvm_ops->owner); > } -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature