On Thu, Apr 11, 2019 at 03:53:02PM +0200, Cédric Le Goater wrote: > When a P9 sPAPR VM boots, the CAS negotiation process determines which > interrupt mode to use (XICS legacy or XIVE native) and invokes a > machine reset to activate the chosen mode. > > We introduce 'release' methods for the XICS-on-XIVE and the XIVE > native KVM devices which are called when the file descriptor of the > device is closed after the TIMA and ESB pages have been unmapped. > They perform the necessary cleanups : clear the vCPU interrupt > presenters that could be attached and then destroy the device. The > 'release' methods replace the 'destroy' methods as 'destroy' is not > called anymore once 'release' is. Compatibility with older QEMU is > nevertheless maintained. > > This is not considered as a safe operation as the vCPUs are still > running and could be referencing the KVM device through their > presenters. To protect the system from any breakage, the kvmppc_xive > objects representing both KVM devices are now stored in an array under > the VM. Allocation is performed on first usage and memory is freed > only when the VM exits. > > Signed-off-by: Cédric Le Goater <clg@xxxxxxxx> Reviewed-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> Although a nit.. > --- > arch/powerpc/include/asm/kvm_host.h | 1 + > arch/powerpc/kvm/book3s_xive.h | 1 + > arch/powerpc/kvm/book3s_xive.c | 82 +++++++++++++++++++++++++-- > arch/powerpc/kvm/book3s_xive_native.c | 30 ++++++++-- > arch/powerpc/kvm/powerpc.c | 9 +++ > 5 files changed, 112 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h > index 9cc6abdce1b9..ed059c95e56a 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -314,6 +314,7 @@ struct kvm_arch { > #ifdef CONFIG_KVM_XICS > struct kvmppc_xics *xics; > struct kvmppc_xive *xive; > + struct kvmppc_xive *xive_devices[2]; This is essentially a bool-indexed array, which is a fairly confusing thing. It thing using separate fields with meaningful names would be better. It'll mean slightly more code in get_device() but not really that much, and I think the intent will be clearer. > struct kvmppc_passthru_irqmap *pimap; > #endif > struct kvmppc_ops *kvm_ops; > diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h > index e011622dc038..426146332984 100644 > --- a/arch/powerpc/kvm/book3s_xive.h > +++ b/arch/powerpc/kvm/book3s_xive.h > @@ -283,6 +283,7 @@ void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb); > int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio); > int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, > bool single_escalation); > +struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); > > #endif /* CONFIG_KVM_XICS */ > #endif /* _KVM_PPC_BOOK3S_XICS_H */ > diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c > index 480a3fc6b9fd..7dec0c350a14 100644 > --- a/arch/powerpc/kvm/book3s_xive.c > +++ b/arch/powerpc/kvm/book3s_xive.c > @@ -1100,11 +1100,19 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) > void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > { > struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > - struct kvmppc_xive *xive = xc->xive; > + struct kvmppc_xive *xive; > int i; > > + if (!kvmppc_xics_enabled(vcpu)) > + return; > + > + if (!xc) > + return; > + > pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); > > + xive = xc->xive; > + > /* Ensure no interrupt is still routed to that VP */ > xc->valid = false; > kvmppc_xive_disable_vcpu_interrupts(vcpu); > @@ -1141,6 +1149,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > } > /* Free the VP */ > kfree(xc); > + > + /* Cleanup the vcpu */ > + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; > + vcpu->arch.xive_vcpu = NULL; > } > > int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > @@ -1158,7 +1170,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > } > if (xive->kvm != vcpu->kvm) > return -EPERM; > - if (vcpu->arch.irq_type) > + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) > return -EBUSY; > if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { > pr_devel("Duplicate !\n"); > @@ -1824,12 +1836,39 @@ void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) > } > } > > -static void kvmppc_xive_free(struct kvm_device *dev) > +/* > + * Called when device fd is closed > + */ > +static void kvmppc_xive_release(struct kvm_device *dev) > { > struct kvmppc_xive *xive = dev->private; > struct kvm *kvm = xive->kvm; > + struct kvm_vcpu *vcpu; > int i; > > + pr_devel("Releasing xive device\n"); > + > + /* > + * When releasing the KVM device fd, the vCPUs can still be > + * running and we should clean up the vCPU interrupt > + * presenters first. > + */ > + if (atomic_read(&kvm->online_vcpus) != 0) { > + /* > + * call kick_all_cpus_sync() to ensure that all CPUs > + * have executed any pending interrupts > + */ > + if (is_kvmppc_hv_enabled(kvm)) > + kick_all_cpus_sync(); > + > + /* > + * TODO: There is still a race window with the early > + * checks in kvmppc_native_connect_vcpu() > + */ > + kvm_for_each_vcpu(i, vcpu, kvm) > + kvmppc_xive_cleanup_vcpu(vcpu); > + } > + > debugfs_remove(xive->dentry); > > if (kvm) > @@ -1846,11 +1885,42 @@ static void kvmppc_xive_free(struct kvm_device *dev) > if (xive->vp_base != XIVE_INVALID_VP) > xive_native_free_vp_block(xive->vp_base); > > + /* > + * A reference of the kvmppc_xive pointer is now kept under > + * the xive_devices[] array of the machine for reuse. It is > + * freed when the VM is destroyed for now until we fix all the > + * execution paths. > + */ > > - kfree(xive); > kfree(dev); > } > > +/* > + * When the guest chooses the interrupt mode (XICS legacy or XIVE > + * native), the VM will switch of KVM device. The previous device will > + * be "released" before the new one is created. > + * > + * Until we are sure all execution paths are well protected, provide a > + * fail safe (transitional) method for device destruction, in which > + * the XIVE device pointer is recycled and not directly freed. > + */ > +struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type) > +{ > + struct kvmppc_xive *xive; > + bool xive_native_index = type == KVM_DEV_TYPE_XIVE; > + > + xive = kvm->arch.xive_devices[xive_native_index]; > + > + if (!xive) { > + xive = kzalloc(sizeof(*xive), GFP_KERNEL); > + kvm->arch.xive_devices[xive_native_index] = xive; > + } else { > + memset(xive, 0, sizeof(*xive)); > + } > + > + return xive; > +} > + > static int kvmppc_xive_create(struct kvm_device *dev, u32 type) > { > struct kvmppc_xive *xive; > @@ -1859,7 +1929,7 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) > > pr_devel("Creating xive for partition\n"); > > - xive = kzalloc(sizeof(*xive), GFP_KERNEL); > + xive = kvmppc_xive_get_device(kvm, type); > if (!xive) > return -ENOMEM; > > @@ -2024,7 +2094,7 @@ struct kvm_device_ops kvm_xive_ops = { > .name = "kvm-xive", > .create = kvmppc_xive_create, > .init = kvmppc_xive_init, > - .destroy = kvmppc_xive_free, > + .release = kvmppc_xive_release, > .set_attr = xive_set_attr, > .get_attr = xive_get_attr, > .has_attr = xive_has_attr, > diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c > index 62648f833adf..a99af2766ce3 100644 > --- a/arch/powerpc/kvm/book3s_xive_native.c > +++ b/arch/powerpc/kvm/book3s_xive_native.c > @@ -964,15 +964,29 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev, > return -ENXIO; > } > > -static void kvmppc_xive_native_free(struct kvm_device *dev) > +/* > + * Called when device fd is closed > + */ > +static void kvmppc_xive_native_release(struct kvm_device *dev) > { > struct kvmppc_xive *xive = dev->private; > struct kvm *kvm = xive->kvm; > + struct kvm_vcpu *vcpu; > int i; > > debugfs_remove(xive->dentry); > > - pr_devel("Destroying xive native device\n"); > + pr_devel("Releasing xive native device\n"); > + > + /* > + * When releasing the KVM device fd, the vCPUs can still be > + * running and we should clean up the vCPU interrupt > + * presenters first. > + */ > + if (atomic_read(&kvm->online_vcpus) != 0) { > + kvm_for_each_vcpu(i, vcpu, kvm) > + kvmppc_xive_native_cleanup_vcpu(vcpu); > + } > > if (kvm) > kvm->arch.xive = NULL; > @@ -987,7 +1001,13 @@ static void kvmppc_xive_native_free(struct kvm_device *dev) > if (xive->vp_base != XIVE_INVALID_VP) > xive_native_free_vp_block(xive->vp_base); > > - kfree(xive); > + /* > + * A reference of the kvmppc_xive pointer is now kept under > + * the xive_devices[] array of the machine for reuse. It is > + * freed when the VM is destroyed for now until we fix all the > + * execution paths. > + */ > + > kfree(dev); > } > > @@ -1002,7 +1022,7 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) > if (kvm->arch.xive) > return -EEXIST; > > - xive = kzalloc(sizeof(*xive), GFP_KERNEL); > + xive = kvmppc_xive_get_device(kvm, type); > if (!xive) > return -ENOMEM; > > @@ -1182,7 +1202,7 @@ struct kvm_device_ops kvm_xive_native_ops = { > .name = "kvm-xive-native", > .create = kvmppc_xive_native_create, > .init = kvmppc_xive_native_init, > - .destroy = kvmppc_xive_native_free, > + .release = kvmppc_xive_native_release, > .set_attr = kvmppc_xive_native_set_attr, > .get_attr = kvmppc_xive_native_get_attr, > .has_attr = kvmppc_xive_native_has_attr, > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c > index f54926c78320..9b9c8a9f28b5 100644 > --- a/arch/powerpc/kvm/powerpc.c > +++ b/arch/powerpc/kvm/powerpc.c > @@ -501,6 +501,15 @@ void kvm_arch_destroy_vm(struct kvm *kvm) > > mutex_unlock(&kvm->lock); > > + /* > + * Free the XIVE devices which are not directly freed by the > + * device 'release' method > + */ > + for (i = 0; i < ARRAY_SIZE(kvm->arch.xive_devices); i++) { > + kfree(kvm->arch.xive_devices[i]); > + kvm->arch.xive_devices[i] = NULL; > + } > + > /* drop the module reference */ > module_put(kvm->arch.kvm_ops->owner); > } -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature