On machines without the Destroy Secure Configuration Fast UVC, the topmost level of page tables is set aside and freed asynchronously as last step of the asynchronous teardown. Each gmap has a host_to_guest radix tree mapping host (userspace) addresses (with 1M granularity) to gmap segment table entries (pmds). If a guest is smaller than 2GB, the topmost level of page tables is the segment table (i.e. there are only 2 levels). Replacing it means that the pointers in the host_to_guest mapping would become stale and cause all kinds of nasty issues. This patch fixes the issue by synchronously destroying all guests with only 2 levels of page tables in kvm_s390_pv_set_aside. This will speed up the process and avoid the issue altogether. Update s390_replace_asce so it refuses to replace segment type ASCEs. Signed-off-by: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx> Fixes: fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot") --- arch/s390/kvm/pv.c | 35 ++++++++++++++++++++--------------- arch/s390/mm/gmap.c | 7 +++++++ 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index e032ebbf51b9..ceb8cb628d62 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -39,6 +39,7 @@ struct pv_vm_to_be_destroyed { u64 handle; void *stor_var; unsigned long stor_base; + bool small; }; static void kvm_s390_clear_pv_state(struct kvm *kvm) @@ -318,7 +319,11 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) if (!priv) return -ENOMEM; - if (is_destroy_fast_available()) { + if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) { + /* No need to do things asynchronously for VMs under 2GB */ + res = kvm_s390_pv_deinit_vm(kvm, rc, rrc); + priv->small = true; + } else if (is_destroy_fast_available()) { res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc); } else { priv->stor_var = kvm->arch.pv.stor_var; @@ -335,7 +340,8 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) return res; } - kvm_s390_destroy_lower_2g(kvm); + if (!priv->small) + kvm_s390_destroy_lower_2g(kvm); kvm_s390_clear_pv_state(kvm); kvm->arch.pv.set_aside = priv; @@ -418,7 +424,10 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) /* If a previous protected VM was set aside, put it in the need_cleanup list */ if (kvm->arch.pv.set_aside) { - list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup); + if (((struct pv_vm_to_be_destroyed *)kvm->arch.pv.set_aside)->small) + kfree(kvm->arch.pv.set_aside); + else + list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup); kvm->arch.pv.set_aside = NULL; } @@ -485,26 +494,22 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) if (!p) return -EINVAL; - /* When a fatal signal is received, stop immediately */ - if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + if (p->small) goto done; - if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) - ret = -EIO; - kfree(p); - p = NULL; -done: - /* - * p is not NULL if we aborted because of a fatal signal, in which - * case queue the leftover for later cleanup. - */ - if (p) { + /* When a fatal signal is received, stop immediately */ + if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) { mutex_lock(&kvm->lock); list_add(&p->list, &kvm->arch.pv.need_cleanup); mutex_unlock(&kvm->lock); /* Did not finish, but pretend things went well */ *rc = UVC_RC_EXECUTED; *rrc = 42; + return 0; } + if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) + ret = -EIO; +done: + kfree(p); return ret; } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 5a716bdcba05..2267cf9819b2 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2833,6 +2833,9 @@ EXPORT_SYMBOL_GPL(s390_unlist_old_asce); * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy * @gmap: the gmap whose ASCE needs to be replaced * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. * If the allocation of the new top level page table fails, the ASCE is not * replaced. * In any case, the old ASCE is always removed from the gmap CRST list. @@ -2847,6 +2850,10 @@ int s390_replace_asce(struct gmap *gmap) s390_unlist_old_asce(gmap); + /* Replacing segment type ASCEs would cause serious issues */ + if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; -- 2.39.2