On Mon, 16 May 2022 09:08:13 +0000 Janosch Frank <frankja@xxxxxxxxxxxxx> wrote: > Sometimes dumping inside of a VM fails, is unavailable or doesn't > yield the required data. For these occasions we dump the VM from the > outside, writing memory and cpu data to a file. > > Up to now PV guests only supported dumping from the inside of the > guest through dumpers like KDUMP. A PV guest can be dumped from the > hypervisor but the data will be stale and / or encrypted. > > To get the actual state of the PV VM we need the help of the > Ultravisor who safeguards the VM state. New UV calls have been added > to initialize the dump, dump storage state data, dump cpu data and > complete the dump process. We expose these calls in this patch via a > new UV ioctl command. > > The sensitive parts of the dump data are encrypted, the dump key is > derived from the Customer Communication Key (CCK). This ensures that > only the owner of the VM who has the CCK can decrypt the dump data. > > The memory is dumped / read via a normal export call and a re-import > after the dump initialization is not needed (no re-encryption with a > dump key). > > Signed-off-by: Janosch Frank <frankja@xxxxxxxxxxxxx> > --- > arch/s390/include/asm/kvm_host.h | 1 + > arch/s390/kvm/kvm-s390.c | 126 +++++++++++++++++++++++++++++++ > arch/s390/kvm/kvm-s390.h | 2 + > arch/s390/kvm/pv.c | 113 +++++++++++++++++++++++++++ > include/uapi/linux/kvm.h | 15 ++++ > 5 files changed, 257 insertions(+) > > diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h > index 766028d54a3e..a0fbe4820e0a 100644 > --- a/arch/s390/include/asm/kvm_host.h > +++ b/arch/s390/include/asm/kvm_host.h > @@ -923,6 +923,7 @@ struct kvm_s390_pv { > u64 guest_len; > unsigned long stor_base; > void *stor_var; > + bool dumping; > }; > > struct kvm_arch{ > diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c > index de54f14e081e..6bf9dd85d50f 100644 > --- a/arch/s390/kvm/kvm-s390.c > +++ b/arch/s390/kvm/kvm-s390.c > @@ -2271,6 +2271,101 @@ static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info) > } > } > > +static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd, > + struct kvm_s390_pv_dmp dmp) > +{ > + int r = -EINVAL; > + void __user *result_buff = (void __user *)dmp.buff_addr; > + > + switch (dmp.subcmd) { > + case KVM_PV_DUMP_INIT: { > + if (kvm->arch.pv.dumping) > + break; > + > + /* > + * Block SIE entry as concurrent dump UVCs could lead > + * to validities. to fatal validity intercepts > + */ > + kvm_s390_vcpu_block_all(kvm); > + > + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), > + UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc); > + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x", > + cmd->rc, cmd->rrc); > + if (!r) { > + kvm->arch.pv.dumping = true; > + } else { > + kvm_s390_vcpu_unblock_all(kvm); > + r = -EINVAL; > + } > + break; > + } > + case KVM_PV_DUMP_CONFIG_STOR_STATE: { > + if (!kvm->arch.pv.dumping) > + break; > + > + /* > + * gaddr is an output parameter since we might stop > + * early. As dmp will be copied back in our caller, we > + * don't need to do it ourselves. > + */ > + r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len, > + &cmd->rc, &cmd->rrc); > + break; > + } > + case KVM_PV_DUMP_COMPLETE: { this starts to be quite complex, maybe push it into a separate function as well, like you did for KVM_PV_DUMP_CONFIG_STOR_STATE > + struct uv_cb_dump_complete complete = { > + .header.len = sizeof(complete), > + .header.cmd = UVC_CMD_DUMP_COMPLETE, > + .config_handle = kvm_s390_pv_get_handle(kvm), > + }; > + u64 *compl_data; > + > + r = -EINVAL; > + if (!kvm->arch.pv.dumping) > + break; > + > + if (dmp.buff_len < uv_info.conf_dump_finalize_len) > + break; > + > + /* Allocate dump area */ > + r = -ENOMEM; > + compl_data = vzalloc(uv_info.conf_dump_finalize_len); > + if (!compl_data) > + break; > + complete.dump_area_origin = (u64)compl_data; > + > + r = uv_call_sched(0, (u64)&complete); > + cmd->rc = complete.header.rc; > + cmd->rrc = complete.header.rrc; > + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x", > + complete.header.rc, complete.header.rrc); > + > + if (!r) { > + /* > + * kvm_s390_pv_dealloc_vm() will also (mem)set > + * this to false on a reboot or other destroy > + * operation for this vm. > + */ > + kvm->arch.pv.dumping = false; > + kvm_s390_vcpu_unblock_all(kvm); > + r = copy_to_user(result_buff, compl_data, uv_info.conf_dump_finalize_len); > + if (r) > + r = -EFAULT; > + } > + vfree(compl_data); > + if (r > 0) > + r = -EINVAL; > + break; > + } > + default: > + r = -ENOTTY; > + break; > + } > + > + return r; > +} > + > static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) > { > int r = 0; > @@ -2447,6 +2542,28 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) > r = 0; > break; > } > + case KVM_PV_DUMP: { > + struct kvm_s390_pv_dmp dmp; > + > + r = -EINVAL; > + if (!kvm_s390_pv_is_protected(kvm)) > + break; > + > + r = -EFAULT; > + if (copy_from_user(&dmp, argp, sizeof(dmp))) > + break; > + > + r = kvm_s390_pv_dmp(kvm, cmd, dmp); > + if (r) > + break; > + > + if (copy_to_user(argp, &dmp, sizeof(dmp))) { > + r = -EFAULT; > + break; > + } > + > + break; > + } > default: > r = -ENOTTY; > } > @@ -4564,6 +4681,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) > struct kvm_run *kvm_run = vcpu->run; > int rc; > > + /* > + * Running a VM while dumping always has the potential to > + * produce inconsistent dump data. But for PV vcpus a SIE > + * entry while dumping could also lead to a validity which we "a fatal validity intercept" > + * absolutely want to avoid. > + */ > + if (vcpu->kvm->arch.pv.dumping) > + return -EINVAL; > + > if (kvm_run->immediate_exit) > return -EINTR; > > diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h > index 497d52a83c78..2868dd0bba25 100644 > --- a/arch/s390/kvm/kvm-s390.h > +++ b/arch/s390/kvm/kvm-s390.h > @@ -250,6 +250,8 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, > int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, > unsigned long tweak, u16 *rc, u16 *rrc); > int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state); > +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, > + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc); > > static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) > { > diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c > index cc7c9599f43e..fd261667d2c2 100644 > --- a/arch/s390/kvm/pv.c > +++ b/arch/s390/kvm/pv.c > @@ -7,6 +7,7 @@ > */ > #include <linux/kvm.h> > #include <linux/kvm_host.h> > +#include <linux/minmax.h> > #include <linux/pagemap.h> > #include <linux/sched/signal.h> > #include <asm/gmap.h> > @@ -298,3 +299,115 @@ int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state) > return -EINVAL; > return 0; > } > + > +/* Size of the cache for the storage state dump data. 1MB for now */ > +#define DUMP_BUFF_LEN HPAGE_SIZE > + > +/* > + * kvm_s390_pv_dump_stor_state > + * > + * @kvm: pointer to the guest's KVM struct > + * @buff_user: Userspace pointer where we will write the results to > + * @gaddr: Starting absolute guest address for which the storage state > + * is requested. This value will be updated with the last > + * address for which data was written when returning to > + * userspace. > + * @buff_user_len: Length of the buff_user buffer > + * @rc: Pointer to where the uvcb return code is stored > + * @rrc: Pointer to where the uvcb return reason code is stored > + * please add: Context: kvm->lock needs to be held also explain that part of the user buffer might be written to even in case of failure (this also needs to go in the documentation) > + * Return: > + * 0 on success rc and rrc will also be set in case of success > + * -ENOMEM if allocating the cache fails > + * -EINVAL if gaddr is not aligned to 1MB > + * -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len > + * -EINVAL if the UV call fails, rc and rrc will be set in this case have you considered a different code for UVC failure? so the caller can know that rc and rrc are meaningful or just explain that rc and rrc will always be set; if the UVC is not performed, rc and rrc will be 0 > + * -EFAULT if copying the result to buff_user failed > + */ > +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, > + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc) > +{ > + struct uv_cb_dump_stor_state uvcb = { > + .header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE, > + .header.len = sizeof(uvcb), > + .config_handle = kvm->arch.pv.handle, > + .gaddr = *gaddr, > + .dump_area_origin = 0, > + }; > + size_t buff_kvm_size; > + size_t size_done = 0; > + u8 *buff_kvm = NULL; > + int cc, ret; > + > + ret = -EINVAL; > + /* UV call processes 1MB guest storage chunks at a time */ > + if (!IS_ALIGNED(*gaddr, HPAGE_SIZE)) > + goto out; > + > + /* > + * We provide the storage state for 1MB chunks of guest > + * storage. The buffer will need to be aligned to > + * conf_dump_storage_state_len so we don't end on a partial > + * chunk. > + */ > + if (!buff_user_len || > + !IS_ALIGNED(buff_user_len, uv_info.conf_dump_storage_state_len)) > + goto out; > + > + /* > + * Allocate a buffer from which we will later copy to the user > + * process. We don't want userspace to dictate our buffer size > + * so we limit it to DUMP_BUFF_LEN. > + */ > + ret = -ENOMEM; > + buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN); > + buff_kvm = vzalloc(buff_kvm_size); > + if (!buff_kvm) > + goto out; > + > + ret = 0; > + uvcb.dump_area_origin = (u64)buff_kvm; > + /* We will loop until the user buffer is filled or an error occurs */ > + do { > + /* Get 1MB worth of guest storage state data */ > + cc = uv_call_sched(0, (u64)&uvcb); > + > + /* All or nothing */ > + if (cc) { > + ret = -EINVAL; > + break; > + } > + > + size_done += uv_info.conf_dump_storage_state_len; maybe save this in a local const variable with a shorter name? would be more readable? const u64 dump_len = uv_info.conf_dump_storage_state_len; > + uvcb.dump_area_origin += uv_info.conf_dump_storage_state_len; > + buff_user_len -= uv_info.conf_dump_storage_state_len; > + uvcb.gaddr += HPAGE_SIZE; > + > + /* KVM Buffer full, time to copy to the process */ > + if (!buff_user_len || size_done == DUMP_BUFF_LEN) { > + if (copy_to_user(buff_user, buff_kvm, size_done)) { > + ret = -EFAULT; > + break; > + } > + > + buff_user += size_done; > + size_done = 0; > + uvcb.dump_area_origin = (u64)buff_kvm; > + } > + } while (buff_user_len); > + > + /* Report back where we ended dumping */ > + *gaddr = uvcb.gaddr; > + > + /* Lets only log errors, we don't want to spam */ > +out: > + if (ret) > + KVM_UV_EVENT(kvm, 3, > + "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x", > + uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc); > + *rc = uvcb.header.rc; > + *rrc = uvcb.header.rrc; > + vfree(buff_kvm); > + > + return ret; > +} > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index bb2f91bc2305..1c60c2d314ba 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -1653,6 +1653,20 @@ struct kvm_s390_pv_unp { > __u64 tweak; > }; > > +enum pv_cmd_dmp_id { > + KVM_PV_DUMP_INIT, > + KVM_PV_DUMP_CONFIG_STOR_STATE, > + KVM_PV_DUMP_COMPLETE, > +}; > + > +struct kvm_s390_pv_dmp { > + __u64 subcmd; > + __u64 buff_addr; > + __u64 buff_len; > + __u64 gaddr; /* For dump storage state */ > + __u64 reserved[4]; > +}; > + > enum pv_cmd_info_id { > KVM_PV_INFO_VM, > KVM_PV_INFO_DUMP, > @@ -1696,6 +1710,7 @@ enum pv_cmd_id { > KVM_PV_PREP_RESET, > KVM_PV_UNSHARE_ALL, > KVM_PV_INFO, > + KVM_PV_DUMP, > }; > > struct kvm_pv_cmd {