On 08/10/2018 20:29, Jim Mattson wrote: > Under nested virtualization, the L1 hypervisor may intercept an > exception raised during the execution of L2 before the exception > is delivered. When the intercepted exception is #PF, the VM-exit > to the L1 hypervisor precedes the modification of CR2. When the > intercepted exception is #DB, the VM-exit to the L1 hypervisor > precedes the modifications of DR6 and DR7 under VMX, but the > VM-exit to the L1 hypervisor follows the modifications of DR6 and > DR7 under SVM. > > At present, CR2 is modified too early under both VMX and SVM. DR6 is > modified too early under VMX. DR7 is modified at the appropriate time. > Unfortunately, it is possible to exit to userspace with one of these > exceptions pending, and userspace may rely on the premature > side-effects. It is also possible for userspace to inject one of these > exceptions, in which case, userspace will presumably have already > processed the side-effects. > > To address this problem, a new per-VM capability > (KVM_CAP_EXCEPTION_PAYLOAD) will be introduced. When this capability > is enabled by userspace, the faulting linear address will be included > with the information about a pending #PF in L2, and the "new DR6 bits" > will be included with the information about a pending #DB in L2. This > ancillary exception information is carried in a new "payload" field. > > Reported-by: Jim Mattson <jmattson@xxxxxxxxxx> > Suggested-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> > Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx> > Reviewed-by: Peter Shier <pshier@xxxxxxxxxx> > --- > Documentation/virtual/kvm/api.txt | 13 +++++++++++-- > arch/x86/include/asm/kvm_host.h | 3 +++ > arch/x86/include/uapi/asm/kvm.h | 6 ++++-- > arch/x86/kvm/x86.c | 28 ++++++++++++++++++++++++--- > tools/arch/x86/include/uapi/asm/kvm.h | 6 ++++-- > 5 files changed, 47 insertions(+), 9 deletions(-) > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt > index 647f94128a85..2df2cca81cf5 100644 > --- a/Documentation/virtual/kvm/api.txt > +++ b/Documentation/virtual/kvm/api.txt > @@ -850,7 +850,7 @@ struct kvm_vcpu_events { > __u8 injected; > __u8 nr; > __u8 has_error_code; > - __u8 pad; > + __u8 has_payload; > __u32 error_code; > } exception; > struct { > @@ -873,9 +873,11 @@ struct kvm_vcpu_events { > __u8 smm_inside_nmi; > __u8 latched_init; > } smi; > + __u32 reserved[7]; > + __u64 exception_payload; > }; > > -Only two fields are defined in the flags field: > +The following bits are defined in the flags field: > > - KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that > interrupt.shadow contains a valid state. > @@ -883,6 +885,10 @@ Only two fields are defined in the flags field: > - KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that > smi contains a valid state. > > +- KVM_VCPUEVENT_VALID_PAYLOAD may be set in the flags field to signal that > + exception.has_payload and payload contain valid state > + (i.e. KVM_CAP_EXCEPTION_PAYLOAD is enabled). > + > ARM/ARM64: > > If the guest accesses a device that is being emulated by the host kernel in > @@ -961,6 +967,9 @@ shall be written into the VCPU. > > KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available. > > +KVM_VCPUEVENT_VALID_PAYLOAD can only be set if KVM_CAP_EXCEPTION_PAYLOAD > +is enabled. > + > ARM/ARM64: > > Set the pending SError exception state for this VCPU. It is not possible to > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 09b2e3e2cf1b..026229a593f2 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -585,6 +585,8 @@ struct kvm_vcpu_arch { > bool has_error_code; > u8 nr; > u32 error_code; > + unsigned long payload; > + bool has_payload; > u8 nested_apf; > } exception; > > @@ -871,6 +873,7 @@ struct kvm_arch { > bool x2apic_broadcast_quirk_disabled; > > bool guest_can_read_msr_platform_info; > + bool exception_payload_enabled; > }; > > struct kvm_vm_stat { > diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h > index fd23d5778ea1..e3ea52bdd461 100644 > --- a/arch/x86/include/uapi/asm/kvm.h > +++ b/arch/x86/include/uapi/asm/kvm.h > @@ -288,6 +288,7 @@ struct kvm_reinject_control { > #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 > #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 > #define KVM_VCPUEVENT_VALID_SMM 0x00000008 > +#define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 > > /* Interrupt shadow states */ > #define KVM_X86_SHADOW_INT_MOV_SS 0x01 > @@ -299,7 +300,7 @@ struct kvm_vcpu_events { > __u8 injected; > __u8 nr; > __u8 has_error_code; > - __u8 pad; > + __u8 has_payload; > __u32 error_code; > } exception; > struct { > @@ -322,7 +323,8 @@ struct kvm_vcpu_events { > __u8 smm_inside_nmi; > __u8 latched_init; > } smi; > - __u32 reserved[9]; > + __u32 reserved[7]; > + __u64 exception_payload; > }; > > /* for KVM_GET/SET_DEBUGREGS */ > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index edbf00ec56b3..dbc538d66505 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -431,6 +431,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, > vcpu->arch.exception.has_error_code = has_error; > vcpu->arch.exception.nr = nr; > vcpu->arch.exception.error_code = error_code; > + vcpu->arch.exception.has_payload = false; > + vcpu->arch.exception.payload = 0; > return; > } > > @@ -455,6 +457,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, > vcpu->arch.exception.has_error_code = true; > vcpu->arch.exception.nr = DF_VECTOR; > vcpu->arch.exception.error_code = 0; > + vcpu->arch.exception.has_payload = false; > + vcpu->arch.exception.payload = 0; > } else > /* replace previous exception with a new one in a hope > that instruction re-execution will regenerate lost > @@ -3373,8 +3377,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, > !kvm_exception_is_soft(vcpu->arch.exception.nr); > events->exception.nr = vcpu->arch.exception.nr; > events->exception.has_error_code = vcpu->arch.exception.has_error_code; > - events->exception.pad = 0; > events->exception.error_code = vcpu->arch.exception.error_code; > + events->exception.has_payload = vcpu->arch.exception.has_payload; > + events->exception_payload = vcpu->arch.exception.payload; > > events->interrupt.injected = > vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; > @@ -3398,6 +3403,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, > events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING > | KVM_VCPUEVENT_VALID_SHADOW > | KVM_VCPUEVENT_VALID_SMM); > + if (vcpu->kvm->arch.exception_payload_enabled) > + events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; > memset(&events->reserved, 0, sizeof(events->reserved)); > } > > @@ -3409,12 +3416,18 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, > if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING > | KVM_VCPUEVENT_VALID_SIPI_VECTOR > | KVM_VCPUEVENT_VALID_SHADOW > - | KVM_VCPUEVENT_VALID_SMM)) > + | KVM_VCPUEVENT_VALID_SMM > + | KVM_VCPUEVENT_VALID_PAYLOAD)) > + return -EINVAL; > + > + if ((events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) && > + !vcpu->kvm->arch.exception_payload_enabled) > return -EINVAL; > > if (events->exception.injected && > (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR || > - is_guest_mode(vcpu))) > + (is_guest_mode(vcpu) && > + !vcpu->kvm->arch.exception_payload_enabled))) > return -EINVAL; > > /* INITs are latched while in SMM */ > @@ -3429,6 +3442,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, > vcpu->arch.exception.nr = events->exception.nr; > vcpu->arch.exception.has_error_code = events->exception.has_error_code; > vcpu->arch.exception.error_code = events->exception.error_code; > + if (vcpu->kvm->arch.exception_payload_enabled) { > + vcpu->arch.exception.has_payload = events->exception.has_payload; > + vcpu->arch.exception.payload = events->exception_payload; > + } else { > + vcpu->arch.exception.has_payload = 0; > + vcpu->arch.exception.payload = 0; > + } > > vcpu->arch.interrupt.injected = events->interrupt.injected; > vcpu->arch.interrupt.nr = events->interrupt.nr; > @@ -9463,6 +9483,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, > vcpu->arch.exception.nr = 0; > vcpu->arch.exception.has_error_code = false; > vcpu->arch.exception.error_code = 0; > + vcpu->arch.exception.has_payload = false; > + vcpu->arch.exception.payload = 0; > } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { > fault.vector = PF_VECTOR; > fault.error_code_valid = true; > diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h > index 86299efa804a..c4e4f03ce436 100644 > --- a/tools/arch/x86/include/uapi/asm/kvm.h > +++ b/tools/arch/x86/include/uapi/asm/kvm.h > @@ -288,6 +288,7 @@ struct kvm_reinject_control { > #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 > #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 > #define KVM_VCPUEVENT_VALID_SMM 0x00000008 > +#define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 > > /* Interrupt shadow states */ > #define KVM_X86_SHADOW_INT_MOV_SS 0x01 > @@ -299,7 +300,7 @@ struct kvm_vcpu_events { > __u8 injected; > __u8 nr; > __u8 has_error_code; > - __u8 pad; > + __u8 has_payload; > __u32 error_code; > } exception; > struct { > @@ -322,7 +323,8 @@ struct kvm_vcpu_events { > __u8 smm_inside_nmi; > __u8 latched_init; > } smi; > - __u32 reserved[9]; > + __u32 reserved[7]; > + __u64 payload; > }; > > /* for KVM_GET/SET_DEBUGREGS */ > I'm not applying any of the patches yet, but I'd be happy to queue it for 4.20 even (slightly) after -rc1. Once this is done, I think we can flip nested=1. Paolo