Re: [PATCH v3 3/4] kvm: x86: only provide PV features if enabled in guest's CPUID

Oliver Upton <oupton@xxxxxxxxxx> · Fri, 14 Aug 2020 10:43:45 -0500

On Thu, Aug 13, 2020 at 7:58 PM Wanpeng Li <kernellwp@xxxxxxxxx> wrote:
>
> On Fri, 7 Aug 2020 at 01:54, Oliver Upton <oupton@xxxxxxxxxx> wrote:
> >
> > KVM unconditionally provides PV features to the guest, regardless of the
> > configured CPUID. An unwitting guest that doesn't check
> > KVM_CPUID_FEATURES before use could access paravirt features that
> > userspace did not intend to provide. Fix this by checking the guest's
> > CPUID before performing any paravirtual operations.
> >
> > Introduce a capability, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, to gate the
> > aforementioned enforcement. Migrating a VM from a host w/o this patch to
> > a host with this patch could silently change the ABI exposed to the
> > guest, warranting that we default to the old behavior and opt-in for
> > the new one.
> >
> > Reviewed-by: Jim Mattson <jmattson@xxxxxxxxxx>
> > Reviewed-by: Peter Shier <pshier@xxxxxxxxxx>
> > Signed-off-by: Oliver Upton <oupton@xxxxxxxxxx>
> > ---
> >  Documentation/virt/kvm/api.rst  | 11 ++++++
> >  arch/x86/include/asm/kvm_host.h |  6 +++
> >  arch/x86/kvm/cpuid.h            | 16 ++++++++
> >  arch/x86/kvm/x86.c              | 67 ++++++++++++++++++++++++++++++---
> >  include/uapi/linux/kvm.h        |  1 +
> >  5 files changed, 96 insertions(+), 5 deletions(-)
> >
> > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > index 644e5326aa50..e8fc6e34f344 100644
> > --- a/Documentation/virt/kvm/api.rst
> > +++ b/Documentation/virt/kvm/api.rst
> > @@ -6155,3 +6155,14 @@ KVM can therefore start protected VMs.
> >  This capability governs the KVM_S390_PV_COMMAND ioctl and the
> >  KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected
> >  guests when the state change is invalid.
> > +
> > +
> > +8.24 KVM_CAP_ENFORCE_PV_CPUID
> > +-----------------------------
> > +
> > +Architectures: x86
> > +
> > +When enabled, KVM will disable paravirtual features provided to the
> > +guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf
> > +(0x40000001). Otherwise, a guest may use the paravirtual features
> > +regardless of what has actually been exposed through the CPUID leaf.
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 5ab3af7275d8..a641c3840a1e 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -788,6 +788,12 @@ struct kvm_vcpu_arch {
> >
> >         /* AMD MSRC001_0015 Hardware Configuration */
> >         u64 msr_hwcr;
> > +
> > +       /*
> > +        * Indicates whether PV emulation should be disabled if not present in
> > +        * the guest's cpuid.
> > +        */
> > +       bool enforce_pv_feature_cpuid;
> >  };
> >
> >  struct kvm_lpage_info {
> > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> > index 3a923ae15f2f..c364c2877583 100644
> > --- a/arch/x86/kvm/cpuid.h
> > +++ b/arch/x86/kvm/cpuid.h
> > @@ -5,6 +5,7 @@
> >  #include "x86.h"
> >  #include <asm/cpu.h>
> >  #include <asm/processor.h>
> > +#include <uapi/asm/kvm_para.h>
> >
> >  extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
> >  void kvm_set_cpu_caps(void);
> > @@ -308,4 +309,19 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
> >         return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
> >  }
> >
> > +static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
> > +                                          unsigned int kvm_feature)
> > +{
> > +       struct kvm_cpuid_entry2 *cpuid;
> > +
> > +       if (!vcpu->arch.enforce_pv_feature_cpuid)
> > +               return true;
> > +
> > +       cpuid = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
> > +       if (!cpuid)
> > +               return false;
> > +
> > +       return cpuid->eax & (1u << kvm_feature);
> > +}
> > +
> >  #endif
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 683ce68d96b2..9900a846dfc0 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -2763,6 +2763,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
> >         if (data & 0x30)
> >                 return 1;
> >
> > +       if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
> > +           (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
> > +               return 1;
> > +
> > +       if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
> > +           (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
> > +               return 1;
> > +
> >         if (!lapic_in_kernel(vcpu))
> >                 return 1;
> >
> > @@ -2840,10 +2848,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> >          * Doing a TLB flush here, on the guest's behalf, can avoid
> >          * expensive IPIs.
> >          */
> > -       trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
> > -               st->preempted & KVM_VCPU_FLUSH_TLB);
> > -       if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
> > -               kvm_vcpu_flush_tlb_guest(vcpu);
> > +       if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
>
> Is it expensive to recalculate it every time if vcpu is
> sched_in/sched_out frequently?

Perhaps so, since we wind up iterating the cpuid array every time. We
could instead just save the feature bitmap in kvm_vcpu_arch when the
guest's CPUID is set to avoid the lookup cost.

>
> > +               trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
> > +                                      st->preempted & KVM_VCPU_FLUSH_TLB);
> > +               if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
> > +                       kvm_vcpu_flush_tlb_guest(vcpu);
> > +       }
> >
> >         vcpu->arch.st.preempted = 0;
> >
> > @@ -2998,30 +3008,54 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >                 vcpu->arch.smi_count = data;
> >                 break;
> >         case MSR_KVM_WALL_CLOCK_NEW:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
> > +                       return 1;
> > +
> > +               kvm_write_wall_clock(vcpu->kvm, data);
> > +               break;
> >         case MSR_KVM_WALL_CLOCK:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
> > +                       return 1;
> > +
> >                 kvm_write_wall_clock(vcpu->kvm, data);
> >                 break;
> >         case MSR_KVM_SYSTEM_TIME_NEW:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
> > +                       return 1;
> > +
> >                 kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
> >                 break;
> >         case MSR_KVM_SYSTEM_TIME:
> > -               kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
> > +                       return 1;
> > +
> > +               kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
> >                 break;
> >         case MSR_KVM_ASYNC_PF_EN:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
> > +                       return 1;
> > +
> >                 if (kvm_pv_enable_async_pf(vcpu, data))
> >                         return 1;
> >                 break;
> >         case MSR_KVM_ASYNC_PF_INT:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
> > +                       return 1;
> > +
> >                 if (kvm_pv_enable_async_pf_int(vcpu, data))
> >                         return 1;
> >                 break;
> >         case MSR_KVM_ASYNC_PF_ACK:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
> > +                       return 1;
> >                 if (data & 0x1) {
> >                         vcpu->arch.apf.pageready_pending = false;
> >                         kvm_check_async_pf_completion(vcpu);
> >                 }
> >                 break;
> >         case MSR_KVM_STEAL_TIME:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
> > +                       return 1;
> >
> >                 if (unlikely(!sched_info_on()))
> >                         return 1;
> > @@ -3038,11 +3072,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >
> >                 break;
> >         case MSR_KVM_PV_EOI_EN:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
> > +                       return 1;
> > +
> >                 if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
> >                         return 1;
> >                 break;
> >
> >         case MSR_KVM_POLL_CONTROL:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
> > +                       return 1;
> > +
> >                 /* only enable bit supported */
> >                 if (data & (-1ULL << 1))
> >                         return 1;
> > @@ -3522,6 +3562,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> >         case KVM_CAP_EXCEPTION_PAYLOAD:
> >         case KVM_CAP_SET_GUEST_DEBUG:
> >         case KVM_CAP_LAST_CPU:
> > +       case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> >                 r = 1;
> >                 break;
> >         case KVM_CAP_SYNC_REGS:
> > @@ -4389,6 +4430,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
> >
> >                 return kvm_x86_ops.enable_direct_tlbflush(vcpu);
> >
> > +       case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> > +               vcpu->arch.enforce_pv_feature_cpuid = cap->args[0];
> > +
> > +               return 0;
> > +
> >         default:
> >                 return -EINVAL;
> >         }
> > @@ -7723,11 +7769,16 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> >                 goto out;
> >         }
> >
> > +       ret = -KVM_ENOSYS;
> > +
> >         switch (nr) {
> >         case KVM_HC_VAPIC_POLL_IRQ:
> >                 ret = 0;
> >                 break;
> >         case KVM_HC_KICK_CPU:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
> > +                       break;
> > +
> >                 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
> >                 kvm_sched_yield(vcpu->kvm, a1);
> >                 ret = 0;
> > @@ -7738,9 +7789,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> >                 break;
> >  #endif
> >         case KVM_HC_SEND_IPI:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
> > +                       break;
> > +
> >                 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
> >                 break;
> >         case KVM_HC_SCHED_YIELD:
> > +               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
> > +                       break;
> > +
> >                 kvm_sched_yield(vcpu->kvm, a0);
> >                 ret = 0;
> >                 break;
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index f6d86033c4fa..48c2d5c10b1e 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -1035,6 +1035,7 @@ struct kvm_ppc_resize_hpt {
> >  #define KVM_CAP_LAST_CPU 184
> >  #define KVM_CAP_SMALLER_MAXPHYADDR 185
> >  #define KVM_CAP_S390_DIAG318 186
> > +#define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 187
> >
> >  #ifdef KVM_CAP_IRQ_ROUTING
> >
> > --
> > 2.28.0.236.gb10cc79966-goog
> >