Re: [PATCH] KVM: x86: KVM_CAP_SYNC_REGS

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Reviewed-by: Jim Mattson <jmattson@xxxxxxxxxx>

On Wed, Sep 20, 2017 at 10:42 AM, Ken Hofsass <hofsass@xxxxxxxxxx> wrote:
> This commit implements an enhanced x86 version of S390
> KVM_CAP_SYNC_REGS functionality. KVM_CAP_SYNC_REGS "allow[s]
> userspace to access certain guest registers without having
> to call SET/GET_*REGS”. This reduces ioctl overhead which
> is particularly important when userspace is making synchronous
> guest state modifications (e.g. when emulating and/or intercepting
> instructions).
>
> Originally implemented upstream for the S390, the x86 differences
> follow:
> - the capability can be enabled/disabled
> - the register sets to be copied out out to kvm_run are selectable
> by userspace
> - vcpu_events and selectable MSRs are available in addition to the
> basic register sets (regs, sregs, debug_regs, and fpu).
>
> Signed-off-by: Ken Hofsass <hofsass@xxxxxxxxxx>
> ---
>  Documentation/virtual/kvm/api.txt |  49 ++++++++++++++
>  arch/x86/include/uapi/asm/kvm.h   |  32 ++++++++-
>  arch/x86/kvm/x86.c                | 134 +++++++++++++++++++++++++++++++++++++-
>  include/uapi/linux/kvm.h          |   6 +-
>  4 files changed, 216 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index e63a35fafef0..9536eb310d04 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3951,6 +3951,55 @@ Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be
>  accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from
>  the guest.
>
> +6.74 KVM_CAP_SYNC_REGS
> +Architectures: s390, x86 (vmx-only)
> +Target: s390: always enabled, x86: vcpu
> +Parameters: s390: none, x86: args[0] takes bitmask selecting which register
> +sets to copy out (bitfields defined in arch/x86/include/uapi/asm/kvm.h).
> +
> +As described above in the kvm_sync_regs struct info in section 5 (kvm_run):
> +KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers
> +without having to call SET/GET_*REGS". This reduces overhead by eliminating
> +repeated ioctl calls for setting and/or getting register values. This is
> +particularly important when userspace is making synchronous guest state
> +modifications, e.g. when emulating and/or intercepting instructions in
> +userspace.
> +
> +For s390 specifics, please the source code.
> +
> +For x86:
> +- the capability can be enabled/disabled (s390 is always enabled).
> +- the register sets to be copied out out to kvm_run are selectable
> +  by userspace (rather that all sets being copied out for every exit).
> +- vcpu_events and selectable MSRs are available in addition to the
> +  basic register sets (regs, sregs, debug_regs, and fpu).
> +
> +In addition to the register sets, the x86 'kvm_sync_regs' struct definition
> +includes the bitarray, 'sync_regs', that toggles whether specific register
> +sets are to be copied out. When enabling SYNC_REGS via KVM_CAP_ENABLE, the
> +value in args[0] is copied into the 'sync_regs' bitarray value by KVM. But
> +like the register sets, the copy-out field can be modified while handling
> +an exit so that additonal KVM_CAP_ENABLE ioctl calls can be avoided.
> +
> +To indicate when userspace has modified values that should be copied into
> +the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set.
> +This is done using the same bitflags as for the 'sync_regs' copy-out field.
> +If the dirty bit is not set, then the register values will not be copied
> +into the vCPU even if they've been modified.
> +
> +Unused bitfields in the bitarrays must be set to zero.
> +
> +struct kvm_sync_regs {
> +        __u64 sync_regs;
> +        struct kvm_regs regs;
> +        struct kvm_sregs sregs;
> +        struct kvm_debugregs debugregs;
> +        struct kvm_fpu fpu;
> +        struct kvm_vcpu_events events;
> +        struct kvm_msrs msrs;
> +        struct kvm_msr_entry msr_entries[SYNC_REGS_NUM_MSRS];
> +};
> +
>  7. Capabilities that can be enabled on VMs
>  ------------------------------------------
>
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index c2824d02ba37..cdee17a4b0ee 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -353,8 +353,38 @@ struct kvm_xcrs {
>         __u64 padding[16];
>  };
>
> -/* definition of registers in kvm_run */
> +#define KVM_SYNC_X86_REGS      (1UL << 0)
> +#define KVM_SYNC_X86_SREGS     (1UL << 1)
> +#define KVM_SYNC_X86_DEBUGREGS (1UL << 2)
> +#define KVM_SYNC_X86_FPU       (1UL << 3)
> +#define KVM_SYNC_X86_EVENTS    (1UL << 4)
> +#define KVM_SYNC_X86_MSRS      (1UL << 5)
> +#define KVM_SYNC_X86_NUM_FIELDS                6
> +
> +#define KVM_SYNC_X86_VALID_BITS \
> +       (KVM_SYNC_X86_REGS| \
> +        KVM_SYNC_X86_SREGS| \
> +        KVM_SYNC_X86_DEBUGREGS| \
> +        KVM_SYNC_X86_FPU| \
> +        KVM_SYNC_X86_EVENTS| \
> +        KVM_SYNC_X86_MSRS)
> +
> +#define SYNC_REGS_NUM_MSRS 8
> +/* kvm_sync_regs struct included by kvm_run struct */
>  struct kvm_sync_regs {
> +       /* Members of this structure are potentially malicious.
> +        * Care must be taken by code reading, esp. interpreting,
> +        * data fields from them inside KVM to prevent TOCTOU and
> +        * double-fetch types of vulnerabilities.
> +        */
> +       __u64 sync_regs;
> +       struct kvm_regs regs;
> +       struct kvm_sregs sregs;
> +       struct kvm_debugregs debugregs;
> +       struct kvm_fpu fpu;
> +       struct kvm_vcpu_events events;
> +       struct kvm_msrs msrs;
> +       struct kvm_msr_entry msr_entries[SYNC_REGS_NUM_MSRS];
>  };
>
>  #define KVM_X86_QUIRK_LINT0_REENABLED  (1 << 0)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index cd17b7d9a107..ad3f84cbf7f3 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -100,6 +100,8 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
>  static void process_nmi(struct kvm_vcpu *vcpu);
>  static void enter_smm(struct kvm_vcpu *vcpu);
>  static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
> +static void sync_regs_store_to_kvmrun(struct kvm_vcpu *vcpu);
> +static int sync_regs_load_from_kvmrun(struct kvm_vcpu *vcpu);
>
>  struct kvm_x86_ops *kvm_x86_ops __read_mostly;
>  EXPORT_SYMBOL_GPL(kvm_x86_ops);
> @@ -2587,15 +2589,22 @@ EXPORT_SYMBOL_GPL(kvm_get_msr_common);
>   *
>   * @return number of msrs set successfully.
>   */
> -static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
> +static int __msr_io(struct kvm_vcpu *vcpu, u32 nmsrs, struct kvm_msrs *msrs,
>                     struct kvm_msr_entry *entries,
>                     int (*do_msr)(struct kvm_vcpu *vcpu,
>                                   unsigned index, u64 *data))
>  {
>         int i, idx;
>
> +       /* For SYNC_REGS call into __msr_io, it is possible that userspace
> +        * might have modified the count since it was snapshotted. Could be
> +        * accidental but might be intentional attempt at TOCTOU.
> +        */
> +       if (nmsrs != msrs->nmsrs)
> +               return -EINVAL;
> +
>         idx = srcu_read_lock(&vcpu->kvm->srcu);
> -       for (i = 0; i < msrs->nmsrs; ++i)
> +       for (i = 0; i < nmsrs; ++i)
>                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
>                         break;
>         srcu_read_unlock(&vcpu->kvm->srcu, idx);
> @@ -2633,7 +2642,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
>                 goto out;
>         }
>
> -       r = n = __msr_io(vcpu, &msrs, entries, do_msr);
> +       r = n = __msr_io(vcpu, msrs.nmsrs, &msrs, entries, do_msr);
>         if (r < 0)
>                 goto out_free;
>
> @@ -2665,6 +2674,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>         case KVM_CAP_NOP_IO_DELAY:
>         case KVM_CAP_MP_STATE:
>         case KVM_CAP_SYNC_MMU:
> +       case KVM_CAP_SYNC_REGS:
>         case KVM_CAP_USER_NMI:
>         case KVM_CAP_REINJECT_CONTROL:
>         case KVM_CAP_IRQ_INJECT_STATUS:
> @@ -3435,6 +3445,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>                         return -EINVAL;
>                 return kvm_hv_activate_synic(vcpu, cap->cap ==
>                                              KVM_CAP_HYPERV_SYNIC2);
> +       case KVM_CAP_SYNC_REGS:
> +               if (cap->args[0] & ~KVM_SYNC_X86_VALID_BITS)
> +                       return -EINVAL;
> +               vcpu->run->s.regs.sync_regs = cap->args[0];
> +               return 0;
>         default:
>                 return -EINVAL;
>         }
> @@ -7247,6 +7262,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>                 goto out;
>         }
>
> +       if (vcpu->run->kvm_dirty_regs) {
> +               r = sync_regs_load_from_kvmrun(vcpu);
> +               if (r != 0)
> +                       goto out;
> +       }
> +
>         /* re-sync apic's tpr */
>         if (!lapic_in_kernel(vcpu)) {
>                 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
> @@ -7270,6 +7291,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>                 r = vcpu_run(vcpu);
>
>  out:
> +       if (vcpu->run->s.regs.sync_regs)
> +               sync_regs_store_to_kvmrun(vcpu);
>         post_kvm_run_save(vcpu);
>         if (vcpu->sigset_active)
>                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
> @@ -7648,6 +7671,111 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
>         return 0;
>  }
>
> +static void sync_regs_store_to_kvmrun(struct kvm_vcpu *vcpu)
> +{
> +       BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_UNION_SIZE_BYTES);
> +       vcpu->run->kvm_valid_regs = 0;
> +
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_REGS) {
> +               kvm_arch_vcpu_ioctl_get_regs(vcpu, &vcpu->run->s.regs.regs);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_REGS;
> +       }
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_SREGS) {
> +               kvm_arch_vcpu_ioctl_get_sregs(vcpu, &vcpu->run->s.regs.sregs);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_SREGS;
> +       }
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_DEBUGREGS) {
> +               kvm_vcpu_ioctl_x86_get_debugregs(
> +                               vcpu, &vcpu->run->s.regs.debugregs);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_DEBUGREGS;
> +       }
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_FPU) {
> +               kvm_arch_vcpu_ioctl_get_fpu(vcpu, &vcpu->run->s.regs.fpu);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_FPU;
> +       }
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_EVENTS) {
> +               kvm_vcpu_ioctl_x86_get_vcpu_events(
> +                               vcpu, &vcpu->run->s.regs.events);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_EVENTS;
> +       }
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_MSRS) {
> +               u32 nmsrs = vcpu->run->s.regs.msrs.nmsrs;
> +
> +               if (nmsrs > SYNC_REGS_NUM_MSRS)
> +                       nmsrs = vcpu->run->s.regs.msrs.nmsrs
> +                                       = SYNC_REGS_NUM_MSRS;
> +               __msr_io(vcpu, nmsrs, &vcpu->run->s.regs.msrs,
> +                        vcpu->run->s.regs.msr_entries, do_get_msr);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_MSRS;
> +       }
> +}
> +
> +static int sync_regs_load_from_kvmrun(struct kvm_vcpu *vcpu)
> +{
> +       int r = -EINVAL;
> +
> +       if (unlikely(vcpu->run->kvm_dirty_regs == 0))
> +               goto out;
> +
> +       if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_BITS)
> +               goto out;
> +
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
> +               if (kvm_arch_vcpu_ioctl_set_regs(
> +                               vcpu, &vcpu->run->s.regs.regs))
> +                       goto out;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
> +               if (kvm_arch_vcpu_ioctl_set_sregs(
> +                               vcpu, &vcpu->run->s.regs.sregs))
> +                       goto out;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_DEBUGREGS) {
> +               if (kvm_vcpu_ioctl_x86_set_debugregs(
> +                               vcpu, &vcpu->run->s.regs.debugregs))
> +                       goto out;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_DEBUGREGS;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_FPU) {
> +               if (kvm_arch_vcpu_ioctl_set_fpu(
> +                               vcpu, &vcpu->run->s.regs.fpu))
> +                       goto out;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_FPU;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
> +               if (kvm_vcpu_ioctl_x86_set_vcpu_events(
> +                               vcpu, &vcpu->run->s.regs.events))
> +                       goto out;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_MSRS) {
> +               u32 nmsrs_in = vcpu->run->s.regs.msrs.nmsrs;
> +               u32 nmsrs_out = 0;
> +
> +               if (!nmsrs_in)
> +                       goto out;
> +               else if (nmsrs_in > SYNC_REGS_NUM_MSRS) {
> +                       r = -E2BIG;
> +                       goto out;
> +               }
> +               nmsrs_out = __msr_io(vcpu, nmsrs_in, &vcpu->run->s.regs.msrs,
> +                                    vcpu->run->s.regs.msr_entries, do_set_msr);
> +               if (!nmsrs_out) {
> +                       r = -EINVAL;
> +                       goto out;
> +               } else if (nmsrs_in != nmsrs_out) {
> +                       r = nmsrs_out;
> +                       goto out;
> +               }
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_MSRS;
> +       }
> +       r = 0;
> +out:
> +       return r;
> +}
> +
>  static void fx_init(struct kvm_vcpu *vcpu)
>  {
>         fpstate_init(&vcpu->arch.guest_fpu.state);
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 838887587411..90a6b24df901 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -395,6 +395,10 @@ struct kvm_run {
>                 char padding[256];
>         };
>
> +       /* 2048 is the size of the char array IBM used to bound/pad the size
> +        * of the union that holds sync regs.
> +        */
> +#define SYNC_REGS_UNION_SIZE_BYTES 2048
>         /*
>          * shared registers between kvm and userspace.
>          * kvm_valid_regs specifies the register classes set by the host
> @@ -406,7 +410,7 @@ struct kvm_run {
>         __u64 kvm_dirty_regs;
>         union {
>                 struct kvm_sync_regs regs;
> -               char padding[2048];
> +               char padding[SYNC_REGS_UNION_SIZE_BYTES];
>         } s;
>  };
>
> --
> 2.14.1.690.gbb1197296e-goog
>




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux