RE: [PATCH 2/4] KVM: x86/xen: Compatibility fixes for shared runstate area

"Durrant, Paul" <pdurrant@xxxxxxxxxxxx> · Sat, 19 Nov 2022 11:38:33 +0000

> -----Original Message-----
> From: David Woodhouse <dwmw2@xxxxxxxxxxxxx>
> Sent: 19 November 2022 09:47
> To: Paolo Bonzini <pbonzini@xxxxxxxxxx>; Sean Christopherson
> <seanjc@xxxxxxxxxx>
> Cc: kvm@xxxxxxxxxxxxxxx; mhal@xxxxxxx
> Subject: [EXTERNAL] [PATCH 2/4] KVM: x86/xen: Compatibility fixes for
> shared runstate area
> 
> CAUTION: This email originated from outside of the organization. Do not
> click links or open attachments unless you can confirm the sender and know
> the content is safe.
> 
> 
> 
> From: David Woodhouse <dwmw@xxxxxxxxxxxx>
> 
> The guest runstate area can be arbitrarily byte-aligned. In fact, even
> when a sane 32-bit guest aligns the overall structure nicely, the 64-bit
> fields in the structure end up being unaligned due to the fact that the
> 32-bit ABI only aligns them to 32 bits.
> 
> So setting the ->state_entry_time field to something|XEN_RUNSTATE_UPDATE
> is buggy, because if it's unaligned then we can't update the whole field
> atomically; the low bytes might be observable before the _UPDATE bit is.
> Xen actually updates the *byte* containing that top bit, on its own. KVM
> should do the same.
> 
> In addition, we cannot assume that the runstate area fits within a single
> page. One option might be to make the gfn_to_pfn cache cope with regions
> that cross a page — but getting a contiguous virtual kernel mapping of a
> discontiguous set of IOMEM pages is a distinctly non-trivial exercise, and
> it seems this is the *only* current use case for the GPC which would
> benefit from it.
> 
> An earlier version of the runstate code did use a gfn_to_hva cache for
> this purpose, but it still had the single-page restriction because it used
> the uhva directly — because it needs to be able to do so atomically when
> the vCPU is being scheduled out, so it used pagefault_disable() around the
> accesses and didn't just use kvm_write_guest_cached() which has a fallback
> path.
> 
> So... use a pair of GPCs for the first and potential second page covering
> the runstate area. We can get away with locking both at once because
> nothing else takes more than one GPC lock at a time so we can invent a
> trivial ordering rule.
> 
> Keep the trivial fast path for the common case where it's all in the same
> page, but fixed to use a byte access for the XEN_RUNSTATE_UPDATE bit. And
> in the cross-page case, build the structure locally on the stack and then
> copy it over with two memcpy() calls, again handling the
> XEN_RUNSTATE_UPDATE bit through a single byte access.
> 
> Finally, Xen also does write the runstate area immediately when it's
> configured. Flip the kvm_xen_update_runstate() and …_guest() functions and
> call the latter directly when the runstate area is set. This means that
> other ioctls which modify the runstate also write it immediately to the
> guest when they do so, which is also intended.
> 
> Update the xen_shinfo_test to exercise the pathological case where the
> XEN_RUNSTATE_UPDATE flag in the top byte of the state_entry_time is
> actually in a different page to the rest of the 64-bit word.
> 
> Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h               |   1 +
>  arch/x86/kvm/xen.c                            | 367 +++++++++++++-----
>  arch/x86/kvm/xen.h                            |   6 +-
>  .../selftests/kvm/x86_64/xen_shinfo_test.c    |  12 +-
>  4 files changed, 272 insertions(+), 114 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h
> b/arch/x86/include/asm/kvm_host.h index d1013c4f673c..70af7240a1d5 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -686,6 +686,7 @@ struct kvm_vcpu_xen {
>         struct gfn_to_pfn_cache vcpu_info_cache;
>         struct gfn_to_pfn_cache vcpu_time_info_cache;
>         struct gfn_to_pfn_cache runstate_cache;
> +       struct gfn_to_pfn_cache runstate2_cache;
>         u64 last_steal;
>         u64 runstate_entry_time;
>         u64 runstate_times[4];
> diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index
> 4b8e9628fbf5..8aa953b1f0e0 100644
> --- a/arch/x86/kvm/xen.c
> +++ b/arch/x86/kvm/xen.c
> @@ -170,148 +170,269 @@ static void kvm_xen_init_timer(struct kvm_vcpu
> *vcpu)
>         vcpu->arch.xen.timer.function = xen_timer_callback;  }
> 
> -static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
> +static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool
> +atomic)
>  {
>         struct kvm_vcpu_xen *vx = &v->arch.xen;
> -       u64 now = get_kvmclock_ns(v->kvm);
> -       u64 delta_ns = now - vx->runstate_entry_time;
> -       u64 run_delay = current->sched_info.run_delay;
> +       struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
> +       struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
> +       size_t user_len, user_len1, user_len2;

I think it might make the code marginally neater to use two-element arrays... but only marginally.

> +       struct vcpu_runstate_info rs;
> +       int *rs_state = &rs.state;
> +       unsigned long flags;
> +       size_t times_ofs;
> +       u8 *update_bit;
> 
> -       if (unlikely(!vx->runstate_entry_time))
> -               vx->current_runstate = RUNSTATE_offline;
> +       /*
> +        * The only difference between 32-bit and 64-bit versions of the
> +        * runstate struct us the alignment of uint64_t in 32-bit, which
> +        * means that the 64-bit version has an additional 4 bytes of
> +        * padding after the first field 'state'.
> +        */
> +       BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
> +       BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) !=
> 0);
> +       BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
> +#ifdef CONFIG_X86_64
> +       BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time)
> !=
> +                    offsetof(struct compat_vcpu_runstate_info,
> state_entry_time) + 4);
> +       BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
> +                    offsetof(struct compat_vcpu_runstate_info, time) +
> +4); #endif
> +
> +       if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
> +               user_len = sizeof(struct vcpu_runstate_info);
> +               times_ofs = offsetof(struct vcpu_runstate_info,
> +                                    state_entry_time);
> +       } else {
> +               user_len = sizeof(struct compat_vcpu_runstate_info);
> +               times_ofs = offsetof(struct compat_vcpu_runstate_info,
> +                                    state_entry_time);
> +               rs_state++;
> +       }
> 
>         /*
> -        * Time waiting for the scheduler isn't "stolen" if the
> -        * vCPU wasn't running anyway.
> +        * There are basically no alignment constraints. The guest can set
> it
> +        * up so it crosses from one page to the next, and at arbitrary
> byte
> +        * alignment (and the 32-bit ABI doesn't align the 64-bit integers
> +        * anyway, even if the overall struct had been 64-bit aligned).
>          */
> -       if (vx->current_runstate == RUNSTATE_running) {
> -               u64 steal_ns = run_delay - vx->last_steal;
> +       if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
> +               user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
> +               user_len2 = user_len - user_len1;
> +       } else {
> +               user_len1 = user_len;
> +               user_len2 = 0;
> +       }
> +       BUG_ON(user_len1 + user_len2 != user_len);
> 
> -               delta_ns -= steal_ns;
> + retry:
> +       /*
> +        * Attempt to obtain the GPC lock on *both* (if there are two)
> +        * gfn_to_pfn caches that cover the region.
> +        */
> +       read_lock_irqsave(&gpc1->lock, flags);
> +       while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc1, gpc1->gpa,
> user_len1)) {
> +               read_unlock_irqrestore(&gpc1->lock, flags);
> 
> -               vx->runstate_times[RUNSTATE_runnable] += steal_ns;
> +               /* When invoked from kvm_sched_out() we cannot sleep */
> +               if (atomic)
> +                       return;
> +
> +               if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc1, gpc1->gpa,
> user_len1))
> +                       return;
> +
> +               read_lock_irqsave(&gpc1->lock, flags);

This is a repeated pattern now, so how about a helper function?

  Paul