Re: [PATCH v2 4/6] KVM: x86/pmu: Add pmc->intr to refactor kvm_perf_overflow{_intr}()

Jim Mattson <jmattson@xxxxxxxxxx> · Wed, 8 Dec 2021 20:25:55 -0800

On Mon, Nov 29, 2021 at 11:42 PM Like Xu <like.xu.linux@xxxxxxxxx> wrote:
>
> From: Like Xu <likexu@xxxxxxxxxxx>
>
> Depending on whether intr should be triggered or not, KVM registers
> two different event overflow callbacks in the perf_event context.
>
> The code skeleton of these two functions is very similar, so
> the pmc->intr can be stored into pmc from pmc_reprogram_counter()
> which provides smaller instructions footprint against the
> u-architecture branch predictor.
>
> The __kvm_perf_overflow() can be called in non-nmi contexts
> and a flag is needed to distinguish the caller context and thus
> avoid a check on kvm_is_in_guest(), otherwise we might get
> warnings from suspicious RCU or check_preemption_disabled().
>
> Suggested-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> Signed-off-by: Like Xu <likexu@xxxxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/pmu.c              | 58 ++++++++++++++++-----------------
>  2 files changed, 29 insertions(+), 30 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index e41ad1ead721..6c2b2331ffeb 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -495,6 +495,7 @@ struct kvm_pmc {
>          */
>         u64 current_config;
>         bool is_paused;
> +       bool intr;
>  };
>
>  struct kvm_pmu {
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index b7a1ae28ab87..a20207ee4014 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -55,43 +55,41 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
>         kvm_pmu_deliver_pmi(vcpu);
>  }
>
> -static void kvm_perf_overflow(struct perf_event *perf_event,
> -                             struct perf_sample_data *data,
> -                             struct pt_regs *regs)
> +static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
>  {
> -       struct kvm_pmc *pmc = perf_event->overflow_handler_context;
>         struct kvm_pmu *pmu = pmc_to_pmu(pmc);
>
> -       if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
> -               __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
> -               kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
> -       }
> +       /* Ignore counters that have been reprogrammed already. */
> +       if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
> +               return;
> +
> +       __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
> +       kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
> +
> +       if (!pmc->intr)
> +               return;
> +
> +       /*
> +        * Inject PMI. If vcpu was in a guest mode during NMI PMI
> +        * can be ejected on a guest mode re-entry. Otherwise we can't
> +        * be sure that vcpu wasn't executing hlt instruction at the
> +        * time of vmexit and is not going to re-enter guest mode until
> +        * woken up. So we should wake it, but this is impossible from
> +        * NMI context. Do it from irq work instead.
> +        */
> +       if (in_pmi && !kvm_is_in_guest())
> +               irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
> +       else
> +               kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
>  }
>
> -static void kvm_perf_overflow_intr(struct perf_event *perf_event,
> -                                  struct perf_sample_data *data,
> -                                  struct pt_regs *regs)
> +static void kvm_perf_overflow(struct perf_event *perf_event,
> +                             struct perf_sample_data *data,
> +                             struct pt_regs *regs)
>  {
>         struct kvm_pmc *pmc = perf_event->overflow_handler_context;
> -       struct kvm_pmu *pmu = pmc_to_pmu(pmc);
> -
> -       if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
> -               __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
> -               kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
>
> -               /*
> -                * Inject PMI. If vcpu was in a guest mode during NMI PMI
> -                * can be ejected on a guest mode re-entry. Otherwise we can't
> -                * be sure that vcpu wasn't executing hlt instruction at the
> -                * time of vmexit and is not going to re-enter guest mode until
> -                * woken up. So we should wake it, but this is impossible from
> -                * NMI context. Do it from irq work instead.
> -                */
> -               if (!kvm_is_in_guest())
> -                       irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
> -               else
> -                       kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
> -       }
> +       __kvm_perf_overflow(pmc, true);
>  }
>
>  static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> @@ -126,7 +124,6 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>         }
>
>         event = perf_event_create_kernel_counter(&attr, -1, current,
> -                                                intr ? kvm_perf_overflow_intr :
>                                                  kvm_perf_overflow, pmc);

Not your change, but if the event is counting anything based on
cycles, and the guest TSC is scaled to run at a different rate from
the host TSC, doesn't the initial value of the underlying hardware
counter have to be adjusted as well, so that the interrupt arrives
when the guest's counter overflows rather than when the host's counter
overflows?

Reviewed-by: Jim Mattson <jmattson@xxxxxxxxxx>