Re: [PATCH v2 1/2] KVM: arm64: Add PMU event filtering infrastructure

Auger Eric <eric.auger@xxxxxxxxxx> · Tue, 10 Mar 2020 18:40:31 +0100

Hi Marc,

On 3/10/20 12:03 PM, Marc Zyngier wrote:
> Hi Eric,
> 
> On 2020-03-09 18:05, Auger Eric wrote:
>> Hi Marc,
>>
>> On 3/9/20 1:48 PM, Marc Zyngier wrote:
>>> It can be desirable to expose a PMU to a guest, and yet not want the
>>> guest to be able to count some of the implemented events (because this
>>> would give information on shared resources, for example.
>>>
>>> For this, let's extend the PMUv3 device API, and offer a way to setup a
>>> bitmap of the allowed events (the default being no bitmap, and thus no
>>> filtering).
>>>
>>> Userspace can thus allow/deny ranges of event. The default policy
>>> depends on the "polarity" of the first filter setup (default deny if the
>>> filter allows events, and default allow if the filter denies events).
>>> This allows to setup exactly what is allowed for a given guest.
>>>
>>> Note that although the ioctl is per-vcpu, the map of allowed events is
>>> global to the VM (it can be setup from any vcpu until the vcpu PMU is
>>> initialized).
>>>
>>> Signed-off-by: Marc Zyngier <maz@xxxxxxxxxx>
>>> ---
>>>  arch/arm64/include/asm/kvm_host.h |  6 +++
>>>  arch/arm64/include/uapi/asm/kvm.h | 16 ++++++
>>>  virt/kvm/arm/arm.c                |  2 +
>>>  virt/kvm/arm/pmu.c                | 84 +++++++++++++++++++++++++------
>>>  4 files changed, 92 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/kvm_host.h
>>> b/arch/arm64/include/asm/kvm_host.h
>>> index 57fd46acd058..8e63c618688d 100644
>>> --- a/arch/arm64/include/asm/kvm_host.h
>>> +++ b/arch/arm64/include/asm/kvm_host.h
>>> @@ -91,6 +91,12 @@ struct kvm_arch {
>>>       * supported.
>>>       */
>>>      bool return_nisv_io_abort_to_user;
>>> +
>>> +    /*
>>> +     * VM-wide PMU filter, implemented as a bitmap and big enough
>>> +     * for up to 65536 events
>>> +     */
>>> +    unsigned long *pmu_filter;
>>>  };
>>>
>>>  #define KVM_NR_MEM_OBJS     40
>>> diff --git a/arch/arm64/include/uapi/asm/kvm.h
>>> b/arch/arm64/include/uapi/asm/kvm.h
>>> index ba85bb23f060..7b1511d6ce44 100644
>>> --- a/arch/arm64/include/uapi/asm/kvm.h
>>> +++ b/arch/arm64/include/uapi/asm/kvm.h
>>> @@ -159,6 +159,21 @@ struct kvm_sync_regs {
>>>  struct kvm_arch_memory_slot {
>>>  };
>>>
>>> +/*
>>> + * PMU filter structure. Describe a range of events with a particular
>>> + * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER.
>>> + */
>>> +struct kvm_pmu_event_filter {
>>> +    __u16    base_event;
>>> +    __u16    nevents;
>>> +
>>> +#define KVM_PMU_EVENT_ALLOW    0
>>> +#define KVM_PMU_EVENT_DENY    1
>>> +
>>> +    __u8    action;
>>> +    __u8    pad[3];
>>> +};
>>> +
>>>  /* for KVM_GET/SET_VCPU_EVENTS */
>>>  struct kvm_vcpu_events {
>>>      struct {
>>> @@ -329,6 +344,7 @@ struct kvm_vcpu_events {
>>>  #define KVM_ARM_VCPU_PMU_V3_CTRL    0
>>>  #define   KVM_ARM_VCPU_PMU_V3_IRQ    0
>>>  #define   KVM_ARM_VCPU_PMU_V3_INIT    1
>>> +#define   KVM_ARM_VCPU_PMU_V3_FILTER    2
>>>  #define KVM_ARM_VCPU_TIMER_CTRL        1
>>>  #define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER        0
>>>  #define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER        1
>>> diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
>>> index eda7b624eab8..8d849ac88a44 100644
>>> --- a/virt/kvm/arm/arm.c
>>> +++ b/virt/kvm/arm/arm.c
>>> @@ -164,6 +164,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>>>      free_percpu(kvm->arch.last_vcpu_ran);
>>>      kvm->arch.last_vcpu_ran = NULL;
>>>
>>> +    bitmap_free(kvm->arch.pmu_filter);
>>> +
>>>      for (i = 0; i < KVM_MAX_VCPUS; ++i) {
>>>          if (kvm->vcpus[i]) {
>>>              kvm_vcpu_destroy(kvm->vcpus[i]);
>>> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
>>> index f0d0312c0a55..9f0fd0224d5b 100644
>>> --- a/virt/kvm/arm/pmu.c
>>> +++ b/virt/kvm/arm/pmu.c
>>> @@ -579,10 +579,19 @@ static void kvm_pmu_create_perf_event(struct
>>> kvm_vcpu *vcpu, u64 select_idx)
>>>
>>>      kvm_pmu_stop_counter(vcpu, pmc);
>>>      eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
>>> +    if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
>>> +        eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
>> nit:
>>     if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
>>         eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
>>     else
>>         eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
> 
> You don't like it? ;-)
? eventset set only once instead of 2 times
> 
>>>
>>>      /* Software increment event does't need to be backed by a perf
>>> event */
>> nit: while wer are at it fix the does't typo
>>> -    if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
>>> -        pmc->idx != ARMV8_PMU_CYCLE_IDX)
>>> +    if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR)
>>> +        return;
>>> +
>>> +    /*
>>> +     * If we have a filter in place and that the event isn't
>>> allowed, do
>>> +     * not install a perf event either.
>>> +     */
>>> +    if (vcpu->kvm->arch.pmu_filter &&
>>> +        !test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
>>>          return;
>>>
>>>      memset(&attr, 0, sizeof(struct perf_event_attr));
>>> @@ -594,8 +603,7 @@ static void kvm_pmu_create_perf_event(struct
>>> kvm_vcpu *vcpu, u64 select_idx)
>>>      attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
>>>      attr.exclude_hv = 1; /* Don't count EL2 events */
>>>      attr.exclude_host = 1; /* Don't count host events */
>>> -    attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
>>> -        ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
>>> +    attr.config = eventsel;
>> So in that case the guest counter will not increment but the guest does
>> not know the counter is not implemented. Can't this lead to bad user
>> experience. Shouldn't this disablement be reflected in PMCEID0/1 regs?
> 
> The whole point is that we want to keep things hidden from the guest.
> Also, PMCEID{0,1} only describe a small set of events (the architected
> common events), and not the whole range of microarchitectural events
> that the CPU implements.

I am still not totally convinced. Things are not totally hidden to the
guest as the counter does not increment, right? So a guest may try to
use as it is advertised in PMCEID0/1 but not get the expected results
leading to potential support request. I agree not all the events are
described there but your API also allows to filter out some of the ones
that are advertised.
> 
>>>
>>>      counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
>>>
>>> @@ -735,15 +743,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
>>>
>>>  static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
>>>  {
>>> -    if (!kvm_arm_support_pmu_v3())
>>> -        return -ENODEV;
>>> -
>>> -    if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
>>> -        return -ENXIO;
>>> -
>>> -    if (vcpu->arch.pmu.created)
>>> -        return -EBUSY;
>>> -
>>>      if (irqchip_in_kernel(vcpu->kvm)) {
>>>          int ret;
>>>
>>> @@ -794,8 +793,19 @@ static bool pmu_irq_is_valid(struct kvm *kvm,
>>> int irq)
>>>      return true;
>>>  }
>>>
>>> +#define NR_EVENTS    (ARMV8_PMU_EVTYPE_EVENT + 1)
>>> +
>>>  int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct
>>> kvm_device_attr *attr)
>>>  {
>>> +    if (!kvm_arm_support_pmu_v3())
>>> +        return -ENODEV;
>>> +
>>> +    if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
>>> +        return -ENODEV;
>> I see you changed -ENXIO into -ENODEV. wanted?
> 
> Probably not... but see below.
> 
>>> +
>>> +    if (vcpu->arch.pmu.created)
>>> +        return -EBUSY;
>>> +
>>>      switch (attr->attr) {
>>>      case KVM_ARM_VCPU_PMU_V3_IRQ: {
>>>          int __user *uaddr = (int __user *)(long)attr->addr;
>>> @@ -804,9 +814,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu
>>> *vcpu, struct kvm_device_attr *attr)
>>>          if (!irqchip_in_kernel(vcpu->kvm))
>>>              return -EINVAL;
>>>
>>> -        if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
>>> -            return -ENODEV;
>>> -
> 
> Here's why. I wonder if we already have a problem with the consistency
> of the
> error codes returned to userspace.
OK. Then you may document it in the commit message?
> 
>>>          if (get_user(irq, uaddr))
>>>              return -EFAULT;
>>>
>>> @@ -824,6 +831,50 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu
>>> *vcpu, struct kvm_device_attr *attr)
>>>          vcpu->arch.pmu.irq_num = irq;
>>>          return 0;
>>>      }
>>> +    case KVM_ARM_VCPU_PMU_V3_FILTER: {
>>> +        struct kvm_pmu_event_filter __user *uaddr;
>>> +        struct kvm_pmu_event_filter filter;
>>> +
>>> +        uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr;
>>> +
>>> +        if (copy_from_user(&filter, uaddr, sizeof(filter)))
>>> +            return -EFAULT;
>>> +
>>> +        if (((u32)filter.base_event + filter.nevents) > NR_EVENTS ||
>> isnt't it >= ?
> 
> No, I think this is correct. I want to be able to filter event 0xFFFF,
> for example,
> so I have base_event=0xFFFF and nevents=1.
hum my mistake. Sorry
> 
>>> +            (filter.action != KVM_PMU_EVENT_ALLOW &&
>>> +             filter.action != KVM_PMU_EVENT_DENY))
>>> +            return -EINVAL;
>> -EINVAL is not documented in the API doc.
> 
> Good point.
> 
>>> +
>>> +        mutex_lock(&vcpu->kvm->lock);
>>> +
>>> +        if (!vcpu->kvm->arch.pmu_filter) {
>>> +            vcpu->kvm->arch.pmu_filter = bitmap_alloc(NR_EVENTS,
>>> GFP_KERNEL);
>>> +            if (!vcpu->kvm->arch.pmu_filter) {
>>> +                mutex_unlock(&vcpu->kvm->lock);
>>> +                return -ENOMEM;
>>> +            }
>>> +
>>> +            /*
>>> +             * The default depends on the first applied filter.
>>> +             * If it allows events, the default is to deny.
>>> +             * Conversely, if the first filter denies a set of
>>> +             * events, the default is to allow.
>>> +             */
>>> +            if (filter.action == KVM_PMU_EVENT_ALLOW)
>>> +                bitmap_zero(vcpu->kvm->arch.pmu_filter, NR_EVENTS);
>>> +            else
>>> +                bitmap_fill(vcpu->kvm->arch.pmu_filter, NR_EVENTS);
>>> +        }
>>> +
>>> +        if (filter.action == KVM_PMU_EVENT_ALLOW)
>>> +            bitmap_set(vcpu->kvm->arch.pmu_filter,
>>> filter.base_event, filter.nevents);
>>> +        else
>>> +            bitmap_clear(vcpu->kvm->arch.pmu_filter,
>>> filter.base_event, filter.nevents);
>>> +
>>> +        mutex_unlock(&vcpu->kvm->lock);
>>> +
>>> +        return 0;
>>> +    }
>>>      case KVM_ARM_VCPU_PMU_V3_INIT:
>>>          return kvm_arm_pmu_v3_init(vcpu);
>>>      }
>>> @@ -860,6 +911,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu
>>> *vcpu, struct kvm_device_attr *attr)
>>>      switch (attr->attr) {
>>>      case KVM_ARM_VCPU_PMU_V3_IRQ:
>> not related to this patch but shouldn't we advertise this only with
>> in-kernel irqchip?
> 
> We do support the PMU without the in-kernel chip, unfortunately... Yes,
> supporting this feature was a big mistake.
But I see in kvm_arm_pmu_v3_set_attr:
case KVM_ARM_VCPU_PMU_V3_IRQ:
../..
                if (!irqchip_in_kernel(vcpu->kvm))
                        return -EINVAL;

Thanks

Eric

> 
>>>      case KVM_ARM_VCPU_PMU_V3_INIT:
>>> +    case KVM_ARM_VCPU_PMU_V3_FILTER:
>>>          if (kvm_arm_support_pmu_v3() &&
>>>              test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
>>>              return 0;
> 
> Thanks for the review,
> 
>         M.