Hi Marc, On 3/10/20 12:03 PM, Marc Zyngier wrote: > Hi Eric, > > On 2020-03-09 18:05, Auger Eric wrote: >> Hi Marc, >> >> On 3/9/20 1:48 PM, Marc Zyngier wrote: >>> It can be desirable to expose a PMU to a guest, and yet not want the >>> guest to be able to count some of the implemented events (because this >>> would give information on shared resources, for example. >>> >>> For this, let's extend the PMUv3 device API, and offer a way to setup a >>> bitmap of the allowed events (the default being no bitmap, and thus no >>> filtering). >>> >>> Userspace can thus allow/deny ranges of event. The default policy >>> depends on the "polarity" of the first filter setup (default deny if the >>> filter allows events, and default allow if the filter denies events). >>> This allows to setup exactly what is allowed for a given guest. >>> >>> Note that although the ioctl is per-vcpu, the map of allowed events is >>> global to the VM (it can be setup from any vcpu until the vcpu PMU is >>> initialized). >>> >>> Signed-off-by: Marc Zyngier <maz@xxxxxxxxxx> >>> --- >>> arch/arm64/include/asm/kvm_host.h | 6 +++ >>> arch/arm64/include/uapi/asm/kvm.h | 16 ++++++ >>> virt/kvm/arm/arm.c | 2 + >>> virt/kvm/arm/pmu.c | 84 +++++++++++++++++++++++++------ >>> 4 files changed, 92 insertions(+), 16 deletions(-) >>> >>> diff --git a/arch/arm64/include/asm/kvm_host.h >>> b/arch/arm64/include/asm/kvm_host.h >>> index 57fd46acd058..8e63c618688d 100644 >>> --- a/arch/arm64/include/asm/kvm_host.h >>> +++ b/arch/arm64/include/asm/kvm_host.h >>> @@ -91,6 +91,12 @@ struct kvm_arch { >>> * supported. >>> */ >>> bool return_nisv_io_abort_to_user; >>> + >>> + /* >>> + * VM-wide PMU filter, implemented as a bitmap and big enough >>> + * for up to 65536 events >>> + */ >>> + unsigned long *pmu_filter; >>> }; >>> >>> #define KVM_NR_MEM_OBJS 40 >>> diff --git a/arch/arm64/include/uapi/asm/kvm.h >>> b/arch/arm64/include/uapi/asm/kvm.h >>> index ba85bb23f060..7b1511d6ce44 100644 >>> --- a/arch/arm64/include/uapi/asm/kvm.h >>> +++ b/arch/arm64/include/uapi/asm/kvm.h >>> @@ -159,6 +159,21 @@ struct kvm_sync_regs { >>> struct kvm_arch_memory_slot { >>> }; >>> >>> +/* >>> + * PMU filter structure. Describe a range of events with a particular >>> + * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER. >>> + */ >>> +struct kvm_pmu_event_filter { >>> + __u16 base_event; >>> + __u16 nevents; >>> + >>> +#define KVM_PMU_EVENT_ALLOW 0 >>> +#define KVM_PMU_EVENT_DENY 1 >>> + >>> + __u8 action; >>> + __u8 pad[3]; >>> +}; >>> + >>> /* for KVM_GET/SET_VCPU_EVENTS */ >>> struct kvm_vcpu_events { >>> struct { >>> @@ -329,6 +344,7 @@ struct kvm_vcpu_events { >>> #define KVM_ARM_VCPU_PMU_V3_CTRL 0 >>> #define KVM_ARM_VCPU_PMU_V3_IRQ 0 >>> #define KVM_ARM_VCPU_PMU_V3_INIT 1 >>> +#define KVM_ARM_VCPU_PMU_V3_FILTER 2 >>> #define KVM_ARM_VCPU_TIMER_CTRL 1 >>> #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 >>> #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 >>> diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c >>> index eda7b624eab8..8d849ac88a44 100644 >>> --- a/virt/kvm/arm/arm.c >>> +++ b/virt/kvm/arm/arm.c >>> @@ -164,6 +164,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) >>> free_percpu(kvm->arch.last_vcpu_ran); >>> kvm->arch.last_vcpu_ran = NULL; >>> >>> + bitmap_free(kvm->arch.pmu_filter); >>> + >>> for (i = 0; i < KVM_MAX_VCPUS; ++i) { >>> if (kvm->vcpus[i]) { >>> kvm_vcpu_destroy(kvm->vcpus[i]); >>> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c >>> index f0d0312c0a55..9f0fd0224d5b 100644 >>> --- a/virt/kvm/arm/pmu.c >>> +++ b/virt/kvm/arm/pmu.c >>> @@ -579,10 +579,19 @@ static void kvm_pmu_create_perf_event(struct >>> kvm_vcpu *vcpu, u64 select_idx) >>> >>> kvm_pmu_stop_counter(vcpu, pmc); >>> eventsel = data & ARMV8_PMU_EVTYPE_EVENT; >>> + if (pmc->idx == ARMV8_PMU_CYCLE_IDX) >>> + eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; >> nit: >> if (pmc->idx == ARMV8_PMU_CYCLE_IDX) >> eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; >> else >> eventsel = data & ARMV8_PMU_EVTYPE_EVENT; > > You don't like it? ;-) ? eventset set only once instead of 2 times > >>> >>> /* Software increment event does't need to be backed by a perf >>> event */ >> nit: while wer are at it fix the does't typo >>> - if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR && >>> - pmc->idx != ARMV8_PMU_CYCLE_IDX) >>> + if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR) >>> + return; >>> + >>> + /* >>> + * If we have a filter in place and that the event isn't >>> allowed, do >>> + * not install a perf event either. >>> + */ >>> + if (vcpu->kvm->arch.pmu_filter && >>> + !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) >>> return; >>> >>> memset(&attr, 0, sizeof(struct perf_event_attr)); >>> @@ -594,8 +603,7 @@ static void kvm_pmu_create_perf_event(struct >>> kvm_vcpu *vcpu, u64 select_idx) >>> attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; >>> attr.exclude_hv = 1; /* Don't count EL2 events */ >>> attr.exclude_host = 1; /* Don't count host events */ >>> - attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ? >>> - ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel; >>> + attr.config = eventsel; >> So in that case the guest counter will not increment but the guest does >> not know the counter is not implemented. Can't this lead to bad user >> experience. Shouldn't this disablement be reflected in PMCEID0/1 regs? > > The whole point is that we want to keep things hidden from the guest. > Also, PMCEID{0,1} only describe a small set of events (the architected > common events), and not the whole range of microarchitectural events > that the CPU implements. I am still not totally convinced. Things are not totally hidden to the guest as the counter does not increment, right? So a guest may try to use as it is advertised in PMCEID0/1 but not get the expected results leading to potential support request. I agree not all the events are described there but your API also allows to filter out some of the ones that are advertised. > >>> >>> counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); >>> >>> @@ -735,15 +743,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) >>> >>> static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) >>> { >>> - if (!kvm_arm_support_pmu_v3()) >>> - return -ENODEV; >>> - >>> - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) >>> - return -ENXIO; >>> - >>> - if (vcpu->arch.pmu.created) >>> - return -EBUSY; >>> - >>> if (irqchip_in_kernel(vcpu->kvm)) { >>> int ret; >>> >>> @@ -794,8 +793,19 @@ static bool pmu_irq_is_valid(struct kvm *kvm, >>> int irq) >>> return true; >>> } >>> >>> +#define NR_EVENTS (ARMV8_PMU_EVTYPE_EVENT + 1) >>> + >>> int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct >>> kvm_device_attr *attr) >>> { >>> + if (!kvm_arm_support_pmu_v3()) >>> + return -ENODEV; >>> + >>> + if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) >>> + return -ENODEV; >> I see you changed -ENXIO into -ENODEV. wanted? > > Probably not... but see below. > >>> + >>> + if (vcpu->arch.pmu.created) >>> + return -EBUSY; >>> + >>> switch (attr->attr) { >>> case KVM_ARM_VCPU_PMU_V3_IRQ: { >>> int __user *uaddr = (int __user *)(long)attr->addr; >>> @@ -804,9 +814,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu >>> *vcpu, struct kvm_device_attr *attr) >>> if (!irqchip_in_kernel(vcpu->kvm)) >>> return -EINVAL; >>> >>> - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) >>> - return -ENODEV; >>> - > > Here's why. I wonder if we already have a problem with the consistency > of the > error codes returned to userspace. OK. Then you may document it in the commit message? > >>> if (get_user(irq, uaddr)) >>> return -EFAULT; >>> >>> @@ -824,6 +831,50 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu >>> *vcpu, struct kvm_device_attr *attr) >>> vcpu->arch.pmu.irq_num = irq; >>> return 0; >>> } >>> + case KVM_ARM_VCPU_PMU_V3_FILTER: { >>> + struct kvm_pmu_event_filter __user *uaddr; >>> + struct kvm_pmu_event_filter filter; >>> + >>> + uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr; >>> + >>> + if (copy_from_user(&filter, uaddr, sizeof(filter))) >>> + return -EFAULT; >>> + >>> + if (((u32)filter.base_event + filter.nevents) > NR_EVENTS || >> isnt't it >= ? > > No, I think this is correct. I want to be able to filter event 0xFFFF, > for example, > so I have base_event=0xFFFF and nevents=1. hum my mistake. Sorry > >>> + (filter.action != KVM_PMU_EVENT_ALLOW && >>> + filter.action != KVM_PMU_EVENT_DENY)) >>> + return -EINVAL; >> -EINVAL is not documented in the API doc. > > Good point. > >>> + >>> + mutex_lock(&vcpu->kvm->lock); >>> + >>> + if (!vcpu->kvm->arch.pmu_filter) { >>> + vcpu->kvm->arch.pmu_filter = bitmap_alloc(NR_EVENTS, >>> GFP_KERNEL); >>> + if (!vcpu->kvm->arch.pmu_filter) { >>> + mutex_unlock(&vcpu->kvm->lock); >>> + return -ENOMEM; >>> + } >>> + >>> + /* >>> + * The default depends on the first applied filter. >>> + * If it allows events, the default is to deny. >>> + * Conversely, if the first filter denies a set of >>> + * events, the default is to allow. >>> + */ >>> + if (filter.action == KVM_PMU_EVENT_ALLOW) >>> + bitmap_zero(vcpu->kvm->arch.pmu_filter, NR_EVENTS); >>> + else >>> + bitmap_fill(vcpu->kvm->arch.pmu_filter, NR_EVENTS); >>> + } >>> + >>> + if (filter.action == KVM_PMU_EVENT_ALLOW) >>> + bitmap_set(vcpu->kvm->arch.pmu_filter, >>> filter.base_event, filter.nevents); >>> + else >>> + bitmap_clear(vcpu->kvm->arch.pmu_filter, >>> filter.base_event, filter.nevents); >>> + >>> + mutex_unlock(&vcpu->kvm->lock); >>> + >>> + return 0; >>> + } >>> case KVM_ARM_VCPU_PMU_V3_INIT: >>> return kvm_arm_pmu_v3_init(vcpu); >>> } >>> @@ -860,6 +911,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu >>> *vcpu, struct kvm_device_attr *attr) >>> switch (attr->attr) { >>> case KVM_ARM_VCPU_PMU_V3_IRQ: >> not related to this patch but shouldn't we advertise this only with >> in-kernel irqchip? > > We do support the PMU without the in-kernel chip, unfortunately... Yes, > supporting this feature was a big mistake. But I see in kvm_arm_pmu_v3_set_attr: case KVM_ARM_VCPU_PMU_V3_IRQ: ../.. if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; Thanks Eric > >>> case KVM_ARM_VCPU_PMU_V3_INIT: >>> + case KVM_ARM_VCPU_PMU_V3_FILTER: >>> if (kvm_arm_support_pmu_v3() && >>> test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) >>> return 0; > > Thanks for the review, > > M.