+Mark On 10/01/17 11:38, Punit Agrawal wrote: > Both AArch32 and AArch64 mode of the ARMv8 architecture support trapping > certain VM operations, e.g, TLB and cache maintenance > operations. Selective trapping of these operations for specific VMs can > be used to track the frequency with which these occur during execution. > > Add a software PMU on the host that can support tracking VM > operations (in the form of PMU events). Supporting new events requires > providing callbacks to configure the VM to enable/disable the trapping > and read a count of the frequency. > > The host PMU events can be controlled by tools like perf that use > standard kernel perf interfaces. > > Signed-off-by: Punit Agrawal <punit.agrawal@xxxxxxx> > Cc: Christoffer Dall <christoffer.dall@xxxxxxxxxx> > Cc: Marc Zyngier <marc.zyngier@xxxxxxx> > Cc: Will Deacon <will.deacon@xxxxxxx> > --- > arch/arm/include/asm/kvm_host.h | 8 ++ > arch/arm/kvm/arm.c | 2 + > arch/arm64/include/asm/kvm_host.h | 8 ++ > virt/kvm/arm/host_pmu.c | 272 ++++++++++++++++++++++++++++++++++++++ > 4 files changed, 290 insertions(+) > create mode 100644 virt/kvm/arm/host_pmu.c > > diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h > index 26f0c8a0b790..b988f8801b86 100644 > --- a/arch/arm/include/asm/kvm_host.h > +++ b/arch/arm/include/asm/kvm_host.h > @@ -289,6 +289,14 @@ static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) > int kvm_perf_init(void); > int kvm_perf_teardown(void); > > +#if !defined(CONFIG_KVM_HOST_PMU) > +static inline int kvm_host_pmu_init(void) { return 0; } > +static inline void kvm_host_pmu_teardown(void) { } > +#else > +int kvm_host_pmu_init(void); > +void kvm_host_pmu_teardown(void); > +#endif > + > void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); > > struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); > diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c > index 11676787ad49..058626b65b8d 100644 > --- a/arch/arm/kvm/arm.c > +++ b/arch/arm/kvm/arm.c > @@ -1263,6 +1263,7 @@ static int init_subsystems(void) > goto out; > > kvm_perf_init(); > + kvm_host_pmu_init(); > kvm_coproc_table_init(); > > out: > @@ -1453,6 +1454,7 @@ int kvm_arch_init(void *opaque) > void kvm_arch_exit(void) > { > kvm_perf_teardown(); > + kvm_host_pmu_teardown(); > } > > static int arm_init(void) > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h > index 1e83b707f14c..018f887e8964 100644 > --- a/arch/arm64/include/asm/kvm_host.h > +++ b/arch/arm64/include/asm/kvm_host.h > @@ -349,6 +349,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, > int kvm_perf_init(void); > int kvm_perf_teardown(void); > > +#if !defined(CONFIG_KVM_HOST_PMU) > +static inline int kvm_host_pmu_init(void) { return 0; } > +static inline void kvm_host_pmu_teardown(void) { } > +#else > +int kvm_host_pmu_init(void); > +void kvm_host_pmu_teardown(void); > +#endif > + > struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); > > static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, > diff --git a/virt/kvm/arm/host_pmu.c b/virt/kvm/arm/host_pmu.c > new file mode 100644 > index 000000000000..fc610ccc169a > --- /dev/null > +++ b/virt/kvm/arm/host_pmu.c > @@ -0,0 +1,272 @@ > +#include <linux/cpumask.h> > +#include <linux/device.h> > +#include <linux/kernel.h> > +#include <linux/kvm_host.h> > +#include <linux/list.h> > +#include <linux/perf_event.h> > +#include <linux/pid.h> > +#include <linux/sched.h> > +#include <linux/slab.h> > +#include <linux/spinlock_types.h> > +#include <linux/sysfs.h> > + > +#include <asm/kvm_emulate.h> > + > +enum host_pmu_events { > + KVM_HOST_MAX_EVENTS, > +}; > + > +struct host_pmu { > + struct pmu pmu; > + spinlock_t event_list_lock; > + struct list_head event_list_head; > +} host_pmu; > +#define to_host_pmu(p) (container_of(p, struct host_pmu, pmu)) > + > +typedef void (*configure_event_fn)(struct kvm *kvm, bool enable); > +typedef u64 (*get_event_count_fn)(struct kvm *kvm); > + > +struct kvm_event_cb { > + enum host_pmu_events event; > + get_event_count_fn get_event_count; > + configure_event_fn configure_event; > +}; > + > +struct event_data { > + bool enable; > + struct kvm *kvm; > + struct kvm_event_cb *cb; > + struct work_struct work; > + struct list_head event_list; > +}; > + > +static struct kvm_event_cb event_callbacks[] = { > +}; > + > +static struct attribute *event_attrs[] = { > + NULL, > +}; > + > +static struct attribute_group events_attr_group = { > + .name = "events", > + .attrs = event_attrs, > +}; > + > + > +#define VM_MASK GENMASK_ULL(31, 0) > +#define EVENT_MASK GENMASK_ULL(32, 39) > +#define EVENT_SHIFT (32) > + > +#define to_pid(cfg) ((cfg) & VM_MASK) > +#define to_event(cfg) (((cfg) & EVENT_MASK) >> EVENT_SHIFT) > + > +PMU_FORMAT_ATTR(vm, "config:0-31"); > +PMU_FORMAT_ATTR(event, "config:32-39"); I'm a bit confused by these. Can't you get the PID of the VM you're tracing directly from perf, without having to encode things? And if you can't, surely this should be a function of the size of pid_t? Mark, can you shine some light here? > + > +static struct attribute *format_attrs[] = { > + &format_attr_vm.attr, > + &format_attr_event.attr, > + NULL, > +}; > + > +static struct attribute_group format_attr_group = { > + .name = "format", > + .attrs = format_attrs, > +}; > + > +static const struct attribute_group *attr_groups[] = { > + &events_attr_group, > + &format_attr_group, > + NULL, > +}; > + > +static void host_event_destroy(struct perf_event *event) > +{ > + struct host_pmu *host_pmu = to_host_pmu(event->pmu); > + struct event_data *e_data = event->pmu_private; > + > + /* > + * Ensure outstanding work items related to this event are > + * completed before freeing resources. > + */ > + flush_work(&e_data->work); > + > + kvm_put_kvm(e_data->kvm); > + > + spin_lock(&host_pmu->event_list_lock); > + list_del(&e_data->event_list); > + spin_unlock(&host_pmu->event_list_lock); > + kfree(e_data); > +} > + > +void host_event_work(struct work_struct *work) > +{ > + struct event_data *e_data = container_of(work, struct event_data, work); > + struct kvm *kvm = e_data->kvm; > + > + e_data->cb->configure_event(kvm, e_data->enable); > +} > + > +static int host_event_init(struct perf_event *event) > +{ > + struct host_pmu *host_pmu = to_host_pmu(event->pmu); > + int event_id = to_event(event->attr.config); > + pid_t task_pid = to_pid(event->attr.config); > + struct event_data *e_data, *pos; > + bool found = false; > + struct pid *pid; > + struct kvm *kvm; > + int ret = 0; > + > + if (event->attr.type != event->pmu->type) > + return -ENOENT; > + > + if (has_branch_stack(event) || > + is_sampling_event(event) || > + event->attr.exclude_user || > + event->attr.exclude_kernel || > + event->attr.exclude_hv || > + event->attr.exclude_idle || > + event->attr.exclude_guest) { > + return -EINVAL; > + } > + > + if (event->attach_state == PERF_ATTACH_TASK) > + return -EOPNOTSUPP; > + > + if (event->cpu < 0) > + return -EINVAL; > + > + if (event_id >= KVM_HOST_MAX_EVENTS) > + return -EINVAL; > + > + pid = find_get_pid(task_pid); > + spin_lock(&kvm_lock); > + list_for_each_entry(kvm, &vm_list, vm_list) { > + if (kvm->pid == pid) { > + kvm_get_kvm(kvm); > + found = true; > + break; > + } > + } > + spin_unlock(&kvm_lock); > + put_pid(pid); > + > + if (!found) > + return -EINVAL; > + > + spin_lock(&host_pmu->event_list_lock); > + /* Make sure we don't already have the (event_id, kvm) pair */ > + list_for_each_entry(pos, &host_pmu->event_list_head, event_list) { > + if (pos->cb->event == event_id && > + pos->kvm->pid == pid) { > + kvm_put_kvm(kvm); > + ret = -EOPNOTSUPP; > + goto unlock; > + } > + } > + > + e_data = kzalloc(sizeof(*e_data), GFP_KERNEL); > + e_data->kvm = kvm; > + e_data->cb = &event_callbacks[event_id]; > + INIT_WORK(&e_data->work, host_event_work); > + event->pmu_private = e_data; > + event->cpu = cpumask_first(cpu_online_mask); > + event->destroy = host_event_destroy; > + > + list_add_tail(&e_data->event_list, &host_pmu->event_list_head); > + > +unlock: > + spin_unlock(&host_pmu->event_list_lock); > + > + return ret; > +} > + > +static void host_event_update(struct perf_event *event) > +{ > + struct event_data *e_data = event->pmu_private; > + struct kvm_event_cb *cb = e_data->cb; > + struct kvm *kvm = e_data->kvm; > + struct hw_perf_event *hw = &event->hw; > + u64 prev_count, new_count; > + > + do { > + prev_count = local64_read(&hw->prev_count); > + new_count = cb->get_event_count(kvm); > + } while (local64_xchg(&hw->prev_count, new_count) != prev_count); > + > + local64_add(new_count - prev_count, &event->count); > +} > + > +static void host_event_start(struct perf_event *event, int flags) > +{ > + struct event_data *e_data = event->pmu_private; > + struct kvm_event_cb *cb = e_data->cb; > + struct kvm *kvm = e_data->kvm; > + u64 val; > + > + val = cb->get_event_count(kvm); > + local64_set(&event->hw.prev_count, val); > + > + e_data->enable = true; > + schedule_work(&e_data->work); > +} > + > +static void host_event_stop(struct perf_event *event, int flags) > +{ > + struct event_data *e_data = event->pmu_private; > + > + e_data->enable = false; > + schedule_work(&e_data->work); > + > + if (flags & PERF_EF_UPDATE) > + host_event_update(event); > +} > + > +static int host_event_add(struct perf_event *event, int flags) > +{ > + if (flags & PERF_EF_START) > + host_event_start(event, flags); > + > + return 0; > +} > + > +static void host_event_del(struct perf_event *event, int flags) > +{ > + host_event_stop(event, PERF_EF_UPDATE); > +} > + > +static void host_event_read(struct perf_event *event) > +{ > + host_event_update(event); > +} > + > +static void init_host_pmu(struct host_pmu *host_pmu) > +{ > + host_pmu->pmu = (struct pmu) { > + .task_ctx_nr = perf_sw_context, > + .attr_groups = attr_groups, > + .event_init = host_event_init, > + .add = host_event_add, > + .del = host_event_del, > + .start = host_event_start, > + .stop = host_event_stop, > + .read = host_event_read, > + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, > + }; > + > + INIT_LIST_HEAD(&host_pmu->event_list_head); > + spin_lock_init(&host_pmu->event_list_lock); > +} > + > +int kvm_host_pmu_init(void) > +{ > + init_host_pmu(&host_pmu); > + > + return perf_pmu_register(&host_pmu.pmu, "kvm", -1); > +} > + > +void kvm_host_pmu_teardown(void) > +{ > + perf_pmu_unregister(&host_pmu.pmu); > +} > This patch really makes me think that there is nothing arm-specific in here at all. Why can't it be a generic feature through which architectures can expose events in a generic way (or as close as possible to being generic)? Thanks, M. -- Jazz is not dead. It just smells funny... -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html