Add a new argument *group_leader for perf_event_create_group_kernel_counters(), so group events can be created from Kernel space just like user space does. Current perf logic requires a perf events group is created to handle the topdown metrics profiling. To support topdown metrics feature in KVM, Kernel space also need the capability to create group events. Co-developed-by: Like Xu <likexu@xxxxxxxxxxx> Signed-off-by: Like Xu <likexu@xxxxxxxxxxx> Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx> --- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 4 ++-- arch/x86/kvm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- include/linux/perf_event.h | 1 + kernel/events/core.c | 17 ++++++++++++++++- kernel/events/hw_breakpoint.c | 4 ++-- kernel/events/hw_breakpoint_test.c | 2 +- kernel/watchdog_perf.c | 2 +- 8 files changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 458cb7419502..6494b2701204 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -952,12 +952,12 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, u64 tmp; miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (IS_ERR(miss_event)) goto out; hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (IS_ERR(hit_event)) goto out_miss; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index edb89b51b383..760d293f4a4a 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -221,7 +221,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, attr.precise_ip = pmc_get_pebs_precise_level(pmc); } - event = perf_event_create_kernel_counter(&attr, -1, current, + event = perf_event_create_kernel_counter(&attr, -1, current, NULL, kvm_perf_overflow, pmc); if (IS_ERR(event)) { pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 044d61aa63dc..9bf80fee34fb 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -302,8 +302,8 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) return 0; } - event = perf_event_create_kernel_counter(&attr, -1, - current, NULL, NULL); + event = perf_event_create_kernel_counter(&attr, -1, current, + NULL, NULL, NULL); if (IS_ERR(event)) { pr_debug_ratelimited("%s: failed %ld\n", __func__, PTR_ERR(event)); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2166a69e3bf2..c182f811f5f8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1102,6 +1102,7 @@ extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, + struct perf_event *group_leader, perf_overflow_handler_t callback, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, diff --git a/kernel/events/core.c b/kernel/events/core.c index 15eb82d1a010..a3af2e740dea 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12754,12 +12754,14 @@ SYSCALL_DEFINE5(perf_event_open, * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound * @task: task to profile (NULL for percpu) + * @group_leader: the group leader event of the created event * @overflow_handler: callback to trigger when we hit the event * @context: context data could be used in overflow_handler callback */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, + struct perf_event *group_leader, perf_overflow_handler_t overflow_handler, void *context) { @@ -12767,6 +12769,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct perf_event_context *ctx; struct perf_event *event; struct pmu *pmu; + int move_group = 0; int err; /* @@ -12776,7 +12779,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, if (attr->aux_output) return ERR_PTR(-EINVAL); - event = perf_event_alloc(attr, cpu, task, NULL, NULL, + if (task && group_leader && + group_leader->attr.inherit != attr->inherit) + return ERR_PTR(-EINVAL); + + event = perf_event_alloc(attr, cpu, task, group_leader, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { err = PTR_ERR(event); @@ -12806,6 +12813,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } + err = perf_event_group_leader_check(group_leader, event, attr, ctx, + &pmu, &move_group); + if (err) + goto err_unlock; + pmu_ctx = find_get_pmu_context(pmu, ctx, event); if (IS_ERR(pmu_ctx)) { err = PTR_ERR(pmu_ctx); @@ -12833,6 +12845,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_pmu_ctx; } + if (move_group) + perf_event_move_group(group_leader, pmu_ctx, ctx); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index c3797701339c..65b5b1421e62 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -771,7 +771,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, void *context, struct task_struct *tsk) { - return perf_event_create_kernel_counter(attr, -1, tsk, triggered, + return perf_event_create_kernel_counter(attr, -1, tsk, NULL, triggered, context); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); @@ -881,7 +881,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, cpus_read_lock(); for_each_online_cpu(cpu) { - bp = perf_event_create_kernel_counter(attr, cpu, NULL, + bp = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, triggered, context); if (IS_ERR(bp)) { err = PTR_ERR(bp); diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c index 2cfeeecf8de9..694db7645676 100644 --- a/kernel/events/hw_breakpoint_test.c +++ b/kernel/events/hw_breakpoint_test.c @@ -39,7 +39,7 @@ static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int attr.bp_addr = (unsigned long)&break_vars[idx]; attr.bp_len = HW_BREAKPOINT_LEN_1; attr.bp_type = HW_BREAKPOINT_RW; - return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL); + return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL, NULL); } static void unregister_test_bp(struct perf_event **bp) diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 8ea00c4a24b2..f8a52c4df079 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -120,7 +120,7 @@ static int hardlockup_detector_event_create(void) wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); /* Try to register using hardware perf events */ - evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, + evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, NULL, watchdog_overflow_callback, NULL); if (IS_ERR(evt)) { pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, -- 2.34.1