On Wed, May 31, 2023 at 8:02 AM Andrew Jones <ajones@xxxxxxxxxxxxxxxx> wrote: > > On Fri, May 12, 2023 at 10:53:18AM +0200, Alexandre Ghiti wrote: > > We used to unconditionnally expose the cycle and instret csrs to > > userspace, which gives rise to security concerns. > > > > So now we only allow access to hw counters from userspace through the perf > > framework which will handle context switches, per-task events...etc. But > > as we cannot break userspace, we give the user the choice to go back to > > the previous behaviour by setting the sysctl perf_user_access. > > > > Signed-off-by: Alexandre Ghiti <alexghiti@xxxxxxxxxxxx> > > --- > > arch/riscv/kernel/perf_event.c | 18 ++- > > drivers/perf/riscv_pmu_sbi.c | 194 ++++++++++++++++++++++++++++++++- > > 2 files changed, 205 insertions(+), 7 deletions(-) > > > > diff --git a/arch/riscv/kernel/perf_event.c b/arch/riscv/kernel/perf_event.c > > index 94174a0fc251..3af9ca45b43f 100644 > > --- a/arch/riscv/kernel/perf_event.c > > +++ b/arch/riscv/kernel/perf_event.c > > @@ -1,9 +1,13 @@ > > // SPDX-License-Identifier: GPL-2.0-only > > +#include <linux/perf/riscv_pmu.h> > > #include <linux/sched_clock.h> > > > > void arch_perf_update_userpage(struct perf_event *event, > > struct perf_event_mmap_page *userpg, u64 now) > > { > > +#ifdef CONFIG_RISCV_PMU_SBI > > + struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu); > > +#endif > > Can avoid this pair of #ifdef/#endif's by just declaring rvpmu inside the > if block below where it's needed. Or even just using to_riscv_pmu() > directly in place of rvpmu. > > > struct clock_read_data *rd; > > unsigned int seq; > > u64 ns; > > @@ -14,7 +18,19 @@ void arch_perf_update_userpage(struct perf_event *event, > > userpg->cap_user_rdpmc = > > !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); > > > > - userpg->pmc_width = 64; > > +#ifdef CONFIG_RISCV_PMU_SBI > > + /* > > + * The counters are 64-bit but the priv spec doesn't mandate all the > > + * bits to be implemented: that's why, counter width can vary based on > > + * the cpu vendor. > > + */ > > + if (event->pmu->name && > > + !strncmp(event->pmu->name, > > + RISCV_PMU_PDEV_NAME, sizeof(RISCV_PMU_PDEV_NAME))) > > + userpg->pmc_width = rvpmu->ctr_get_width(event->hw.idx) + 1; > > + else > > +#endif > > + userpg->pmc_width = 64; > > Can leave the initialization to 64 above the #ifdef CONFIG_RISCV_PMU_SBI > as is and drop the else. > > > > > do { > > rd = sched_clock_read_begin(&seq); > > diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c > > index 3b0ee2148054..d9bcc5cc6df5 100644 > > --- a/drivers/perf/riscv_pmu_sbi.c > > +++ b/drivers/perf/riscv_pmu_sbi.c > > @@ -24,6 +24,14 @@ > > #include <asm/sbi.h> > > #include <asm/hwcap.h> > > > > +#define SYSCTL_NO_USER_ACCESS 0 > > +#define SYSCTL_USER_ACCESS 1 > > +#define SYSCTL_LEGACY 2 > > + > > +#define PERF_EVENT_FLAG_NO_USER_ACCESS BIT(SYSCTL_NO_USER_ACCESS) > > +#define PERF_EVENT_FLAG_USER_ACCESS BIT(SYSCTL_USER_ACCESS) > > +#define PERF_EVENT_FLAG_LEGACY BIT(SYSCTL_LEGACY) > > + > > PMU_FORMAT_ATTR(event, "config:0-47"); > > PMU_FORMAT_ATTR(firmware, "config:63"); > > > > @@ -43,6 +51,9 @@ static const struct attribute_group *riscv_pmu_attr_groups[] = { > > NULL, > > }; > > > > +/* Allow legacy access by default */ > > +static int sysctl_perf_user_access __read_mostly = SYSCTL_LEGACY; > > + > > I'm still not in favor of this. Hopefully the distro discussions result in > it being changed. > I did not see any feedback from distro guys. I talked to David (fedora maintainer) and he is even okay with SYSCTL_NO_USER_ACCESS :). I would love to hear back from others (cc'd a few distro folks to this thread). > > /* > > * RISC-V doesn't have heterogeneous harts yet. This need to be part of > > * per_cpu in case of harts with different pmu counters > > @@ -301,6 +312,11 @@ int riscv_pmu_get_hpm_info(u32 *hw_ctr_width, u32 *num_hw_ctr) > > } > > EXPORT_SYMBOL_GPL(riscv_pmu_get_hpm_info); > > > > +static uint8_t pmu_sbi_csr_index(struct perf_event *event) > > +{ > > + return pmu_ctr_list[event->hw.idx].csr - CSR_CYCLE; > > +} > > + > > static unsigned long pmu_sbi_get_filter_flags(struct perf_event *event) > > { > > unsigned long cflags = 0; > > @@ -329,18 +345,34 @@ static int pmu_sbi_ctr_get_idx(struct perf_event *event) > > struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events); > > struct sbiret ret; > > int idx; > > - uint64_t cbase = 0; > > + uint64_t cbase = 0, cmask = rvpmu->cmask; > > unsigned long cflags = 0; > > > > cflags = pmu_sbi_get_filter_flags(event); > > + > > + /* > > + * In legacy mode, we have to force the fixed counters for those events > > + * but not in the user access mode as we want to use the other counters > > + * that support sampling/filtering. > > + */ > > + if (hwc->flags & PERF_EVENT_FLAG_LEGACY) { > > + if (event->attr.config == PERF_COUNT_HW_CPU_CYCLES) { > > + cflags |= SBI_PMU_CFG_FLAG_SKIP_MATCH; > > + cmask = 1; > > + } else if (event->attr.config == PERF_COUNT_HW_INSTRUCTIONS) { > > + cflags |= SBI_PMU_CFG_FLAG_SKIP_MATCH; > > + cmask = 1UL << (CSR_INSTRET - CSR_CYCLE); > > + } > > + } > > + > > /* retrieve the available counter index */ > > #if defined(CONFIG_32BIT) > > ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase, > > - rvpmu->cmask, cflags, hwc->event_base, hwc->config, > > + cmask, cflags, hwc->event_base, hwc->config, > > hwc->config >> 32); > > #else > > ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase, > > - rvpmu->cmask, cflags, hwc->event_base, hwc->config, 0); > > + cmask, cflags, hwc->event_base, hwc->config, 0); > > #endif > > if (ret.error) { > > pr_debug("Not able to find a counter for event %lx config %llx\n", > > @@ -474,6 +506,14 @@ static u64 pmu_sbi_ctr_read(struct perf_event *event) > > return val; > > } > > > > +static void pmu_sbi_set_scounteren(void *arg) > > +{ > > + struct perf_event *event = (struct perf_event *)arg; > > + > > + csr_write(CSR_SCOUNTEREN, > > + csr_read(CSR_SCOUNTEREN) | (1 << pmu_sbi_csr_index(event))); > > +} > > + > > static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival) > > { > > struct sbiret ret; > > @@ -490,6 +530,18 @@ static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival) > > if (ret.error && (ret.error != SBI_ERR_ALREADY_STARTED)) > > pr_err("Starting counter idx %d failed with error %d\n", > > hwc->idx, sbi_err_map_linux_errno(ret.error)); > > + > > + if (hwc->flags & PERF_EVENT_FLAG_USER_ACCESS && > > + hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT) > > + pmu_sbi_set_scounteren((void *)event); > > +} > > + > > +static void pmu_sbi_reset_scounteren(void *arg) > > +{ > > + struct perf_event *event = (struct perf_event *)arg; > > + > > + csr_write(CSR_SCOUNTEREN, > > + csr_read(CSR_SCOUNTEREN) & ~(1 << pmu_sbi_csr_index(event))); > > } > > > > static void pmu_sbi_ctr_stop(struct perf_event *event, unsigned long flag) > > @@ -497,6 +549,10 @@ static void pmu_sbi_ctr_stop(struct perf_event *event, unsigned long flag) > > struct sbiret ret; > > struct hw_perf_event *hwc = &event->hw; > > > > + if (hwc->flags & PERF_EVENT_FLAG_USER_ACCESS && > > + hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT) > > + pmu_sbi_reset_scounteren((void *)event); > > + > > ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, hwc->idx, 1, flag, 0, 0, 0); > > if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) && > > flag != SBI_PMU_STOP_FLAG_RESET) > > @@ -704,10 +760,13 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node) > > struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events); > > > > /* > > - * Enable the access for CYCLE, TIME, and INSTRET CSRs from userspace, > > - * as is necessary to maintain uABI compatibility. > > + * We keep enabling userspace access to CYCLE, TIME and INSRET via the > > + * legacy option but that will be removed in the future. > > */ > > - csr_write(CSR_SCOUNTEREN, 0x7); > > + if (sysctl_perf_user_access == SYSCTL_LEGACY) > > + csr_write(CSR_SCOUNTEREN, 0x7); > > + else > > + csr_write(CSR_SCOUNTEREN, 0x2); > > > > /* Stop all the counters so that they can be enabled from perf */ > > pmu_sbi_stop_all(pmu); > > @@ -851,6 +910,123 @@ static void riscv_pmu_destroy(struct riscv_pmu *pmu) > > cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node); > > } > > > > +static void pmu_sbi_event_init(struct perf_event *event) > > +{ > > + /* > > + * The permissions are set at event_init so that we do not depend > > + * on the sysctl value that can change. > > + */ > > + if (sysctl_perf_user_access == SYSCTL_NO_USER_ACCESS) > > + event->hw.flags |= PERF_EVENT_FLAG_NO_USER_ACCESS; > > + else if (sysctl_perf_user_access == SYSCTL_USER_ACCESS) > > + event->hw.flags |= PERF_EVENT_FLAG_USER_ACCESS; > > + else > > + event->hw.flags |= PERF_EVENT_FLAG_LEGACY; > > +} > > + > > +static void pmu_sbi_event_mapped(struct perf_event *event, struct mm_struct *mm) > > +{ > > + if (event->hw.flags & PERF_EVENT_FLAG_NO_USER_ACCESS) > > + return; > > + > > + /* In legacy mode, the first 3 CSRs are available. */ > > first and third > > > + if (event->hw.flags & PERF_EVENT_FLAG_LEGACY) { > > + if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES && > > + event->attr.config != PERF_COUNT_HW_INSTRUCTIONS) { > > + return; > > + } > > + } > > + > > + /* > > + * The user mmapped the event to directly access it: this is where > > + * we determine based on sysctl_perf_user_access if we grant userspace > > + * the direct access to this event. That means that within the same > > + * task, some events may be directly accessible and some other may not, > > + * if the user changes the value of sysctl_perf_user_accesss in the > > + * meantime. > > + */ > > + > > + event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; > > + > > + /* > > + * We must enable userspace access *before* advertising in the user page > > + * that it is possible to do so to avoid any race. > > + * And we must notify all cpus here because threads that currently run > > + * on other cpus will try to directly access the counter too without > > + * calling pmu_sbi_ctr_start. > > + */ > > + if (event->hw.flags & PERF_EVENT_FLAG_USER_ACCESS) > > + on_each_cpu_mask(mm_cpumask(mm), > > + pmu_sbi_set_scounteren, (void *)event, 1); > > +} > > + > > +static void pmu_sbi_event_unmapped(struct perf_event *event, struct mm_struct *mm) > > +{ > > + if (event->hw.flags & PERF_EVENT_FLAG_NO_USER_ACCESS) > > + return; > > + > > + /* In legacy mode, the first 3 CSRs are available. */ > > first and third > > > + if (event->hw.flags & PERF_EVENT_FLAG_LEGACY) { > > + if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES && > > + event->attr.config != PERF_COUNT_HW_INSTRUCTIONS) { > > + return; > > + } > > + } > > + > > + /* > > + * Here we can directly remove user access since the user does not have > > + * access to the user page anymore so we avoid the racy window where the > > + * user could have read cap_user_rdpmc to true right before we disable > > + * it. > > + */ > > + event->hw.flags &= ~PERF_EVENT_FLAG_USER_READ_CNT; > > + > > + if (event->hw.flags & PERF_EVENT_FLAG_USER_ACCESS) > > + on_each_cpu_mask(mm_cpumask(mm), > > + pmu_sbi_reset_scounteren, (void *)event, 1); > > +} > > + > > +static void riscv_pmu_update_counter_access(void *info) > > +{ > > + if (sysctl_perf_user_access == SYSCTL_LEGACY) > > + csr_write(CSR_SCOUNTEREN, 0x7); > > + else > > + csr_write(CSR_SCOUNTEREN, 0x2); > > +} > > + > > +static int riscv_pmu_proc_user_access_handler(struct ctl_table *table, > > + int write, void *buffer, > > + size_t *lenp, loff_t *ppos) > > +{ > > + int prev = sysctl_perf_user_access; > > + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); > > + > > + /* > > + * Test against the previous value since we clear SCOUNTEREN when > > + * sysctl_perf_user_access is set to SYSCTL_USER_ACCESS, but we should > > + * not do that if that was already the case. > > + */ > > + if (ret || !write || prev == sysctl_perf_user_access) > > + return ret; > > + > > + on_each_cpu(riscv_pmu_update_counter_access, (void *)&prev, 1); > > Instead of passing prev shouldn't we pass NULL, as it's not used? > > > + > > + return 0; > > +} > > + > > +static struct ctl_table sbi_pmu_sysctl_table[] = { > > + { > > + .procname = "perf_user_access", > > + .data = &sysctl_perf_user_access, > > + .maxlen = sizeof(unsigned int), > > + .mode = 0644, > > + .proc_handler = riscv_pmu_proc_user_access_handler, > > + .extra1 = SYSCTL_ZERO, > > + .extra2 = SYSCTL_TWO, > > + }, > > + { } > > +}; > > + > > static int pmu_sbi_device_probe(struct platform_device *pdev) > > { > > struct riscv_pmu *pmu = NULL; > > @@ -888,6 +1064,10 @@ static int pmu_sbi_device_probe(struct platform_device *pdev) > > pmu->ctr_get_width = pmu_sbi_ctr_get_width; > > pmu->ctr_clear_idx = pmu_sbi_ctr_clear_idx; > > pmu->ctr_read = pmu_sbi_ctr_read; > > + pmu->event_init = pmu_sbi_event_init; > > + pmu->event_mapped = pmu_sbi_event_mapped; > > + pmu->event_unmapped = pmu_sbi_event_unmapped; > > + pmu->csr_index = pmu_sbi_csr_index; > > > > ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node); > > if (ret) > > @@ -901,6 +1081,8 @@ static int pmu_sbi_device_probe(struct platform_device *pdev) > > if (ret) > > goto out_unregister; > > > > + register_sysctl("kernel", sbi_pmu_sysctl_table); > > + > > return 0; > > > > out_unregister: > > -- > > 2.37.2 > > > > Thanks, > drew -- Regards, Atish