On 11/02/15 18:12, Peter Zijlstra wrote: > > How about something like the below? I _think_ it should mostly work for > x86, where the tsc is a 64bit wide cycle counter. It would have to be based on CLOCK_MONOTONIC_RAW not CLOCK_MONOTONIC and you would have to check the clocksource is TSC. Why is CLOCK_MONOTONIC preferred anyway - I would have thought any adjustment would skew performance timings? > > I suppose we should extend the perf userpage time data with > time_last_cycle and time_mask if/when we want to make this work on > something with a short counter. > > Of course, at that time we also need to somehow deal with that counter > wrapping, its hardly practical to go iterate all possible userpg > instances from a timer handler. > > > --- > Documentation/kernel-parameters.txt | 9 +++++++ > arch/x86/kernel/cpu/perf_event.c | 44 ++++++++++++++++++++++++--------- > include/linux/perf_event.h | 6 +++++ > kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++++--- > kernel/time/timekeeping.c | 30 +++++++++++++++++++++++ > 5 files changed, 123 insertions(+), 15 deletions(-) > > diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt > index 176d4fe4f076..52255676b6e2 100644 > --- a/Documentation/kernel-parameters.txt > +++ b/Documentation/kernel-parameters.txt > @@ -91,6 +91,7 @@ the beginning of each description states the restrictions within which a > NUMA NUMA support is enabled. > NFS Appropriate NFS support is enabled. > OSS OSS sound support is enabled. > + PERF Performance events and counters support is enabled. > PV_OPS A paravirtualized kernel is enabled. > PARIDE The ParIDE (parallel port IDE) subsystem is enabled. > PARISC The PA-RISC architecture is enabled. > @@ -2796,6 +2797,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. > allocator. This parameter is primarily for debugging > and performance comparison. > > + perf_use_local_clock > + [PERF] > + Use local_clock() as a source for perf timestamps > + generation. This was be the default behaviour and > + this parameter can be used to maintain backward > + compatibility or on older hardware with expensive > + monotonic clock source. > + > pf. [PARIDE] > See Documentation/blockdev/paride.txt. > > diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c > index b71a7f86d68a..436a66632f76 100644 > --- a/arch/x86/kernel/cpu/perf_event.c > +++ b/arch/x86/kernel/cpu/perf_event.c > @@ -1952,6 +1952,35 @@ static struct pmu pmu = { > .flush_branch_stack = x86_pmu_flush_branch_stack, > }; > > +static void local_clock_user_time(struct perf_event_mmap_page *userpg, u64 now) > +{ > + data = cyc2ns_read_begin(); > + > + userpg->cap_user_time = 1; > + userpg->time_mult = data->cyc2ns_mul; > + userpg->time_shift = data->cyc2ns_shift; > + userpg->time_offset = data->cyc2ns_offset - now; > + > + userpg->cap_user_time_zero = 1; > + userpg->time_zero = data->cyc2ns_offset; > + > + cyc2ns_read_end(data); > +} > + > +extern void notrace __ktime_get_mono_fast(u64 *offset, u32 *mult, u16 *shift); > + > +static void ktime_fast_mono_user_time(struct perf_event_mmap_page *userpg, u64 now) > +{ > + userpg->cap_user_time = 1; > + userpg->cap_user_time_zero = 1; > + > + __ktime_get_mono_fast(&userpg->time_zero, > + &userpg->time_mult, > + &userpg->time_shift); > + > + userpg->offset = userpg->time_zero - now; > +} > + > void arch_perf_update_userpage(struct perf_event *event, > struct perf_event_mmap_page *userpg, u64 now) > { > @@ -1966,17 +1995,10 @@ void arch_perf_update_userpage(struct perf_event *event, > if (!sched_clock_stable()) > return; > > - data = cyc2ns_read_begin(); > - > - userpg->cap_user_time = 1; > - userpg->time_mult = data->cyc2ns_mul; > - userpg->time_shift = data->cyc2ns_shift; > - userpg->time_offset = data->cyc2ns_offset - now; > - > - userpg->cap_user_time_zero = 1; > - userpg->time_zero = data->cyc2ns_offset; > - > - cyc2ns_read_end(data); > + if (static_key_false(&perf_use_local_clock_key)) > + local_clock_user_time(userpg, now); > + else > + ktime_fast_mono_user_time(userpg, now); > } > > /* > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h > index 33262004c310..1d61f968113a 100644 > --- a/include/linux/perf_event.h > +++ b/include/linux/perf_event.h > @@ -561,6 +561,12 @@ extern void perf_pmu_enable(struct pmu *pmu); > extern int perf_event_task_disable(void); > extern int perf_event_task_enable(void); > extern int perf_event_refresh(struct perf_event *event, int refresh); > + > +extern struct static_key perf_use_local_clock_key = STATIC_KEY_INIT_FALSE; > +extern void __weak > +arch_perf_update_userpage(struct perf_event *event, > + struct perf_event_mmap_page *userpg, u64 now); > + > extern void perf_event_update_userpage(struct perf_event *event); > extern int perf_event_release_kernel(struct perf_event *event); > extern struct perf_event * > diff --git a/kernel/events/core.c b/kernel/events/core.c > index 13209a90b751..7bad385103ea 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -42,6 +42,8 @@ > #include <linux/module.h> > #include <linux/mman.h> > #include <linux/compat.h> > +#include <linux/sysctl.h> > +#include <linux/jump_label.h> > > #include "internal.h" > > @@ -322,9 +324,43 @@ extern __weak const char *perf_pmu_name(void) > return "pmu"; > } > > +struct static_key perf_use_local_clock_key = STATIC_KEY_INIT_FALSE; > +static bool perf_use_local_clock_param __initdata; > +static int __init perf_use_local_clock_setup(char *__unused) > +{ > + perf_use_local_clock_param = true; > + return 1; > +} > +__setup("perf_use_local_clock", perf_use_local_clock_setup); > + > +static int sysctl_perf_sample_time_clk_id = CLOCK_MONOTONIC; > + > +static struct ctl_table perf_sample_time_kern_table[] = { > + { > + .procname = "perf_sample_time_clk_id", > + .data = &sysctl_perf_sample_time_clk_id, > + .maxlen = sizeof(int), > + .mode = 0444, > + .proc_handler = proc_dointvec, > + }, > + {} > +}; > + > +static struct ctl_table perf_sample_time_root_table[] = { > + { > + .procname = "kernel", > + .mode = 0555, > + .child = perf_sample_time_kern_table, > + }, > + {} > +}; > + > static inline u64 perf_clock(void) > { > - return local_clock(); > + if (static_key_false(&perf_use_local_clock_key)) > + return local_clock(); > + else > + return ktime_get_mono_fast_ns(); > } > > static inline struct perf_cpu_context * > @@ -4101,8 +4137,8 @@ static void perf_event_init_userpage(struct perf_event *event) > rcu_read_unlock(); > } > > -void __weak arch_perf_update_userpage( > - struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) > +void __weak arch_perf_update_userpage(struct perf_event *event, > + struct perf_event_mmap_page *userpg, u64 now) > { > } > > @@ -4487,7 +4523,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) > if (vma->vm_flags & VM_WRITE) > flags |= RING_BUFFER_WRITABLE; > > - rb = rb_alloc(nr_pages, > + rb = rb_alloc(nr_pages, > event->attr.watermark ? event->attr.wakeup_watermark : 0, > event->cpu, flags); > > @@ -8516,6 +8552,11 @@ void __init perf_event_init(void) > */ > BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) > != 1024); > + > + if (perf_use_local_clock_param) > + static_key_slow_inc(&perf_use_local_clock_key); > + else > + register_sysctl_table(perf_sample_time_root_table); > } > > static int __init perf_event_sysfs_init(void) > diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c > index b124af259800..37bed5931a91 100644 > --- a/kernel/time/timekeeping.c > +++ b/kernel/time/timekeeping.c > @@ -334,6 +334,36 @@ u64 notrace ktime_get_mono_fast_ns(void) > } > EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); > > +void notrace __ktime_get_mono_fast(u64 *offset, u32 *mult, u16 *shift) > +{ > + struct tk_read_base *tkr; > + unsigned int seq; > + cycle_t cycle_now, delta; > + u64 nsecs, now; > + > + do { > + seq = raw_read_seqcount(&tk_fast_mono.seq); > + tkr = tk_fast_mono.base + (seq & 0x01); > + > + cycle_now = tkr->read(tkr->clock); > + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); > + > + nsec = delta * tkr->mult + tkr->xtime_nsec; > + nsec >>= tkr->shift; > + nsec += arch_gettimeoffset(); > + > + now = ktime_to_ns(tkr->base_mono) + nsec; > + > + *mult = tkr->mult; > + *shift = tkr->shift; > + > + nsec = mul_u64_u32_shr(cycle_now, tkr->mult, tkr->shift); > + > + *offset = now - nsec; > + > + } while (read_seqcount_retry(&tk_fast_mono.seq, seq)); > +} > + > #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD > > static inline void update_vsyscall(struct timekeeper *tk) > > -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html