On 23/01/30 04:22, Martin KaFai Lau wrote: > On 1/27/23 10:14 AM, Anton Protopopov wrote: > > +/* The number of slots to store times */ > > +#define NR_SLOTS 32 > > + > > +/* Configured by userspace */ > > +u64 nr_entries; > > +u64 nr_loops; > > +u32 __attribute__((__aligned__(8))) key[256]; > > + > > +/* Filled by us */ > > +u64 __attribute__((__aligned__(256))) percpu_times_index[NR_SLOTS]; > +u64 __attribute__((__aligned__(256))) percpu_times[256][NR_SLOTS]; > > + > > +static inline void patch_key(u32 i) > > +{ > > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ > > + key[0] = i + 1; > > +#else > > + key[0] = __builtin_bswap32(i + 1); > > +#endif > > + /* the rest of key is random and is configured by userspace */ > > +} > > + > > +static int lookup_callback(__u32 index, u32 *unused) > > +{ > > + patch_key(index); > > + return bpf_map_lookup_elem(&hash_map_bench, key) ? 0 : 1; > > +} > > + > > +static int loop_lookup_callback(__u32 index, u32 *unused) > > +{ > > + return bpf_loop(nr_entries, lookup_callback, NULL, 0) ? 0 : 1; > > +} > > + > > +SEC("fentry/" SYS_PREFIX "sys_getpgid") > > +int benchmark(void *ctx) > > +{ > > + u32 cpu = bpf_get_smp_processor_id(); > > + u32 times_index; > > + u64 start_time; > > + > > + times_index = percpu_times_index[cpu & 255] % NR_SLOTS; > > percpu_times_index only has NR_SLOTS (32) elements? Yes, the idea was the following. One measurement (bpf prog execution) takes about 20-80 ms (depending on the key/map size). So in 2-3 seconds we can get about NR_SLOTS elements. For me 32 looked like enough to get stats for this benchmark. Do you think this is better to make the NR_SLOTS bigger/configurable? > > + start_time = bpf_ktime_get_ns(); > > + bpf_loop(nr_loops, loop_lookup_callback, NULL, 0); > > + percpu_times[cpu & 255][times_index] = bpf_ktime_get_ns() - start_time; > > + percpu_times_index[cpu & 255] += 1; > > + return 0; > > +} >