----- On Jul 28, 2016, at 12:43 AM, Boqun Feng boqun.feng@xxxxxxxxx wrote: > On Thu, Jul 28, 2016 at 02:59:45AM +0000, Mathieu Desnoyers wrote: >> ----- On Jul 27, 2016, at 11:05 AM, Boqun Feng boqun.feng@xxxxxxxxx wrote: >> >> > As rseq syscall is enabled on PPC, implement the self-tests on PPC to >> > verify the implementation of the syscall. >> > >> > Please note we only support 32bit userspace on BE kernel. >> > >> > Signed-off-by: Boqun Feng <boqun.feng@xxxxxxxxx> >> > --- >> > tools/testing/selftests/rseq/param_test.c | 14 ++++ >> > tools/testing/selftests/rseq/rseq.h | 120 ++++++++++++++++++++++++++++++ >> > 2 files changed, 134 insertions(+) >> > >> > diff --git a/tools/testing/selftests/rseq/param_test.c >> > b/tools/testing/selftests/rseq/param_test.c >> > index db25e0a818e5..e2cb1b165f81 100644 >> > --- a/tools/testing/selftests/rseq/param_test.c >> > +++ b/tools/testing/selftests/rseq/param_test.c >> > @@ -75,6 +75,20 @@ static __thread unsigned int yield_mod_cnt, nr_retry; >> > "bne 222b\n\t" \ >> > "333:\n\t" >> > >> > +#elif __PPC__ >> > +#define INJECT_ASM_REG "r18" >> > + >> > +#define RSEQ_INJECT_CLOBBER \ >> > + , INJECT_ASM_REG >> > + >> > +#define RSEQ_INJECT_ASM(n) \ >> > + "lwz %%" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \ >> > + "cmpwi %%" INJECT_ASM_REG ", 0\n\t" \ >> > + "beq 333f\n\t" \ >> > + "222:\n\t" \ >> > + "subic. %%" INJECT_ASM_REG ", %%" INJECT_ASM_REG ", 1\n\t" \ >> > + "bne 222b\n\t" \ >> > + "333:\n\t" >> > #else >> > #error unsupported target >> > #endif >> > diff --git a/tools/testing/selftests/rseq/rseq.h >> > b/tools/testing/selftests/rseq/rseq.h >> > index 791e14cf42ae..dea0bea52566 100644 >> > --- a/tools/testing/selftests/rseq/rseq.h >> > +++ b/tools/testing/selftests/rseq/rseq.h >> > @@ -138,6 +138,35 @@ do { \ >> > #define has_fast_acquire_release() 0 >> > #define has_single_copy_load_64() 1 >> > >> > +#elif __PPC__ >> > +#define smp_mb() __asm__ __volatile__ ("sync" : : : "memory") >> > +#define smp_lwsync() __asm__ __volatile__ ("lwsync" : : : "memory") >> > +#define smp_rmb() smp_lwsync() >> > +#define smp_wmb() smp_lwsync() >> > + >> > +#define smp_load_acquire(p) \ >> > +__extension__ ({ \ >> > + __typeof(*p) ____p1 = READ_ONCE(*p); \ >> > + smp_lwsync(); \ >> > + ____p1; \ >> > +}) >> > + >> > +#define smp_acquire__after_ctrl_dep() smp_lwsync() >> > + >> > +#define smp_store_release(p, v) \ >> > +do { \ >> > + smp_lwsync(); \ >> > + WRITE_ONCE(*p, v); \ >> > +} while (0) >> > + >> > +#define has_fast_acquire_release() 1 >> >> Can you check if defining has_fast_acquire_release() to 0 speeds up >> performance significantly ? It turns the smp_lwsync() into a >> compiler barrier() on the smp_load_acquire() side (fast-path), and >> turn the smp_lwsync() into a membarrier system call instead of the >> matching smp_store_release() (slow path). >> > > Good point. Here are the numbers: > > Power8 PSeries KVM Guest(64 VCPUs, the host has 16 cores, 128 hardware > threads): > > Counter increment speed (ns/increment) > 1 thread 2 threads 4 threads 8 threads 16 threads 32 threads > global increment (baseline) 6.5 N/A N/A N/A > N/A N/A > percpu rseq increment 7.0 7.0 7.2 7.2 > 9.3 14.5 > percpu rseq spinlock 18.5 18.5 18.6 18.8 > 25.5 52.7 > > So looks like defining has_fast_acquire_release() to 0 could benefit the > cases with more threads in current benchmark. I will send a updated > patch doing this. Good to know the lwsync barrier overhead kicks in at that level of workload on Power8. > > And as discussed in IRC, I will also remove jump from rseq_finish() > fast-path in powerpc asm in the updated patch as you did for x86 and > ARM. Allright, thanks! Mathieu > > Regards, > Boqun > > >> Thanks, >> >> Mathieu >> >> > + >> > +# if __PPC64__ >> > +# define has_single_copy_load_64() 1 >> > +# else >> > +# define has_single_copy_load_64() 0 >> > +# endif >> > + >> > #else >> > #error unsupported target >> > #endif >> > @@ -404,6 +433,97 @@ bool rseq_finish(struct rseq_lock *rlock, >> > : succeed >> > ); >> > } >> > +#elif __PPC64__ >> > + { >> > + /* >> > + * The __rseq_table section can be used by debuggers to better >> > + * handle single-stepping through the restartable critical >> > + * sections. >> > + */ >> > + __asm__ __volatile__ goto ( >> > + ".pushsection __rseq_table, \"aw\"\n\t" >> > + ".balign 8\n\t" >> > + "4:\n\t" >> > + ".quad 1f, 2f, 3f\n\t" >> > + ".popsection\n\t" >> > + "1:\n\t" >> > + RSEQ_INJECT_ASM(1) >> > + "lis %%r17, (4b)@highest\n\t" >> > + "ori %%r17, %%r17, (4b)@higher\n\t" >> > + "rldicr %%r17, %%r17, 32, 31\n\t" >> > + "oris %%r17, %%r17, (4b)@h\n\t" >> > + "ori %%r17, %%r17, (4b)@l\n\t" >> > + "std %%r17, 0(%[rseq_cs])\n\t" >> > + RSEQ_INJECT_ASM(2) >> > + "lwz %%r17, %[current_event_counter]\n\t" >> > + "li %%r16, 0\n\t" >> > + "cmpw cr7, %[start_event_counter], %%r17\n\t" >> > + "bne cr7, 3f\n\t" >> > + RSEQ_INJECT_ASM(3) >> > + "std %[to_write], 0(%[target])\n\t" >> > + "2:\n\t" >> > + RSEQ_INJECT_ASM(4) >> > + "std %%r16, 0(%[rseq_cs])\n\t" >> > + "b %l[succeed]\n\t" >> > + "3:\n\t" >> > + "li %%r16, 0\n\t" >> > + "std %%r16, 0(%[rseq_cs])\n\t" >> > + : /* no outputs */ >> > + : [start_event_counter]"r"(start_value.event_counter), >> > + [current_event_counter]"m"(start_value.rseqp->abi.u.e.event_counter), >> > + [to_write]"r"(to_write), >> > + [target]"b"(p), >> > + [rseq_cs]"b"(&start_value.rseqp->abi.rseq_cs) >> > + RSEQ_INJECT_INPUT >> > + : "r16", "r17", "memory", "cc" >> > + RSEQ_INJECT_CLOBBER >> > + : succeed >> > + ); >> > + } >> > +#elif __PPC__ >> > + { >> > + /* >> > + * The __rseq_table section can be used by debuggers to better >> > + * handle single-stepping through the restartable critical >> > + * sections. >> > + */ >> > + __asm__ __volatile__ goto ( >> > + ".pushsection __rseq_table, \"aw\"\n\t" >> > + ".balign 8\n\t" >> > + "4:\n\t" >> > + ".long 0x0, 1f, 0x0, 2f, 0x0, 3f\n\t" /* 32 bit only supported on BE */ >> > + ".popsection\n\t" >> > + "1:\n\t" >> > + RSEQ_INJECT_ASM(1) >> > + "lis %%r17, (4b)@ha\n\t" >> > + "addi %%r17, %%r17, (4b)@l\n\t" >> > + "stw %%r17, 0(%[rseq_cs])\n\t" >> > + RSEQ_INJECT_ASM(2) >> > + "lwz %%r17, %[current_event_counter]\n\t" >> > + "li %%r16, 0\n\t" >> > + "cmpw cr7, %[start_event_counter], %%r17\n\t" >> > + "bne cr7, 3f\n\t" >> > + RSEQ_INJECT_ASM(3) >> > + "stw %[to_write], 0(%[target])\n\t" >> > + "2:\n\t" >> > + RSEQ_INJECT_ASM(4) >> > + "stw %%r16, 0(%[rseq_cs])\n\t" >> > + "b %l[succeed]\n\t" >> > + "3:\n\t" >> > + "li %%r16, 0\n\t" >> > + "stw %%r16, 0(%[rseq_cs])\n\t" >> > + : /* no outputs */ >> > + : [start_event_counter]"r"(start_value.event_counter), >> > + [current_event_counter]"m"(start_value.rseqp->abi.u.e.event_counter), >> > + [to_write]"r"(to_write), >> > + [target]"b"(p), >> > + [rseq_cs]"b"(&start_value.rseqp->abi.rseq_cs) >> > + RSEQ_INJECT_INPUT >> > + : "r16", "r17", "memory", "cc" >> > + RSEQ_INJECT_CLOBBER >> > + : succeed >> > + ); >> > + } >> > #else >> > #error unsupported target >> > #endif >> > -- >> > 2.9.0 >> >> -- >> Mathieu Desnoyers >> EfficiOS Inc. > > http://www.efficios.com -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html