On Thu, Jan 07, 2016 at 01:05:35PM -0800, H. Peter Anvin wrote: > On 01/07/16 11:44, Chris Wilson wrote: > > > > Now I feel silly. Looking at the .s, there is no difference with the > > addition of the barrier to clflush_cache_range(). And sure enough > > letting the test run for longer, we see a failure. I fell for a placebo. > > > > The failing assertion is always on the last cacheline and is always one > > value behind. Oh well, back to wondering where we miss the flush. > > -Chris > > > > Could you include the assembly here? Sure, here you go: .LHOTB18: .p2align 4,,15 .globl clflush_cache_range .type clflush_cache_range, @function clflush_cache_range: .LFB2505: .loc 1 131 0 .cfi_startproc .LVL194: 1: call __fentry__ .loc 1 132 0 movzwl boot_cpu_data+198(%rip), %eax .loc 1 131 0 pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 .loc 1 133 0 movl %esi, %esi .LVL195: addq %rdi, %rsi .LVL196: .loc 1 131 0 movq %rsp, %rbp .cfi_def_cfa_register 6 .loc 1 132 0 subl $1, %eax cltq .LVL197: .loc 1 136 0 #APP # 136 "arch/x86/mm/pageattr.c" 1 mfence # 0 "" 2 .loc 1 138 0 #NO_APP notq %rax .LVL198: andq %rax, %rdi .LVL199: cmpq %rdi, %rsi jbe .L216 .L217: .LBB1741: .LBB1742: .loc 8 198 0 #APP # 198 "./arch/x86/include/asm/special_insns.h" 1 661: .byte 0x3e; clflush (%rdi) 662: .skip -(((6651f-6641f)-(662b-661b)) > 0) * ((6651f-6641f)-(662b-661b)),0x90 663: .pushsection .altinstructions,"a" .long 661b - . .long 6641f - . .word ( 9*32+23) .byte 663b-661b .byte 6651f-6641f .byte 663b-662b .popsection .pushsection .altinstr_replacement, "ax" 6641: .byte 0x66; clflush (%rdi) 6651: .popsection # 0 "" 2 #NO_APP .LBE1742: .LBE1741: .loc 1 141 0 .loc 1 139 0 movzwl boot_cpu_data+198(%rip), %eax addq %rax, %rdi .loc 1 138 0 cmpq %rdi, %rsi ja .L217 .L216: .loc 1 144 0 #APP # 144 "arch/x86/mm/pageattr.c" 1 mfence # 0 "" 2 .loc 1 145 0 #NO_APP popq %rbp .cfi_restore 6 .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE2505: .size clflush_cache_range, .-clflush_cache_range .section .text.unlikely Whilst you are looking at this asm, note that we reload boot_cpu_data.x86_cflush_size everytime around the loop. That's a small but noticeable extra cost (especially when we are only flushing a single cacheline). diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a3137a4..2cd2b4b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -129,14 +129,13 @@ within(unsigned long addr, unsigned long start, unsigned long end) */ void clflush_cache_range(void *vaddr, unsigned int size) { - unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + unsigned long clflush_size = boot_cpu_data.x86_clflush_size; void *vend = vaddr + size; - void *p; + void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); mb(); - for (p = (void *)((unsigned long)vaddr & ~clflush_mask); - p < vend; p += boot_cpu_data.x86_clflush_size) + for (; p < vend; p += clflush_size) clflushopt(p); mb(); -Chris -- Chris Wilson, Intel Open Source Technology Centre _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/dri-devel