> Before I get to the vfs layer, there is a significant loss in the > memory allocator because of memcg -- it takes several irq off/on trips > for every alloc (needed to grab struct file *). I have a plan what to > do with it (handle stuff with local cmpxchg (note no lock prefix)), > which I'm trying to get around to. Apart from that you may note the > allocator fast path performs a 16-byte cmpxchg, which is again dog > slow and executes twice (once for the file obj, another time for the > namei buffer). Someone(tm) should patch it up and I have some vague > ideas, but 0 idea when I can take a serious stab. I just LBR sampled it on my skylake and it doesn't look particularly slow. You see the whole massive block including CMPXCHG16 gets IPC 2.7, which is rather good. If you see lots of cycles on it it's likely a missing cache line. kmem_cache_free: ffffffff9944ce20 nop %edi, %edx ffffffff9944ce24 nopl %eax, (%rax,%rax,1) ffffffff9944ce29 pushq %rbp ffffffff9944ce2a mov %rdi, %rdx ffffffff9944ce2d mov %rsp, %rbp ffffffff9944ce30 pushq %r15 ffffffff9944ce32 pushq %r14 ffffffff9944ce34 pushq %r13 ffffffff9944ce36 pushq %r12 ffffffff9944ce38 mov $0x80000000, %r12d ffffffff9944ce3e pushq %rbx ffffffff9944ce3f mov %rsi, %rbx ffffffff9944ce42 and $0xfffffffffffffff0, %rsp ffffffff9944ce46 sub $0x10, %rsp ffffffff9944ce4a movq %gs:0x28, %rax ffffffff9944ce53 movq %rax, 0x8(%rsp) ffffffff9944ce58 xor %eax, %eax ffffffff9944ce5a add %rsi, %r12 ffffffff9944ce5d jb 0xffffffff9944d1ea ffffffff9944ce63 mov $0xffffffff80000000, %rax ffffffff9944ce6a xor %r13d, %r13d ffffffff9944ce6d subq 0x17b068c(%rip), %rax ffffffff9944ce74 add %r12, %rax ffffffff9944ce77 shr $0xc, %rax ffffffff9944ce7b shl $0x6, %rax ffffffff9944ce7f addq 0x17b066a(%rip), %rax ffffffff9944ce86 movq 0x8(%rax), %rcx ffffffff9944ce8a test $0x1, %cl ffffffff9944ce8d jnz 0xffffffff9944d15c ffffffff9944ce93 nopl %eax, (%rax,%rax,1) ffffffff9944ce98 movq (%rax), %rcx ffffffff9944ce9b and $0x8, %ch ffffffff9944ce9e jz 0xffffffff9944cfea ffffffff9944cea4 test %rax, %rax ffffffff9944cea7 jz 0xffffffff9944cfea ffffffff9944cead movq 0x8(%rax), %r14 ffffffff9944ceb1 test %r14, %r14 ffffffff9944ceb4 jz 0xffffffff9944cfac ffffffff9944ceba cmp %r14, %rdx ffffffff9944cebd jnz 0xffffffff9944d165 ffffffff9944cec3 test %r14, %r14 ffffffff9944cec6 jz 0xffffffff9944cfac ffffffff9944cecc movq 0x8(%rbp), %r15 ffffffff9944ced0 nopl %eax, (%rax,%rax,1) ffffffff9944ced5 movq 0x1fe5134(%rip), %rax ffffffff9944cedc test %r13, %r13 ffffffff9944cedf jnz 0xffffffff9944ceef ffffffff9944cee1 mov $0xffffffff80000000, %rax ffffffff9944cee8 subq 0x17b0611(%rip), %rax ffffffff9944ceef add %rax, %r12 ffffffff9944cef2 shr $0xc, %r12 ffffffff9944cef6 shl $0x6, %r12 ffffffff9944cefa addq 0x17b05ef(%rip), %r12 ffffffff9944cf01 movq 0x8(%r12), %rax ffffffff9944cf06 mov %r12, %r13 ffffffff9944cf09 test $0x1, %al ffffffff9944cf0b jnz 0xffffffff9944d1b1 ffffffff9944cf11 nopl %eax, (%rax,%rax,1) ffffffff9944cf16 movq (%r13), %rax ffffffff9944cf1a movq %rbx, (%rsp) ffffffff9944cf1e test $0x8, %ah ffffffff9944cf21 mov $0x0, %eax ffffffff9944cf26 cmovz %rax, %r13 ffffffff9944cf2a data16 nop ffffffff9944cf2c movq 0x38(%r13), %r8 ffffffff9944cf30 cmp $0x3, %r8 ffffffff9944cf34 jnbe 0xffffffff9944d1ca ffffffff9944cf3a nopl %eax, (%rax,%rax,1) ffffffff9944cf3f movq 0x23d6f72(%rip), %rax ffffffff9944cf46 mov %rbx, %rdx ffffffff9944cf49 sub %rax, %rdx ffffffff9944cf4c cmp $0x1fffff, %rdx ffffffff9944cf53 jbe 0xffffffff9944d03a ffffffff9944cf59 movq (%r14), %rax ffffffff9944cf5c addq %gs:0x66bccab4(%rip), %rax ffffffff9944cf64 movq 0x8(%rax), %rdx ffffffff9944cf68 cmpq %r13, 0x10(%rax) ffffffff9944cf6c jnz 0xffffffff9944d192 ffffffff9944cf72 movl 0x28(%r14), %ecx ffffffff9944cf76 movq (%rax), %rax ffffffff9944cf79 add %rbx, %rcx ffffffff9944cf7c cmp %rbx, %rax ffffffff9944cf7f jz 0xffffffff9944d1ba ffffffff9944cf85 movq 0xb8(%r14), %rsi ffffffff9944cf8c mov %rcx, %rdi ffffffff9944cf8f bswap %rdi ffffffff9944cf92 xor %rax, %rsi ffffffff9944cf95 xor %rdi, %rsi ffffffff9944cf98 movq %rsi, (%rcx) ffffffff9944cf9b leaq 0x2000(%rdx), %rcx ffffffff9944cfa2 movq (%r14), %rsi ffffffff9944cfa5 cmpxchg16bx %gs:(%rsi) ffffffff9944cfaa jnz 0xffffffff9944cf59 ffffffff9944cfac movq 0x8(%rsp), %rax ffffffff9944cfb1 subq %gs:0x28, %rax ffffffff9944cfba jnz 0xffffffff9944d1fc ffffffff9944cfc0 leaq -0x28(%rbp), %rsp ffffffff9944cfc4 popq %rbx ffffffff9944cfc5 popq %r12 ffffffff9944cfc7 popq %r13 ffffffff9944cfc9 popq %r14 ffffffff9944cfcb popq %r15 ffffffff9944cfcd popq %rbp ffffffff9944cfce retq # PRED 38 cycles [126] 2.74 IPC <-------------