ip_queue_xmit(): Re: [Bug #11308] tbench regression on each kernel release from 2.6.22 -> 2.6.28

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



* Ingo Molnar <mingo@xxxxxxx> wrote:

> 100.000000 total
> ................
>   3.356152 ip_queue_xmit

                      hits (335615 total)
                 .........
ffffffff804b7045:     1001 <ip_queue_xmit>:
ffffffff804b7045:     1001 	41 57                	push   %r15
ffffffff804b7047:    36698 	41 56                	push   %r14
ffffffff804b7049:        0 	49 89 fe             	mov    %rdi,%r14
ffffffff804b704c:        0 	41 55                	push   %r13
ffffffff804b704e:      447 	41 54                	push   %r12
ffffffff804b7050:        0 	55                   	push   %rbp
ffffffff804b7051:        4 	53                   	push   %rbx
ffffffff804b7052:      465 	48 83 ec 68          	sub    $0x68,%rsp
ffffffff804b7056:        1 	89 74 24 08          	mov    %esi,0x8(%rsp)
ffffffff804b705a:      486 	48 8b 47 28          	mov    0x28(%rdi),%rax
ffffffff804b705e:        0 	48 8b 6f 10          	mov    0x10(%rdi),%rbp
ffffffff804b7062:        7 	48 85 c0             	test   %rax,%rax
ffffffff804b7065:      480 	48 89 44 24 58       	mov    %rax,0x58(%rsp)
ffffffff804b706a:        0 	4c 8b bd 48 02 00 00 	mov    0x248(%rbp),%r15
ffffffff804b7071:        7 	0f 85 0d 01 00 00    	jne    ffffffff804b7184 <ip_queue_xmit+0x13f>
ffffffff804b7077:      452 	31 f6                	xor    %esi,%esi
ffffffff804b7079:        0 	48 89 ef             	mov    %rbp,%rdi
ffffffff804b707c:        5 	e8 c1 eb fc ff       	callq  ffffffff80485c42 <__sk_dst_check>
ffffffff804b7081:      434 	48 85 c0             	test   %rax,%rax
ffffffff804b7084:       54 	48 89 44 24 58       	mov    %rax,0x58(%rsp)
ffffffff804b7089:        0 	0f 85 e0 00 00 00    	jne    ffffffff804b716f <ip_queue_xmit+0x12a>
ffffffff804b708f:        0 	4d 85 ff             	test   %r15,%r15
ffffffff804b7092:        0 	44 8b ad 30 02 00 00 	mov    0x230(%rbp),%r13d
ffffffff804b7099:        0 	74 0a                	je     ffffffff804b70a5 <ip_queue_xmit+0x60>
ffffffff804b709b:        0 	41 80 7f 05 00       	cmpb   $0x0,0x5(%r15)
ffffffff804b70a0:        0 	74 03                	je     ffffffff804b70a5 <ip_queue_xmit+0x60>
ffffffff804b70a2:        0 	45 8b 2f             	mov    (%r15),%r13d
ffffffff804b70a5:        0 	8b 85 3c 02 00 00    	mov    0x23c(%rbp),%eax
ffffffff804b70ab:        0 	48 8d b5 10 01 00 00 	lea    0x110(%rbp),%rsi
ffffffff804b70b2:        0 	44 8b 65 04          	mov    0x4(%rbp),%r12d
ffffffff804b70b6:        0 	bf 0d 00 00 00       	mov    $0xd,%edi
ffffffff804b70bb:        0 	89 44 24 0c          	mov    %eax,0xc(%rsp)
ffffffff804b70bf:        0 	8a 9d 54 02 00 00    	mov    0x254(%rbp),%bl
ffffffff804b70c5:        0 	e8 9a df ff ff       	callq  ffffffff804b5064 <constant_test_bit>
ffffffff804b70ca:        0 	31 d2                	xor    %edx,%edx
ffffffff804b70cc:        0 	48 8d 7c 24 10       	lea    0x10(%rsp),%rdi
ffffffff804b70d1:        0 	41 89 c3             	mov    %eax,%r11d
ffffffff804b70d4:        0 	fc                   	cld    
ffffffff804b70d5:        0 	89 d0                	mov    %edx,%eax
ffffffff804b70d7:        0 	b9 10 00 00 00       	mov    $0x10,%ecx
ffffffff804b70dc:        0 	44 8a 45 39          	mov    0x39(%rbp),%r8b
ffffffff804b70e0:        0 	40 8a b5 57 02 00 00 	mov    0x257(%rbp),%sil
ffffffff804b70e7:        0 	44 8b 8d 50 02 00 00 	mov    0x250(%rbp),%r9d
ffffffff804b70ee:        0 	83 e3 1e             	and    $0x1e,%ebx
ffffffff804b70f1:        0 	44 8b 95 38 02 00 00 	mov    0x238(%rbp),%r10d
ffffffff804b70f8:        0 	44 09 db             	or     %r11d,%ebx
ffffffff804b70fb:        0 	f3 ab                	rep stos %eax,%es:(%rdi)
ffffffff804b70fd:        0 	40 c0 ee 05          	shr    $0x5,%sil
ffffffff804b7101:        0 	88 5c 24 24          	mov    %bl,0x24(%rsp)
ffffffff804b7105:        0 	48 8d 5c 24 10       	lea    0x10(%rsp),%rbx
ffffffff804b710a:        0 	83 e6 01             	and    $0x1,%esi
ffffffff804b710d:        0 	48 89 ef             	mov    %rbp,%rdi
ffffffff804b7110:        0 	44 88 44 24 40       	mov    %r8b,0x40(%rsp)
ffffffff804b7115:        0 	8b 44 24 0c          	mov    0xc(%rsp),%eax
ffffffff804b7119:        0 	40 88 74 24 41       	mov    %sil,0x41(%rsp)
ffffffff804b711e:        0 	48 89 de             	mov    %rbx,%rsi
ffffffff804b7121:        0 	66 44 89 4c 24 44    	mov    %r9w,0x44(%rsp)
ffffffff804b7127:        0 	66 44 89 54 24 46    	mov    %r10w,0x46(%rsp)
ffffffff804b712d:        0 	44 89 64 24 10       	mov    %r12d,0x10(%rsp)
ffffffff804b7132:        0 	44 89 6c 24 1c       	mov    %r13d,0x1c(%rsp)
ffffffff804b7137:        0 	89 44 24 20          	mov    %eax,0x20(%rsp)
ffffffff804b713b:        0 	e8 2d 9f e5 ff       	callq  ffffffff8031106d <security_sk_classify_flow>
ffffffff804b7140:        0 	48 8d 74 24 58       	lea    0x58(%rsp),%rsi
ffffffff804b7145:        0 	45 31 c0             	xor    %r8d,%r8d
ffffffff804b7148:        0 	48 89 e9             	mov    %rbp,%rcx
ffffffff804b714b:        0 	48 89 da             	mov    %rbx,%rdx
ffffffff804b714e:        0 	48 c7 c7 d0 15 ab 80 	mov    $0xffffffff80ab15d0,%rdi
ffffffff804b7155:        0 	e8 1a 91 ff ff       	callq  ffffffff804b0274 <ip_route_output_flow>
ffffffff804b715a:        0 	85 c0                	test   %eax,%eax
ffffffff804b715c:        0 	0f 85 9f 01 00 00    	jne    ffffffff804b7301 <ip_queue_xmit+0x2bc>
ffffffff804b7162:        0 	48 8b 74 24 58       	mov    0x58(%rsp),%rsi
ffffffff804b7167:        0 	48 89 ef             	mov    %rbp,%rdi
ffffffff804b716a:        0 	e8 a8 eb fc ff       	callq  ffffffff80485d17 <sk_setup_caps>
ffffffff804b716f:      441 	48 8b 44 24 58       	mov    0x58(%rsp),%rax
ffffffff804b7174:     1388 	48 85 c0             	test   %rax,%rax
ffffffff804b7177:        0 	74 07                	je     ffffffff804b7180 <ip_queue_xmit+0x13b>
ffffffff804b7179:        0 	f0 ff 80 b0 00 00 00 	lock incl 0xb0(%rax)
ffffffff804b7180:      556 	49 89 46 28          	mov    %rax,0x28(%r14)
ffffffff804b7184:     8351 	4d 85 ff             	test   %r15,%r15
ffffffff804b7187:        0 	be 14 00 00 00       	mov    $0x14,%esi
ffffffff804b718c:      461 	74 26                	je     ffffffff804b71b4 <ip_queue_xmit+0x16f>
ffffffff804b718e:        0 	41 f6 47 08 01       	testb  $0x1,0x8(%r15)
ffffffff804b7193:        0 	74 17                	je     ffffffff804b71ac <ip_queue_xmit+0x167>
ffffffff804b7195:        0 	48 8b 54 24 58       	mov    0x58(%rsp),%rdx
ffffffff804b719a:        0 	8b 82 28 01 00 00    	mov    0x128(%rdx),%eax
ffffffff804b71a0:        0 	39 82 1c 01 00 00    	cmp    %eax,0x11c(%rdx)
ffffffff804b71a6:        0 	0f 85 55 01 00 00    	jne    ffffffff804b7301 <ip_queue_xmit+0x2bc>
ffffffff804b71ac:        0 	41 0f b6 47 04       	movzbl 0x4(%r15),%eax
ffffffff804b71b1:        0 	8d 70 14             	lea    0x14(%rax),%esi
ffffffff804b71b4:       39 	4c 89 f7             	mov    %r14,%rdi
ffffffff804b71b7:      493 	e8 f8 18 fd ff       	callq  ffffffff80488ab4 <skb_push>
ffffffff804b71bc:        0 	4c 89 f7             	mov    %r14,%rdi
ffffffff804b71bf:     1701 	e8 99 df ff ff       	callq  ffffffff804b515d <skb_reset_network_header>
ffffffff804b71c4:      481 	0f b6 85 54 02 00 00 	movzbl 0x254(%rbp),%eax
ffffffff804b71cb:     4202 	41 8b 9e bc 00 00 00 	mov    0xbc(%r14),%ebx
ffffffff804b71d2:        3 	48 89 ef             	mov    %rbp,%rdi
ffffffff804b71d5:        0 	49 03 9e d0 00 00 00 	add    0xd0(%r14),%rbx
ffffffff804b71dc:      466 	80 cc 45             	or     $0x45,%ah
ffffffff804b71df:        7 	66 c1 c0 08          	rol    $0x8,%ax
ffffffff804b71e3:        0 	66 89 03             	mov    %ax,(%rbx)
ffffffff804b71e6:      492 	48 8b 74 24 58       	mov    0x58(%rsp),%rsi
ffffffff804b71eb:        3 	e8 a0 df ff ff       	callq  ffffffff804b5190 <ip_dont_fragment>
ffffffff804b71f0:     1405 	85 c0                	test   %eax,%eax
ffffffff804b71f2:     4391 	74 0f                	je     ffffffff804b7203 <ip_queue_xmit+0x1be>
ffffffff804b71f4:        0 	83 7c 24 08 00       	cmpl   $0x0,0x8(%rsp)
ffffffff804b71f9:      417 	75 08                	jne    ffffffff804b7203 <ip_queue_xmit+0x1be>
ffffffff804b71fb:      503 	66 c7 43 06 40 00    	movw   $0x40,0x6(%rbx)
ffffffff804b7201:     6743 	eb 06                	jmp    ffffffff804b7209 <ip_queue_xmit+0x1c4>
ffffffff804b7203:        0 	66 c7 43 06 00 00    	movw   $0x0,0x6(%rbx)
ffffffff804b7209:      118 	0f bf 85 40 02 00 00 	movswl 0x240(%rbp),%eax
ffffffff804b7210:    10867 	48 8b 54 24 58       	mov    0x58(%rsp),%rdx
ffffffff804b7215:      340 	85 c0                	test   %eax,%eax
ffffffff804b7217:        0 	79 06                	jns    ffffffff804b721f <ip_queue_xmit+0x1da>
ffffffff804b7219:   107464 	8b 82 9c 00 00 00    	mov    0x9c(%rdx),%eax
ffffffff804b721f:     4963 	88 43 08             	mov    %al,0x8(%rbx)
ffffffff804b7222:    26297 	8a 45 39             	mov    0x39(%rbp),%al
ffffffff804b7225:    76658 	4d 85 ff             	test   %r15,%r15
ffffffff804b7228:     1712 	88 43 09             	mov    %al,0x9(%rbx)
ffffffff804b722b:      148 	48 8b 44 24 58       	mov    0x58(%rsp),%rax
ffffffff804b7230:     2971 	8b 80 20 01 00 00    	mov    0x120(%rax),%eax
ffffffff804b7236:    14849 	89 43 0c             	mov    %eax,0xc(%rbx)
ffffffff804b7239:       84 	48 8b 44 24 58       	mov    0x58(%rsp),%rax
ffffffff804b723e:      360 	8b 80 1c 01 00 00    	mov    0x11c(%rax),%eax
ffffffff804b7244:      174 	89 43 10             	mov    %eax,0x10(%rbx)
ffffffff804b7247:       96 	74 32                	je     ffffffff804b727b <ip_queue_xmit+0x236>
ffffffff804b7249:        0 	41 8a 57 04          	mov    0x4(%r15),%dl
ffffffff804b724d:        0 	84 d2                	test   %dl,%dl
ffffffff804b724f:        0 	74 2a                	je     ffffffff804b727b <ip_queue_xmit+0x236>
ffffffff804b7251:        0 	c0 ea 02             	shr    $0x2,%dl
ffffffff804b7254:        0 	03 13                	add    (%rbx),%edx
ffffffff804b7256:        0 	8a 03                	mov    (%rbx),%al
ffffffff804b7258:        0 	45 31 c0             	xor    %r8d,%r8d
ffffffff804b725b:        0 	4c 89 fe             	mov    %r15,%rsi
ffffffff804b725e:        0 	4c 89 f7             	mov    %r14,%rdi
ffffffff804b7261:        0 	83 e0 f0             	and    $0xfffffffffffffff0,%eax
ffffffff804b7264:        0 	83 e2 0f             	and    $0xf,%edx
ffffffff804b7267:        0 	09 d0                	or     %edx,%eax
ffffffff804b7269:        0 	88 03                	mov    %al,(%rbx)
ffffffff804b726b:        0 	48 8b 4c 24 58       	mov    0x58(%rsp),%rcx
ffffffff804b7270:        0 	8b 95 30 02 00 00    	mov    0x230(%rbp),%edx
ffffffff804b7276:        0 	e8 e4 d8 ff ff       	callq  ffffffff804b4b5f <ip_options_build>
ffffffff804b727b:      541 	41 8b 86 c8 00 00 00 	mov    0xc8(%r14),%eax
ffffffff804b7282:      570 	31 d2                	xor    %edx,%edx
ffffffff804b7284:        0 	49 03 86 d0 00 00 00 	add    0xd0(%r14),%rax
ffffffff804b728b:       34 	8b 40 08             	mov    0x8(%rax),%eax
ffffffff804b728e:      496 	66 85 c0             	test   %ax,%ax
ffffffff804b7291:       11 	74 06                	je     ffffffff804b7299 <ip_queue_xmit+0x254>
ffffffff804b7293:        9 	0f b7 c0             	movzwl %ax,%eax
ffffffff804b7296:      495 	8d 50 ff             	lea    -0x1(%rax),%edx
ffffffff804b7299:        2 	f6 43 06 40          	testb  $0x40,0x6(%rbx)
ffffffff804b729d:        9 	48 8b 74 24 58       	mov    0x58(%rsp),%rsi
ffffffff804b72a2:      497 	74 34                	je     ffffffff804b72d8 <ip_queue_xmit+0x293>
ffffffff804b72a4:        8 	83 bd 30 02 00 00 00 	cmpl   $0x0,0x230(%rbp)
ffffffff804b72ab:       10 	74 23                	je     ffffffff804b72d0 <ip_queue_xmit+0x28b>
ffffffff804b72ad:     1044 	66 8b 85 52 02 00 00 	mov    0x252(%rbp),%ax
ffffffff804b72b4:        7 	66 c1 c0 08          	rol    $0x8,%ax
ffffffff804b72b8:        8 	66 89 43 04          	mov    %ax,0x4(%rbx)
ffffffff804b72bc:      432 	66 8b 85 52 02 00 00 	mov    0x252(%rbp),%ax
ffffffff804b72c3:        9 	ff c0                	inc    %eax
ffffffff804b72c5:       14 	01 d0                	add    %edx,%eax
ffffffff804b72c7:     1141 	66 89 85 52 02 00 00 	mov    %ax,0x252(%rbp)
ffffffff804b72ce:        7 	eb 10                	jmp    ffffffff804b72e0 <ip_queue_xmit+0x29b>
ffffffff804b72d0:        0 	66 c7 43 04 00 00    	movw   $0x0,0x4(%rbx)
ffffffff804b72d6:        0 	eb 08                	jmp    ffffffff804b72e0 <ip_queue_xmit+0x29b>
ffffffff804b72d8:        0 	48 89 df             	mov    %rbx,%rdi
ffffffff804b72db:        0 	e8 b7 9d ff ff       	callq  ffffffff804b1097 <__ip_select_ident>
ffffffff804b72e0:        6 	8b 85 54 01 00 00    	mov    0x154(%rbp),%eax
ffffffff804b72e6:      458 	4c 89 f7             	mov    %r14,%rdi
ffffffff804b72e9:        2 	41 89 46 78          	mov    %eax,0x78(%r14)
ffffffff804b72ed:        4 	8b 85 f0 01 00 00    	mov    0x1f0(%rbp),%eax
ffffffff804b72f3:      841 	41 89 86 b0 00 00 00 	mov    %eax,0xb0(%r14)
ffffffff804b72fa:       11 	e8 30 f2 ff ff       	callq  ffffffff804b652f <ip_local_out>
ffffffff804b72ff:        0 	eb 44                	jmp    ffffffff804b7345 <ip_queue_xmit+0x300>
ffffffff804b7301:        0 	65 48 8b 04 25 10 00 	mov    %gs:0x10,%rax
ffffffff804b7308:        0 	00 00 
ffffffff804b730a:        0 	8b 80 48 e0 ff ff    	mov    -0x1fb8(%rax),%eax
ffffffff804b7310:        0 	4c 89 f7             	mov    %r14,%rdi
ffffffff804b7313:        0 	30 c0                	xor    %al,%al
ffffffff804b7315:        0 	66 83 f8 01          	cmp    $0x1,%ax
ffffffff804b7319:        0 	48 19 c0             	sbb    %rax,%rax
ffffffff804b731c:        0 	83 e0 08             	and    $0x8,%eax
ffffffff804b731f:        0 	48 8b 90 a8 16 ab 80 	mov    -0x7f54e958(%rax),%rdx
ffffffff804b7326:        0 	65 8b 04 25 24 00 00 	mov    %gs:0x24,%eax
ffffffff804b732d:        0 	00 
ffffffff804b732e:        0 	89 c0                	mov    %eax,%eax
ffffffff804b7330:        0 	48 f7 d2             	not    %rdx
ffffffff804b7333:        0 	48 8b 04 c2          	mov    (%rdx,%rax,8),%rax
ffffffff804b7337:        0 	48 ff 40 68          	incq   0x68(%rax)
ffffffff804b733b:        0 	e8 b1 18 fd ff       	callq  ffffffff80488bf1 <kfree_skb>
ffffffff804b7340:        0 	b8 8f ff ff ff       	mov    $0xffffff8f,%eax
ffffffff804b7345:     9196 	48 83 c4 68          	add    $0x68,%rsp
ffffffff804b7349:      892 	5b                   	pop    %rbx
ffffffff804b734a:        0 	5d                   	pop    %rbp
ffffffff804b734b:      488 	41 5c                	pop    %r12
ffffffff804b734d:        0 	41 5d                	pop    %r13
ffffffff804b734f:        0 	41 5e                	pop    %r14
ffffffff804b7351:      513 	41 5f                	pop    %r15
ffffffff804b7353:        0 	c3                   	retq   

about 10% of this function's cost is artificial:

ffffffff804b7045:     1001 <ip_queue_xmit>:
ffffffff804b7045:     1001 	41 57                	push   %r15
ffffffff804b7047:    36698 	41 56                	push   %r14

there are profiler hits that leaked in via out-of-order execution from 
the callsites. The callsites are hard to map unfortunately, as this 
function is called via function pointers.

the most likely callsite is tcp_transmit_skb().

30% of the overhead of this function comes from:

ffffffff804b7203:        0 	66 c7 43 06 00 00    	movw   $0x0,0x6(%rbx)
ffffffff804b7209:      118 	0f bf 85 40 02 00 00 	movswl 0x240(%rbp),%eax
ffffffff804b7210:    10867 	48 8b 54 24 58       	mov    0x58(%rsp),%rdx
ffffffff804b7215:      340 	85 c0                	test   %eax,%eax
ffffffff804b7217:        0 	79 06                	jns    ffffffff804b721f <ip_queue_xmit+0x1da>
ffffffff804b7219:   107464 	8b 82 9c 00 00 00    	mov    0x9c(%rdx),%eax
ffffffff804b721f:     4963 	88 43 08             	mov    %al,0x8(%rbx)

the 16-bit movw looks a bit weird. It comes from line 372:

 0xffffffff804b7203 is in ip_queue_xmit (net/ipv4/ip_output.c:372).
 367		iph = ip_hdr(skb);
 368		*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 369		if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 370			iph->frag_off = htons(IP_DF);
 371		else
 372			iph->frag_off = 0;
 373		iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 374		iph->protocol = sk->sk_protocol;
 375		iph->saddr    = rt->rt_src;
 376		iph->daddr    = rt->rt_dst;

the ip-header fragment flag setting to zero.

16-bit ops are an on-off love/hate affair on x86 CPUs. The trend is 
towards eliminating them as much as possible.

_But_, the real overhead probably comes from:

 ffffffff804b7210:    10867 	48 8b 54 24 58       	mov    0x58(%rsp),%rdx

which is the next line, the ttl field:

 373             iph->ttl      = ip_select_ttl(inet, &rt->u.dst);

this shows that we are doing a hard cachemiss on the net-localhost 
route dst structure cacheline. We do a plain load instruction from it 
here and get a hefty cachemiss. (because 16 CPUs are banging on that 
single route)

And let make sure we see this in perspective as well: that single 
cachemiss is _1.0 percent_ of the total tbench cost. (!) We could make 
the scheduler 10% slower straight away and it would have less of a 
real-life effect than this single iph->ttl field setting.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kernel-testers" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux