Re: [Bug #11308] tbench regression on each kernel release from 2.6.22 -> 2.6.28

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



* Ingo Molnar <mingo@xxxxxxx> wrote:

> 100.000000 total
> ................
>   1.469183 tcp_current_mss

                      hits (total: 146918)
                 .........
ffffffff804c5237:      526 <tcp_current_mss>:
ffffffff804c5237:      526 	41 54                	push   %r12
ffffffff804c5239:     5929 	55                   	push   %rbp
ffffffff804c523a:       32 	53                   	push   %rbx
ffffffff804c523b:      294 	48 89 fb             	mov    %rdi,%rbx
ffffffff804c523e:      539 	48 83 ec 30          	sub    $0x30,%rsp
ffffffff804c5242:     2590 	85 f6                	test   %esi,%esi
ffffffff804c5244:      444 	48 8b 4f 78          	mov    0x78(%rdi),%rcx
ffffffff804c5248:      521 	8b af 4c 04 00 00    	mov    0x44c(%rdi),%ebp
ffffffff804c524e:      791 	74 2a                	je     ffffffff804c527a <tcp_current_mss+0x43>
ffffffff804c5250:      433 	8b 87 00 01 00 00    	mov    0x100(%rdi),%eax
ffffffff804c5256:      236 	c1 e0 10             	shl    $0x10,%eax
ffffffff804c5259:      191 	89 c2                	mov    %eax,%edx
ffffffff804c525b:      487 	23 97 fc 00 00 00    	and    0xfc(%rdi),%edx
ffffffff804c5261:      362 	39 c2                	cmp    %eax,%edx
ffffffff804c5263:      342 	75 15                	jne    ffffffff804c527a <tcp_current_mss+0x43>
ffffffff804c5265:      473 	45 31 e4             	xor    %r12d,%r12d
ffffffff804c5268:      221 	8b 87 00 04 00 00    	mov    0x400(%rdi),%eax
ffffffff804c526e:      194 	3b 87 80 04 00 00    	cmp    0x480(%rdi),%eax
ffffffff804c5274:      445 	41 0f 94 c4          	sete   %r12b
ffffffff804c5278:      261 	eb 03                	jmp    ffffffff804c527d <tcp_current_mss+0x46>
ffffffff804c527a:        0 	45 31 e4             	xor    %r12d,%r12d
ffffffff804c527d:      185 	48 85 c9             	test   %rcx,%rcx
ffffffff804c5280:      686 	74 15                	je     ffffffff804c5297 <tcp_current_mss+0x60>
ffffffff804c5282:     1806 	8b 71 7c             	mov    0x7c(%rcx),%esi
ffffffff804c5285:        1 	3b b3 5c 03 00 00    	cmp    0x35c(%rbx),%esi
ffffffff804c528b:       21 	74 0a                	je     ffffffff804c5297 <tcp_current_mss+0x60>
ffffffff804c528d:        0 	48 89 df             	mov    %rbx,%rdi
ffffffff804c5290:        0 	e8 8b fb ff ff       	callq  ffffffff804c4e20 <tcp_sync_mss>
ffffffff804c5295:        0 	89 c5                	mov    %eax,%ebp
ffffffff804c5297:      864 	48 8d 4c 24 28       	lea    0x28(%rsp),%rcx
ffffffff804c529c:      634 	48 8d 54 24 10       	lea    0x10(%rsp),%rdx
ffffffff804c52a1:      995 	31 f6                	xor    %esi,%esi
ffffffff804c52a3:        0 	48 89 df             	mov    %rbx,%rdi
ffffffff804c52a6:        2 	e8 f2 fe ff ff       	callq  ffffffff804c519d <tcp_established_options>
ffffffff804c52ab:      859 	8b 8b e8 03 00 00    	mov    0x3e8(%rbx),%ecx
ffffffff804c52b1:      936 	83 c0 14             	add    $0x14,%eax
ffffffff804c52b4:        6 	0f b7 d1             	movzwl %cx,%edx
ffffffff804c52b7:        0 	39 d0                	cmp    %edx,%eax
ffffffff804c52b9:      911 	74 04                	je     ffffffff804c52bf <tcp_current_mss+0x88>
ffffffff804c52bb:        0 	29 d0                	sub    %edx,%eax
ffffffff804c52bd:        0 	29 c5                	sub    %eax,%ebp
ffffffff804c52bf:        0 	45 85 e4             	test   %r12d,%r12d
ffffffff804c52c2:     6894 	89 e8                	mov    %ebp,%eax
ffffffff804c52c4:        0 	74 38                	je     ffffffff804c52fe <tcp_current_mss+0xc7>
ffffffff804c52c6:      990 	48 8b 83 68 03 00 00 	mov    0x368(%rbx),%rax
ffffffff804c52cd:      642 	8b b3 04 01 00 00    	mov    0x104(%rbx),%esi
ffffffff804c52d3:        3 	48 89 df             	mov    %rbx,%rdi
ffffffff804c52d6:      240 	66 2b 70 30          	sub    0x30(%rax),%si
ffffffff804c52da:      588 	66 2b b3 7e 03 00 00 	sub    0x37e(%rbx),%si
ffffffff804c52e1:        2 	66 29 ce             	sub    %cx,%si
ffffffff804c52e4:      284 	ff ce                	dec    %esi
ffffffff804c52e6:      664 	0f b7 f6             	movzwl %si,%esi
ffffffff804c52e9:        2 	e8 0a fb ff ff       	callq  ffffffff804c4df8 <tcp_bound_to_half_wnd>
ffffffff804c52ee:       68 	0f b7 d0             	movzwl %ax,%edx
ffffffff804c52f1:     1870 	89 c1                	mov    %eax,%ecx
ffffffff804c52f3:        0 	89 d0                	mov    %edx,%eax
ffffffff804c52f5:        0 	31 d2                	xor    %edx,%edx
ffffffff804c52f7:     2135 	f7 f5                	div    %ebp
ffffffff804c52f9:   107010 	89 c8                	mov    %ecx,%eax
ffffffff804c52fb:     1670 	66 29 d0             	sub    %dx,%ax
ffffffff804c52fe:        0 	66 89 83 ea 03 00 00 	mov    %ax,0x3ea(%rbx)
ffffffff804c5305:        4 	48 83 c4 30          	add    $0x30,%rsp
ffffffff804c5309:      855 	89 e8                	mov    %ebp,%eax
ffffffff804c530b:        0 	5b                   	pop    %rbx
ffffffff804c530c:      797 	5d                   	pop    %rbp
ffffffff804c530d:        0 	41 5c                	pop    %r12
ffffffff804c530f:        0 	c3                   	retq   

apparently this division causes 1.0% of tbench overhead:

ffffffff804c52f5:        0 	31 d2                	xor    %edx,%edx
ffffffff804c52f7:     2135 	f7 f5                	div    %ebp
ffffffff804c52f9:   107010 	89 c8                	mov    %ecx,%eax

(gdb) list *0xffffffff804c52f7
0xffffffff804c52f7 is in tcp_current_mss (net/ipv4/tcp_output.c:1078).
1073					  inet_csk(sk)->icsk_af_ops->net_header_len -
1074					  inet_csk(sk)->icsk_ext_hdr_len -
1075					  tp->tcp_header_len);
1076	
1077			xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
1078			xmit_size_goal -= (xmit_size_goal % mss_now);
1079		}
1080		tp->xmit_size_goal = xmit_size_goal;
1081	
1082		return mss_now;
(gdb) 

it's this division:

        if (doing_tso) {
        [...]
			xmit_size_goal -= (xmit_size_goal % mss_now);

Has no-one hit this before? Perhaps this is why switching loopback 
networking to TSO had a performance impact for others?

It's still a bit weird ... how can a single division cause this much 
overhead? tcp_bound_to_half_wnd() [which is called straight before 
this sequence] seems low-overhead.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kernel-testers" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux