Re: fio server/client disconnect bug

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 3/21/18 12:16 PM, Jeff Furlong wrote:
> Here's the failing case where host1 does not print:
> 
> back=1, groupid=0, nr_ts=1
> client: got cmd op PROBE from host2 (pdu=112)
> client host2 removed 2
> client: got cmd op PROBE from host1 (pdu=112)
> client host1 removed 2
> client: got cmd op START from host2 (pdu=8)
> client host2 removed 2
> client: got cmd op START from host1 (pdu=8)
> client host1 removed 2
> client: got cmd op ADD_JOB from host2 (pdu=23204)
> client host2 removed 2
> client: got cmd op TEXT from host2 (pdu=70)
> client host2 removed 2
> client: got cmd op TEXT from host2 (pdu=90)
> client host2 removed 2
> client: got cmd op ADD_JOB from host1 (pdu=23204)
> client host1 removed 2
> client: got cmd op TEXT from host2 (pdu=51)
> client host2 removed 2
> client: got cmd op TEXT from host1 (pdu=70)
> client host1 removed 2
> client: got cmd op START from host2 (pdu=8)
> client host2 removed 2
> client: got cmd op TEXT from host1 (pdu=90)
> client host1 removed 2
> client: got cmd op TEXT from host1 (pdu=51)
> client host1 removed 2
> client: got cmd op START from host1 (pdu=8)
> client host1 removed 2
> client: got cmd op TEXT from host2 (pdu=33)
> client host2 removed 2
> client: got cmd op TEXT from host2 (pdu=33)
> client host2 removed 2
> client: got cmd op TEXT from host2 (pdu=25)
> client host2 removed 2
> client: got cmd op TEXT from host1 (pdu=33)
> client host1 removed 2
> client: got cmd op TEXT from host1 (pdu=33)
> client host1 removed 2
> client: got cmd op TEXT from host1 (pdu=25)
> client host1 removed 2
> client: got cmd op SERVER_START from host2 (pdu=0)
> client host2 removed 2
> client: got cmd op SERVER_START from host1 (pdu=0)
> client host1 removed 2
> client: got cmd op ETA from host2 (pdu=165)
> client host2 removed 2
> client: got cmd op ETA from host1 (pdu=165)
> client host1 removed 2
> client: got cmd op ETA from host2 (pdu=165)
> client host2 removed 2
> client: got cmd op ETA from host1 (pdu=165)
> client host1 removed 2
> client: got cmd op ETA from host2 (pdu=165)
> client host2 removed 2
> client: got cmd op ETA from host1 (pdu=165)
> client host1 removed 25.0%][r=2667M,w=0k][r=41.7k,w=0 IOPS][eta 00m:02s]
> client: got cmd op ETA from host2 (pdu=165)
> client host2 removed 2
> client: got cmd op ETA from host1 (pdu=165)
> client host1 removed 28.9%][r=2661M,w=0k][r=41.6k,w=0 IOPS][eta 00m:01s]
> client: got cmd op ETA from host2 (pdu=165)
> client host2 removed 2
> client: got cmd op ETA from host1 (pdu=165)
> client host1 removed 200.0%][r=2669M,w=0k][r=41.7k,w=0 IOPS][eta 00m:00s]
> client: got cmd op IOLOG from host2 (pdu=338)
> client host2 removed 2
> client: got cmd op IOLOG from host2 (pdu=327)
> client host2 removed 2
> client: got cmd op IOLOG from host2 (pdu=337)
> client host2 removed 2
> client: got cmd op IOLOG from host1 (pdu=338)
> client host1 removed 2
> client: got cmd op IOLOG from host1 (pdu=325)
> client host1 removed 2
> client: got cmd op IOLOG from host1 (pdu=337)
> client host1 removed 2
> client: got cmd op IOLOG from host1 (pdu=323)
> client host1 removed 2
> client: got cmd op IOLOG from host1 (pdu=320)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op STOP from host1 (pdu=8)
> client host1 removed 2
> client: got cmd op QUIT from host1 (pdu=0)
> client host1 removed 2
> client host1 removed 1
> client: got cmd op IOLOG from host2 (pdu=331)
> client host2 removed 2
> client: got cmd op IOLOG from host2 (pdu=327)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op TS from host2 (pdu=94888)
> client host2 removed 2
> client: got cmd op GS from host2 (pdu=164)
> client host2 removed 2
> client: got cmd op DISK_UTIL from host2 (pdu=256)
> client host2 removed 2
> client: got cmd op TEXT from host2 (pdu=25)
> client host2 removed 2
> client: got cmd op STOP from host2 (pdu=8)
> client host2 removed 2
> client: got cmd op QUIT from host2 (pdu=0)
> client host2 removed 2
> client host2 removed 1
> 
> And here's a good case:
> 
> client: got cmd op PROBE from host2 (pdu=112)
> client host2 removed 2
> client: got cmd op PROBE from host1 (pdu=112)
> client host1 removed 2
> client: got cmd op START from host2 (pdu=8)
> client host2 removed 2
> client: got cmd op START from host1 (pdu=8)
> client host1 removed 2
> client: got cmd op ADD_JOB from host2 (pdu=23204)
> client host2 removed 2
> client: got cmd op TEXT from host2 (pdu=70)
> client host2 removed 2
> client: got cmd op TEXT from host2 (pdu=90)
> client host2 removed 2
> client: got cmd op TEXT from host2 (pdu=51)
> client host2 removed 2
> client: got cmd op START from host2 (pdu=8)
> client host2 removed 2
> client: got cmd op ADD_JOB from host1 (pdu=23204)
> client host1 removed 2
> client: got cmd op TEXT from host1 (pdu=70)
> client host1 removed 2
> client: got cmd op TEXT from host1 (pdu=90)
> client host1 removed 2
> client: got cmd op TEXT from host1 (pdu=51)
> client host1 removed 2
> client: got cmd op START from host1 (pdu=8)
> client host1 removed 2
> client: got cmd op TEXT from host2 (pdu=33)
> client host2 removed 2
> client: got cmd op TEXT from host2 (pdu=33)
> client host2 removed 2
> client: got cmd op TEXT from host2 (pdu=25)
> client host2 removed 2
> client: got cmd op SERVER_START from host2 (pdu=0)
> client host2 removed 2
> client: got cmd op TEXT from host1 (pdu=33)
> client host1 removed 2
> client: got cmd op TEXT from host1 (pdu=33)
> client host1 removed 2
> client: got cmd op TEXT from host1 (pdu=25)
> client host1 removed 2
> client: got cmd op SERVER_START from host1 (pdu=0)
> client host1 removed 2
> client: got cmd op ETA from host2 (pdu=165)
> client host2 removed 2
> client: got cmd op ETA from host1 (pdu=165)
> client host1 removed 2
> client: got cmd op ETA from host2 (pdu=165)
> client host2 removed 2
> client: got cmd op ETA from host1 (pdu=165)
> client host1 removed 2
> client: got cmd op ETA from host2 (pdu=165)
> client host2 removed 2
> client: got cmd op ETA from host1 (pdu=165)
> client host1 removed 25.0%][r=2667M,w=0k][r=41.7k,w=0 IOPS][eta 00m:02s]
> client: got cmd op ETA from host2 (pdu=165)
> client host2 removed 2
> client: got cmd op ETA from host1 (pdu=165)
> client host1 removed 28.9%][r=2669M,w=0k][r=41.7k,w=0 IOPS][eta 00m:01s]
> client: got cmd op ETA from host2 (pdu=165)
> client host2 removed 2
> client: got cmd op ETA from host1 (pdu=165)
> client host1 removed 200.0%][r=2675M,w=0k][r=41.8k,w=0 IOPS][eta 00m:00s]
> client: got cmd op IOLOG from host2 (pdu=336)
> client host2 removed 2
> client: got cmd op IOLOG from host2 (pdu=327)
> client host2 removed 2
> client: got cmd op IOLOG from host2 (pdu=337)
> client host2 removed 2
> client: got cmd op IOLOG from host1 (pdu=338)
> client host1 removed 2
> client: got cmd op IOLOG from host1 (pdu=327)
> client host1 removed 2
> client: got cmd op IOLOG from host1 (pdu=338)
> client host1 removed 2
> client: got cmd op IOLOG from host2 (pdu=330)
> client host2 removed 2
> client: got cmd op IOLOG from host2 (pdu=327)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op JOB_OPT from host2 (pdu=200)
> client host2 removed 2
> client: got cmd op TS from host2 (pdu=94888)
> client host2 removed 2
> client: got cmd op GS from host2 (pdu=164)
> client host2 removed 2
> client: got cmd op DISK_UTIL from host2 (pdu=256)
> client host2 removed 2
> client: got cmd op TEXT from host2 (pdu=25)
> client host2 removed 2
> client: got cmd op STOP from host2 (pdu=8)
> client host2 removed 2
> client: got cmd op QUIT from host2 (pdu=0)
> client host2 removed 2
> client host2 removed 1
> client: got cmd op IOLOG from host1 (pdu=330)
> client host1 removed 2
> client: got cmd op IOLOG from host1 (pdu=327)
> back=1, groupid=0, nr_ts=1
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op JOB_OPT from host1 (pdu=200)
> client host1 removed 2
> client: got cmd op TS from host1 (pdu=94888)
> client host1 removed 2
> client: got cmd op GS from host1 (pdu=164)
> client host1 removed 2
> client: got cmd op DISK_UTIL from host1 (pdu=256)
> client host1 removed 2
> client: got cmd op TEXT from host1 (pdu=25)
> client host1 removed 2
> client: got cmd op STOP from host1 (pdu=8)
> client host1 removed 2
> client: got cmd op QUIT from host1 (pdu=0)
> client host1 removed 2
> client host1 removed 1

So puzzling... It prints all the right things, but for some reason we
are not doing the ts/gs/disk_util send for the host in some cases.
For the life of me, I can't see how that would happen! We just go
straight from the job option sends, and then don't do the rest in
__show_run_stats().

And thanks for the quick turn-around on the debugging, it's key
to getting this resolved since I can't reproduce this myself. Can you
run with this one? Backout the previous one first.


diff --git a/client.c b/client.c
index bff0adc0d972..76c89a496964 100644
--- a/client.c
+++ b/client.c
@@ -1664,7 +1664,7 @@ int fio_handle_client(struct fio_client *client)
 	if (!cmd)
 		return 0;
 
-	dprint(FD_NET, "client: got cmd op %s from %s (pdu=%u)\n",
+	printf("client: got cmd op %s from %s (pdu=%u)\n",
 		fio_server_op(cmd->opcode), client->hostname, cmd->pdu_len);
 
 	client->last_cmd = cmd->opcode;
diff --git a/server.c b/server.c
index 65d4484edae5..e78f6f64824d 100644
--- a/server.c
+++ b/server.c
@@ -551,6 +551,8 @@ static void fio_net_queue_entry(struct sk_entry *entry)
 {
 	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
 
+	printf("queue cmd op=%x, sock=%d\n", entry->opcode, sk_out->sk);
+
 	if (entry->flags & SK_F_INLINE)
 		handle_sk_entry(sk_out, entry);
 	else {
@@ -1177,6 +1179,7 @@ static int handle_xmits(struct sk_out *sk_out)
 	while (!flist_empty(&list)) {
 		entry = flist_entry(list.next, struct sk_entry, list);
 		flist_del(&entry->list);
+		printf("xmit op %x\n", entry->opcode);
 		ret += handle_sk_entry(sk_out, entry);
 	}
 
diff --git a/stat.c b/stat.c
index 98ab63893900..558786f52751 100644
--- a/stat.c
+++ b/stat.c
@@ -1891,6 +1891,7 @@ void __show_run_stats(void)
 
 		if (is_backend) {
 			fio_server_send_job_options(opt_lists[i], i);
+			printf("should send TS\n");
 			fio_server_send_ts(ts, rs);
 		} else {
 			if (output_format & FIO_OUTPUT_TERSE)
@@ -1905,6 +1906,7 @@ void __show_run_stats(void)
 	}
 	if (!is_backend && (output_format & FIO_OUTPUT_JSON)) {
 		/* disk util stats, if any */
+		printf("should send DU\n");
 		show_disk_util(1, root, &output[__FIO_OUTPUT_JSON]);
 
 		show_idle_prof_stats(FIO_OUTPUT_JSON, root, &output[__FIO_OUTPUT_JSON]);
@@ -1917,6 +1919,8 @@ void __show_run_stats(void)
 	for (i = 0; i < groupid + 1; i++) {
 		rs = &runstats[i];
 
+		printf("should send GS\n");
+
 		rs->groupid = i;
 		if (is_backend)
 			fio_server_send_gs(rs);

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe fio" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Kernel]     [Linux SCSI]     [Linux IDE]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]

  Powered by Linux