Packeloss with delay on sending host, bunoon receiving host

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

I havtwo hosts on a 10GB network which can send and receiv10GB/s with my TCP settings.
iperf3 -c 192.168.120.14
Connecting to hos192.168.120.14, por5201
[  4] local 192.168.120.10 por57217 connected to 192.168.120.14 por5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec  1.09 GBytes  9.41 Gbits/sec    0    872 KBytes       
[  4]   1.00-2.00   sec  1.10 GBytes  9.42 Gbits/sec    0    915 KBytes       
[  4]   2.00-3.00   sec  1.10 GBytes  9.42 Gbits/sec    0    915 KBytes       
[  4]   3.00-4.00   sec  1.09 GBytes  9.41 Gbits/sec    0    915 KBytes       
[  4]   4.00-5.00   sec  1.10 GBytes  9.42 Gbits/sec    0    915 KBytes       
[  4]   5.00-6.00   sec  1.10 GBytes  9.42 Gbits/sec    0    915 KBytes       
[  4]   6.00-7.00   sec  1.10 GBytes  9.42 Gbits/sec    0    966 KBytes       
[  4]   7.00-8.00   sec  1.10 GBytes  9.42 Gbits/sec    0    966 KBytes       
^C[  4]   8.00-8.05   sec  60.0 MBytes  9.34 Gbits/sec    0    966 KBytes       
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-8.05   sec  8.83 GBytes  9.41 Gbits/sec    0             sender
[  4]   0.00-8.05   sec  0.00 Bytes  0.00 bits/sec                  receiver

Ithfinal setup, the gosts will be deployed at different geographical sites and communicate via a WAN connection with max. 80ms latency. I want to use netem to simulate that latency for testing. To do so, I define a qdisc with 80ms latency on both servers and ran some tests with iperf. TCP buffers are sized to 6MB default and 32MB max. I expect to be able to reach ~800MB/s with these settings, but sometimes the systems fails to reach this:
iperf3 -c 192.168.120.14
Connecting to hos192.168.120.14, por5201
[  4] local 192.168.120.10 por57245 connected to 192.168.120.14 por5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec  6.25 MBytes  52.4 Mbits/sec    0   1.10 MBytes       
[  4]   1.00-2.00   sec  48.8 MBytes   409 Mbits/sec  152   8.44 MBytes       
[  4]   2.00-3.00   sec  51.2 MBytes   430 Mbits/sec    0   8.45 MBytes       
[  4]   3.00-4.00   sec  50.0 MBytes   419 Mbits/sec    0   8.45 MBytes       
[  4]   4.00-5.00   sec  58.8 MBytes   493 Mbits/sec    0   8.47 MBytes       
[  4]   5.00-6.00   sec  51.2 MBytes   430 Mbits/sec    0   8.49 MBytes       
[  4]   6.00-7.00   sec  50.0 MBytes   419 Mbits/sec    0   8.53 MBytes       
[  4]   7.00-8.00   sec  51.2 MBytes   430 Mbits/sec    0   8.58 MBytes       
[  4]   8.00-9.00   sec  60.0 MBytes   503 Mbits/sec    0   8.67 MBytes       
[  4]   9.00-10.00  sec  52.5 MBytes   440 Mbits/sec    0   8.77 MBytes       
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-10.00  sec   480 MBytes   403 Mbits/sec  152             sender
[  4]   0.00-10.00  sec   467 MBytes   392 Mbits/sec                  receiver

Iother runs, I observed retransmissions occurring. To localizthe problems, I started to run a simulation with 1ms latency only on both hosts to see if there is a principal problem with netem or its settings.
10:29:42 1290 0 ~ # tc qdisc add dev eno50 roonetedelay 1ms limit 1250; tc qdisc add dev eno49 root netem delay 1ms limit 1250
10:29:50 1291 0 ~ # ping 192.168.120.14
PING 192.168.120.14 (192.168.120.14) 56(84) bytes of data.
64 bytes fro192.168.120.14: icmp_seq=1 ttl=64 time=2.29 ms
64 bytes fro192.168.120.14: icmp_seq=2 ttl=64 time=2.27 ms
^C
--- 192.168.120.14 ping statistics ---
2 packets transmitted, 2 received, 0% packeloss, tim1001ms
rtmin/avg/max/mdev = 2.276/2.287/2.299/0.049 ms
10:30:13 1292 0 ~ # iperf3 -c 192.168.120.14 -i 1 -0
Connecting to hos192.168.120.14, por5201
[  4] local 192.168.120.10 por59752 connected to 192.168.120.14 por5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec   314 MBytes  2.63 Gbits/sec   13    936 KBytes       
[  4]   1.00-2.00   sec   471 MBytes  3.95 Gbits/sec    0   1.24 MBytes       
[  4]   2.00-3.00   sec   614 MBytes  5.15 Gbits/sec    0   1.56 MBytes       
[  4]   3.00-4.00   sec   759 MBytes  6.36 Gbits/sec  170   1.87 MBytes       
[  4]   4.00-5.00   sec   890 MBytes  7.47 Gbits/sec   21   2.19 MBytes       
[  4]   5.00-6.00   sec  1011 MBytes  8.48 Gbits/sec  912   2.46 MBytes       
[  4]   6.00-7.00   sec  1018 MBytes  8.54 Gbits/sec  2183   1.92 MBytes       
[  4]   7.00-8.00   sec   909 MBytes  7.62 Gbits/sec  360   2.21 MBytes       
[  4]   8.00-9.00   sec  1014 MBytes  8.50 Gbits/sec  1428   2.44 MBytes       
[  4]   9.00-10.00  sec   972 MBytes  8.16 Gbits/sec  2154   1.92 MBytes       
[  4]  10.00-11.00  sec   911 MBytes  7.64 Gbits/sec   53   2.21 MBytes       
[  4]  11.00-12.00  sec  1.01 GBytes  8.64 Gbits/sec  431   2.48 MBytes       
[  4]  12.00-13.00  sec   880 MBytes  7.38 Gbits/sec  1493   2.02 MBytes       
[  4]  13.00-14.00  sec   960 MBytes  8.05 Gbits/sec    0   2.34 MBytes       
[  4]  14.00-15.00  sec  1.02 GBytes  8.77 Gbits/sec  2396   2.56 MBytes       
[  4]  15.00-16.00  sec   925 MBytes  7.76 Gbits/sec  2565   2.02 MBytes       
[  4]  16.00-17.00  sec   958 MBytes  8.03 Gbits/sec   65   2.32 MBytes       
[  4]  17.00-18.00  sec  1024 MBytes  8.59 Gbits/sec  2936   2.49 MBytes       
[  4]  18.00-19.00  sec  1.01 GBytes  8.70 Gbits/sec  6058   2.62 MBytes       
[  4]  19.00-20.00  sec   904 MBytes  7.58 Gbits/sec  1215   2.12 MBytes       
[  4]  20.00-21.00  sec   995 MBytes  8.35 Gbits/sec  421   2.40 MBytes       
[  4]  21.00-22.00  sec  1.06 GBytes  9.13 Gbits/sec  1494   2.65 MBytes       
[  4]  22.00-23.00  sec   960 MBytes  8.05 Gbits/sec  1774   2.11 MBytes       
[  4]  23.00-24.00  sec   995 MBytes  8.35 Gbits/sec   39   2.41 MBytes       
[  4]  24.00-25.00  sec  1.06 GBytes  9.14 Gbits/sec  2185   2.30 MBytes       
[  4]  25.00-26.00  sec   948 MBytes  7.95 Gbits/sec  8771   2.08 MBytes       
[  4]  26.00-27.00  sec   982 MBytes  8.24 Gbits/sec   82   2.36 MBytes       
[  4]  27.00-28.00  sec  1.05 GBytes  9.06 Gbits/sec  1144   2.65 MBytes       
[  4]  28.00-29.00  sec   975 MBytes  8.18 Gbits/sec  12405   1.96 MBytes       
[  4]  29.00-30.00  sec   936 MBytes  7.85 Gbits/sec    1   2.28 MBytes       
[  4]  30.00-31.00  sec  1.04 GBytes  8.93 Gbits/sec   47   2.58 MBytes       
[  4]  31.00-32.00  sec  1.02 GBytes  8.74 Gbits/sec  9484   2.75 MBytes       
[  4]  32.00-33.00  sec   891 MBytes  7.49 Gbits/sec  22296   2.00 MBytes       
[  4]  33.00-34.00  sec   925 MBytes  7.76 Gbits/sec  576   2.26 MBytes       
[  4]  34.00-35.00  sec  1010 MBytes  8.47 Gbits/sec  1679   2.47 MBytes       
[  4]  35.00-36.00  sec  1.02 GBytes  8.77 Gbits/sec  5385   1.84 MBytes       
[  4]  36.00-37.00  sec   866 MBytes  7.27 Gbits/sec    9   2.14 MBytes       
^C[  4]  37.00-37.00  sec  3.75 MBytes  9.12 Gbits/sec    0   2.14 MBytes       
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-37.00  sec  33.7 GBytes  7.83 Gbits/sec  92245             sender
[  4]   0.00-37.00  sec  0.00 Bytes  0.00 bits/sec                  receiver
iperf3: interrup- thclient has terminated

WheI run thsame experiment with delay 1ms only on the iperf server the retransmissions disappear:
iperf3 -c 192.168.120.14-i 1 -0 -w 12m
Connecting to hos192.168.120.14, por5201
[  4] local 192.168.120.10 por59851 connected to 192.168.120.14 por5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec  1.03 GBytes  8.84 Gbits/sec   13   1.90 MBytes       
[  4]   1.00-2.00   sec  1.10 GBytes  9.42 Gbits/sec    0   1.91 MBytes       
[  4]   2.00-3.00   sec  1.10 GBytes  9.42 Gbits/sec    0   1.92 MBytes       
[  4]   3.00-4.00   sec  1.10 GBytes  9.42 Gbits/sec    0   1.93 MBytes       
[  4]   4.00-5.00   sec  1.10 GBytes  9.41 Gbits/sec    0   1.95 MBytes       
[  4]   5.00-6.00   sec  1.09 GBytes  9.41 Gbits/sec    0   1.99 MBytes       
[  4]   6.00-7.00   sec  1.10 GBytes  9.42 Gbits/sec    0   2.03 MBytes       
[  4]   7.00-8.00   sec  1.10 GBytes  9.42 Gbits/sec    0   2.13 MBytes       
[  4]   8.00-9.00   sec  1.10 GBytes  9.41 Gbits/sec    0   2.13 MBytes       
[  4]   9.00-10.00  sec  1.09 GBytes  9.40 Gbits/sec    0   2.29 MBytes       
[  4]  10.00-11.00  sec  1.10 GBytes  9.42 Gbits/sec    0   2.29 MBytes       
[  4]  11.00-12.00  sec  1.10 GBytes  9.42 Gbits/sec    0   2.29 MBytes       
^C[  4]  12.00-12.65  sec   725 MBytes  9.41 Gbits/sec    0   2.29 MBytes       
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-12.65  sec  13.8 GBytes  9.37 Gbits/sec   13             sender
[  4]   0.00-12.65  sec  0.00 Bytes  0.00 bits/sec                  receiver
iperf3: interrup- thclient has terminated

To doublcheck, I disabled delay on thiperf server and add 1ms latency on the client. The problems start re-appear:
PING 192.168.120.14 (192.168.120.14) 56(84) bytes of data.
64 bytes fro192.168.120.14: icmp_seq=1 ttl=64 time=1.19 ms
64 bytes fro192.168.120.14: icmp_seq=2 ttl=64 time=1.18 ms
64 bytes fro192.168.120.14: icmp_seq=3 ttl=64 time=1.17 ms
64 bytes fro192.168.120.14: icmp_seq=4 ttl=64 time=1.17 ms

MDA1PFP-S01 11:23:41 1386 1 ~ # iperf3 -c 192.168.120.14 -i 1 -0
Connecting to hos192.168.120.14, por5201
[  4] local 192.168.120.10 por59975 connected to 192.168.120.14 por5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec  1.57 GBytes  13.5 Gbits/sec   37    776 KBytes       
[  4]   1.00-2.00   sec   798 MBytes  6.69 Gbits/sec    0   1.32 MBytes       
[  4]   2.00-3.00   sec  1.04 GBytes  8.95 Gbits/sec  1244   1.60 MBytes       
[  4]   3.00-4.00   sec  1.04 GBytes  8.97 Gbits/sec  2506   1.64 MBytes       
[  4]   4.00-5.00   sec  1.03 GBytes  8.86 Gbits/sec  336   1.61 MBytes       
[  4]   5.00-6.00   sec  1.02 GBytes  8.74 Gbits/sec  2892   1.64 MBytes       
[  4]   6.00-7.00   sec  1.00 GBytes  8.63 Gbits/sec  2896   1.33 MBytes       
[  4]   7.00-8.00   sec  1.00 GBytes  8.60 Gbits/sec  262   1.38 MBytes       
[  4]   8.00-9.00   sec  1.05 GBytes  8.99 Gbits/sec  1234   1.58 MBytes       
[  4]   9.00-10.00  sec  1.01 GBytes  8.64 Gbits/sec  2367   1.64 MBytes       
[  4]  10.00-11.00  sec  1.01 GBytes  8.72 Gbits/sec  5315   1.67 MBytes       
[  4]  11.00-12.00  sec  1.01 GBytes  8.69 Gbits/sec  2248   1.39 MBytes       
[  4]  12.00-13.00  sec  1.06 GBytes  9.13 Gbits/sec  1086   1.63 MBytes       
[  4]  13.00-14.00  sec  1.02 GBytes  8.80 Gbits/sec  3066   1.65 MBytes       
[  4]  14.00-15.00  sec  1.05 GBytes  9.04 Gbits/sec  3889   1.65 MBytes       
[  4]  15.00-16.00  sec  1.01 GBytes  8.68 Gbits/sec  3168   1.21 MBytes       
[  4]  16.00-17.00  sec  1.05 GBytes  9.04 Gbits/sec  483   1.59 MBytes       
[  4]  17.00-18.00  sec  1.02 GBytes  8.78 Gbits/sec  3070   1.17 MBytes       
[  4]  18.00-19.00  sec  1.04 GBytes  8.92 Gbits/sec  748   1.59 MBytes       
[  4]  19.00-20.00  sec  1.00 GBytes  8.63 Gbits/sec  1181   1.38 MBytes       
[  4]  20.00-21.00  sec  1.07 GBytes  9.21 Gbits/sec  805   1.59 MBytes       
[  4]  21.00-22.00  sec  1021 MBytes  8.57 Gbits/sec  2500   1.27 MBytes       
[  4]  22.00-23.00  sec  1.06 GBytes  9.07 Gbits/sec  766   1.63 MBytes       
[  4]  23.00-24.00  sec  1.04 GBytes  8.93 Gbits/sec  3376   1.65 MBytes       
[  4]  24.00-25.00  sec  1.01 GBytes  8.71 Gbits/sec  1223   1.46 MBytes       
[  4]  25.00-26.00  sec  1.06 GBytes  9.13 Gbits/sec  1534   1.62 MBytes       
[  4]  26.00-27.00  sec  1.05 GBytes  9.01 Gbits/sec  3015   1.64 MBytes       
^C[  4]  27.00-27.39  sec   411 MBytes  8.96 Gbits/sec  1266   1.66 MBytes  

Iseems thaeven with 1ms delay netem has a very negative impact on the socket performance.

Thcommand I usto set the delay is
tc qdisc add dev eno49 roonetedelay 1ms limit 1250

Thlimiis computed as 10 Gbps / 1500 bytes MTU * 1 ms * 1.5 = 1250.

Is theranything wrong with my settings?

Beswishes,
  Jens Auer

--
Dr. Jens Auer | CGI | SoftwarEngineer
CGI Deutschland Ltd. & Co. KG
Rheinstra?95 | 64295 Darmstad| Germany
T: +49 6151 36860 154
jens.auer acgi.com
UnserPflichtangaben gem?? ? 35a GmbHG / ?? 161, 125a HGB finden Siunter de.cgi.com/pflichtangaben.

CONFIDENTIALITY NOTICE: Proprietary/Confidential informatiobelonging to CGI Group Inc. and its affiliates may bcontained in this message. If you are not a recipient indicated or intended in this message (or responsible for delivery of this message to such person), or you think for any reason that this message may have been addressed to you in error, you may not use or copy or deliver this message to anyone else. In such case, you should destroy this message and are asked to notify the sender by reply e-mail.



Frosasha.levin aoracle.com  Tue Jul 12 02:59:52 2016
From: sasha.leviaoracle.com (Sasha Levin)
Date: Tue, 12 Jul 2016 02:59:52 -0000
Subject: [added to th4.1 stabltree] netem: Segment GSO packets
	oenqueue
In-Reply-To: <1468292170-22812-1-git-send-email-sasha.levin@xxxxxxxxxx>
References: <1468292170-22812-1-git-send-email-sasha.levin@xxxxxxxxxx>
Message-ID: <1468292170-22812-216-git-send-email-sasha.levin@xxxxxxxxxx>

From: Neil Horma<nhorman atuxdriver.com>

This patch has beeadded to th4.1 stable tree. If you have any
objections, pleasleus know.

===============

[ Upstreacommi6071bd1aa13ed9e41824bafad845b7b7f4df5cfd ]

This was recently reported to me, and reproduced othlatest net kernel,
wheattempting to run netperf froa host that had a netem qdisc attached
to thegress interface:

[  788.073771] ---------------------[ cuher]---------------------------
[  788.096716] WARNING: anet/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda()
[  788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962
data_len=0 gso_size=1448 gso_type=1 ip_summed=3
[  788.182150] Modules linked in: sch_netekvm_amd kvcrc32_pclmul ipmi_ssif
ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul
glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_corhpilo ipmi_si
i2c_piix4 k10temp fam15h_power hpwdipmi_msghandler shpchp acpi_power_meter
pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd gracsunrpc ip_tables xfs libcrc32c
sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrecsysimgblt
i2c_algo_bidrm_kms_helper ahci ata_generic pata_acpi ttlibahci
crct10dif_pclmul pata_atiixp tg3 libata crct10dif_commodrcrc32c_intel ptp
serio_raw bnx2 r8169 hpsa pps_cori2c_cormii dm_mirror dm_region_hash dm_log
dm_mod
[  788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G        W
------------   3.10.0-327.el7.x86_64 #1
[  788.511521] Hardwarname: HP ProLianDL385p Gen8, BIOS A28 12/17/2012
[  788.542260]  ffff880437c036b8 f7afc56532a53db9 ffff880437c03670
ffffffff816351f1
[  788.576332]  ffff880437c036a8 ffffffff8107b200 ffff880633e74200
ffff880231674000
[  788.611943]  0000000000000001 0000000000000003 0000000000000000
ffff880437c03710
[  788.647241] Call Trace:
[  788.658817]  <IRQ>  [<ffffffff816351f1>] dump_stack+0x19/0x1b
[  788.686193]  [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0
[  788.713803]  [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80
[  788.741314]  [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100
[  788.767018]  [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda
[  788.796117]  [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190
[  788.823392]  [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem]
[  788.854487]  [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570
[  788.880870]  [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0
...

Thprobleoccurs because netem is not prepared to handle GSO packets (as it
uses skb_checksum_help iits enqueupath, which cannot manipulate these
frames).

Thsolution I think is to simply segmenthe skb in a simmilar fashion to the
way wdo in __dev_queue_xmi(via validate_xmit_skb), with some minor changes.
Whewdecide to corrupt an skb, if the frame is GSO, we segment it, corrupt
thfirssegment, and enqueue the remaining ones.

tested successfully by myself othlatest net kernel, to which this applies

Signed-off-by: Neil Horma<nhorman atuxdriver.com>
CC: Jamal Hadi Sali<jhs amojatatu.com>
CC: "David S. Miller" <daveadavemloft.net>
CC: netealists.linux-foundation.org
CC: eric.dumazeagmail.com
CC: stepheanetworkplumber.org
Acked-by: Eric Dumaze<edumazeat google.com>
Signed-off-by: David S. Miller <daveadavemloft.net>
Signed-off-by: Sasha Levi<sasha.levin aoracle.com>
---
 net/sched/sch_netem.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 filchanged, 59 insertions(+), 2 deletions(-)

diff --gia/net/sched/sch_netem.c b/net/sched/sch_netem.c
index cc00329..80124c1 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -395,6 +395,25 @@ static void tfifo_enqueue(strucsk_buff *nskb, strucQdisc *sch)
 	sch->q.qlen++;
 }
 
+/* netecan'properly corrupt a megapacket (like we get from GSO), so instead
+ * whewstatistically choose to corrupt one, we instead segment it, returning
+ * thfirspacket to be corrupted, and re-enqueue the remaining frames
+ */
+static strucsk_buff *netem_segment(strucsk_buff *skb, struct Qdisc *sch)
+{
+	strucsk_buff *segs;
+	netdev_features_features = netif_skb_features(skb);
+
+	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+
+	if (IS_ERR_OR_NULL(segs)) {
+		qdisc_reshape_fail(skb, sch);
+		returNULL;
+	}
+	consume_skb(skb);
+	retursegs;
+}
+
 /*
  * Inseronskb into qdisc.
  * Note: parendepends on return valuto account for queue length.
@@ -407,7 +426,11 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch)
 	/* Wdon'fill cb now as skb_unshare() may invalidate it */
 	strucnetem_skb_cb *cb;
 	strucsk_buff *skb2;
+	strucsk_buff *segs = NULL;
+	unsigned inlen = 0, last_len, prev_len = qdisc_pkt_len(skb);
+	innb = 0;
 	incoun= 1;
+	inrc = NET_XMIT_SUCCESS;
 
 	/* Randoduplication */
 	if (q->duplicat&& q->duplicat>= get_crandom(&q->dup_cor))
@@ -453,10 +476,23 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch)
 	 * do inow in softwarbefore we mangle it.
 	 */
 	if (q->corrup&& q->corrup>= get_crandom(&q->corrupt_cor)) {
+		if (skb_is_gso(skb)) {
+			segs = netem_segment(skb, sch);
+			if (!segs)
+				returNET_XMIT_DROP;
+		} els{
+			segs = skb;
+		}
+
+		skb = segs;
+		segs = segs->next;
+
 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
-		     skb_checksum_help(skb)))
-			returqdisc_drop(skb, sch);
+		     skb_checksum_help(skb))) {
+			rc = qdisc_drop(skb, sch);
+			goto finish_segs;
+		}
 
 		skb->data[prandom_u32() % skb_headlen(skb)] ^=
 			1<<(prandom_u32() % 8);
@@ -516,6 +552,27 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch)
 		sch->qstats.requeues++;
 	}
 
+finish_segs:
+	if (segs) {
+		whil(segs) {
+			skb2 = segs->next;
+			segs->nex= NULL;
+			qdisc_skb_cb(segs)->pkt_le= segs->len;
+			last_le= segs->len;
+			rc = qdisc_enqueue(segs, sch);
+			if (rc != NET_XMIT_SUCCESS) {
+				if (net_xmit_drop_count(rc))
+					qdisc_qstats_drop(sch);
+			} els{
+				nb++;
+				le+= last_len;
+			}
+			segs = skb2;
+		}
+		sch->q.qle+= nb;
+		if (nb > 1)
+			qdisc_tree_reduce_backlog(sch, 1 - nb, prev_le- len);
+	}
 	returNET_XMIT_SUCCESS;
 }
 
-- 
2.5.0


Frosasha.levin aoracle.com  Tue Jul 12 03:04:31 2016
From: sasha.leviaoracle.com (Sasha Levin)
Date: Tue, 12 Jul 2016 03:04:31 -0000
Subject: [added to th3.18 stabltree] netem: Segment GSO packets
	oenqueue
In-Reply-To: <1468292479-23684-1-git-send-email-sasha.levin@xxxxxxxxxx>
References: <1468292479-23684-1-git-send-email-sasha.levin@xxxxxxxxxx>
Message-ID: <1468292479-23684-181-git-send-email-sasha.levin@xxxxxxxxxx>

From: Neil Horma<nhorman atuxdriver.com>

This patch has beeadded to th3.18 stable tree. If you have any
objections, pleasleus know.

===============

[ Upstreacommi6071bd1aa13ed9e41824bafad845b7b7f4df5cfd ]

This was recently reported to me, and reproduced othlatest net kernel,
wheattempting to run netperf froa host that had a netem qdisc attached
to thegress interface:

[  788.073771] ---------------------[ cuher]---------------------------
[  788.096716] WARNING: anet/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda()
[  788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962
data_len=0 gso_size=1448 gso_type=1 ip_summed=3
[  788.182150] Modules linked in: sch_netekvm_amd kvcrc32_pclmul ipmi_ssif
ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul
glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_corhpilo ipmi_si
i2c_piix4 k10temp fam15h_power hpwdipmi_msghandler shpchp acpi_power_meter
pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd gracsunrpc ip_tables xfs libcrc32c
sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrecsysimgblt
i2c_algo_bidrm_kms_helper ahci ata_generic pata_acpi ttlibahci
crct10dif_pclmul pata_atiixp tg3 libata crct10dif_commodrcrc32c_intel ptp
serio_raw bnx2 r8169 hpsa pps_cori2c_cormii dm_mirror dm_region_hash dm_log
dm_mod
[  788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G        W
------------   3.10.0-327.el7.x86_64 #1
[  788.511521] Hardwarname: HP ProLianDL385p Gen8, BIOS A28 12/17/2012
[  788.542260]  ffff880437c036b8 f7afc56532a53db9 ffff880437c03670
ffffffff816351f1
[  788.576332]  ffff880437c036a8 ffffffff8107b200 ffff880633e74200
ffff880231674000
[  788.611943]  0000000000000001 0000000000000003 0000000000000000
ffff880437c03710
[  788.647241] Call Trace:
[  788.658817]  <IRQ>  [<ffffffff816351f1>] dump_stack+0x19/0x1b
[  788.686193]  [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0
[  788.713803]  [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80
[  788.741314]  [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100
[  788.767018]  [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda
[  788.796117]  [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190
[  788.823392]  [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem]
[  788.854487]  [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570
[  788.880870]  [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0
...

Thprobleoccurs because netem is not prepared to handle GSO packets (as it
uses skb_checksum_help iits enqueupath, which cannot manipulate these
frames).

Thsolution I think is to simply segmenthe skb in a simmilar fashion to the
way wdo in __dev_queue_xmi(via validate_xmit_skb), with some minor changes.
Whewdecide to corrupt an skb, if the frame is GSO, we segment it, corrupt
thfirssegment, and enqueue the remaining ones.

tested successfully by myself othlatest net kernel, to which this applies

Signed-off-by: Neil Horma<nhorman atuxdriver.com>
CC: Jamal Hadi Sali<jhs amojatatu.com>
CC: "David S. Miller" <daveadavemloft.net>
CC: netealists.linux-foundation.org
CC: eric.dumazeagmail.com
CC: stepheanetworkplumber.org
Acked-by: Eric Dumaze<edumazeat google.com>
Signed-off-by: David S. Miller <daveadavemloft.net>
Signed-off-by: Sasha Levi<sasha.levin aoracle.com>
---
 net/sched/sch_netem.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 filchanged, 59 insertions(+), 2 deletions(-)

diff --gia/net/sched/sch_netem.c b/net/sched/sch_netem.c
index fac07d5..f60db2b 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -408,6 +408,25 @@ static void tfifo_enqueue(strucsk_buff *nskb, strucQdisc *sch)
 	sch->q.qlen++;
 }
 
+/* netecan'properly corrupt a megapacket (like we get from GSO), so instead
+ * whewstatistically choose to corrupt one, we instead segment it, returning
+ * thfirspacket to be corrupted, and re-enqueue the remaining frames
+ */
+static strucsk_buff *netem_segment(strucsk_buff *skb, struct Qdisc *sch)
+{
+	strucsk_buff *segs;
+	netdev_features_features = netif_skb_features(skb);
+
+	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+
+	if (IS_ERR_OR_NULL(segs)) {
+		qdisc_reshape_fail(skb, sch);
+		returNULL;
+	}
+	consume_skb(skb);
+	retursegs;
+}
+
 /*
  * Inseronskb into qdisc.
  * Note: parendepends on return valuto account for queue length.
@@ -420,7 +439,11 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch)
 	/* Wdon'fill cb now as skb_unshare() may invalidate it */
 	strucnetem_skb_cb *cb;
 	strucsk_buff *skb2;
+	strucsk_buff *segs = NULL;
+	unsigned inlen = 0, last_len, prev_len = qdisc_pkt_len(skb);
+	innb = 0;
 	incoun= 1;
+	inrc = NET_XMIT_SUCCESS;
 
 	/* Randoduplication */
 	if (q->duplicat&& q->duplicat>= get_crandom(&q->dup_cor))
@@ -466,10 +489,23 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch)
 	 * do inow in softwarbefore we mangle it.
 	 */
 	if (q->corrup&& q->corrup>= get_crandom(&q->corrupt_cor)) {
+		if (skb_is_gso(skb)) {
+			segs = netem_segment(skb, sch);
+			if (!segs)
+				returNET_XMIT_DROP;
+		} els{
+			segs = skb;
+		}
+
+		skb = segs;
+		segs = segs->next;
+
 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
-		     skb_checksum_help(skb)))
-			returqdisc_drop(skb, sch);
+		     skb_checksum_help(skb))) {
+			rc = qdisc_drop(skb, sch);
+			goto finish_segs;
+		}
 
 		skb->data[prandom_u32() % skb_headlen(skb)] ^=
 			1<<(prandom_u32() % 8);
@@ -529,6 +565,27 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch)
 		sch->qstats.requeues++;
 	}
 
+finish_segs:
+	if (segs) {
+		whil(segs) {
+			skb2 = segs->next;
+			segs->nex= NULL;
+			qdisc_skb_cb(segs)->pkt_le= segs->len;
+			last_le= segs->len;
+			rc = qdisc_enqueue(segs, sch);
+			if (rc != NET_XMIT_SUCCESS) {
+				if (net_xmit_drop_count(rc))
+					qdisc_qstats_drop(sch);
+			} els{
+				nb++;
+				le+= last_len;
+			}
+			segs = skb2;
+		}
+		sch->q.qle+= nb;
+		if (nb > 1)
+			qdisc_tree_reduce_backlog(sch, 1 - nb, prev_le- len);
+	}
 	returNET_XMIT_SUCCESS;
 }
 
-- 
2.5.0



[Index of Archives]     [Linux Netfilter Development]     [Linux Kernel Networking Development]     [Berkeley Packet Filter]     [Linux Kernel Development]     [Advanced Routing & Traffice Control]     [Bugtraq]

  Powered by Linux