Hi, I havtwo hosts on a 10GB network which can send and receiv10GB/s with my TCP settings. iperf3 -c 192.168.120.14 Connecting to hos192.168.120.14, por5201 [ 4] local 192.168.120.10 por57217 connected to 192.168.120.14 por5201 [ ID] Interval Transfer Bandwidth Retr Cwnd [ 4] 0.00-1.00 sec 1.09 GBytes 9.41 Gbits/sec 0 872 KBytes [ 4] 1.00-2.00 sec 1.10 GBytes 9.42 Gbits/sec 0 915 KBytes [ 4] 2.00-3.00 sec 1.10 GBytes 9.42 Gbits/sec 0 915 KBytes [ 4] 3.00-4.00 sec 1.09 GBytes 9.41 Gbits/sec 0 915 KBytes [ 4] 4.00-5.00 sec 1.10 GBytes 9.42 Gbits/sec 0 915 KBytes [ 4] 5.00-6.00 sec 1.10 GBytes 9.42 Gbits/sec 0 915 KBytes [ 4] 6.00-7.00 sec 1.10 GBytes 9.42 Gbits/sec 0 966 KBytes [ 4] 7.00-8.00 sec 1.10 GBytes 9.42 Gbits/sec 0 966 KBytes ^C[ 4] 8.00-8.05 sec 60.0 MBytes 9.34 Gbits/sec 0 966 KBytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bandwidth Retr [ 4] 0.00-8.05 sec 8.83 GBytes 9.41 Gbits/sec 0 sender [ 4] 0.00-8.05 sec 0.00 Bytes 0.00 bits/sec receiver Ithfinal setup, the gosts will be deployed at different geographical sites and communicate via a WAN connection with max. 80ms latency. I want to use netem to simulate that latency for testing. To do so, I define a qdisc with 80ms latency on both servers and ran some tests with iperf. TCP buffers are sized to 6MB default and 32MB max. I expect to be able to reach ~800MB/s with these settings, but sometimes the systems fails to reach this: iperf3 -c 192.168.120.14 Connecting to hos192.168.120.14, por5201 [ 4] local 192.168.120.10 por57245 connected to 192.168.120.14 por5201 [ ID] Interval Transfer Bandwidth Retr Cwnd [ 4] 0.00-1.00 sec 6.25 MBytes 52.4 Mbits/sec 0 1.10 MBytes [ 4] 1.00-2.00 sec 48.8 MBytes 409 Mbits/sec 152 8.44 MBytes [ 4] 2.00-3.00 sec 51.2 MBytes 430 Mbits/sec 0 8.45 MBytes [ 4] 3.00-4.00 sec 50.0 MBytes 419 Mbits/sec 0 8.45 MBytes [ 4] 4.00-5.00 sec 58.8 MBytes 493 Mbits/sec 0 8.47 MBytes [ 4] 5.00-6.00 sec 51.2 MBytes 430 Mbits/sec 0 8.49 MBytes [ 4] 6.00-7.00 sec 50.0 MBytes 419 Mbits/sec 0 8.53 MBytes [ 4] 7.00-8.00 sec 51.2 MBytes 430 Mbits/sec 0 8.58 MBytes [ 4] 8.00-9.00 sec 60.0 MBytes 503 Mbits/sec 0 8.67 MBytes [ 4] 9.00-10.00 sec 52.5 MBytes 440 Mbits/sec 0 8.77 MBytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bandwidth Retr [ 4] 0.00-10.00 sec 480 MBytes 403 Mbits/sec 152 sender [ 4] 0.00-10.00 sec 467 MBytes 392 Mbits/sec receiver Iother runs, I observed retransmissions occurring. To localizthe problems, I started to run a simulation with 1ms latency only on both hosts to see if there is a principal problem with netem or its settings. 10:29:42 1290 0 ~ # tc qdisc add dev eno50 roonetedelay 1ms limit 1250; tc qdisc add dev eno49 root netem delay 1ms limit 1250 10:29:50 1291 0 ~ # ping 192.168.120.14 PING 192.168.120.14 (192.168.120.14) 56(84) bytes of data. 64 bytes fro192.168.120.14: icmp_seq=1 ttl=64 time=2.29 ms 64 bytes fro192.168.120.14: icmp_seq=2 ttl=64 time=2.27 ms ^C --- 192.168.120.14 ping statistics --- 2 packets transmitted, 2 received, 0% packeloss, tim1001ms rtmin/avg/max/mdev = 2.276/2.287/2.299/0.049 ms 10:30:13 1292 0 ~ # iperf3 -c 192.168.120.14 -i 1 -0 Connecting to hos192.168.120.14, por5201 [ 4] local 192.168.120.10 por59752 connected to 192.168.120.14 por5201 [ ID] Interval Transfer Bandwidth Retr Cwnd [ 4] 0.00-1.00 sec 314 MBytes 2.63 Gbits/sec 13 936 KBytes [ 4] 1.00-2.00 sec 471 MBytes 3.95 Gbits/sec 0 1.24 MBytes [ 4] 2.00-3.00 sec 614 MBytes 5.15 Gbits/sec 0 1.56 MBytes [ 4] 3.00-4.00 sec 759 MBytes 6.36 Gbits/sec 170 1.87 MBytes [ 4] 4.00-5.00 sec 890 MBytes 7.47 Gbits/sec 21 2.19 MBytes [ 4] 5.00-6.00 sec 1011 MBytes 8.48 Gbits/sec 912 2.46 MBytes [ 4] 6.00-7.00 sec 1018 MBytes 8.54 Gbits/sec 2183 1.92 MBytes [ 4] 7.00-8.00 sec 909 MBytes 7.62 Gbits/sec 360 2.21 MBytes [ 4] 8.00-9.00 sec 1014 MBytes 8.50 Gbits/sec 1428 2.44 MBytes [ 4] 9.00-10.00 sec 972 MBytes 8.16 Gbits/sec 2154 1.92 MBytes [ 4] 10.00-11.00 sec 911 MBytes 7.64 Gbits/sec 53 2.21 MBytes [ 4] 11.00-12.00 sec 1.01 GBytes 8.64 Gbits/sec 431 2.48 MBytes [ 4] 12.00-13.00 sec 880 MBytes 7.38 Gbits/sec 1493 2.02 MBytes [ 4] 13.00-14.00 sec 960 MBytes 8.05 Gbits/sec 0 2.34 MBytes [ 4] 14.00-15.00 sec 1.02 GBytes 8.77 Gbits/sec 2396 2.56 MBytes [ 4] 15.00-16.00 sec 925 MBytes 7.76 Gbits/sec 2565 2.02 MBytes [ 4] 16.00-17.00 sec 958 MBytes 8.03 Gbits/sec 65 2.32 MBytes [ 4] 17.00-18.00 sec 1024 MBytes 8.59 Gbits/sec 2936 2.49 MBytes [ 4] 18.00-19.00 sec 1.01 GBytes 8.70 Gbits/sec 6058 2.62 MBytes [ 4] 19.00-20.00 sec 904 MBytes 7.58 Gbits/sec 1215 2.12 MBytes [ 4] 20.00-21.00 sec 995 MBytes 8.35 Gbits/sec 421 2.40 MBytes [ 4] 21.00-22.00 sec 1.06 GBytes 9.13 Gbits/sec 1494 2.65 MBytes [ 4] 22.00-23.00 sec 960 MBytes 8.05 Gbits/sec 1774 2.11 MBytes [ 4] 23.00-24.00 sec 995 MBytes 8.35 Gbits/sec 39 2.41 MBytes [ 4] 24.00-25.00 sec 1.06 GBytes 9.14 Gbits/sec 2185 2.30 MBytes [ 4] 25.00-26.00 sec 948 MBytes 7.95 Gbits/sec 8771 2.08 MBytes [ 4] 26.00-27.00 sec 982 MBytes 8.24 Gbits/sec 82 2.36 MBytes [ 4] 27.00-28.00 sec 1.05 GBytes 9.06 Gbits/sec 1144 2.65 MBytes [ 4] 28.00-29.00 sec 975 MBytes 8.18 Gbits/sec 12405 1.96 MBytes [ 4] 29.00-30.00 sec 936 MBytes 7.85 Gbits/sec 1 2.28 MBytes [ 4] 30.00-31.00 sec 1.04 GBytes 8.93 Gbits/sec 47 2.58 MBytes [ 4] 31.00-32.00 sec 1.02 GBytes 8.74 Gbits/sec 9484 2.75 MBytes [ 4] 32.00-33.00 sec 891 MBytes 7.49 Gbits/sec 22296 2.00 MBytes [ 4] 33.00-34.00 sec 925 MBytes 7.76 Gbits/sec 576 2.26 MBytes [ 4] 34.00-35.00 sec 1010 MBytes 8.47 Gbits/sec 1679 2.47 MBytes [ 4] 35.00-36.00 sec 1.02 GBytes 8.77 Gbits/sec 5385 1.84 MBytes [ 4] 36.00-37.00 sec 866 MBytes 7.27 Gbits/sec 9 2.14 MBytes ^C[ 4] 37.00-37.00 sec 3.75 MBytes 9.12 Gbits/sec 0 2.14 MBytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bandwidth Retr [ 4] 0.00-37.00 sec 33.7 GBytes 7.83 Gbits/sec 92245 sender [ 4] 0.00-37.00 sec 0.00 Bytes 0.00 bits/sec receiver iperf3: interrup- thclient has terminated WheI run thsame experiment with delay 1ms only on the iperf server the retransmissions disappear: iperf3 -c 192.168.120.14-i 1 -0 -w 12m Connecting to hos192.168.120.14, por5201 [ 4] local 192.168.120.10 por59851 connected to 192.168.120.14 por5201 [ ID] Interval Transfer Bandwidth Retr Cwnd [ 4] 0.00-1.00 sec 1.03 GBytes 8.84 Gbits/sec 13 1.90 MBytes [ 4] 1.00-2.00 sec 1.10 GBytes 9.42 Gbits/sec 0 1.91 MBytes [ 4] 2.00-3.00 sec 1.10 GBytes 9.42 Gbits/sec 0 1.92 MBytes [ 4] 3.00-4.00 sec 1.10 GBytes 9.42 Gbits/sec 0 1.93 MBytes [ 4] 4.00-5.00 sec 1.10 GBytes 9.41 Gbits/sec 0 1.95 MBytes [ 4] 5.00-6.00 sec 1.09 GBytes 9.41 Gbits/sec 0 1.99 MBytes [ 4] 6.00-7.00 sec 1.10 GBytes 9.42 Gbits/sec 0 2.03 MBytes [ 4] 7.00-8.00 sec 1.10 GBytes 9.42 Gbits/sec 0 2.13 MBytes [ 4] 8.00-9.00 sec 1.10 GBytes 9.41 Gbits/sec 0 2.13 MBytes [ 4] 9.00-10.00 sec 1.09 GBytes 9.40 Gbits/sec 0 2.29 MBytes [ 4] 10.00-11.00 sec 1.10 GBytes 9.42 Gbits/sec 0 2.29 MBytes [ 4] 11.00-12.00 sec 1.10 GBytes 9.42 Gbits/sec 0 2.29 MBytes ^C[ 4] 12.00-12.65 sec 725 MBytes 9.41 Gbits/sec 0 2.29 MBytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bandwidth Retr [ 4] 0.00-12.65 sec 13.8 GBytes 9.37 Gbits/sec 13 sender [ 4] 0.00-12.65 sec 0.00 Bytes 0.00 bits/sec receiver iperf3: interrup- thclient has terminated To doublcheck, I disabled delay on thiperf server and add 1ms latency on the client. The problems start re-appear: PING 192.168.120.14 (192.168.120.14) 56(84) bytes of data. 64 bytes fro192.168.120.14: icmp_seq=1 ttl=64 time=1.19 ms 64 bytes fro192.168.120.14: icmp_seq=2 ttl=64 time=1.18 ms 64 bytes fro192.168.120.14: icmp_seq=3 ttl=64 time=1.17 ms 64 bytes fro192.168.120.14: icmp_seq=4 ttl=64 time=1.17 ms MDA1PFP-S01 11:23:41 1386 1 ~ # iperf3 -c 192.168.120.14 -i 1 -0 Connecting to hos192.168.120.14, por5201 [ 4] local 192.168.120.10 por59975 connected to 192.168.120.14 por5201 [ ID] Interval Transfer Bandwidth Retr Cwnd [ 4] 0.00-1.00 sec 1.57 GBytes 13.5 Gbits/sec 37 776 KBytes [ 4] 1.00-2.00 sec 798 MBytes 6.69 Gbits/sec 0 1.32 MBytes [ 4] 2.00-3.00 sec 1.04 GBytes 8.95 Gbits/sec 1244 1.60 MBytes [ 4] 3.00-4.00 sec 1.04 GBytes 8.97 Gbits/sec 2506 1.64 MBytes [ 4] 4.00-5.00 sec 1.03 GBytes 8.86 Gbits/sec 336 1.61 MBytes [ 4] 5.00-6.00 sec 1.02 GBytes 8.74 Gbits/sec 2892 1.64 MBytes [ 4] 6.00-7.00 sec 1.00 GBytes 8.63 Gbits/sec 2896 1.33 MBytes [ 4] 7.00-8.00 sec 1.00 GBytes 8.60 Gbits/sec 262 1.38 MBytes [ 4] 8.00-9.00 sec 1.05 GBytes 8.99 Gbits/sec 1234 1.58 MBytes [ 4] 9.00-10.00 sec 1.01 GBytes 8.64 Gbits/sec 2367 1.64 MBytes [ 4] 10.00-11.00 sec 1.01 GBytes 8.72 Gbits/sec 5315 1.67 MBytes [ 4] 11.00-12.00 sec 1.01 GBytes 8.69 Gbits/sec 2248 1.39 MBytes [ 4] 12.00-13.00 sec 1.06 GBytes 9.13 Gbits/sec 1086 1.63 MBytes [ 4] 13.00-14.00 sec 1.02 GBytes 8.80 Gbits/sec 3066 1.65 MBytes [ 4] 14.00-15.00 sec 1.05 GBytes 9.04 Gbits/sec 3889 1.65 MBytes [ 4] 15.00-16.00 sec 1.01 GBytes 8.68 Gbits/sec 3168 1.21 MBytes [ 4] 16.00-17.00 sec 1.05 GBytes 9.04 Gbits/sec 483 1.59 MBytes [ 4] 17.00-18.00 sec 1.02 GBytes 8.78 Gbits/sec 3070 1.17 MBytes [ 4] 18.00-19.00 sec 1.04 GBytes 8.92 Gbits/sec 748 1.59 MBytes [ 4] 19.00-20.00 sec 1.00 GBytes 8.63 Gbits/sec 1181 1.38 MBytes [ 4] 20.00-21.00 sec 1.07 GBytes 9.21 Gbits/sec 805 1.59 MBytes [ 4] 21.00-22.00 sec 1021 MBytes 8.57 Gbits/sec 2500 1.27 MBytes [ 4] 22.00-23.00 sec 1.06 GBytes 9.07 Gbits/sec 766 1.63 MBytes [ 4] 23.00-24.00 sec 1.04 GBytes 8.93 Gbits/sec 3376 1.65 MBytes [ 4] 24.00-25.00 sec 1.01 GBytes 8.71 Gbits/sec 1223 1.46 MBytes [ 4] 25.00-26.00 sec 1.06 GBytes 9.13 Gbits/sec 1534 1.62 MBytes [ 4] 26.00-27.00 sec 1.05 GBytes 9.01 Gbits/sec 3015 1.64 MBytes ^C[ 4] 27.00-27.39 sec 411 MBytes 8.96 Gbits/sec 1266 1.66 MBytes Iseems thaeven with 1ms delay netem has a very negative impact on the socket performance. Thcommand I usto set the delay is tc qdisc add dev eno49 roonetedelay 1ms limit 1250 Thlimiis computed as 10 Gbps / 1500 bytes MTU * 1 ms * 1.5 = 1250. Is theranything wrong with my settings? Beswishes, Jens Auer -- Dr. Jens Auer | CGI | SoftwarEngineer CGI Deutschland Ltd. & Co. KG Rheinstra?95 | 64295 Darmstad| Germany T: +49 6151 36860 154 jens.auer acgi.com UnserPflichtangaben gem?? ? 35a GmbHG / ?? 161, 125a HGB finden Siunter de.cgi.com/pflichtangaben. CONFIDENTIALITY NOTICE: Proprietary/Confidential informatiobelonging to CGI Group Inc. and its affiliates may bcontained in this message. If you are not a recipient indicated or intended in this message (or responsible for delivery of this message to such person), or you think for any reason that this message may have been addressed to you in error, you may not use or copy or deliver this message to anyone else. In such case, you should destroy this message and are asked to notify the sender by reply e-mail. Frosasha.levin aoracle.com Tue Jul 12 02:59:52 2016 From: sasha.leviaoracle.com (Sasha Levin) Date: Tue, 12 Jul 2016 02:59:52 -0000 Subject: [added to th4.1 stabltree] netem: Segment GSO packets oenqueue In-Reply-To: <1468292170-22812-1-git-send-email-sasha.levin@xxxxxxxxxx> References: <1468292170-22812-1-git-send-email-sasha.levin@xxxxxxxxxx> Message-ID: <1468292170-22812-216-git-send-email-sasha.levin@xxxxxxxxxx> From: Neil Horma<nhorman atuxdriver.com> This patch has beeadded to th4.1 stable tree. If you have any objections, pleasleus know. =============== [ Upstreacommi6071bd1aa13ed9e41824bafad845b7b7f4df5cfd ] This was recently reported to me, and reproduced othlatest net kernel, wheattempting to run netperf froa host that had a netem qdisc attached to thegress interface: [ 788.073771] ---------------------[ cuher]--------------------------- [ 788.096716] WARNING: anet/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netekvm_amd kvcrc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_corhpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd gracsunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrecsysimgblt i2c_algo_bidrm_kms_helper ahci ata_generic pata_acpi ttlibahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_commodrcrc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_cori2c_cormii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardwarname: HP ProLianDL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... Thprobleoccurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help iits enqueupath, which cannot manipulate these frames). Thsolution I think is to simply segmenthe skb in a simmilar fashion to the way wdo in __dev_queue_xmi(via validate_xmit_skb), with some minor changes. Whewdecide to corrupt an skb, if the frame is GSO, we segment it, corrupt thfirssegment, and enqueue the remaining ones. tested successfully by myself othlatest net kernel, to which this applies Signed-off-by: Neil Horma<nhorman atuxdriver.com> CC: Jamal Hadi Sali<jhs amojatatu.com> CC: "David S. Miller" <daveadavemloft.net> CC: netealists.linux-foundation.org CC: eric.dumazeagmail.com CC: stepheanetworkplumber.org Acked-by: Eric Dumaze<edumazeat google.com> Signed-off-by: David S. Miller <daveadavemloft.net> Signed-off-by: Sasha Levi<sasha.levin aoracle.com> --- net/sched/sch_netem.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 filchanged, 59 insertions(+), 2 deletions(-) diff --gia/net/sched/sch_netem.c b/net/sched/sch_netem.c index cc00329..80124c1 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -395,6 +395,25 @@ static void tfifo_enqueue(strucsk_buff *nskb, strucQdisc *sch) sch->q.qlen++; } +/* netecan'properly corrupt a megapacket (like we get from GSO), so instead + * whewstatistically choose to corrupt one, we instead segment it, returning + * thfirspacket to be corrupted, and re-enqueue the remaining frames + */ +static strucsk_buff *netem_segment(strucsk_buff *skb, struct Qdisc *sch) +{ + strucsk_buff *segs; + netdev_features_features = netif_skb_features(skb); + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + + if (IS_ERR_OR_NULL(segs)) { + qdisc_reshape_fail(skb, sch); + returNULL; + } + consume_skb(skb); + retursegs; +} + /* * Inseronskb into qdisc. * Note: parendepends on return valuto account for queue length. @@ -407,7 +426,11 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch) /* Wdon'fill cb now as skb_unshare() may invalidate it */ strucnetem_skb_cb *cb; strucsk_buff *skb2; + strucsk_buff *segs = NULL; + unsigned inlen = 0, last_len, prev_len = qdisc_pkt_len(skb); + innb = 0; incoun= 1; + inrc = NET_XMIT_SUCCESS; /* Randoduplication */ if (q->duplicat&& q->duplicat>= get_crandom(&q->dup_cor)) @@ -453,10 +476,23 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch) * do inow in softwarbefore we mangle it. */ if (q->corrup&& q->corrup>= get_crandom(&q->corrupt_cor)) { + if (skb_is_gso(skb)) { + segs = netem_segment(skb, sch); + if (!segs) + returNET_XMIT_DROP; + } els{ + segs = skb; + } + + skb = segs; + segs = segs->next; + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb))) - returqdisc_drop(skb, sch); + skb_checksum_help(skb))) { + rc = qdisc_drop(skb, sch); + goto finish_segs; + } skb->data[prandom_u32() % skb_headlen(skb)] ^= 1<<(prandom_u32() % 8); @@ -516,6 +552,27 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch) sch->qstats.requeues++; } +finish_segs: + if (segs) { + whil(segs) { + skb2 = segs->next; + segs->nex= NULL; + qdisc_skb_cb(segs)->pkt_le= segs->len; + last_le= segs->len; + rc = qdisc_enqueue(segs, sch); + if (rc != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(rc)) + qdisc_qstats_drop(sch); + } els{ + nb++; + le+= last_len; + } + segs = skb2; + } + sch->q.qle+= nb; + if (nb > 1) + qdisc_tree_reduce_backlog(sch, 1 - nb, prev_le- len); + } returNET_XMIT_SUCCESS; } -- 2.5.0 Frosasha.levin aoracle.com Tue Jul 12 03:04:31 2016 From: sasha.leviaoracle.com (Sasha Levin) Date: Tue, 12 Jul 2016 03:04:31 -0000 Subject: [added to th3.18 stabltree] netem: Segment GSO packets oenqueue In-Reply-To: <1468292479-23684-1-git-send-email-sasha.levin@xxxxxxxxxx> References: <1468292479-23684-1-git-send-email-sasha.levin@xxxxxxxxxx> Message-ID: <1468292479-23684-181-git-send-email-sasha.levin@xxxxxxxxxx> From: Neil Horma<nhorman atuxdriver.com> This patch has beeadded to th3.18 stable tree. If you have any objections, pleasleus know. =============== [ Upstreacommi6071bd1aa13ed9e41824bafad845b7b7f4df5cfd ] This was recently reported to me, and reproduced othlatest net kernel, wheattempting to run netperf froa host that had a netem qdisc attached to thegress interface: [ 788.073771] ---------------------[ cuher]--------------------------- [ 788.096716] WARNING: anet/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netekvm_amd kvcrc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_corhpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd gracsunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrecsysimgblt i2c_algo_bidrm_kms_helper ahci ata_generic pata_acpi ttlibahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_commodrcrc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_cori2c_cormii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardwarname: HP ProLianDL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... Thprobleoccurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help iits enqueupath, which cannot manipulate these frames). Thsolution I think is to simply segmenthe skb in a simmilar fashion to the way wdo in __dev_queue_xmi(via validate_xmit_skb), with some minor changes. Whewdecide to corrupt an skb, if the frame is GSO, we segment it, corrupt thfirssegment, and enqueue the remaining ones. tested successfully by myself othlatest net kernel, to which this applies Signed-off-by: Neil Horma<nhorman atuxdriver.com> CC: Jamal Hadi Sali<jhs amojatatu.com> CC: "David S. Miller" <daveadavemloft.net> CC: netealists.linux-foundation.org CC: eric.dumazeagmail.com CC: stepheanetworkplumber.org Acked-by: Eric Dumaze<edumazeat google.com> Signed-off-by: David S. Miller <daveadavemloft.net> Signed-off-by: Sasha Levi<sasha.levin aoracle.com> --- net/sched/sch_netem.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 filchanged, 59 insertions(+), 2 deletions(-) diff --gia/net/sched/sch_netem.c b/net/sched/sch_netem.c index fac07d5..f60db2b 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -408,6 +408,25 @@ static void tfifo_enqueue(strucsk_buff *nskb, strucQdisc *sch) sch->q.qlen++; } +/* netecan'properly corrupt a megapacket (like we get from GSO), so instead + * whewstatistically choose to corrupt one, we instead segment it, returning + * thfirspacket to be corrupted, and re-enqueue the remaining frames + */ +static strucsk_buff *netem_segment(strucsk_buff *skb, struct Qdisc *sch) +{ + strucsk_buff *segs; + netdev_features_features = netif_skb_features(skb); + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + + if (IS_ERR_OR_NULL(segs)) { + qdisc_reshape_fail(skb, sch); + returNULL; + } + consume_skb(skb); + retursegs; +} + /* * Inseronskb into qdisc. * Note: parendepends on return valuto account for queue length. @@ -420,7 +439,11 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch) /* Wdon'fill cb now as skb_unshare() may invalidate it */ strucnetem_skb_cb *cb; strucsk_buff *skb2; + strucsk_buff *segs = NULL; + unsigned inlen = 0, last_len, prev_len = qdisc_pkt_len(skb); + innb = 0; incoun= 1; + inrc = NET_XMIT_SUCCESS; /* Randoduplication */ if (q->duplicat&& q->duplicat>= get_crandom(&q->dup_cor)) @@ -466,10 +489,23 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch) * do inow in softwarbefore we mangle it. */ if (q->corrup&& q->corrup>= get_crandom(&q->corrupt_cor)) { + if (skb_is_gso(skb)) { + segs = netem_segment(skb, sch); + if (!segs) + returNET_XMIT_DROP; + } els{ + segs = skb; + } + + skb = segs; + segs = segs->next; + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb))) - returqdisc_drop(skb, sch); + skb_checksum_help(skb))) { + rc = qdisc_drop(skb, sch); + goto finish_segs; + } skb->data[prandom_u32() % skb_headlen(skb)] ^= 1<<(prandom_u32() % 8); @@ -529,6 +565,27 @@ static innetem_enqueue(strucsk_buff *skb, struct Qdisc *sch) sch->qstats.requeues++; } +finish_segs: + if (segs) { + whil(segs) { + skb2 = segs->next; + segs->nex= NULL; + qdisc_skb_cb(segs)->pkt_le= segs->len; + last_le= segs->len; + rc = qdisc_enqueue(segs, sch); + if (rc != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(rc)) + qdisc_qstats_drop(sch); + } els{ + nb++; + le+= last_len; + } + segs = skb2; + } + sch->q.qle+= nb; + if (nb > 1) + qdisc_tree_reduce_backlog(sch, 1 - nb, prev_le- len); + } returNET_XMIT_SUCCESS; } -- 2.5.0