[PATCH RFC v2 net-next 1/5] net: Introduce Qdisc backpressure infrastructure

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Peilin Ye <peilin.ye@xxxxxxxxxxxxx>

Currently sockets (especially UDP ones) can drop a lot of traffic at TC
egress when rate limited by shaper Qdiscs like HTB.  Improve this by
introducing a Qdisc backpressure infrastructure:

  a. A new 'sock struct' field, @sk_overlimits, which keeps track of the
     number of bytes in socket send buffer that are currently
     unavailable due to TC egress congestion.  The size of an overlimit
     socket's "effective" send buffer is represented by @sk_sndbuf minus
     @sk_overlimits, with a lower limit of SOCK_MIN_SNDBUF:

     max(@sk_sndbuf - @sk_overlimits, SOCK_MIN_SNDBUF)

  b. A new (*backpressure) 'struct proto' callback, which is the
     protocol's private algorithm for Qdisc backpressure.

Working together:

  1. When a shaper Qdisc (TBF, HTB, CBQ, etc.) drops a packet that
     belongs to a local socket, it calls qdisc_backpressure().

  2. qdisc_backpressure() eventually invokes the socket protocol's
     (*backpressure) callback, which should increase @sk_overlimits.

  3. The transport layer then sees a smaller "effective" send buffer and
     will send slower.

  4. It is the per-protocol (*backpressure) implementation's
     responsibility to decrease @sk_overlimits when TC egress becomes
     idle again, potentially by using a timer.

Suggested-by: Cong Wang <cong.wang@xxxxxxxxxxxxx>
Signed-off-by: Peilin Ye <peilin.ye@xxxxxxxxxxxxx>
---
 include/net/sch_generic.h | 11 +++++++++++
 include/net/sock.h        | 21 +++++++++++++++++++++
 net/core/sock.c           |  1 +
 3 files changed, 33 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ec693fe7c553..afdf4bf64936 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -19,6 +19,7 @@
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 #include <net/flow_offload.h>
+#include <net/sock.h>
 
 struct Qdisc_ops;
 struct qdisc_walker;
@@ -1188,6 +1189,16 @@ static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
 	return NET_XMIT_DROP;
 }
 
+static inline void qdisc_backpressure(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	if (!sk || !sk_fullsock(sk))
+		return;
+
+	sk_backpressure(sk);
+}
+
 /* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how
    long it will take to send a packet given its size.
  */
diff --git a/include/net/sock.h b/include/net/sock.h
index 05a1bbdf5805..ef10ca66cf26 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -277,6 +277,7 @@ struct sk_filter;
   *	@sk_pacing_status: Pacing status (requested, handled by sch_fq)
   *	@sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
   *	@sk_sndbuf: size of send buffer in bytes
+  *	@sk_overlimits: size of temporarily unavailable send buffer in bytes
   *	@__sk_flags_offset: empty field used to determine location of bitfield
   *	@sk_padding: unused element for alignment
   *	@sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
@@ -439,6 +440,7 @@ struct sock {
 	struct dst_entry __rcu	*sk_dst_cache;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
+	int			sk_overlimits;
 
 	/* ===== cache line for TX ===== */
 	int			sk_wmem_queued;
@@ -1264,6 +1266,7 @@ struct proto {
 
 	bool			(*stream_memory_free)(const struct sock *sk, int wake);
 	bool			(*sock_is_readable)(struct sock *sk);
+	void			(*backpressure)(struct sock *sk);
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
 	void			(*leave_memory_pressure)(struct sock *sk);
@@ -2499,6 +2502,24 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
 	WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
 }
 
+static inline int sk_sndbuf_avail(struct sock *sk)
+{
+	int overlimits, sndbuf = READ_ONCE(sk->sk_sndbuf);
+
+	if (!sk->sk_prot->backpressure)
+		return sndbuf;
+
+	overlimits = READ_ONCE(sk->sk_overlimits);
+
+	return max_t(int, sndbuf - overlimits, SOCK_MIN_SNDBUF);
+}
+
+static inline void sk_backpressure(struct sock *sk)
+{
+	if (sk->sk_prot->backpressure)
+		sk->sk_prot->backpressure(sk);
+}
+
 /**
  * sk_page_frag - return an appropriate page_frag
  * @sk: socket
diff --git a/net/core/sock.c b/net/core/sock.c
index 4cb957d934a2..167d471b176f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2194,6 +2194,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
 	refcount_set(&newsk->sk_wmem_alloc, 1);
+	newsk->sk_overlimits	= 0;
 
 	atomic_set(&newsk->sk_omem_alloc, 0);
 	sk_init_common(newsk);
-- 
2.20.1




[Index of Archives]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]     [Linux Resources]

  Powered by Linux