[PATCH bpf-next] bpf: add sock_ops callbacks for data send/recv/acked events

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Add 3 sock_ops operators, namely BPF_SOCK_OPS_DATA_SEND_CB,
BPF_SOCK_OPS_DATA_RECV_CB, and BPF_SOCK_OPS_DATA_ACKED_CB. A flag
BPF_SOCK_OPS_DATA_EVENT_CB_FLAG is provided to minimize the performance
impact. The flag must be explicitly set to enable these callbacks.

If the flag is enabled, bpf sock_ops program will be called every time a
tcp data packet is sent, received, and acked.
BPF_SOCK_OPS_DATA_SEND_CB: call bpf after a data packet is sent.
BPF_SOCK_OPS_DATA_RECV_CB: call bpf after a data packet is receviced.
BPF_SOCK_OPS_DATA_ACKED_CB: call bpf after a valid ack packet is
processed (some sent data are ackknowledged).

We use these callbacks for fine-grained tcp monitoring, which collects
and analyses every tcp request/response event information. The whole
system has been described in SIGMOD'18 (see
https://dl.acm.org/doi/pdf/10.1145/3183713.3190659 for details). To
achieve this with bpf, we require hooks for data events that call
sock_ops bpf (1) when any data packet is sent/received/acked, and (2)
after critical tcp state variables have been updated (e.g., snd_una,
snd_nxt, rcv_nxt). However, existing sock_ops operators cannot meet our
requirements.

Besides, these hooks also help to debug tcp when data send/recv/acked.

Signed-off-by: Philo Lu <lulie@xxxxxxxxxxxxxxxxx>
---
 include/net/tcp.h        |  9 +++++++++
 include/uapi/linux/bpf.h | 14 +++++++++++++-
 net/ipv4/tcp_input.c     |  4 ++++
 net/ipv4/tcp_output.c    |  2 ++
 4 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d2f0736b76b8..73eda03fdda5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2660,6 +2660,15 @@ static inline void tcp_bpf_rtt(struct sock *sk)
 		tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
 }
 
+/* op must be one of BPF_SOCK_OPS_DATA_SEND_CB, BPF_SOCK_OPS_DATA_RECV_CB,
+ * or BPF_SOCK_OPS_DATA_ACKED_CB.
+ */
+static inline void tcp_bpf_data_event(struct sock *sk, int op)
+{
+	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_DATA_EVENT_CB_FLAG))
+		tcp_call_bpf(sk, op, 0, NULL);
+}
+
 #if IS_ENABLED(CONFIG_SMC)
 extern struct static_key_false tcp_have_smc;
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7cf8bcf9f6a2..2154a6235901 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3016,6 +3016,7 @@ union bpf_attr {
  * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
  * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
  * 		* **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
+ * 		* **BPF_SOCK_OPS_DATA_EVENT_CB_FLAG** (data packet send/recv/acked)
  *
  * 		Therefore, this function can be used to clear a callback flag by
  * 		setting the appropriate bit to zero. e.g. to disable the RTO
@@ -6755,8 +6756,10 @@ enum {
 	 * options first before the BPF program does.
 	 */
 	BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6),
+	/* Call bpf when data send/recv/acked. */
+	BPF_SOCK_OPS_DATA_EVENT_CB_FLAG = (1<<7),
 /* Mask of all currently supported cb flags */
-	BPF_SOCK_OPS_ALL_CB_FLAGS       = 0x7F,
+	BPF_SOCK_OPS_ALL_CB_FLAGS       = 0xFF,
 };
 
 /* List of known BPF sock_ops operators.
@@ -6869,6 +6872,15 @@ enum {
 					 * by the kernel or the
 					 * earlier bpf-progs.
 					 */
+	BPF_SOCK_OPS_DATA_SEND_CB,		/* Calls BPF program when a
+					 * data packet is sent. Pure ack is ignored.
+					 */
+	BPF_SOCK_OPS_DATA_RECV_CB,		/* Calls BPF program when a
+					 * data packet is received. Pure ack is ignored.
+					 */
+	BPF_SOCK_OPS_DATA_ACKED_CB,		/* Calls BPF program when sent
+					 * data are acknowledged.
+					 */
 };
 
 /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bcb55d98004c..72c6192e7cd0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -824,6 +824,8 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 
 	now = tcp_jiffies32;
 
+	tcp_bpf_data_event(sk, BPF_SOCK_OPS_DATA_RECV_CB);
+
 	if (!icsk->icsk_ack.ato) {
 		/* The _first_ data packet received, initialize
 		 * delayed ACK engine.
@@ -3454,6 +3456,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
 		flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
 	}
 
+	tcp_bpf_data_event(sk, BPF_SOCK_OPS_DATA_ACKED_CB);
+
 	if (icsk->icsk_ca_ops->pkts_acked) {
 		struct ack_sample sample = { .pkts_acked = pkts_acked,
 					     .rtt_us = sack->rate->rtt_us };
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index eb13a55d660c..ddd6a9c2150f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2821,6 +2821,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		/* Send one loss probe per tail loss episode. */
 		if (push_one != 2)
 			tcp_schedule_loss_probe(sk, false);
+
+		tcp_bpf_data_event(sk, BPF_SOCK_OPS_DATA_SEND_CB);
 		return false;
 	}
 	return !tp->packets_out && !tcp_write_queue_empty(sk);
-- 
2.32.0.3.g01195cf9f





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux