Re: [RFC PATCH bpf-next v5 2/2] net: Add additional bit to support clockid_t timestamp type

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 4/24/24 3:20 PM, Abhishek Chauhan wrote:
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e464d0ebc9c1..3ad0de07d261 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -711,6 +711,8 @@ typedef unsigned char *sk_buff_data_t;
  enum skb_tstamp_type {
  	SKB_CLOCK_REALTIME,
  	SKB_CLOCK_MONOTONIC,
+	SKB_CLOCK_TAI,
+	__SKB_CLOCK_MAX = SKB_CLOCK_TAI,
  };
/**
@@ -831,8 +833,8 @@ enum skb_tstamp_type {
   *	@decrypted: Decrypted SKB
   *	@slow_gro: state present at GRO time, slower prepare step required
   *	@tstamp_type: When set, skb->tstamp has the
- *		delivery_time in mono clock base Otherwise, the
- *		timestamp is considered real clock base.
+ *		delivery_time in mono clock base or clock base of skb->tstamp.
+ *		Otherwise, the timestamp is considered real clock base
   *	@napi_id: id of the NAPI struct this skb came from
   *	@sender_cpu: (aka @napi_id) source CPU in XPS
   *	@alloc_cpu: CPU which did the skb allocation.
@@ -960,7 +962,7 @@ struct sk_buff {
  	/* private: */
  	__u8			__mono_tc_offset[0];
  	/* public: */
-	__u8			tstamp_type:1;	/* See skb_tstamp_type */
+	__u8			tstamp_type:2;	/* See skb_tstamp_type */
  #ifdef CONFIG_NET_XGRESS
  	__u8			tc_at_ingress:1;	/* See TC_AT_INGRESS_MASK */
  	__u8			tc_skip_classify:1;
@@ -1090,15 +1092,17 @@ struct sk_buff {
  #endif
  #define PKT_TYPE_OFFSET		offsetof(struct sk_buff, __pkt_type_offset)
-/* if you move tc_at_ingress or mono_delivery_time
+/* if you move tc_at_ingress or tstamp_type:2
   * around, you also must adapt these constants.
   */
  #ifdef __BIG_ENDIAN_BITFIELD
-#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 7)
-#define TC_AT_INGRESS_MASK		(1 << 6)
+#define SKB_TSTAMP_TYPE_MASK		(3 << 6)
+#define SKB_TSTAMP_TYPE_RSH		(6)
+#define TC_AT_INGRESS_RSH		(5)

TC_AT_INGRESS_RSH is not used.
+#define TC_AT_INGRESS_MASK		(1 << 5)
  #else
-#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 0)
-#define TC_AT_INGRESS_MASK		(1 << 1)
+#define SKB_TSTAMP_TYPE_MASK		(3)
+#define TC_AT_INGRESS_MASK		(1 << 2)
  #endif
  #define SKB_BF_MONO_TC_OFFSET		offsetof(struct sk_buff, __mono_tc_offset)
@@ -4204,6 +4208,12 @@ static inline void skb_set_tstamp_type_frm_clkid(struct sk_buff *skb,
  	case CLOCK_MONOTONIC:
  		skb->tstamp_type = SKB_CLOCK_MONOTONIC;
  		break;
+	case CLOCK_TAI:
+		skb->tstamp_type = SKB_CLOCK_TAI;
+		break;
+	default:
+		WARN_ONCE(true, "clockid %d not supported", clockid);
+		skb->tstamp_type = SKB_CLOCK_REALTIME;
  	}
  }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cee0a7915c08..1376ed5ece10 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h

The bpf.h needs to be sync to tools/include/uapi/linux/bpf.h.
Otherwise, the bpf CI cannot compile the tests:

https://patchwork.kernel.org/project/netdevbpf/patch/20240424222028.1080134-2-quic_abchauha@xxxxxxxxxxx/

Please monitor the bpf CI test result after submitting the patches.

@@ -6209,6 +6209,7 @@ union {					\
  enum {
  	BPF_SKB_TSTAMP_UNSPEC,
  	BPF_SKB_TSTAMP_DELIVERY_MONO,	/* tstamp has mono delivery time */
+	BPF_SKB_TSTAMP_DELIVERY_TAI,	/* tstamp has tai delivery time */
  	/* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
  	 * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
  	 * and try to deduce it by ingress, egress or skb->sk->sk_clockid.

SKB_CLOCK_TAI is properly defined as an enum now and there is a
WARN for clock other than REAL, MONO, and TAI. I think it is
time to remove UNSPEC and give it back the proper name REALTIME.

I want to take this chance to do some renaming:

/* The enum used in skb->tstamp_type. It specifies the clock type
 * of the time stored in the skb->tstamp.
 */
enum {
	BPF_SKB_TSTAMP_UNSPEC = 0,              /* DEPRECATED */
	BPF_SKB_TSTAMP_DELIVERY_MONO = 1,       /* DEPRECATED */
	BPF_SKB_CLOCK_REALTIME = 0,             /* Realtime clock */
	BPF_SKB_CLOCK_MONOTONIC = 1,            /* Monotonic clock */
	BPF_SKB_CLOCK_TAI = 2,                  /* TAI clock */
	/* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle,
	 * the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid.
	 */
};


diff --git a/net/core/filter.c b/net/core/filter.c
index 957c2fc724eb..c67622f4fe98 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7733,6 +7733,12 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
  		skb->tstamp = tstamp;
  		skb->tstamp_type = SKB_CLOCK_MONOTONIC;
  		break;
+	case BPF_SKB_TSTAMP_DELIVERY_TAI:
+		if (!tstamp)
+			return -EINVAL;
+		skb->tstamp = tstamp;
+		skb->tstamp_type = SKB_CLOCK_TAI;
+		break;
  	case BPF_SKB_TSTAMP_UNSPEC:
  		if (tstamp)

Allow to store any realtime tstamp here since BPF_SKB_TSTAMP_UNSPEC
becomes BPF_SKB_CLOCK_REALTIME.

Like:

BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
           u64, tstamp, u32, tstamp_type)
{
	/* ... */
	case BPF_SKB_CLOCK_TAI:
		if (!tstamp)
			return -EINVAL;
		skb->tstamp = tstamp;
		skb->tstamp_type = SKB_CLOCK_TAI;
		break;
        case BPF_SKB_CLOCK_REALTIME:
		skb->tstamp = tstamp;
		skb->tstamp_type = SKB_CLOCK_REALTIME;
		break;

	/* ... */
}

  			return -EINVAL;

@@ -9388,17 +9394,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
  {
  	__u8 value_reg = si->dst_reg;
  	__u8 skb_reg = si->src_reg;
-	/* AX is needed because src_reg and dst_reg could be the same */
-	__u8 tmp_reg = BPF_REG_AX;
-
-	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
-			      SKB_BF_MONO_TC_OFFSET);
-	*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
-				SKB_MONO_DELIVERY_TIME_MASK, 2);
-	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
-	*insn++ = BPF_JMP_A(1);
-	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
-
+	BUILD_BUG_ON(__SKB_CLOCK_MAX != BPF_SKB_TSTAMP_DELIVERY_TAI);

Add these also:

	BUILD_BUG_ON(SKB_CLOCK_REALTIME != BPF_SKB_CLOCK_REALTIME);
	BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != BPF_SKB_CLOCK_MONOTONIC);
	BUILD_BUG_ON(SKB_CLOCK_TAI != BPF_SKB_CLOCK_TAI);

+	*insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
+	*insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
+#ifdef __BIG_ENDIAN_BITFIELD
+	*insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSH);
+#else
+	BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
+#endif
+	*insn++ = BPF_JMP32_IMM(BPF_JNE, value_reg, SKB_TSTAMP_TYPE_MASK, 1);
+	/* Both the bits set then mark it BPF_SKB_TSTAMP_UNSPEC */
+	*insn++ = BPF_MOV64_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);

The kernel should not have both bits set in skb->tstamp_type. No need to
add two extra bpf insns to check this. If there is a bug in the kernel,
it is better to be uncovered instead of hiding it under BPF_SKB_TSTAMP_UNSPEC (which
is renamed to BPF_SKB_CLOCK_REALTIME anyway).
Hence, the last two bpf insns should be removed.

  	return insn;
  }
@@ -9430,6 +9436,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
  	__u8 value_reg = si->dst_reg;
  	__u8 skb_reg = si->src_reg;
+BUILD_BUG_ON(__SKB_CLOCK_MAX != BPF_SKB_TSTAMP_DELIVERY_TAI);

It is a dup of the one in bpf_convert_tstamp_type_read and can be removed.

  #ifdef CONFIG_NET_XGRESS
  	/* If the tstamp_type is read,
  	 * the bpf prog is aware the tstamp could have delivery time.
@@ -9440,11 +9447,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
  		__u8 tmp_reg = BPF_REG_AX;
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
-		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
-					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
-		*insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
-					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2);
-		/* skb->tc_at_ingress && skb->tstamp_type:1,
+		/* check if ingress mask bits is set */
+		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
+		*insn++ = BPF_JMP_A(4);
+		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
+		*insn++ = BPF_JMP_A(2);
+		/* skb->tc_at_ingress && skb->tstamp_type:2,
  		 * read 0 as the (rcv) timestamp.
  		 */
  		*insn++ = BPF_MOV64_IMM(value_reg, 0);
@@ -9469,7 +9477,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
  	 * the bpf prog is aware the tstamp could have delivery time.
  	 * Thus, write skb->tstamp as is if tstamp_type_access is true.
  	 * Otherwise, writing at ingress will have to clear the
-	 * mono_delivery_time (skb->tstamp_type:1)bit also.
+	 * mono_delivery_time (skb->tstamp_type:2)bit also.
  	 */
  	if (!prog->tstamp_type_access) {
  		__u8 tmp_reg = BPF_REG_AX;
@@ -9479,8 +9487,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
  		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
  		/* goto <store> */
  		*insn++ = BPF_JMP_A(2);
-		/* <clear>: mono_delivery_time or (skb->tstamp_type:1) */
-		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
+		/* <clear>: skb->tstamp_type:2 */
+		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
  		*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
  	}
  #endif
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 591226dcde26..f195b31d6e75 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1457,7 +1457,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
  	skb->mark = cork->mark;
-	skb->tstamp = cork->transmit_time;
+	skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid);

hmm... I think this will break for tcp. This sequence in particular:

tcp_v4_timewait_ack()
  tcp_v4_send_ack()
    ip_send_unicast_reply()
      ip_push_pending_frames()
        ip_finish_skb()
          __ip_make_skb()
            /* sk_clockid is REAL but cork->transmit_time should be in mono */
            skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid);;

I think I hit it from time to time when running the test in this patch set.

[ ... ]

diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
index 74ec09f040b7..19dba6d88265 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c

Please separate the selftests/bpf changes into another patch.

@@ -227,6 +227,12 @@ int egress_host(struct __sk_buff *skb)
  			inc_dtimes(EGRESS_ENDHOST);
  		else
  			inc_errs(EGRESS_ENDHOST);
+	} else if (skb_proto(skb_type) == IPPROTO_UDP) {
+		if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI &&
+		    skb->tstamp)
+			inc_dtimes(EGRESS_ENDHOST);
+		else
+			inc_errs(EGRESS_ENDHOST);
  	} else {
  		if (skb->tstamp_type == BPF_SKB_TSTAMP_UNSPEC &&
  		    skb->tstamp)
@@ -255,6 +261,9 @@ int ingress_host(struct __sk_buff *skb)
  	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
  	    skb->tstamp == EGRESS_FWDNS_MAGIC)
  		inc_dtimes(INGRESS_ENDHOST);
+	else if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI &&
+		       skb->tstamp == EGRESS_FWDNS_MAGIC)
+		inc_dtimes(INGRESS_ENDHOST);
  	else
  		inc_errs(INGRESS_ENDHOST);
@@ -323,12 +332,14 @@ int ingress_fwdns_prio101(struct __sk_buff *skb)
  		/* Should have handled in prio100 */
  		return TC_ACT_SHOT;
- if (skb_proto(skb_type) == IPPROTO_UDP)
+	if (skb_proto(skb_type) == IPPROTO_UDP &&
+		  skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI)
  		expected_dtime = 0;

The IPPROTO_UDP check and expected_dtime can be removed. The UDP test
can expect the same EGRESS_ENDHOST_MAGIC in the skb->tstamp since
the TAI tstamp is also forwarded from egress to ingress now.

if (skb->tstamp_type) {
  		if (fwdns_clear_dtime() ||
-		    skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
+		    (skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO &&
+		    skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI) ||
  		    skb->tstamp != expected_dtime)
  			inc_errs(INGRESS_FWDNS_P101);
  		else
@@ -338,7 +349,8 @@ int ingress_fwdns_prio101(struct __sk_buff *skb)
  			inc_errs(INGRESS_FWDNS_P101);
  	}
- if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
+	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO ||
+		  skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI) {

No need to check BPF_SKB_TSTAMP_DELIVERY_TAI such that the
bpf_skb_set_tstamp() helper can still be tested.

There are some other minor changes needed for the test_tc_dtime.c and
the tc_redirect.c. I quickly made the changes and put them here (first patch):

https://git.kernel.org/pub/scm/linux/kernel/git/martin.lau/bpf-next.git/log/?h=skb.tstamp_type



  		skb->tstamp = INGRESS_FWDNS_MAGIC;
  	} else {
  		if (bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
@@ -370,7 +382,8 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
if (skb->tstamp_type) {
  		if (fwdns_clear_dtime() ||
-		    skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
+		    (skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO &&
+		     skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI) ||
  		    skb->tstamp != INGRESS_FWDNS_MAGIC)
  			inc_errs(EGRESS_FWDNS_P101);
  		else
@@ -380,7 +393,8 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
  			inc_errs(EGRESS_FWDNS_P101);
  	}
- if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
+	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO ||
+		  skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI) {
  		skb->tstamp = EGRESS_FWDNS_MAGIC;
  	} else {
  		if (bpf_skb_set_tstamp(skb, EGRESS_FWDNS_MAGIC,





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux