On 4/25/2024 9:37 PM, Martin KaFai Lau wrote: > On 4/24/24 3:20 PM, Abhishek Chauhan wrote: >> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >> index e464d0ebc9c1..3ad0de07d261 100644 >> --- a/include/linux/skbuff.h >> +++ b/include/linux/skbuff.h >> @@ -711,6 +711,8 @@ typedef unsigned char *sk_buff_data_t; >> enum skb_tstamp_type { >> SKB_CLOCK_REALTIME, >> SKB_CLOCK_MONOTONIC, >> + SKB_CLOCK_TAI, >> + __SKB_CLOCK_MAX = SKB_CLOCK_TAI, >> }; >> /** >> @@ -831,8 +833,8 @@ enum skb_tstamp_type { >> * @decrypted: Decrypted SKB >> * @slow_gro: state present at GRO time, slower prepare step required >> * @tstamp_type: When set, skb->tstamp has the >> - * delivery_time in mono clock base Otherwise, the >> - * timestamp is considered real clock base. >> + * delivery_time in mono clock base or clock base of skb->tstamp. >> + * Otherwise, the timestamp is considered real clock base >> * @napi_id: id of the NAPI struct this skb came from >> * @sender_cpu: (aka @napi_id) source CPU in XPS >> * @alloc_cpu: CPU which did the skb allocation. >> @@ -960,7 +962,7 @@ struct sk_buff { >> /* private: */ >> __u8 __mono_tc_offset[0]; >> /* public: */ >> - __u8 tstamp_type:1; /* See skb_tstamp_type */ >> + __u8 tstamp_type:2; /* See skb_tstamp_type */ >> #ifdef CONFIG_NET_XGRESS >> __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ >> __u8 tc_skip_classify:1; >> @@ -1090,15 +1092,17 @@ struct sk_buff { >> #endif >> #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) >> -/* if you move tc_at_ingress or mono_delivery_time >> +/* if you move tc_at_ingress or tstamp_type:2 >> * around, you also must adapt these constants. >> */ >> #ifdef __BIG_ENDIAN_BITFIELD >> -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 7) >> -#define TC_AT_INGRESS_MASK (1 << 6) >> +#define SKB_TSTAMP_TYPE_MASK (3 << 6) >> +#define SKB_TSTAMP_TYPE_RSH (6) >> +#define TC_AT_INGRESS_RSH (5) > > TC_AT_INGRESS_RSH is not used. > Noted. I will remove it. >> +#define TC_AT_INGRESS_MASK (1 << 5) >> #else >> -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 0) >> -#define TC_AT_INGRESS_MASK (1 << 1) >> +#define SKB_TSTAMP_TYPE_MASK (3) >> +#define TC_AT_INGRESS_MASK (1 << 2) >> #endif >> #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) >> @@ -4204,6 +4208,12 @@ static inline void skb_set_tstamp_type_frm_clkid(struct sk_buff *skb, >> case CLOCK_MONOTONIC: >> skb->tstamp_type = SKB_CLOCK_MONOTONIC; >> break; >> + case CLOCK_TAI: >> + skb->tstamp_type = SKB_CLOCK_TAI; >> + break; >> + default: >> + WARN_ONCE(true, "clockid %d not supported", clockid); >> + skb->tstamp_type = SKB_CLOCK_REALTIME; >> } >> } >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index cee0a7915c08..1376ed5ece10 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h > > The bpf.h needs to be sync to tools/include/uapi/linux/bpf.h. Oh i see. There is a bpf.h in tools as well . I was not aware of this part. I will further check this. > Otherwise, the bpf CI cannot compile the tests: > > https://patchwork.kernel.org/project/netdevbpf/patch/20240424222028.1080134-2-quic_abchauha@xxxxxxxxxxx/ > > Please monitor the bpf CI test result after submitting the patches. > >> @@ -6209,6 +6209,7 @@ union { \ >> enum { >> BPF_SKB_TSTAMP_UNSPEC, >> BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ >> + BPF_SKB_TSTAMP_DELIVERY_TAI, /* tstamp has tai delivery time */ >> /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, >> * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC >> * and try to deduce it by ingress, egress or skb->sk->sk_clockid. > > SKB_CLOCK_TAI is properly defined as an enum now and there is a > WARN for clock other than REAL, MONO, and TAI. I think it is > time to remove UNSPEC and give it back the proper name REALTIME. > > I want to take this chance to do some renaming: > > /* The enum used in skb->tstamp_type. It specifies the clock type > * of the time stored in the skb->tstamp. > */ > enum { > BPF_SKB_TSTAMP_UNSPEC = 0, /* DEPRECATED */ > BPF_SKB_TSTAMP_DELIVERY_MONO = 1, /* DEPRECATED */ > BPF_SKB_CLOCK_REALTIME = 0, /* Realtime clock */ > BPF_SKB_CLOCK_MONOTONIC = 1, /* Monotonic clock */ > BPF_SKB_CLOCK_TAI = 2, /* TAI clock */ > /* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle, > * the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid. > */ > }; > Okay let me evalute this and get back > >> diff --git a/net/core/filter.c b/net/core/filter.c >> index 957c2fc724eb..c67622f4fe98 100644 >> --- a/net/core/filter.c >> +++ b/net/core/filter.c >> @@ -7733,6 +7733,12 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, >> skb->tstamp = tstamp; >> skb->tstamp_type = SKB_CLOCK_MONOTONIC; >> break; >> + case BPF_SKB_TSTAMP_DELIVERY_TAI: >> + if (!tstamp) >> + return -EINVAL; >> + skb->tstamp = tstamp; >> + skb->tstamp_type = SKB_CLOCK_TAI; >> + break; >> case BPF_SKB_TSTAMP_UNSPEC: >> if (tstamp) > > Allow to store any realtime tstamp here since BPF_SKB_TSTAMP_UNSPEC > becomes BPF_SKB_CLOCK_REALTIME. > > Like: > > BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, > u64, tstamp, u32, tstamp_type) > { > /* ... */ > case BPF_SKB_CLOCK_TAI: > if (!tstamp) > return -EINVAL; > skb->tstamp = tstamp; > skb->tstamp_type = SKB_CLOCK_TAI; > break; > case BPF_SKB_CLOCK_REALTIME: > skb->tstamp = tstamp; > skb->tstamp_type = SKB_CLOCK_REALTIME; > break; > > /* ... */ > } Noted! > >> return -EINVAL; > >> @@ -9388,17 +9394,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, >> { >> __u8 value_reg = si->dst_reg; >> __u8 skb_reg = si->src_reg; >> - /* AX is needed because src_reg and dst_reg could be the same */ >> - __u8 tmp_reg = BPF_REG_AX; >> - >> - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, >> - SKB_BF_MONO_TC_OFFSET); >> - *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, >> - SKB_MONO_DELIVERY_TIME_MASK, 2); >> - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); >> - *insn++ = BPF_JMP_A(1); >> - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); >> - >> + BUILD_BUG_ON(__SKB_CLOCK_MAX != BPF_SKB_TSTAMP_DELIVERY_TAI); > > Add these also: > Noted! > BUILD_BUG_ON(SKB_CLOCK_REALTIME != BPF_SKB_CLOCK_REALTIME); > BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != BPF_SKB_CLOCK_MONOTONIC); > BUILD_BUG_ON(SKB_CLOCK_TAI != BPF_SKB_CLOCK_TAI); > >> + *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); >> + *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); >> +#ifdef __BIG_ENDIAN_BITFIELD >> + *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSH); >> +#else >> + BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); >> +#endif >> + *insn++ = BPF_JMP32_IMM(BPF_JNE, value_reg, SKB_TSTAMP_TYPE_MASK, 1); >> + /* Both the bits set then mark it BPF_SKB_TSTAMP_UNSPEC */ >> + *insn++ = BPF_MOV64_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); > > The kernel should not have both bits set in skb->tstamp_type. No need to > add two extra bpf insns to check this. If there is a bug in the kernel, > it is better to be uncovered instead of hiding it under BPF_SKB_TSTAMP_UNSPEC (which > is renamed to BPF_SKB_CLOCK_REALTIME anyway). > Hence, the last two bpf insns should be removed. > Got it !. >> return insn; >> } >> @@ -9430,6 +9436,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, >> __u8 value_reg = si->dst_reg; >> __u8 skb_reg = si->src_reg; >> +BUILD_BUG_ON(__SKB_CLOCK_MAX != BPF_SKB_TSTAMP_DELIVERY_TAI); > > It is a dup of the one in bpf_convert_tstamp_type_read and can be removed. > Noted! >> #ifdef CONFIG_NET_XGRESS >> /* If the tstamp_type is read, >> * the bpf prog is aware the tstamp could have delivery time. >> @@ -9440,11 +9447,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, >> __u8 tmp_reg = BPF_REG_AX; >> *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); >> - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, >> - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK); >> - *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg, >> - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2); >> - /* skb->tc_at_ingress && skb->tstamp_type:1, >> + /* check if ingress mask bits is set */ >> + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); >> + *insn++ = BPF_JMP_A(4); >> + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); >> + *insn++ = BPF_JMP_A(2); >> + /* skb->tc_at_ingress && skb->tstamp_type:2, >> * read 0 as the (rcv) timestamp. >> */ >> *insn++ = BPF_MOV64_IMM(value_reg, 0); >> @@ -9469,7 +9477,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, >> * the bpf prog is aware the tstamp could have delivery time. >> * Thus, write skb->tstamp as is if tstamp_type_access is true. >> * Otherwise, writing at ingress will have to clear the >> - * mono_delivery_time (skb->tstamp_type:1)bit also. >> + * mono_delivery_time (skb->tstamp_type:2)bit also. >> */ >> if (!prog->tstamp_type_access) { >> __u8 tmp_reg = BPF_REG_AX; >> @@ -9479,8 +9487,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, >> *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); >> /* goto <store> */ >> *insn++ = BPF_JMP_A(2); >> - /* <clear>: mono_delivery_time or (skb->tstamp_type:1) */ >> - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); >> + /* <clear>: skb->tstamp_type:2 */ >> + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); >> *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); >> } >> #endif >> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c >> index 591226dcde26..f195b31d6e75 100644 >> --- a/net/ipv4/ip_output.c >> +++ b/net/ipv4/ip_output.c >> @@ -1457,7 +1457,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, >> skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority); >> skb->mark = cork->mark; >> - skb->tstamp = cork->transmit_time; >> + skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid); > > hmm... I think this will break for tcp. This sequence in particular: > > tcp_v4_timewait_ack() > tcp_v4_send_ack() > ip_send_unicast_reply() > ip_push_pending_frames() > ip_finish_skb() > __ip_make_skb() > /* sk_clockid is REAL but cork->transmit_time should be in mono */ > skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid);; > > I think I hit it from time to time when running the test in this patch set. > do you think i need to check for protocol type here . since tcp uses Mono and the rest according to the new design is based on sk->sk_clockid if (iph->protocol == IPPROTO_TCP) skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, CLOCK_MONOTONIC); else skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid); > [ ... ] > >> diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c >> index 74ec09f040b7..19dba6d88265 100644 >> --- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c >> +++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c > > Please separate the selftests/bpf changes into another patch. > I will do that. >> @@ -227,6 +227,12 @@ int egress_host(struct __sk_buff *skb) >> inc_dtimes(EGRESS_ENDHOST); >> else >> inc_errs(EGRESS_ENDHOST); >> + } else if (skb_proto(skb_type) == IPPROTO_UDP) { >> + if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI && >> + skb->tstamp) >> + inc_dtimes(EGRESS_ENDHOST); >> + else >> + inc_errs(EGRESS_ENDHOST); >> } else { >> if (skb->tstamp_type == BPF_SKB_TSTAMP_UNSPEC && >> skb->tstamp) >> @@ -255,6 +261,9 @@ int ingress_host(struct __sk_buff *skb) >> if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO && >> skb->tstamp == EGRESS_FWDNS_MAGIC) >> inc_dtimes(INGRESS_ENDHOST); >> + else if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI && >> + skb->tstamp == EGRESS_FWDNS_MAGIC) >> + inc_dtimes(INGRESS_ENDHOST); >> else >> inc_errs(INGRESS_ENDHOST); >> @@ -323,12 +332,14 @@ int ingress_fwdns_prio101(struct __sk_buff *skb) >> /* Should have handled in prio100 */ >> return TC_ACT_SHOT; >> - if (skb_proto(skb_type) == IPPROTO_UDP) >> + if (skb_proto(skb_type) == IPPROTO_UDP && >> + skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI) >> expected_dtime = 0; > > The IPPROTO_UDP check and expected_dtime can be removed. The UDP test > can expect the same EGRESS_ENDHOST_MAGIC in the skb->tstamp since > the TAI tstamp is also forwarded from egress to ingress now. > >> if (skb->tstamp_type) { >> if (fwdns_clear_dtime() || >> - skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO || >> + (skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO && >> + skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI) || >> skb->tstamp != expected_dtime) >> inc_errs(INGRESS_FWDNS_P101); >> else >> @@ -338,7 +349,8 @@ int ingress_fwdns_prio101(struct __sk_buff *skb) >> inc_errs(INGRESS_FWDNS_P101); >> } >> - if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) { >> + if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO || >> + skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI) { > > No need to check BPF_SKB_TSTAMP_DELIVERY_TAI such that the > bpf_skb_set_tstamp() helper can still be tested. > > There are some other minor changes needed for the test_tc_dtime.c and > the tc_redirect.c. I quickly made the changes and put them here (first patch): > > https://git.kernel.org/pub/scm/linux/kernel/git/martin.lau/bpf-next.git/log/?h=skb.tstamp_type > Thanks for your help Martin. I will check the changes you have made in BPF test framework and see what i have missed. > > >> skb->tstamp = INGRESS_FWDNS_MAGIC; >> } else { >> if (bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC, >> @@ -370,7 +382,8 @@ int egress_fwdns_prio101(struct __sk_buff *skb) >> if (skb->tstamp_type) { >> if (fwdns_clear_dtime() || >> - skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO || >> + (skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO && >> + skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI) || >> skb->tstamp != INGRESS_FWDNS_MAGIC) >> inc_errs(EGRESS_FWDNS_P101); >> else >> @@ -380,7 +393,8 @@ int egress_fwdns_prio101(struct __sk_buff *skb) >> inc_errs(EGRESS_FWDNS_P101); >> } >> - if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) { >> + if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO || >> + skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI) { >> skb->tstamp = EGRESS_FWDNS_MAGIC; >> } else { >> if (bpf_skb_set_tstamp(skb, EGRESS_FWDNS_MAGIC, >