On 4/15/2024 1:00 PM, Martin KaFai Lau wrote: > On 4/13/24 12:07 PM, Willem de Bruijn wrote: >>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >>> index a83a2120b57f..b6346c21c3d4 100644 >>> --- a/include/linux/skbuff.h >>> +++ b/include/linux/skbuff.h >>> @@ -827,7 +827,8 @@ enum skb_tstamp_type { >>> * @tstamp_type: When set, skb->tstamp has the >>> * delivery_time in mono clock base (i.e. EDT). Otherwise, the >>> * skb->tstamp has the (rcv) timestamp at ingress and >>> - * delivery_time at egress. >>> + * delivery_time at egress or skb->tstamp defined by skb->sk->sk_clockid >>> + * coming from userspace >>> * @napi_id: id of the NAPI struct this skb came from >>> * @sender_cpu: (aka @napi_id) source CPU in XPS >>> * @alloc_cpu: CPU which did the skb allocation. >>> @@ -955,7 +956,7 @@ struct sk_buff { >>> /* private: */ >>> __u8 __mono_tc_offset[0]; >>> /* public: */ >>> - __u8 tstamp_type:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ >>> + __u8 tstamp_type:2; /* See SKB_MONO_DELIVERY_TIME_MASK */ >>> #ifdef CONFIG_NET_XGRESS >>> __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ >>> __u8 tc_skip_classify:1; >> >> A quick pahole for a fairly standard .config that I had laying around >> shows a hole after this list of bits, so no huge concerns there from >> adding a bit: >> >> __u8 slow_gro:1; /* 3: 4 1 */ >> __u8 csum_not_inet:1; /* 3: 5 1 */ >> >> /* XXX 2 bits hole, try to pack */ >> >> __u16 tc_index; /* 4 2 */ >> >>> @@ -1090,10 +1091,10 @@ struct sk_buff { >>> */ >>> #ifdef __BIG_ENDIAN_BITFIELD >>> #define SKB_MONO_DELIVERY_TIME_MASK (1 << 7) >>> -#define TC_AT_INGRESS_MASK (1 << 6) >>> +#define TC_AT_INGRESS_MASK (1 << 5) >> >> Have to be careful when adding a new 2 bit tstamp_type with both bits >> set, that this does not incorrectly get interpreted as MONO. >> >> I haven't looked closely at the BPF API, but hopefully it can be >> extensible to return the specific type. If it is hardcoded to return >> either MONO or not, then only 0x1 should match, not 0x3. > > Good point. I believe it is the best to have bpf to consider both bits in tstamp_type:2 in filter.c to avoid the 0x3 surprise in the future. The BPF API can be extended to support SKB_CLOCK_TAI. > > Regardless, in bpf_convert_tstamp_write(), it still needs to clear both bits in tstamp_type when it is at ingress. Right now it only clears the mono bit. > > Then it may as well consider both tstamp_type:2 bits in bpf_convert_tstamp_read() and bpf_convert_tstamp_type_read(). e.g. bpf_convert_tstamp_type_read(), it should be a pretty straight forward change because the SKB_CLOCK_* enum value should be a 1:1 mapping to the BPF_SKB_TSTAMP_*. > >> >>> #else >>> #define SKB_MONO_DELIVERY_TIME_MASK (1 << 0) >>> -#define TC_AT_INGRESS_MASK (1 << 1) >>> +#define TC_AT_INGRESS_MASK (1 << 2) >>> #endif >>> #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) >>> > Hi Martin and Willem, I have made the changes as per your guidelines . I will be raising RFC patch bpf-next v4 soon. Giving you heads up of the changes i am bringing in the BPF code. If you feel i have done something incorrectly, please do correct me here. I apologies for adding the code here and making the content of the email huge for other upstream reviewers. //Introduce a new BPF tstamp type mask in bpf.h #ifdef __BIG_ENDIAN_BITFIELD #define SKB_MONO_DELIVERY_TIME_MASK (1 << 7) + #define SKB_TAI_DELIVERY_TIME_MASK (1 << 6) (new) #define TC_AT_INGRESS_MASK (1 << 5) #else #define SKB_MONO_DELIVERY_TIME_MASK (1 << 0) + #define SKB_TAI_DELIVERY_TIME_MASK (1 << 1) (new) #define TC_AT_INGRESS_MASK (1 << 2) #endif //changes in the filter.c (bpf_convert_tstamp_{read,write}, bpf_convert_tstamp_type_read())code are accordingly taken care since now we have 3 bits instead of 2 Value 3 => unspec Value 2 => tai Value 1 => mono static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_MONO_DELIVERY_TIME_MASK | SKB_TAI_DELIVERY_TIME_MASK, 2); <== check for both bits are set (if so make it tstamp_unspec) *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_MONO_DELIVERY_TIME_MASK, 3); <== if mono is set then its mono base and jump to 3rd instruction from here. *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TAI_DELIVERY_TIME_MASK, 4); <== if tai is set then its tai base and jump to 4th instruction from here. *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); *insn++ = BPF_JMP_A(1); *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); *insn++ = BPF_JMP_A(1); *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_TAI); return insn; } //Values 7, 6 and 5 as input will set the value reg to 0 //otherwise the skb_reg has correct configuration and will store the content in value reg. static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. */ if (!prog->tstamp_type_access) { /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /*check if all three bits are set*/ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK | SKB_TAI_DELIVERY_TIME_MASK); <== check if all 3 bits are set which is value 7 /*if all 3 bits are set jump 3 instructions and clear the register */ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK | SKB_TAI_DELIVERY_TIME_MASK, 4); <== if all 3 bits are set then value reg is set to 0 /*Now check Mono is set with ingress mask if so clear*/ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 3); <== check if value is 5 (mono + ingress) if so then value reg is set to 0 /*Now Check tai is set with ingress mask if so clear*/ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, TC_AT_INGRESS_MASK | SKB_TAI_DELIVERY_TIME_MASK, 2); <== check if value is 6 (tai + ingress) if so then value reg is set to 0 /*Now Check tai and mono are set if so clear */ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, SKB_MONO_DELIVERY_TIME_MASK | SKB_TAI_DELIVERY_TIME_MASK, 1); <== check if value 3 (tai + mono) if so then value reg is set to 0 /* goto <store> */ *insn++ = BPF_JMP_A(2); /* skb->tc_at_ingress && skb->tstamp_type:1, * read 0 as the (rcv) timestamp. */ *insn++ = BPF_MOV64_IMM(value_reg, 0); *insn++ = BPF_JMP_A(1); } #endif *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, offsetof(struct sk_buff, tstamp)); return insn; } //this was pretty straight forward //if ingress mask is set just go ahead and unset both tai and mono delivery time. static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the * mono_delivery_time (skb->tstamp_type:1)bit also. */ if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* Writing __sk_buff->tstamp as ingress, goto <clear> */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); /* goto <store> */ *insn++ = BPF_JMP_A(3); /* <clear>: mono_delivery_time or (skb->tstamp_type:1) */ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); /* <clear>: tai delivery_time or (skb->tstamp_type:2) */ (new) *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TAI_DELIVERY_TIME_MASK); (new) <== reset tai delivery mask if ingress bit is set. *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); } #endif /* <store>: skb->tstamp = tstamp */ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM, skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm); return insn; And the ctx_rewrite will be as follows .read = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" "w11 &= 7;" "if w11 == 0x7 goto pc+4;" "if w11 == 0x5 goto pc+3;" "if w11 == 0x6 goto pc+2;" "if w11 == 0x3 goto pc+1;" "goto pc+2" "$dst = 0;" "goto pc+1;" "$dst = *(u64 *)($ctx + sk_buff::tstamp);", .write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" "if w11 & 0x4 goto pc+1;" "goto pc+3;" "w11 &= -2;" "w11 &= -3;" "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;" "*(u64 *)($ctx + sk_buff::tstamp) = $src;",