Re: [RFC PATCH bpf-next v3 2/2] net: Add additional bit to support userspace timestamp type

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 4/15/2024 1:00 PM, Martin KaFai Lau wrote:
> On 4/13/24 12:07 PM, Willem de Bruijn wrote:
>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>>> index a83a2120b57f..b6346c21c3d4 100644
>>> --- a/include/linux/skbuff.h
>>> +++ b/include/linux/skbuff.h
>>> @@ -827,7 +827,8 @@ enum skb_tstamp_type {
>>>    *    @tstamp_type: When set, skb->tstamp has the
>>>    *        delivery_time in mono clock base (i.e. EDT).  Otherwise, the
>>>    *        skb->tstamp has the (rcv) timestamp at ingress and
>>> - *        delivery_time at egress.
>>> + *        delivery_time at egress or skb->tstamp defined by skb->sk->sk_clockid
>>> + *        coming from userspace
>>>    *    @napi_id: id of the NAPI struct this skb came from
>>>    *    @sender_cpu: (aka @napi_id) source CPU in XPS
>>>    *    @alloc_cpu: CPU which did the skb allocation.
>>> @@ -955,7 +956,7 @@ struct sk_buff {
>>>       /* private: */
>>>       __u8            __mono_tc_offset[0];
>>>       /* public: */
>>> -    __u8            tstamp_type:1;    /* See SKB_MONO_DELIVERY_TIME_MASK */
>>> +    __u8            tstamp_type:2;    /* See SKB_MONO_DELIVERY_TIME_MASK */
>>>   #ifdef CONFIG_NET_XGRESS
>>>       __u8            tc_at_ingress:1;    /* See TC_AT_INGRESS_MASK */
>>>       __u8            tc_skip_classify:1;
>>
>> A quick pahole for a fairly standard .config that I had laying around
>> shows a hole after this list of bits, so no huge concerns there from
>> adding a bit:
>>
>>             __u8               slow_gro:1;           /*     3: 4  1 */
>>             __u8               csum_not_inet:1;      /*     3: 5  1 */
>>
>>             /* XXX 2 bits hole, try to pack */
>>
>>             __u16              tc_index;             /*     4     2 */
>>
>>> @@ -1090,10 +1091,10 @@ struct sk_buff {
>>>    */
>>>   #ifdef __BIG_ENDIAN_BITFIELD
>>>   #define SKB_MONO_DELIVERY_TIME_MASK    (1 << 7)
>>> -#define TC_AT_INGRESS_MASK        (1 << 6)
>>> +#define TC_AT_INGRESS_MASK        (1 << 5)
>>
>> Have to be careful when adding a new 2 bit tstamp_type with both bits
>> set, that this does not incorrectly get interpreted as MONO.
>>
>> I haven't looked closely at the BPF API, but hopefully it can be
>> extensible to return the specific type. If it is hardcoded to return
>> either MONO or not, then only 0x1 should match, not 0x3.
> 
> Good point. I believe it is the best to have bpf to consider both bits in tstamp_type:2 in filter.c to avoid the 0x3 surprise in the future. The BPF API can be extended to support SKB_CLOCK_TAI.
> 
> Regardless, in bpf_convert_tstamp_write(), it still needs to clear both bits in tstamp_type when it is at ingress. Right now it only clears the mono bit.
> 
> Then it may as well consider both tstamp_type:2 bits in bpf_convert_tstamp_read() and bpf_convert_tstamp_type_read(). e.g. bpf_convert_tstamp_type_read(), it should be a pretty straight forward change because the SKB_CLOCK_* enum value should be a 1:1 mapping to the BPF_SKB_TSTAMP_*.
> 
>>
>>>   #else
>>>   #define SKB_MONO_DELIVERY_TIME_MASK    (1 << 0)
>>> -#define TC_AT_INGRESS_MASK        (1 << 1)
>>> +#define TC_AT_INGRESS_MASK        (1 << 2)
>>>   #endif
>>>   #define SKB_BF_MONO_TC_OFFSET        offsetof(struct sk_buff, __mono_tc_offset)
>>>   
> 
Hi Martin and Willem,

I have made the changes as per your guidelines . I will be raising RFC patch bpf-next v4 soon. Giving you heads up of the changes i am bringing in the BPF code. If you feel i have done something incorrectly, please do correct me here.  
I apologies for adding the code here and making the content of the email huge for other upstream reviewers. 

//Introduce a new BPF tstamp type mask in bpf.h 

#ifdef __BIG_ENDIAN_BITFIELD
#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 7)
+ #define SKB_TAI_DELIVERY_TIME_MASK	(1 << 6) (new)
#define TC_AT_INGRESS_MASK		(1 << 5)
#else
#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 0)
+ #define SKB_TAI_DELIVERY_TIME_MASK	(1 << 1) (new)
#define TC_AT_INGRESS_MASK		(1 << 2)
#endif

//changes in the filter.c (bpf_convert_tstamp_{read,write}, bpf_convert_tstamp_type_read())code are accordingly taken care since now we have 3 bits instead of 2 

Value 3 => unspec
Value 2 => tai 
Value 1 => mono

static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
						     struct bpf_insn *insn)
{
	__u8 value_reg = si->dst_reg;
	__u8 skb_reg = si->src_reg;
	/* AX is needed because src_reg and dst_reg could be the same */
	__u8 tmp_reg = BPF_REG_AX;

	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
			      SKB_BF_MONO_TC_OFFSET);
	*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
				SKB_MONO_DELIVERY_TIME_MASK | SKB_TAI_DELIVERY_TIME_MASK, 2); <== check for both bits are set (if so make it tstamp_unspec)
	*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, 
				SKB_MONO_DELIVERY_TIME_MASK, 3); <== if mono is set then its mono base and jump to 3rd instruction from here.  
	*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
				SKB_TAI_DELIVERY_TIME_MASK, 4); <== if tai is set then its tai base and jump to 4th instruction from here. 
	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
	*insn++ = BPF_JMP_A(1);
	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
	*insn++ = BPF_JMP_A(1);
	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_TAI);

	return insn;
}

//Values 7, 6 and 5 as input will set the value reg to 0 
//otherwise the skb_reg has correct configuration and will store the content in value reg. 
                 
static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
						const struct bpf_insn *si,
						struct bpf_insn *insn)
{
	__u8 value_reg = si->dst_reg;
	__u8 skb_reg = si->src_reg;

#ifdef CONFIG_NET_XGRESS
	/* If the tstamp_type is read,
	 * the bpf prog is aware the tstamp could have delivery time.
	 * Thus, read skb->tstamp as is if tstamp_type_access is true.
	 */
	if (!prog->tstamp_type_access) {
		/* AX is needed because src_reg and dst_reg could be the same */
		__u8 tmp_reg = BPF_REG_AX;

		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
		/*check if all three bits are set*/
		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK |
					SKB_TAI_DELIVERY_TIME_MASK);  <== check if all 3 bits are set which is value 7 
		/*if all 3 bits are set jump 3 instructions and clear the register */
		*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg,
					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK | 
					SKB_TAI_DELIVERY_TIME_MASK, 4); <== if all 3 bits are set then value reg is set to 0 
		/*Now check Mono is set with ingress mask if so clear*/
		*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg,
					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 3); <== check if value is 5 (mono + ingress) if so then value reg is set to 0 
		/*Now Check tai is set with ingress mask if so clear*/
		*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg,
					TC_AT_INGRESS_MASK | SKB_TAI_DELIVERY_TIME_MASK, 2); <== check if value is 6 (tai + ingress) if so then value reg is set to 0 
		/*Now Check tai and mono are set if so clear */
		*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg,
					SKB_MONO_DELIVERY_TIME_MASK | SKB_TAI_DELIVERY_TIME_MASK, 1); <== check if value 3 (tai + mono) if so then value reg is set to 0
		/* goto <store> */
		*insn++ = BPF_JMP_A(2);
		/* skb->tc_at_ingress && skb->tstamp_type:1,
		 * read 0 as the (rcv) timestamp.
		 */
		*insn++ = BPF_MOV64_IMM(value_reg, 0);
		*insn++ = BPF_JMP_A(1);
	}
#endif

	*insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
			      offsetof(struct sk_buff, tstamp));
	return insn;
}

//this was pretty straight forward 
//if ingress mask is set just go ahead and unset both tai and mono delivery time. 

static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
						 const struct bpf_insn *si,
						 struct bpf_insn *insn)
{
	__u8 value_reg = si->src_reg;
	__u8 skb_reg = si->dst_reg;

#ifdef CONFIG_NET_XGRESS
	/* If the tstamp_type is read,
	 * the bpf prog is aware the tstamp could have delivery time.
	 * Thus, write skb->tstamp as is if tstamp_type_access is true.
	 * Otherwise, writing at ingress will have to clear the
	 * mono_delivery_time (skb->tstamp_type:1)bit also.
	 */
	if (!prog->tstamp_type_access) {
		__u8 tmp_reg = BPF_REG_AX;

		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
		/* Writing __sk_buff->tstamp as ingress, goto <clear> */
		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
		/* goto <store> */
		*insn++ = BPF_JMP_A(3);
		/* <clear>: mono_delivery_time or (skb->tstamp_type:1) */
		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
		/* <clear>: tai delivery_time or (skb->tstamp_type:2) */ (new)
		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TAI_DELIVERY_TIME_MASK); (new) <== reset tai delivery mask if ingress bit is set. 
		*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); 
	}
#endif

	/* <store>: skb->tstamp = tstamp */
	*insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM,
			       skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm);
	return insn;


And the ctx_rewrite will be as follows 

		.read  = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
			 "w11 &= 7;"
			 "if w11 == 0x7 goto pc+4;"
			 "if w11 == 0x5 goto pc+3;"
			 "if w11 == 0x6 goto pc+2;"
			 "if w11 == 0x3 goto pc+1;"
			 "goto pc+2"
			 "$dst = 0;"
			 "goto pc+1;"
			 "$dst = *(u64 *)($ctx + sk_buff::tstamp);",
		.write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
			 "if w11 & 0x4 goto pc+1;"
			 "goto pc+3;"
			 "w11 &= -2;"
			 "w11 &= -3;"
			 "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;"
			 "*(u64 *)($ctx + sk_buff::tstamp) = $src;",




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux