Re: [RFC PATCH bpf-next v5 2/2] net: Add additional bit to support clockid_t timestamp type

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 4/25/2024 9:37 PM, Martin KaFai Lau wrote:
> On 4/24/24 3:20 PM, Abhishek Chauhan wrote:
>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>> index e464d0ebc9c1..3ad0de07d261 100644
>> --- a/include/linux/skbuff.h
>> +++ b/include/linux/skbuff.h
>> @@ -711,6 +711,8 @@ typedef unsigned char *sk_buff_data_t;
>>   enum skb_tstamp_type {
>>       SKB_CLOCK_REALTIME,
>>       SKB_CLOCK_MONOTONIC,
>> +    SKB_CLOCK_TAI,
>> +    __SKB_CLOCK_MAX = SKB_CLOCK_TAI,
>>   };
>>     /**
>> @@ -831,8 +833,8 @@ enum skb_tstamp_type {
>>    *    @decrypted: Decrypted SKB
>>    *    @slow_gro: state present at GRO time, slower prepare step required
>>    *    @tstamp_type: When set, skb->tstamp has the
>> - *        delivery_time in mono clock base Otherwise, the
>> - *        timestamp is considered real clock base.
>> + *        delivery_time in mono clock base or clock base of skb->tstamp.
>> + *        Otherwise, the timestamp is considered real clock base
>>    *    @napi_id: id of the NAPI struct this skb came from
>>    *    @sender_cpu: (aka @napi_id) source CPU in XPS
>>    *    @alloc_cpu: CPU which did the skb allocation.
>> @@ -960,7 +962,7 @@ struct sk_buff {
>>       /* private: */
>>       __u8            __mono_tc_offset[0];
>>       /* public: */
>> -    __u8            tstamp_type:1;    /* See skb_tstamp_type */
>> +    __u8            tstamp_type:2;    /* See skb_tstamp_type */
>>   #ifdef CONFIG_NET_XGRESS
>>       __u8            tc_at_ingress:1;    /* See TC_AT_INGRESS_MASK */
>>       __u8            tc_skip_classify:1;
>> @@ -1090,15 +1092,17 @@ struct sk_buff {
>>   #endif
>>   #define PKT_TYPE_OFFSET        offsetof(struct sk_buff, __pkt_type_offset)
>>   -/* if you move tc_at_ingress or mono_delivery_time
>> +/* if you move tc_at_ingress or tstamp_type:2
>>    * around, you also must adapt these constants.
>>    */
>>   #ifdef __BIG_ENDIAN_BITFIELD
>> -#define SKB_MONO_DELIVERY_TIME_MASK    (1 << 7)
>> -#define TC_AT_INGRESS_MASK        (1 << 6)
>> +#define SKB_TSTAMP_TYPE_MASK        (3 << 6)
>> +#define SKB_TSTAMP_TYPE_RSH        (6)
>> +#define TC_AT_INGRESS_RSH        (5)
> 
> TC_AT_INGRESS_RSH is not used.
>  
Noted. I will remove it. 

>> +#define TC_AT_INGRESS_MASK        (1 << 5)
>>   #else
>> -#define SKB_MONO_DELIVERY_TIME_MASK    (1 << 0)
>> -#define TC_AT_INGRESS_MASK        (1 << 1)
>> +#define SKB_TSTAMP_TYPE_MASK        (3)
>> +#define TC_AT_INGRESS_MASK        (1 << 2)
>>   #endif
>>   #define SKB_BF_MONO_TC_OFFSET        offsetof(struct sk_buff, __mono_tc_offset)
>>   @@ -4204,6 +4208,12 @@ static inline void skb_set_tstamp_type_frm_clkid(struct sk_buff *skb,
>>       case CLOCK_MONOTONIC:
>>           skb->tstamp_type = SKB_CLOCK_MONOTONIC;
>>           break;
>> +    case CLOCK_TAI:
>> +        skb->tstamp_type = SKB_CLOCK_TAI;
>> +        break;
>> +    default:
>> +        WARN_ONCE(true, "clockid %d not supported", clockid);
>> +        skb->tstamp_type = SKB_CLOCK_REALTIME;
>>       }
>>   }
>>   diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index cee0a7915c08..1376ed5ece10 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
> 
> The bpf.h needs to be sync to tools/include/uapi/linux/bpf.h.

Oh i see. There is a bpf.h in tools as well . I was not aware of this part. I will further check this. 
> Otherwise, the bpf CI cannot compile the tests:
> 
> https://patchwork.kernel.org/project/netdevbpf/patch/20240424222028.1080134-2-quic_abchauha@xxxxxxxxxxx/
> 
> Please monitor the bpf CI test result after submitting the patches.
> 
>> @@ -6209,6 +6209,7 @@ union {                    \
>>   enum {
>>       BPF_SKB_TSTAMP_UNSPEC,
>>       BPF_SKB_TSTAMP_DELIVERY_MONO,    /* tstamp has mono delivery time */
>> +    BPF_SKB_TSTAMP_DELIVERY_TAI,    /* tstamp has tai delivery time */
>>       /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
>>        * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
>>        * and try to deduce it by ingress, egress or skb->sk->sk_clockid.
> 
> SKB_CLOCK_TAI is properly defined as an enum now and there is a
> WARN for clock other than REAL, MONO, and TAI. I think it is
> time to remove UNSPEC and give it back the proper name REALTIME.
> 
> I want to take this chance to do some renaming:
> 
> /* The enum used in skb->tstamp_type. It specifies the clock type
>  * of the time stored in the skb->tstamp.
>  */
> enum {
>     BPF_SKB_TSTAMP_UNSPEC = 0,              /* DEPRECATED */
>     BPF_SKB_TSTAMP_DELIVERY_MONO = 1,       /* DEPRECATED */
>     BPF_SKB_CLOCK_REALTIME = 0,             /* Realtime clock */
>     BPF_SKB_CLOCK_MONOTONIC = 1,            /* Monotonic clock */
>     BPF_SKB_CLOCK_TAI = 2,                  /* TAI clock */
>     /* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle,
>      * the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid.
>      */
> };
> 
Okay let me evalute this and get back 
> 
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 957c2fc724eb..c67622f4fe98 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -7733,6 +7733,12 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
>>           skb->tstamp = tstamp;
>>           skb->tstamp_type = SKB_CLOCK_MONOTONIC;
>>           break;
>> +    case BPF_SKB_TSTAMP_DELIVERY_TAI:
>> +        if (!tstamp)
>> +            return -EINVAL;
>> +        skb->tstamp = tstamp;
>> +        skb->tstamp_type = SKB_CLOCK_TAI;
>> +        break;
>>       case BPF_SKB_TSTAMP_UNSPEC:
>>           if (tstamp)
> 
> Allow to store any realtime tstamp here since BPF_SKB_TSTAMP_UNSPEC
> becomes BPF_SKB_CLOCK_REALTIME.
> 
> Like:
> 
> BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
>            u64, tstamp, u32, tstamp_type)
> {
>     /* ... */
>     case BPF_SKB_CLOCK_TAI:
>         if (!tstamp)
>             return -EINVAL;
>         skb->tstamp = tstamp;
>         skb->tstamp_type = SKB_CLOCK_TAI;
>         break;
>         case BPF_SKB_CLOCK_REALTIME:
>         skb->tstamp = tstamp;
>         skb->tstamp_type = SKB_CLOCK_REALTIME;
>         break;
> 
>     /* ... */
> }
Noted! 
> 
>>               return -EINVAL;
> 
>> @@ -9388,17 +9394,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
>>   {
>>       __u8 value_reg = si->dst_reg;
>>       __u8 skb_reg = si->src_reg;
>> -    /* AX is needed because src_reg and dst_reg could be the same */
>> -    __u8 tmp_reg = BPF_REG_AX;
>> -
>> -    *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
>> -                  SKB_BF_MONO_TC_OFFSET);
>> -    *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
>> -                SKB_MONO_DELIVERY_TIME_MASK, 2);
>> -    *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
>> -    *insn++ = BPF_JMP_A(1);
>> -    *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
>> -
>> +    BUILD_BUG_ON(__SKB_CLOCK_MAX != BPF_SKB_TSTAMP_DELIVERY_TAI);
> 
> Add these also:
> 
Noted! 
>     BUILD_BUG_ON(SKB_CLOCK_REALTIME != BPF_SKB_CLOCK_REALTIME);
>     BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != BPF_SKB_CLOCK_MONOTONIC);
>     BUILD_BUG_ON(SKB_CLOCK_TAI != BPF_SKB_CLOCK_TAI);
> 
>> +    *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
>> +    *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
>> +#ifdef __BIG_ENDIAN_BITFIELD
>> +    *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSH);
>> +#else
>> +    BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
>> +#endif
>> +    *insn++ = BPF_JMP32_IMM(BPF_JNE, value_reg, SKB_TSTAMP_TYPE_MASK, 1);
>> +    /* Both the bits set then mark it BPF_SKB_TSTAMP_UNSPEC */
>> +    *insn++ = BPF_MOV64_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
> 
> The kernel should not have both bits set in skb->tstamp_type. No need to
> add two extra bpf insns to check this. If there is a bug in the kernel,
> it is better to be uncovered instead of hiding it under BPF_SKB_TSTAMP_UNSPEC (which
> is renamed to BPF_SKB_CLOCK_REALTIME anyway).
> Hence, the last two bpf insns should be removed.
> 
Got it !.
>>       return insn;
>>   }
>>   @@ -9430,6 +9436,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
>>       __u8 value_reg = si->dst_reg;
>>       __u8 skb_reg = si->src_reg;
>>   +BUILD_BUG_ON(__SKB_CLOCK_MAX != BPF_SKB_TSTAMP_DELIVERY_TAI);
> 
> It is a dup of the one in bpf_convert_tstamp_type_read and can be removed.
> 
Noted!
>>   #ifdef CONFIG_NET_XGRESS
>>       /* If the tstamp_type is read,
>>        * the bpf prog is aware the tstamp could have delivery time.
>> @@ -9440,11 +9447,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
>>           __u8 tmp_reg = BPF_REG_AX;
>>             *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
>> -        *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
>> -                    TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
>> -        *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
>> -                    TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2);
>> -        /* skb->tc_at_ingress && skb->tstamp_type:1,
>> +        /* check if ingress mask bits is set */
>> +        *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
>> +        *insn++ = BPF_JMP_A(4);
>> +        *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
>> +        *insn++ = BPF_JMP_A(2);
>> +        /* skb->tc_at_ingress && skb->tstamp_type:2,
>>            * read 0 as the (rcv) timestamp.
>>            */
>>           *insn++ = BPF_MOV64_IMM(value_reg, 0);
>> @@ -9469,7 +9477,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
>>        * the bpf prog is aware the tstamp could have delivery time.
>>        * Thus, write skb->tstamp as is if tstamp_type_access is true.
>>        * Otherwise, writing at ingress will have to clear the
>> -     * mono_delivery_time (skb->tstamp_type:1)bit also.
>> +     * mono_delivery_time (skb->tstamp_type:2)bit also.
>>        */
>>       if (!prog->tstamp_type_access) {
>>           __u8 tmp_reg = BPF_REG_AX;
>> @@ -9479,8 +9487,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
>>           *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
>>           /* goto <store> */
>>           *insn++ = BPF_JMP_A(2);
>> -        /* <clear>: mono_delivery_time or (skb->tstamp_type:1) */
>> -        *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
>> +        /* <clear>: skb->tstamp_type:2 */
>> +        *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
>>           *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
>>       }
>>   #endif
>> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
>> index 591226dcde26..f195b31d6e75 100644
>> --- a/net/ipv4/ip_output.c
>> +++ b/net/ipv4/ip_output.c
>> @@ -1457,7 +1457,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
>>         skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
>>       skb->mark = cork->mark;
>> -    skb->tstamp = cork->transmit_time;
>> +    skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid);
> 
> hmm... I think this will break for tcp. This sequence in particular:
> 
> tcp_v4_timewait_ack()
>   tcp_v4_send_ack()
>     ip_send_unicast_reply()
>       ip_push_pending_frames()
>         ip_finish_skb()
>           __ip_make_skb()
>             /* sk_clockid is REAL but cork->transmit_time should be in mono */
>             skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid);;
> 
> I think I hit it from time to time when running the test in this patch set.
> 
do you think i need to check for protocol type here . since tcp uses Mono and the rest according to the new design is based on 
sk->sk_clockid
if (iph->protocol == IPPROTO_TCP)
	skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, CLOCK_MONOTONIC);
else 
	skb_set_tstamp_type_frm_clkid(skb, cork->transmit_time, sk->sk_clockid);


> [ ... ]
> 
>> diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
>> index 74ec09f040b7..19dba6d88265 100644
>> --- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c
>> +++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
> 
> Please separate the selftests/bpf changes into another patch.
> 
I will do that. 
>> @@ -227,6 +227,12 @@ int egress_host(struct __sk_buff *skb)
>>               inc_dtimes(EGRESS_ENDHOST);
>>           else
>>               inc_errs(EGRESS_ENDHOST);
>> +    } else if (skb_proto(skb_type) == IPPROTO_UDP) {
>> +        if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI &&
>> +            skb->tstamp)
>> +            inc_dtimes(EGRESS_ENDHOST);
>> +        else
>> +            inc_errs(EGRESS_ENDHOST);
>>       } else {
>>           if (skb->tstamp_type == BPF_SKB_TSTAMP_UNSPEC &&
>>               skb->tstamp)
>> @@ -255,6 +261,9 @@ int ingress_host(struct __sk_buff *skb)
>>       if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
>>           skb->tstamp == EGRESS_FWDNS_MAGIC)
>>           inc_dtimes(INGRESS_ENDHOST);
>> +    else if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI &&
>> +               skb->tstamp == EGRESS_FWDNS_MAGIC)
>> +        inc_dtimes(INGRESS_ENDHOST);
>>       else
>>           inc_errs(INGRESS_ENDHOST);
>>   @@ -323,12 +332,14 @@ int ingress_fwdns_prio101(struct __sk_buff *skb)
>>           /* Should have handled in prio100 */
>>           return TC_ACT_SHOT;
>>   -    if (skb_proto(skb_type) == IPPROTO_UDP)
>> +    if (skb_proto(skb_type) == IPPROTO_UDP &&
>> +          skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI)
>>           expected_dtime = 0;
> 
> The IPPROTO_UDP check and expected_dtime can be removed. The UDP test
> can expect the same EGRESS_ENDHOST_MAGIC in the skb->tstamp since
> the TAI tstamp is also forwarded from egress to ingress now.
> 
>>         if (skb->tstamp_type) {
>>           if (fwdns_clear_dtime() ||
>> -            skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
>> +            (skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO &&
>> +            skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI) ||
>>               skb->tstamp != expected_dtime)
>>               inc_errs(INGRESS_FWDNS_P101);
>>           else
>> @@ -338,7 +349,8 @@ int ingress_fwdns_prio101(struct __sk_buff *skb)
>>               inc_errs(INGRESS_FWDNS_P101);
>>       }
>>   -    if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
>> +    if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO ||
>> +          skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI) {
> 
> No need to check BPF_SKB_TSTAMP_DELIVERY_TAI such that the
> bpf_skb_set_tstamp() helper can still be tested.
> 
> There are some other minor changes needed for the test_tc_dtime.c and
> the tc_redirect.c. I quickly made the changes and put them here (first patch):
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/martin.lau/bpf-next.git/log/?h=skb.tstamp_type
> 
Thanks for your help Martin. I will check the changes you have made in BPF test framework and see what i have missed. 

> 
> 
>>           skb->tstamp = INGRESS_FWDNS_MAGIC;
>>       } else {
>>           if (bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
>> @@ -370,7 +382,8 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
>>         if (skb->tstamp_type) {
>>           if (fwdns_clear_dtime() ||
>> -            skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
>> +            (skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO &&
>> +             skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_TAI) ||
>>               skb->tstamp != INGRESS_FWDNS_MAGIC)
>>               inc_errs(EGRESS_FWDNS_P101);
>>           else
>> @@ -380,7 +393,8 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
>>               inc_errs(EGRESS_FWDNS_P101);
>>       }
>>   -    if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
>> +    if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO ||
>> +          skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_TAI) {
>>           skb->tstamp = EGRESS_FWDNS_MAGIC;
>>       } else {
>>           if (bpf_skb_set_tstamp(skb, EGRESS_FWDNS_MAGIC,
> 




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux