On Mon, Dec 23, 2019 at 03:26:50PM -0800, Andrii Nakryiko wrote: > On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@xxxxxx> wrote: > > > > This patch adds a bpf_dctcp example. It currently does not do > > no-ECN fallback but the same could be done through the cgrp2-bpf. > > > > Signed-off-by: Martin KaFai Lau <kafai@xxxxxx> > > --- > > tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++ > > .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 218 +++++++++++++++++ > > tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++ > > 3 files changed, 656 insertions(+) > > create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h > > create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c > > create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c > > > > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h > > new file mode 100644 > > index 000000000000..7ba8c1b4157a > > --- /dev/null > > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h > > @@ -0,0 +1,228 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ > > +#ifndef __BPF_TCP_HELPERS_H > > +#define __BPF_TCP_HELPERS_H > > + > > +#include <stdbool.h> > > +#include <linux/types.h> > > +#include <bpf_helpers.h> > > +#include <bpf_core_read.h> > > +#include "bpf_trace_helpers.h" > > + > > +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__) > > +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__) > > +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__) > > +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__) > > +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__) > > +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__) > > Should we try to put those BPF programs into some section that would > indicate they are used with struct opts? libbpf doesn't use or enforce > that (even though it could to derive and enforce that they are > STRUCT_OPS programs). So something like > SEC("struct_ops/<ideally-operation-name-here>"). I think having this > convention is very useful for consistency and to do a quick ELF dump > and see what is where. WDYT? I did not use it here because I don't want any misperception that it is a required convention by libbpf. Sure, I can prefix it here and comment that it is just a convention but not a libbpf's requirement. > > > + > > +struct sock_common { > > + unsigned char skc_state; > > +} __attribute__((preserve_access_index)); > > + > > +struct sock { > > + struct sock_common __sk_common; > > +} __attribute__((preserve_access_index)); > > + > > +struct inet_sock { > > + struct sock sk; > > +} __attribute__((preserve_access_index)); > > + > > +struct inet_connection_sock { > > + struct inet_sock icsk_inet; > > + __u8 icsk_ca_state:6, > > + icsk_ca_setsockopt:1, > > + icsk_ca_dst_locked:1; > > + struct { > > + __u8 pending; > > + } icsk_ack; > > + __u64 icsk_ca_priv[104 / sizeof(__u64)]; > > +} __attribute__((preserve_access_index)); > > + > > +struct tcp_sock { > > + struct inet_connection_sock inet_conn; > > + > > + __u32 rcv_nxt; > > + __u32 snd_nxt; > > + __u32 snd_una; > > + __u8 ecn_flags; > > + __u32 delivered; > > + __u32 delivered_ce; > > + __u32 snd_cwnd; > > + __u32 snd_cwnd_cnt; > > + __u32 snd_cwnd_clamp; > > + __u32 snd_ssthresh; > > + __u8 syn_data:1, /* SYN includes data */ > > + syn_fastopen:1, /* SYN includes Fast Open option */ > > + syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ > > + syn_fastopen_ch:1, /* Active TFO re-enabling probe */ > > + syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ > > + save_syn:1, /* Save headers of SYN packet */ > > + is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ > > + syn_smc:1; /* SYN includes SMC */ > > + __u32 max_packets_out; > > + __u32 lsndtime; > > + __u32 prior_cwnd; > > +} __attribute__((preserve_access_index)); > > + > > +static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk) > > +{ > > + return (struct inet_connection_sock *)sk; > > +} > > + > > +static __always_inline void *inet_csk_ca(const struct sock *sk) > > +{ > > + return (void *)inet_csk(sk)->icsk_ca_priv; > > +} > > + > > +static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk) > > +{ > > + return (struct tcp_sock *)sk; > > +} > > + > > +static __always_inline bool before(__u32 seq1, __u32 seq2) > > +{ > > + return (__s32)(seq1-seq2) < 0; > > +} > > +#define after(seq2, seq1) before(seq1, seq2) > > + > > +#define TCP_ECN_OK 1 > > +#define TCP_ECN_QUEUE_CWR 2 > > +#define TCP_ECN_DEMAND_CWR 4 > > +#define TCP_ECN_SEEN 8 > > + > > +enum inet_csk_ack_state_t { > > + ICSK_ACK_SCHED = 1, > > + ICSK_ACK_TIMER = 2, > > + ICSK_ACK_PUSHED = 4, > > + ICSK_ACK_PUSHED2 = 8, > > + ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ > > +}; > > + > > +enum tcp_ca_event { > > + CA_EVENT_TX_START = 0, > > + CA_EVENT_CWND_RESTART = 1, > > + CA_EVENT_COMPLETE_CWR = 2, > > + CA_EVENT_LOSS = 3, > > + CA_EVENT_ECN_NO_CE = 4, > > + CA_EVENT_ECN_IS_CE = 5, > > +}; > > + > > +enum tcp_ca_state { > > + TCP_CA_Open = 0, > > + TCP_CA_Disorder = 1, > > + TCP_CA_CWR = 2, > > + TCP_CA_Recovery = 3, > > + TCP_CA_Loss = 4 > > +}; > > + > > +struct ack_sample { > > + __u32 pkts_acked; > > + __s32 rtt_us; > > + __u32 in_flight; > > +} __attribute__((preserve_access_index)); > > + > > +struct rate_sample { > > + __u64 prior_mstamp; /* starting timestamp for interval */ > > + __u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ > > + __s32 delivered; /* number of packets delivered over interval */ > > + long interval_us; /* time for tp->delivered to incr "delivered" */ > > + __u32 snd_interval_us; /* snd interval for delivered packets */ > > + __u32 rcv_interval_us; /* rcv interval for delivered packets */ > > + long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ > > + int losses; /* number of packets marked lost upon ACK */ > > + __u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ > > + __u32 prior_in_flight; /* in flight before this ACK */ > > + bool is_app_limited; /* is sample from packet with bubble in pipe? */ > > + bool is_retrans; /* is sample from retransmission? */ > > + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ > > +} __attribute__((preserve_access_index)); > > + > > +#define TCP_CA_NAME_MAX 16 > > +#define TCP_CONG_NEEDS_ECN 0x2 > > + > > +struct tcp_congestion_ops { > > + __u32 flags; > > + > > + /* initialize private data (optional) */ > > + void (*init)(struct sock *sk); > > + /* cleanup private data (optional) */ > > + void (*release)(struct sock *sk); > > + > > + /* return slow start threshold (required) */ > > + __u32 (*ssthresh)(struct sock *sk); > > + /* do new cwnd calculation (required) */ > > + void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked); > > + /* call before changing ca_state (optional) */ > > + void (*set_state)(struct sock *sk, __u8 new_state); > > + /* call when cwnd event occurs (optional) */ > > + void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); > > + /* call when ack arrives (optional) */ > > + void (*in_ack_event)(struct sock *sk, __u32 flags); > > + /* new value of cwnd after loss (required) */ > > + __u32 (*undo_cwnd)(struct sock *sk); > > + /* hook for packet ack accounting (optional) */ > > + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); > > + /* override sysctl_tcp_min_tso_segs */ > > + __u32 (*min_tso_segs)(struct sock *sk); > > + /* returns the multiplier used in tcp_sndbuf_expand (optional) */ > > + __u32 (*sndbuf_expand)(struct sock *sk); > > + /* call when packets are delivered to update cwnd and pacing rate, > > + * after all the ca_state processing. (optional) > > + */ > > + void (*cong_control)(struct sock *sk, const struct rate_sample *rs); > > + > > + char name[TCP_CA_NAME_MAX]; > > +}; > > Can all of these types come from vmlinux.h instead of being duplicated here? It can but I prefer leaving it as is in bpf_tcp_helpers.h like another existing test in kfree_skb.c. Without directly using the same struct in vmlinux.h, I think it is a good test for libbpf. That remind me to shuffle the member ordering a little in tcp_congestion_ops here. > > > + > > +#define min(a, b) ((a) < (b) ? (a) : (b)) > > +#define max(a, b) ((a) > (b) ? (a) : (b)) > > +#define min_not_zero(x, y) ({ \ > > + typeof(x) __x = (x); \ > > + typeof(y) __y = (y); \ > > + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) > > + > > [...] > > > +static struct bpf_object *load(const char *filename, const char *map_name, > > + struct bpf_link **link) > > +{ > > + struct bpf_object *obj; > > + struct bpf_map *map; > > + struct bpf_link *l; > > + int err; > > + > > + obj = bpf_object__open(filename); > > + if (CHECK(IS_ERR(obj), "bpf_obj__open_file", "obj:%ld\n", > > + PTR_ERR(obj))) > > + return obj; > > + > > + err = bpf_object__load(obj); > > + if (CHECK(err, "bpf_object__load", "err:%d\n", err)) { > > + bpf_object__close(obj); > > + return ERR_PTR(err); > > + } > > + > > + map = bpf_object__find_map_by_name(obj, map_name); > > + if (CHECK(!map, "bpf_object__find_map_by_name", "%s not found\n", > > + map_name)) { > > + bpf_object__close(obj); > > + return ERR_PTR(-ENOENT); > > + } > > + > > use skeleton instead? Will give it a spin. > > > + l = bpf_map__attach_struct_ops(map); > > + if (CHECK(IS_ERR(l), "bpf_struct_ops_map__attach", "err:%ld\n", > > + PTR_ERR(l))) { > > + bpf_object__close(obj); > > + return (void *)l; > > + } > > + > > + *link = l; > > + > > + return obj; > > +} > > +