Re: [PATCH bpf-next 1/7] bpf: implement getsockopt and setsockopt hooks

Martin Lau <kafai@xxxxxx> · Wed, 5 Jun 2019 21:41:39 +0000



On Wed, Jun 05, 2019 at 02:16:30PM -0700, Stanislav Fomichev wrote:
> On 06/05, Martin Lau wrote:
> > On Wed, Jun 05, 2019 at 12:17:24PM -0700, Stanislav Fomichev wrote:
> > > On 06/05, Martin Lau wrote:
> > > > On Tue, Jun 04, 2019 at 02:35:18PM -0700, Stanislav Fomichev wrote:
> > > > > Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and
> > > > > BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks.
> > > > > 
> > > > > BPF_CGROUP_SETSOCKOPT get a read-only view of the setsockopt arguments.
> > > > > BPF_CGROUP_GETSOCKOPT can modify the supplied buffer.
> > > > > Both of them reuse existing PTR_TO_PACKET{,_END} infrastructure.
> > > > > 
> > > > > The buffer memory is pre-allocated (because I don't think there is
> > > > > a precedent for working with __user memory from bpf). This might be
> > > > > slow to do for each {s,g}etsockopt call, that's why I've added
> > > > > __cgroup_bpf_has_prog_array that exits early if there is nothing
> > > > > attached to a cgroup. Note, however, that there is a race between
> > > > > __cgroup_bpf_has_prog_array and BPF_PROG_RUN_ARRAY where cgroup
> > > > > program layout might have changed; this should not be a problem
> > > > > because in general there is a race between multiple calls to
> > > > > {s,g}etsocktop and user adding/removing bpf progs from a cgroup.
> > > > > 
> > > > > By default, kernel code path is executed after the hook (to let
> > > > > BPF handle only a subset of the options). There is new
> > > > > bpf_sockopt_handled handler that returns control to the userspace
> > > > > instead (bypassing the kernel handling).
> > > > > 
> > > > > The return code is either 1 (success) or 0 (EPERM).
> > > > > 
> > > > > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx>
> > > > > ---
> > > > >  include/linux/bpf-cgroup.h |  29 ++++
> > > > >  include/linux/bpf.h        |   2 +
> > > > >  include/linux/bpf_types.h  |   1 +
> > > > >  include/linux/filter.h     |  19 +++
> > > > >  include/uapi/linux/bpf.h   |  17 ++-
> > > > >  kernel/bpf/cgroup.c        | 288 +++++++++++++++++++++++++++++++++++++
> > > > >  kernel/bpf/syscall.c       |  19 +++
> > > > >  kernel/bpf/verifier.c      |  12 ++
> > > > >  net/core/filter.c          |   4 +-
> > > > >  net/socket.c               |  18 +++
> > > > >  10 files changed, 406 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > > > > index b631ee75762d..406f1ba82531 100644
> > > > > --- a/include/linux/bpf-cgroup.h
> > > > > +++ b/include/linux/bpf-cgroup.h
> > > > > @@ -124,6 +124,13 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
> > > > >  				   loff_t *ppos, void **new_buf,
> > > > >  				   enum bpf_attach_type type);
> > > > >  
> > > > > +int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int level,
> > > > > +				       int optname, char __user *optval,
> > > > > +				       unsigned int optlen);
> > > > > +int __cgroup_bpf_run_filter_getsockopt(struct sock *sock, int level,
> > > > > +				       int optname, char __user *optval,
> > > > > +				       int __user *optlen);
> > > > > +
> > > > >  static inline enum bpf_cgroup_storage_type cgroup_storage_type(
> > > > >  	struct bpf_map *map)
> > > > >  {
> > > > > @@ -280,6 +287,26 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> > > > >  	__ret;								       \
> > > > >  })
> > > > >  
> > > > > +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen)   \
> > > > > +({									       \
> > > > > +	int __ret = 0;							       \
> > > > > +	if (cgroup_bpf_enabled)						       \
> > > > > +		__ret = __cgroup_bpf_run_filter_setsockopt(sock, level,	       \
> > > > > +							   optname, optval,    \
> > > > > +							   optlen);	       \
> > > > > +	__ret;								       \
> > > > > +})
> > > > > +
> > > > > +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen)   \
> > > > > +({									       \
> > > > > +	int __ret = 0;							       \
> > > > > +	if (cgroup_bpf_enabled)						       \
> > > > > +		__ret = __cgroup_bpf_run_filter_getsockopt(sock, level,	       \
> > > > > +							   optname, optval,    \
> > > > > +							   optlen);	       \
> > > > > +	__ret;								       \
> > > > > +})
> > > > > +
> > > > >  int cgroup_bpf_prog_attach(const union bpf_attr *attr,
> > > > >  			   enum bpf_prog_type ptype, struct bpf_prog *prog);
> > > > >  int cgroup_bpf_prog_detach(const union bpf_attr *attr,
> > > > > @@ -349,6 +376,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
> > > > >  #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
> > > > >  #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
> > > > >  #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; })
> > > > > +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen) ({ 0; })
> > > > > +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen) ({ 0; })
> > > > >  
> > > > >  #define for_each_cgroup_storage_type(stype) for (; false; )
> > > > >  
> > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > index e5a309e6a400..fb4e6ef5a971 100644
> > > > > --- a/include/linux/bpf.h
> > > > > +++ b/include/linux/bpf.h
> > > > > @@ -1054,6 +1054,8 @@ extern const struct bpf_func_proto bpf_spin_unlock_proto;
> > > > >  extern const struct bpf_func_proto bpf_get_local_storage_proto;
> > > > >  extern const struct bpf_func_proto bpf_strtol_proto;
> > > > >  extern const struct bpf_func_proto bpf_strtoul_proto;
> > > > > +extern const struct bpf_func_proto bpf_sk_fullsock_proto;
> > > > > +extern const struct bpf_func_proto bpf_tcp_sock_proto;
> > > > >  
> > > > >  /* Shared helpers among cBPF and eBPF. */
> > > > >  void bpf_user_rnd_init_once(void);
> > > > > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> > > > > index 5a9975678d6f..eec5aeeeaf92 100644
> > > > > --- a/include/linux/bpf_types.h
> > > > > +++ b/include/linux/bpf_types.h
> > > > > @@ -30,6 +30,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable)
> > > > >  #ifdef CONFIG_CGROUP_BPF
> > > > >  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
> > > > >  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl)
> > > > > +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt)
> > > > >  #endif
> > > > >  #ifdef CONFIG_BPF_LIRC_MODE2
> > > > >  BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
> > > > > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > > > > index 43b45d6db36d..7a07fd2e14d3 100644
> > > > > --- a/include/linux/filter.h
> > > > > +++ b/include/linux/filter.h
> > > > > @@ -1199,4 +1199,23 @@ struct bpf_sysctl_kern {
> > > > >  	u64 tmp_reg;
> > > > >  };
> > > > >  
> > > > > +struct bpf_sockopt_kern {
> > > > > +	struct sock	*sk;
> > > > > +	s32		level;
> > > > > +	s32		optname;
> > > > > +	u32		optlen;
> > > > It seems there is hole.
> > > Ack, will move the pointers up.
> > > 
> > > > > +	u8		*optval;
> > > > > +	u8		*optval_end;
> > > > > +
> > > > > +	/* If true, BPF program had consumed the sockopt request.
> > > > > +	 * Control is returned to the userspace (i.e. kernel doesn't
> > > > > +	 * handle this option).
> > > > > +	 */
> > > > > +	bool		handled;
> > > > > +
> > > > > +	/* Small on-stack optval buffer to avoid small allocations.
> > > > > +	 */
> > > > > +	u8 buf[64];
> > > > Is it better to align to 8 bytes?
> > > Do you mean manually set size to be 64 + x where x is a remainder
> > > to align to 8 bytes? Is there some macro to help with that maybe?
> > I think __attribute__((aligned(8))) should do.
> Ah, you meant to align the buffer itself to avoid unaligned
> access from the bpf progs. Got it, will do.
> 
> > > 
> > > > > +};
> > > > > +
> > > > >  #endif /* __LINUX_FILTER_H__ */
> > > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > > index 7c6aef253173..b6c3891241ef 100644
> > > > > --- a/include/uapi/linux/bpf.h
> > > > > +++ b/include/uapi/linux/bpf.h
> > > > > @@ -170,6 +170,7 @@ enum bpf_prog_type {
> > > > >  	BPF_PROG_TYPE_FLOW_DISSECTOR,
> > > > >  	BPF_PROG_TYPE_CGROUP_SYSCTL,
> > > > >  	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
> > > > > +	BPF_PROG_TYPE_CGROUP_SOCKOPT,
> > > > >  };
> > > > >  
> > > > >  enum bpf_attach_type {
> > > > > @@ -192,6 +193,8 @@ enum bpf_attach_type {
> > > > >  	BPF_LIRC_MODE2,
> > > > >  	BPF_FLOW_DISSECTOR,
> > > > >  	BPF_CGROUP_SYSCTL,
> > > > > +	BPF_CGROUP_GETSOCKOPT,
> > > > > +	BPF_CGROUP_SETSOCKOPT,
> > > > >  	__MAX_BPF_ATTACH_TYPE
> > > > >  };
> > > > >  
> > > > > @@ -2815,7 +2818,8 @@ union bpf_attr {
> > > > >  	FN(strtoul),			\
> > > > >  	FN(sk_storage_get),		\
> > > > >  	FN(sk_storage_delete),		\
> > > > > -	FN(send_signal),
> > > > > +	FN(send_signal),		\
> > > > > +	FN(sockopt_handled),
> > > > Document.
> > > Ah, totally forgot about that, sure, will do!
> > > 
> > > > >  
> > > > >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > > > >   * function eBPF program intends to call
> > > > > @@ -3533,4 +3537,15 @@ struct bpf_sysctl {
> > > > >  				 */
> > > > >  };
> > > > >  
> > > > > +struct bpf_sockopt {
> > > > > +	__bpf_md_ptr(struct bpf_sock *, sk);
> > > > > +
> > > > > +	__s32	level;
> > > > > +	__s32	optname;
> > > > > +
> > > > > +	__u32	optlen;
> > > > > +	__u32	optval;
> > > > > +	__u32	optval_end;
> > > > > +};
> > > > > +
> > > > >  #endif /* _UAPI__LINUX_BPF_H__ */
> > > > > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> > > > > index 1b65ab0df457..4ec99ea97023 100644
> > > > > --- a/kernel/bpf/cgroup.c
> > > > > +++ b/kernel/bpf/cgroup.c
> > > > > @@ -18,6 +18,7 @@
> > > > >  #include <linux/bpf.h>
> > > > >  #include <linux/bpf-cgroup.h>
> > > > >  #include <net/sock.h>
> > > > > +#include <net/bpf_sk_storage.h>
> > > > >  
> > > > >  DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
> > > > >  EXPORT_SYMBOL(cgroup_bpf_enabled_key);
> > > > > @@ -924,6 +925,142 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
> > > > >  }
> > > > >  EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
> > > > >  
> > > > > +static bool __cgroup_bpf_has_prog_array(struct cgroup *cgrp,
> > > > > +					enum bpf_attach_type attach_type)
> > > > > +{
> > > > > +	struct bpf_prog_array *prog_array;
> > > > > +	int nr;
> > > > > +
> > > > > +	rcu_read_lock();
> > > > > +	prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
> > > > > +	nr = bpf_prog_array_length(prog_array);
> > > > Nit. It seems unnecessary to loop through the whole
> > > > array if the only signal needed is non-zero.
> > > Oh, good point. I guess I'd have to add another helper like
> > > bpf_prog_array_is_empty() and return early. Any other suggestions?
> > I was thinking to check empty_prog_array on top but it is
> > too overkilled, so didn't mention it.  I think just return
> > early is good enough.
> [..]
> > I think this non-zero check is good to have before doing lock_sock().
> And not before the allocation? I was trying to optimize for both kmalloc
> and lock_sock (since, I guess, the majority of the cgroups would not
> have any sockopt progs, so there is no point in paying the kmalloc
> cost as well).
+1