Quoting Oren Laadan (orenl@xxxxxxxxxxxxxxx): > A pipe is essentially a double-headed inode with a buffer attached to > it. We checkpoint the pipe buffer only once, as soon as we hit one > side of the pipe, regardless whether it is read- or write- end. > > To checkpoint a file descriptor that refers to a pipe (either end), we > first lookup the inode in the hash table: > > If not found, it is the first encounter of this pipe. Besides the file > descriptor, we also (a) save the pipe data, and (b) register the pipe > inode in the hash. We save the 'objref' of the inode 'in ->fd_objref' > of the file descriptor. The file descriptor type becomes CR_FD_PIPE. > > If found, it is the second encounter of this pipe, namely, as we hit > the other end of the same pipe. In this case we need only record the > reference ('objref') to the inode that we had saved before, and the > file descriptor type is changed to CR_FD_OBJREF. > > The type CR_FD_PIPE will indicate to the kernel to create a new pipe; > since both ends are created at the same time, one end will be used, > and the other end will be deposited in the hash table for later use. > The type CR_FD_OBJREF will indicate that the corresponding file > descriptor is already setup and registered in the hash using the > '->fd_objref' that it had been assigned. > > The format of the pipe data is as follows: > > struct cr_hdr_fd_pipe { > __u32 nr_bufs; > } > > cr_hdr + cr_hdr_fd_ent > cr_hdr + cr_hdr_fd_data > cr_hdr + cr_hdr_fd_pipe -> # buffers > cr_hdr + cr_hdr_buffer -> 1st buffer > cr_hdr + cr_hdr_buffer -> 2nd buffer > cr_hdr + cr_hdr_buffer -> 3rd buffer > ... > > Changelog[v14]: > - Use 'fd_type' instead of 'hh->fd_objref' in cr_write_fd_data() > - Revert change to pr_debug(), back to cr_debug() > - Discard the 'h.parent' field > - Check whether calls to cr_hbuf_get() fail > - Test that a pipe's inode != ctx->file's inode to prevent deadlock > > Signed-off-by: Oren Laadan <orenl@xxxxxxxxxxxxxxx> Acked-by: Serge Hallyn <serue@xxxxxxxxxx> But: > --- > checkpoint/ckpt_file.c | 2 + > fs/pipe.c | 113 ++++++++++++++++++++++++++++++++++++++++ > include/linux/checkpoint_hdr.h | 8 +++- > 3 files changed, 122 insertions(+), 1 deletions(-) > > diff --git a/checkpoint/ckpt_file.c b/checkpoint/ckpt_file.c > index 0fe68bf..dd26b3d 100644 > --- a/checkpoint/ckpt_file.c > +++ b/checkpoint/ckpt_file.c > @@ -12,6 +12,7 @@ > #include <linux/sched.h> > #include <linux/file.h> > #include <linux/fdtable.h> > +#include <linux/pipe_fs_i.h> > #include <linux/checkpoint.h> > #include <linux/checkpoint_hdr.h> > > @@ -72,6 +73,7 @@ int cr_scan_fds(struct files_struct *files, int **fdtable) > return n; > } > > + > static int cr_write_file_generic(struct cr_ctx *ctx, struct file *file, > struct cr_hdr_file *hh) > { > diff --git a/fs/pipe.c b/fs/pipe.c > index 14f502b..0c3f391 100644 > --- a/fs/pipe.c > +++ b/fs/pipe.c > @@ -22,6 +22,9 @@ > #include <asm/uaccess.h> > #include <asm/ioctls.h> > > +#include <linux/checkpoint.h> > +#include <linux/checkpoint_hdr.h> > + > /* > * We use a start+len construction, which provides full use of the > * allocated memory. > @@ -771,6 +774,113 @@ pipe_rdwr_open(struct inode *inode, struct file *filp) > return 0; > } > > +/* cr_write_pipebuf - dump contents of a pipe/fifo (assume i_mutex taken) */ > +static int cr_write_pipebuf(struct cr_ctx *ctx, struct pipe_inode_info *pipe) > +{ > + struct cr_hdr h; > + void *kbuf, *addr; > + int i, ret = 0; > + > + kbuf = (void *) __get_free_page(GFP_KERNEL); > + if (!kbuf) > + return -ENOMEM; > + > + /* this is a simplified fs/pipe.c:read_pipe() */ pipe_read() actually :) > + > + for (i = 0; i < pipe->nrbufs; i++) { > + int nn = (pipe->curbuf + i) & (PIPE_BUFFERS-1); > + struct pipe_buffer *pbuf = pipe->bufs + nn; > + const struct pipe_buf_operations *ops = pbuf->ops; > + > + ret = ops->confirm(pipe, pbuf); > + if (ret < 0) > + break; not that it seems to matter, but pipe_read() returns error also if ret > 0. > + > + addr = ops->map(pipe, pbuf, 1); > + memcpy(kbuf, addr + pbuf->offset, pbuf->len); > + ops->unmap(pipe, pbuf, addr); > + > + h.type = CR_HDR_BUFFER; > + h.len = pbuf->len; > + > + ret = cr_write_obj(ctx, &h, kbuf); > + if (ret < 0) > + break; > + } > + > + free_page((unsigned long) kbuf); > + return ret; > +} > + > +/* cr_write_pipe - dump pipe (assume i_mutex taken) */ > +static int cr_write_pipe(struct cr_ctx *ctx, struct inode *inode) > +{ > + struct cr_hdr h; > + struct cr_hdr_fd_pipe *hh; > + struct pipe_inode_info *pipe = inode->i_pipe; > + int ret; > + > + h.type = CR_HDR_FD_PIPE; > + h.len = sizeof(*hh); > + > + hh = cr_hbuf_get(ctx, sizeof(*hh)); > + if (!hh) > + return -ENOMEM; > + > + hh->nr_bufs = pipe->nrbufs; > + > + ret = cr_write_obj(ctx, &h, hh); > + cr_hbuf_put(ctx, sizeof(*hh)); > + if (ret < 0) > + return ret; > + > + return cr_write_pipebuf(ctx, pipe); > +} > + > +static int pipe_file_checkpoint(struct cr_ctx *ctx, > + struct file *file, struct cr_hdr_file *hh) > +{ > + struct cr_hdr h; > + struct inode *inode = file->f_dentry->d_inode; > + int new, objref; > + int ret; > + > + /* > + * We take the inode's mutex and later will call vfs_write(), > + * which also takes an inode's mutex. To avoid deadlock, make > + * sure that the two inodes are distinct. > + */ > + if (ctx->file->f_dentry->d_inode == inode) { > + pr_warning("c/r: writing to pipe that is checkpointed " > + "may result in a deadlock ... aborting\n"); > + return -EDEADLK; > + } > + > + h.type = CR_HDR_FILE; > + h.len = sizeof(*hh); > + > + new = cr_obj_add_ptr(ctx, inode, &objref, CR_OBJ_INODE, 0); > + cr_debug("objref %d inode %p new %d\n", objref, inode, new); > + if (new < 0) > + return new; > + > + hh->fd_type = (new ? CR_FD_PIPE : CR_FD_OBJREF); The git commit msg has a good explanation, but it's worth a comment in the code as well, that on first instance we call it CR_FD_PIPE and second time CR_FD_OBJREF. > + hh->fd_objref = objref; > + > + ret = cr_write_obj(ctx, &h, hh); > + if (ret < 0) > + return ret; > + > + if (new) { > + mutex_lock(&inode->i_mutex); > + ret = cr_write_pipe(ctx, inode); > + mutex_unlock(&inode->i_mutex); > + } > + > + return ret; > +} > + > + > /* > * The file_operations structs are not static because they > * are also used in linux/fs/fifo.c to do operations on FIFOs. > @@ -787,6 +897,7 @@ const struct file_operations read_pipefifo_fops = { > .open = pipe_read_open, > .release = pipe_read_release, > .fasync = pipe_read_fasync, > + .checkpoint = pipe_file_checkpoint, > }; > > const struct file_operations write_pipefifo_fops = { > @@ -799,6 +910,7 @@ const struct file_operations write_pipefifo_fops = { > .open = pipe_write_open, > .release = pipe_write_release, > .fasync = pipe_write_fasync, > + .checkpoint = pipe_file_checkpoint, > }; > > const struct file_operations rdwr_pipefifo_fops = { > @@ -812,6 +924,7 @@ const struct file_operations rdwr_pipefifo_fops = { > .open = pipe_rdwr_open, > .release = pipe_rdwr_release, > .fasync = pipe_rdwr_fasync, > + .checkpoint = pipe_file_checkpoint, > }; > > struct pipe_inode_info * alloc_pipe_info(struct inode *inode) > diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h > index 9ad845d..ce5d880 100644 > --- a/include/linux/checkpoint_hdr.h > +++ b/include/linux/checkpoint_hdr.h > @@ -57,6 +57,7 @@ enum { > CR_HDR_FD_TABLE = 301, > CR_HDR_FD_ENT, > CR_HDR_FILE, > + CR_HDR_FD_PIPE, > > CR_HDR_TAIL = 5001 > }; > @@ -152,7 +153,8 @@ struct cr_hdr_fd_ent { > /* fd types */ > enum fd_type { > CR_FD_OBJREF = 1, > - CR_FD_GENERIC > + CR_FD_GENERIC, > + CR_FD_PIPE, > }; > > struct cr_hdr_file { > @@ -165,4 +167,8 @@ struct cr_hdr_file { > __u64 f_version; > } __attribute__((aligned(8))); > > +struct cr_hdr_fd_pipe { > + __s32 nr_bufs; > +} __attribute__((aligned(8))); > + > #endif /* _CHECKPOINT_CKPT_HDR_H_ */ > -- > 1.5.4.3 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers