On Tue, Sep 9, 2008 at 4:42 PM, Oren Laadan <orenl@xxxxxxxxxxxxxxx> wrote: > Dump the files_struct of a task with 'struct cr_hdr_files', followed by > all open file descriptors. Since FDs can be shared, they are assigned an > objref and registered in the object hash. > > For each open FD there is a 'struct cr_hdr_fd_ent' with the FD, its objref > and its close-on-exec property. If the FD is to be saved (first time) > then this is followed by a 'struct cr_hdr_fd_data' with the FD state. > Then will come the next FD and so on. > > This patch only handles basic FDs - regular files, directories and also > symbolic links. > > Signed-off-by: Oren Laadan <orenl@xxxxxxxxxxxxxxx> > --- > checkpoint/Makefile | 2 +- > checkpoint/checkpoint.c | 4 + > checkpoint/ckpt_file.c | 221 ++++++++++++++++++++++++++++++++++++++++++++++ > checkpoint/ckpt_file.h | 17 ++++ > include/linux/ckpt.h | 7 +- > include/linux/ckpt_hdr.h | 34 +++++++- > 6 files changed, 280 insertions(+), 5 deletions(-) > create mode 100644 checkpoint/ckpt_file.c > create mode 100644 checkpoint/ckpt_file.h > > diff --git a/checkpoint/Makefile b/checkpoint/Makefile > index 9843fb9..7496695 100644 > --- a/checkpoint/Makefile > +++ b/checkpoint/Makefile > @@ -3,4 +3,4 @@ > # > > obj-$(CONFIG_CHECKPOINT_RESTART) += sys.o checkpoint.o restart.o objhash.o \ > - ckpt_mem.o rstr_mem.o > + ckpt_mem.o rstr_mem.o ckpt_file.o > diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c > index 4dae775..aebbf22 100644 > --- a/checkpoint/checkpoint.c > +++ b/checkpoint/checkpoint.c > @@ -217,6 +217,10 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t) > cr_debug("memory: ret %d\n", ret); > if (ret < 0) > goto out; > + ret = cr_write_files(ctx, t); > + cr_debug("files: ret %d\n", ret); > + if (ret < 0) > + goto out; > ret = cr_write_thread(ctx, t); > cr_debug("thread: ret %d\n", ret); > if (ret < 0) > diff --git a/checkpoint/ckpt_file.c b/checkpoint/ckpt_file.c > new file mode 100644 > index 0000000..ca58b28 > --- /dev/null > +++ b/checkpoint/ckpt_file.c > @@ -0,0 +1,221 @@ > +/* > + * Checkpoint file descriptors > + * > + * Copyright (C) 2008 Oren Laadan > + * > + * This file is subject to the terms and conditions of the GNU General Public > + * License. See the file COPYING in the main directory of the Linux > + * distribution for more details. > + */ > + > +#include <linux/kernel.h> > +#include <linux/sched.h> > +#include <linux/file.h> > +#include <linux/fdtable.h> > +#include <linux/ckpt.h> > +#include <linux/ckpt_hdr.h> > + > +#include "ckpt_file.h" > + > +#define CR_DEFAULT_FDTABLE 256 /* an initial guess */ > + > +/** > + * cr_scan_fds - scan file table and construct array of open fds > + * @files: files_struct pointer > + * @fdtable: (output) array of open fds > + * @return: the number of open fds found > + * > + * Allocates the file descriptors array (*fdtable), caller should free > + */ > +int cr_scan_fds(struct files_struct *files, int **fdtable) > +{ > + struct fdtable *fdt; > + int *fdlist; > + int i, n, max; > + > + n = 0; > + max = CR_DEFAULT_FDTABLE; max is read-only variable so that you don't need to declare local variable. You can use macro. > + fdlist = kmalloc(max * sizeof(*fdlist), GFP_KERNEL); > + if (!fdlist) > + return -ENOMEM; > + > + spin_lock(&files->file_lock); > + fdt = files_fdtable(files); > + for (i = 0; i < fdt->max_fds; i++) { > + if (!fcheck_files(files, i)) > + continue; > + if (n == max) { > + /* fcheck_files() is safe with drop/re-acquire > + * of the lock, as it tests: fd < max_fds */ > + spin_unlock(&files->file_lock); > + max *= 2; > + if (max < 0) { /* overflow ? */ > + n = -EMFILE; > + goto out; > + } > + fdlist = krealloc(fdlist, max, GFP_KERNEL); > + if (!fdlist) { > + n = -ENOMEM; > + goto out; > + } > + spin_lock(&files->file_lock); > + } > + fdlist[n++] = i; > + } > + spin_unlock(&files->file_lock); > + > + *fdtable = fdlist; > + out: > + return n; > +} > + > +/* cr_write_fd_data - dump the state of a given file pointer */ > +static int cr_write_fd_data(struct cr_ctx *ctx, struct file *file, int parent) > +{ > + struct cr_hdr h; > + struct cr_hdr_fd_data *hh = cr_hbuf_get(ctx, sizeof(*hh)); > + struct dentry *dent = file->f_dentry; > + struct inode *inode = dent->d_inode; > + enum fd_type fd_type; > + int ret; > + > + h.type = CR_HDR_FD_DATA; > + h.len = sizeof(*hh); > + h.parent = parent; > + > + hh->f_flags = file->f_flags; > + hh->f_mode = file->f_mode; > + hh->f_pos = file->f_pos; > + hh->f_uid = file->f_uid; > + hh->f_gid = file->f_gid; > + hh->f_version = file->f_version; > + /* FIX: need also file->f_owner */ > + > + switch (inode->i_mode & S_IFMT) { > + case S_IFREG: > + fd_type = CR_FD_FILE; > + break; > + case S_IFDIR: > + fd_type = CR_FD_DIR; > + break; > + case S_IFLNK: > + fd_type = CR_FD_LINK; > + break; > + default: > + return -EBADF; > + } > + > + /* FIX: check if the file/dir/link is unlinked */ > + hh->fd_type = fd_type; > + > + ret = cr_write_obj(ctx, &h, hh); > + cr_hbuf_put(ctx, sizeof(*hh)); > + if (ret < 0) > + return ret; > + > + return cr_write_fname(ctx, &file->f_path, ctx->vfsroot); > +} > + > +/** > + * cr_write_fd_ent - dump the state of a given file descriptor > + * @ctx: checkpoint context > + * @files: files_struct pointer > + * @fd: file descriptor > + * > + * Save the state of the file descriptor; look up the actual file pointer > + * in the hash table, and if found save the matching objref, otherwise call > + * cr_write_fd_data to dump the file pointer too. > + */ > +static int > +cr_write_fd_ent(struct cr_ctx *ctx, struct files_struct *files, int fd) > +{ > + struct cr_hdr h; > + struct cr_hdr_fd_ent *hh = cr_hbuf_get(ctx, sizeof(*hh)); > + struct file *file = NULL; > + struct fdtable *fdt; > + int coe, objref, new, ret; > + > + rcu_read_lock(); > + fdt = files_fdtable(files); > + file = fcheck_files(files, fd); > + if (file) { > + coe = FD_ISSET(fd, fdt->close_on_exec); > + get_file(file); > + } > + rcu_read_unlock(); > + > + /* sanity check (although this shouldn't happen) */ > + if (!file) > + return -EBADF; > + > + new = cr_obj_add_ptr(ctx, (void *) file, &objref, CR_OBJ_FILE, 0); > + cr_debug("fd %d objref %d file %p c-o-e %d)\n", fd, objref, file, coe); > + > + if (new < 0) > + return new; > + > + h.type = CR_HDR_FD_ENT; > + h.len = sizeof(*hh); > + h.parent = 0; > + > + hh->objref = objref; > + hh->fd = fd; > + hh->close_on_exec = coe; > + > + ret = cr_write_obj(ctx, &h, hh); > + cr_hbuf_put(ctx, sizeof(*hh)); > + if (ret < 0) > + return ret; > + > + /* new==1 if-and-only-if file was newly added to hash */ > + if (new) > + ret = cr_write_fd_data(ctx, file, objref); > + > + fput(file); > + return ret; > +} > + > +int cr_write_files(struct cr_ctx *ctx, struct task_struct *t) > +{ > + struct cr_hdr h; > + struct cr_hdr_files *hh = cr_hbuf_get(ctx, sizeof(*hh)); > + struct files_struct *files; > + int *fdtable; > + int nfds, n, ret; > + > + h.type = CR_HDR_FILES; > + h.len = sizeof(*hh); > + h.parent = task_pid_vnr(t); > + > + files = get_files_struct(t); > + > + hh->objref = 0; /* will be meaningful with multiple processes */ > + > + nfds = cr_scan_fds(files, &fdtable); > + if (nfds < 0) { > + ret = nfds; > + goto out; > + } > + > + hh->nfds = nfds; > + > + ret = cr_write_obj(ctx, &h, hh); > + cr_hbuf_put(ctx, sizeof(*hh)); > + if (ret < 0) > + goto clean; > + > + cr_debug("nfds %d\n", nfds); > + for (n = 0; n < nfds; n++) { > + ret = cr_write_fd_ent(ctx, files, n); I think your intention is not 'n' but 'fdtable[n]' in argument. > + if (ret < 0) > + break; > + } > + > + clean: > + kfree(fdtable); > + out: > + put_files_struct(files); > + > + return ret; > +} > diff --git a/checkpoint/ckpt_file.h b/checkpoint/ckpt_file.h > new file mode 100644 > index 0000000..9dc3eba > --- /dev/null > +++ b/checkpoint/ckpt_file.h > @@ -0,0 +1,17 @@ > +#ifndef _CHECKPOINT_CKPT_FILE_H_ > +#define _CHECKPOINT_CKPT_FILE_H_ > +/* > + * Checkpoint file descriptors > + * > + * Copyright (C) 2008 Oren Laadan > + * > + * This file is subject to the terms and conditions of the GNU General Public > + * License. See the file COPYING in the main directory of the Linux > + * distribution for more details. > + */ > + > +#include <linux/fdtable.h> > + > +int cr_scan_fds(struct files_struct *files, int **fdtable); > + > +#endif /* _CHECKPOINT_CKPT_FILE_H_ */ > diff --git a/include/linux/ckpt.h b/include/linux/ckpt.h > index d73f79e..ad46baf 100644 > --- a/include/linux/ckpt.h > +++ b/include/linux/ckpt.h > @@ -13,7 +13,7 @@ > #include <linux/path.h> > #include <linux/fs.h> > > -#define CR_VERSION 1 > +#define CR_VERSION 2 > > struct cr_ctx { > pid_t pid; /* container identifier */ > @@ -80,11 +80,12 @@ int cr_read_string(struct cr_ctx *ctx, void *str, int len); > int cr_read_fname(struct cr_ctx *ctx, void *fname, int n); > struct file *cr_read_open_fname(struct cr_ctx *ctx, int flags, int mode); > > +int do_checkpoint(struct cr_ctx *ctx); > int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t); > -int cr_read_mm(struct cr_ctx *ctx); > +int cr_write_files(struct cr_ctx *ctx, struct task_struct *t); > > -int do_checkpoint(struct cr_ctx *ctx); > int do_restart(struct cr_ctx *ctx); > +int cr_read_mm(struct cr_ctx *ctx); > > #define cr_debug(fmt, args...) \ > pr_debug("[CR:%s] " fmt, __func__, ## args) > diff --git a/include/linux/ckpt_hdr.h b/include/linux/ckpt_hdr.h > index f064cbb..f868dce 100644 > --- a/include/linux/ckpt_hdr.h > +++ b/include/linux/ckpt_hdr.h > @@ -17,7 +17,7 @@ > /* > * To maintain compatibility between 32-bit and 64-bit architecture flavors, > * keep data 64-bit aligned: use padding for structure members, and use > - * __attribute__ ((aligned (8))) for the entire structure. > + * __attribute__((aligned(8))) for the entire structure. > */ > > /* records: generic header */ > @@ -42,6 +42,10 @@ enum { > CR_HDR_VMA, > CR_HDR_MM_CONTEXT, > > + CR_HDR_FILES = 301, > + CR_HDR_FD_ENT, > + CR_HDR_FD_DATA, > + > CR_HDR_TAIL = 5001 > }; > > @@ -112,4 +116,32 @@ struct cr_hdr_vma { > > } __attribute__((aligned(8))); > > +struct cr_hdr_files { > + __u32 objref; /* identifier for shared objects */ > + __u32 nfds; > +} __attribute__((aligned(8))); > + > +struct cr_hdr_fd_ent { > + __u32 objref; /* identifier for shared objects */ > + __s32 fd; > + __u32 close_on_exec; > +} __attribute__((aligned(8))); > + > +/* fd types */ > +enum fd_type { > + CR_FD_FILE = 1, > + CR_FD_DIR, > + CR_FD_LINK > +}; > + > +struct cr_hdr_fd_data { > + __u16 fd_type; > + __u16 f_mode; > + __u32 f_flags; > + __u32 f_uid; > + __u32 f_gid; > + __u64 f_pos; > + __u64 f_version; > +} __attribute__((aligned(8))); > + > #endif /* _CHECKPOINT_CKPT_HDR_H_ */ > -- > 1.5.4.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > -- Kinds regards, MinChan Kim _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers