A few minor comments. Oren Laadan [orenl@xxxxxxxxxxxxxxx] wrote: | From 6f0a1dc1db8fdac766b00f90e04e06a5827af459 Mon Sep 17 00:00:00 2001 | From: Oren Laadan <orenl@xxxxxxxxxxxxxxx> | Date: Mon, 30 Mar 2009 13:59:34 -0400 | Subject: [PATCH 09/29] Dump open file descriptors | | Dump the files_struct of a task with 'struct cr_hdr_files', followed by | all open file descriptors. Because the 'struct file' corresponding to an | FD can be shared, each they are assigned an objref and registered in the | object hash. A reference to the 'file *' is kept for as long as it lives | in the hash (the hash is only cleaned up at the end of the checkpoint). | | For each open FD there is a 'struct cr_hdr_fd_ent' with the FD, its | close-on-exec property, and the objref of the corresponding 'file *'. | If the FD is to be saved (first time) then this is followed by a | 'struct cr_hdr_fd_data' with the FD state. Then will come the next FD | and so on. | | Recall that it is assumed that all tasks possibly sharing the file table | are frozen. If this assumption breaks, then the behavior is *undefined*: | checkpoint may fail, or restart from the resulting image file will fail. | | This patch only handles basic FDs - regular files, directories. | | Changelog[v14]: | - Revert change to pr_debug(), back to cr_debug() | - Use only unsigned fields in checkpoint headers | - Rename: cr_write_files() => cr_write_fd_table() | - Rename: cr_write_fd_data() => cr_write_file() | - Discard field 'h->parent' | - Check whether calls to cr_hbuf_get() fail | - Use one CR_FD_GENERIC for both regular files and dirs | - Put code for generic file descriptors in a separate function | | Changelog[v12]: | - Replace obsolete cr_debug() with pr_debug() | | Changelog[v11]: | - Discard handling of opened symlinks (there is no such thing) | - cr_scan_fds() retries from scratch if hits size limits | | Changelog[v9]: | - Fix a couple of leaks in cr_write_files() | - Drop useless kfree from cr_scan_fds() | | Changelog[v8]: | - initialize 'coe' to workaround gcc false warning | | Changelog[v6]: | - Balance all calls to cr_hbuf_get() with matching cr_hbuf_put() | (even though it's not really needed) | | Signed-off-by: Oren Laadan <orenl@xxxxxxxxxxxxxxx> | Acked-by: Serge Hallyn <serue@xxxxxxxxxx> | Signed-off-by: Dave Hansen <dave@xxxxxxxxxxxxxxxxxx> | --- | arch/x86/include/asm/checkpoint_hdr.h | 2 +- | checkpoint/Makefile | 2 +- | checkpoint/checkpoint.c | 4 + | checkpoint/checkpoint_file.h | 17 +++ | checkpoint/ckpt_file.c | 247 +++++++++++++++++++++++++++++++++ | include/linux/checkpoint.h | 3 +- | include/linux/checkpoint_hdr.h | 30 ++++- | 7 files changed, 301 insertions(+), 4 deletions(-) | create mode 100644 checkpoint/checkpoint_file.h | create mode 100644 checkpoint/ckpt_file.c | | diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h | index e9eb40c..1efdf24 100644 | --- a/arch/x86/include/asm/checkpoint_hdr.h | +++ b/arch/x86/include/asm/checkpoint_hdr.h | @@ -15,7 +15,7 @@ | /* | * To maintain compatibility between 32-bit and 64-bit architecture flavors, | * keep data 64-bit aligned: use padding for structure members, and use | - * __attribute__ ((aligned (8))) for the entire structure. | + * __attribute__((aligned (8))) for the entire structure. | * | * Quoting Arnd Bergmann: | * "This structure has an odd multiple of 32-bit members, which means | diff --git a/checkpoint/Makefile b/checkpoint/Makefile | index 8368a03..1d92ed2 100644 | --- a/checkpoint/Makefile | +++ b/checkpoint/Makefile | @@ -3,4 +3,4 @@ | # | | obj-$(CONFIG_CHECKPOINT) += sys.o checkpoint.o restart.o objhash.o \ | - ckpt_mem.o rstr_mem.o | + ckpt_mem.o rstr_mem.o ckpt_file.o | diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c | index 422e1a3..d4e0007 100644 | --- a/checkpoint/checkpoint.c | +++ b/checkpoint/checkpoint.c | @@ -250,6 +250,10 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t) | cr_debug("memory: ret %d\n", ret); | if (ret < 0) | goto out; | + ret = cr_write_fd_table(ctx, t); | + cr_debug("files: ret %d\n", ret); | + if (ret < 0) | + goto out; | ret = cr_write_thread(ctx, t); | cr_debug("thread: ret %d\n", ret); | if (ret < 0) | diff --git a/checkpoint/checkpoint_file.h b/checkpoint/checkpoint_file.h | new file mode 100644 | index 0000000..9dc3eba | --- /dev/null | +++ b/checkpoint/checkpoint_file.h | @@ -0,0 +1,17 @@ | +#ifndef _CHECKPOINT_CKPT_FILE_H_ | +#define _CHECKPOINT_CKPT_FILE_H_ | +/* | + * Checkpoint file descriptors | + * | + * Copyright (C) 2008 Oren Laadan | + * | + * This file is subject to the terms and conditions of the GNU General Public | + * License. See the file COPYING in the main directory of the Linux | + * distribution for more details. | + */ | + | +#include <linux/fdtable.h> | + | +int cr_scan_fds(struct files_struct *files, int **fdtable); | + | +#endif /* _CHECKPOINT_CKPT_FILE_H_ */ | diff --git a/checkpoint/ckpt_file.c b/checkpoint/ckpt_file.c | new file mode 100644 | index 0000000..9c344c7 | --- /dev/null | +++ b/checkpoint/ckpt_file.c | @@ -0,0 +1,247 @@ | +/* | + * Checkpoint file descriptors | + * | + * Copyright (C) 2008-2009 Oren Laadan | + * | + * This file is subject to the terms and conditions of the GNU General Public | + * License. See the file COPYING in the main directory of the Linux | + * distribution for more details. | + */ | + | +#include <linux/kernel.h> | +#include <linux/sched.h> | +#include <linux/file.h> | +#include <linux/fdtable.h> | +#include <linux/checkpoint.h> | +#include <linux/checkpoint_hdr.h> | + | +#include "checkpoint_file.h" | + | +#define CR_DEFAULT_FDTABLE 256 /* an initial guess */ | + | +/** | + * cr_scan_fds - scan file table and construct array of open fds | + * @files: files_struct pointer | + * @fdtable: (output) array of open fds | + * | + * Returns the number of open fds found, and also the file table | + * array via *fdtable. The caller should free the array. | + * | + * The caller must validate the file descriptors collected in the | + * array before using them, e.g. by using fcheck_files(), in case | + * the task's fdtable changes in the meantime. | + */ | +int cr_scan_fds(struct files_struct *files, int **fdtable) | +{ | + struct fdtable *fdt; | + int *fds = NULL; | + int i, n; | + int tot = CR_DEFAULT_FDTABLE; | + | + /* | + * We assume that all tasks possibly sharing the file table are | + * frozen (or we our a single process and we checkpoint ourselves). Nit: s/our/are/ | + * Therefore, we can safely proceed after krealloc() from where we | + * left off. Otherwise the file table may be modified by another | + * task after we scan it. The behavior is this case is undefined, | + * and either and either checkpoint or restart will likely fail. Nit: s/and either// | + */ | + retry: | + fds = krealloc(fds, tot * sizeof(*fds), GFP_KERNEL); | + if (!fds) | + return -ENOMEM; | + | + spin_lock(&files->file_lock); | + rcu_read_lock(); | + fdt = files_fdtable(files); | + for (n = 0, i = 0; i < fdt->max_fds; i++) { Hmm, if we want to start where we left-off before the krealloc, shouldn't we initialize 'i' and 'n' to 0 before the 'retry:' ? Or maybe I misunderstand what you mean by "where we left-off" comment above. | + if (!fcheck_files(files, i)) | + continue; | + if (n == tot) { | + spin_unlock(&files->file_lock); | + rcu_read_unlock(); | + tot *= 2; /* won't overflow: kmalloc will fail */ | + goto retry; | + } | + fds[n++] = i; | + } | + rcu_read_unlock(); | + spin_unlock(&files->file_lock); | + | + *fdtable = fds; | + return n; | +} | + | +static int cr_write_file_generic(struct cr_ctx *ctx, struct file *file, | + struct cr_hdr_file *hh) | +{ | + struct cr_hdr h; | + int ret; | + | + /* | + * FIX: check if the file/dir/link is unlinked | + * | + * Or, pass up somthing like in hh->flags to tell | + * the higher-level code that it needs to bring | + * along the file contents too. | + */ | + | + h.type = CR_HDR_FILE; | + h.len = sizeof(*hh); | + | + hh->fd_type = CR_FD_GENERIC; | + | + ret = cr_write_obj(ctx, &h, hh); | + if (ret < 0) | + return ret; | + | + return cr_write_fname(ctx, &file->f_path, &ctx->fs_mnt); | +} | + | +/* cr_write_file - dump the state of a given file pointer */ | +static int cr_write_file(struct cr_ctx *ctx, struct file *file) | +{ | + struct cr_hdr_file *hh; | + struct dentry *dent = file->f_dentry; | + struct inode *inode = dent->d_inode; | + int ret; | + | + hh = cr_hbuf_get(ctx, sizeof(*hh)); | + if (!hh) | + return -ENOMEM; | + | + hh->f_flags = file->f_flags; | + hh->f_mode = file->f_mode; | + hh->f_pos = file->f_pos; | + hh->f_version = file->f_version; | + /* FIX: need also file->uid, file->gid, file->f_owner, etc */ | + | + /* | + * FIXME: when we'll add support for unlinked files/dirs, we'll | + * need to distinguish between unlinked filed and unlinked dirs. | + */ | + switch (inode->i_mode & S_IFMT) { | + case S_IFREG: | + case S_IFDIR: | + ret = cr_write_file_generic(ctx, file, hh); | + break; | + default: | + ret = -EBADF; | + break; | + } | + cr_hbuf_put(ctx, sizeof(*hh)); | + | + return ret; | +} | + | +/** | + * cr_write_fd_ent - dump the state of a given file descriptor | + * @ctx: checkpoint context | + * @files: files_struct pointer | + * @fd: file descriptor | + * | + * Saves the state of the file descriptor; looks up the actual file | + * pointer in the hash table, and if found saves the matching objref, | + * otherwise calls cr_write_file to dump the file pointer too. | + */ | +static int | +cr_write_fd_ent(struct cr_ctx *ctx, struct files_struct *files, int fd) | +{ | + struct cr_hdr h; | + struct cr_hdr_fd_ent *hh; | + struct file *file; | + struct fdtable *fdt; | + int objref, new, ret; | + int coe = 0; /* avoid gcc warning */ | + | + rcu_read_lock(); | + fdt = files_fdtable(files); | + file = fcheck_files(files, fd); | + if (file) { | + coe = FD_ISSET(fd, fdt->close_on_exec); | + get_file(file); | + } | + rcu_read_unlock(); | + | + /* sanity check (although this shouldn't happen) */ | + if (!file) | + return -EBADF; | + | + /* adding 'file' to the hash will keep a reference to it */ | + new = cr_obj_add_ptr(ctx, file, &objref, CR_OBJ_FILE, 0); | + cr_debug("fd %d objref %d file %p c-o-e %d)\n", fd, objref, file, coe); | + | + if (new < 0) | + return new; | + | + h.type = CR_HDR_FD_ENT; | + h.len = sizeof(*hh); | + | + hh = cr_hbuf_get(ctx, sizeof(*hh)); | + if (!hh) { | + fput(file); | + return -ENOMEM; | + } | + | + hh->objref = objref; | + hh->fd = fd; | + hh->close_on_exec = coe; | + | + ret = cr_write_obj(ctx, &h, hh); | + if (ret < 0) | + goto out; | + | + /* new==1 if-and-only-if file was newly added to hash */ | + if (new) | + ret = cr_write_file(ctx, file); | + | +out: | + cr_hbuf_put(ctx, sizeof(*hh)); | + if (file) | + fput(file); | + return ret; | +} | + | +int cr_write_fd_table(struct cr_ctx *ctx, struct task_struct *t) | +{ | + struct cr_hdr h; | + struct cr_hdr_fd_table *hh; | + struct files_struct *files; | + int *fdtable = NULL; | + int nfds, n, ret; | + | + h.type = CR_HDR_FD_TABLE; | + h.len = sizeof(*hh); | + | + hh = cr_hbuf_get(ctx, sizeof(*hh)); | + if (!hh) | + return -ENOMEM; | + | + files = get_files_struct(t); | + | + nfds = cr_scan_fds(files, &fdtable); | + if (nfds < 0) { | + ret = nfds; | + goto out; | + } | + | + hh->objref = 0; /* will be meaningful with multiple processes */ | + hh->nfds = nfds; | + | + ret = cr_write_obj(ctx, &h, hh); | + cr_hbuf_put(ctx, sizeof(*hh)); | + if (ret < 0) | + goto out; | + | + cr_debug("nfds %d\n", nfds); | + for (n = 0; n < nfds; n++) { | + ret = cr_write_fd_ent(ctx, files, fdtable[n]); | + if (ret < 0) | + break; | + } | + | + out: | + kfree(fdtable); | + put_files_struct(files); | + return ret; | +} | diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h | index 88854a9..9489ea5 100644 | --- a/include/linux/checkpoint.h | +++ b/include/linux/checkpoint.h | @@ -13,7 +13,7 @@ | #include <linux/path.h> | #include <linux/fs.h> | | -#define CR_VERSION 1 | +#define CR_VERSION 2 | | struct cr_ctx { | int crid; /* unique checkpoint id */ | @@ -85,6 +85,7 @@ extern struct file *cr_read_open_fname(struct cr_ctx *ctx, | | extern int do_checkpoint(struct cr_ctx *ctx, pid_t pid); | extern int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t); | +extern int cr_write_fd_table(struct cr_ctx *ctx, struct task_struct *t); | | extern int do_restart(struct cr_ctx *ctx, pid_t pid); | extern int cr_read_mm(struct cr_ctx *ctx); | diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h | index 2a06a2f..a6b6dce 100644 | --- a/include/linux/checkpoint_hdr.h | +++ b/include/linux/checkpoint_hdr.h | @@ -17,7 +17,7 @@ | /* | * To maintain compatibility between 32-bit and 64-bit architecture flavors, | * keep data 64-bit aligned: use padding for structure members, and use | - * __attribute__ ((aligned (8))) for the entire structure. | + * __attribute__((aligned (8))) for the entire structure. Nit: Fix in earlier patch ? Sukadev _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers