From: Matt Helsley <matthltc@xxxxxxxxxx> Save/restore epoll items during checkpoint/restart respectively. Output the epoll header and items separately. Chunk the output much like the pid array gets chunked. This ensures that even sub-order 0 allocations will enable checkpoint of large epoll sets. A subsequent patch will do something similar for the restore path. On restart, we grab a piece of memory suitable to store a "chunk" of items for input. Read the input one chunk at a time and add epoll items for each item in the chunk. Changelog [v21]: - Put file_ops->checkpoint under CONFIG_CHECKPOINT Changelog [v19]: - [Oren Laadan] Fix broken compilation for no-c/r architectures Changelog [v19-rc1]: - [Oren Laadan] Return -EBUSY (not BUG_ON) if fd is gone on restart - [Oren Laadan] Fix the chunk size instead of auto-tune Changelog v5: Fix potential recursion during collect. Replace call to ckpt_obj_collect() with ckpt_collect_file(). [Oren] Fix checkpoint leak detection when there are more items than expected. Cleanup/simplify error write paths. (will complicate in a later patch) [Oren] Remove files_deferq bits. [Oren] Remove extra newline. [Oren] Remove aggregate check on number of watches added. [Oren] This is OK since these will be done individually anyway. Remove check for negative objrefs during restart. [Oren] Fixup comment regarding race that indicates checkpoint leaks. [Oren] s/ckpt_read_obj/ckpt_read_buf_type/ [Oren] Patch for lots of epoll items follows. Moved sys_close(epfd) right under fget(). [Oren] Use CKPT_HDR_BUFFER rather than custome ckpt_read/write_* This makes it more similar to the pid array code. [Oren] It also simplifies the error recovery paths. Tested polling a pipe and 50,000 UNIX sockets. Changelog v4: ckpt-v18 Use files_deferq as submitted by Dan Smith Cleanup to only report >= 1 items when debugging. Changelog v3: [unposted] Removed most of the TODOs -- the remainder will be removed by subsequent patches. Fixed missing ep_file_collect() [Serge] Rather than include checkpoint_hdr.h declare (but do not define) the two structs needed in eventpoll.h [Oren] Complain with ckpt_write_err() when we detect checkpoint obj leaks. [Oren] Remove redundant is_epoll_file() check in collect. [Oren] Move epfile_objref lookup to simplify error handling. [Oren] Simplify error handling with early return in ep_eventpoll_checkpoint(). [Oren] Cleaned up a comment. [Oren] Shorten CKPT_HDR_FILE_EPOLL_ITEMS (-FILE) [Oren] Renumbered to indicate that it follows the file table. Renamed the epoll struct in checkpoint_hdr.h [Oren] Also renamed substruct. Fixup return of empty ep_file_restore(). [Oren] Changed some error returns. [Oren] Changed some tests to BUG_ON(). [Oren] Factored out watch insert with epoll_ctl() into do_epoll_ctl(). [Cedric, Oren] Cc: Davide Libenzi <davidel@xxxxxxxxxxxxxxx> Cc: linux-fsdevel@xxxxxxxxxxxxxxx Signed-off-by: Matt Helsley <matthltc@xxxxxxxxxx> Acked-by: Oren Laadan <orenl@xxxxxxxxxxxxxxx> Acked-by: Serge Hallyn <serue@xxxxxxxxxx> --- fs/checkpoint.c | 7 + fs/eventpoll.c | 334 ++++++++++++++++++++++++++++++++++++---- include/linux/checkpoint_hdr.h | 18 ++ include/linux/eventpoll.h | 17 ++- 4 files changed, 347 insertions(+), 29 deletions(-) diff --git a/fs/checkpoint.c b/fs/checkpoint.c index 9b29a26..3bfa692 100644 --- a/fs/checkpoint.c +++ b/fs/checkpoint.c @@ -21,6 +21,7 @@ #include <linux/syscalls.h> #include <linux/deferqueue.h> #include <linux/checkpoint.h> +#include <linux/eventpoll.h> #include <net/sock.h> /************************************************************************** @@ -632,6 +633,12 @@ static struct restore_file_ops restore_file_ops[] = { .file_type = CKPT_FILE_TTY, .restore = tty_file_restore, }, + /* epoll */ + { + .file_name = "EPOLL", + .file_type = CKPT_FILE_EPOLL, + .restore = ep_file_restore, + }, }; static void *restore_file(struct ckpt_ctx *ctx) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index bd056a5..99920d2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -39,6 +39,9 @@ #include <asm/mman.h> #include <asm/atomic.h> +#include <linux/checkpoint.h> +#include <linux/deferqueue.h> + /* * LOCKING: * There are three level of locking required by epoll : @@ -671,10 +674,19 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) return pollflags != -1 ? pollflags : 0; } +#ifdef CONFIG_CHECKPOINT +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file); +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file); +#endif + /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, - .poll = ep_eventpoll_poll + .poll = ep_eventpoll_poll, +#ifdef CONFIG_CHECKPOINT + .checkpoint = ep_eventpoll_checkpoint, + .collect = ep_file_collect, +#endif }; /* Fast test to see if the file is an evenpoll file */ @@ -1226,35 +1238,18 @@ SYSCALL_DEFINE1(epoll_create, int, size) * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ -SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, - struct epoll_event __user *, event) +int do_epoll_ctl(int op, int fd, + struct file *file, struct file *tfile, + struct epoll_event *epds) { int error; - struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; - struct epoll_event epds; - - error = -EFAULT; - if (ep_op_has_event(op) && - copy_from_user(&epds, event, sizeof(struct epoll_event))) - goto error_return; - - /* Get the "struct file *" for the eventpoll file */ - error = -EBADF; - file = fget(epfd); - if (!file) - goto error_return; - - /* Get the "struct file *" for the target file */ - tfile = fget(fd); - if (!tfile) - goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; if (!tfile->f_op || !tfile->f_op->poll) - goto error_tgt_fput; + return error; /* * We have to check that the file structure underneath the file descriptor @@ -1263,7 +1258,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, */ error = -EINVAL; if (file == tfile || !is_file_epoll(file)) - goto error_tgt_fput; + return error; /* * At this point it is safe to assume that the "private_data" contains @@ -1284,8 +1279,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= POLLERR | POLLHUP; - error = ep_insert(ep, &epds, tfile, fd); + epds->events |= POLLERR | POLLHUP; + error = ep_insert(ep, epds, tfile, fd); } else error = -EEXIST; break; @@ -1297,15 +1292,46 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, break; case EPOLL_CTL_MOD: if (epi) { - epds.events |= POLLERR | POLLHUP; - error = ep_modify(ep, epi, &epds); + epds->events |= POLLERR | POLLHUP; + error = ep_modify(ep, epi, epds); } else error = -ENOENT; break; } mutex_unlock(&ep->mtx); -error_tgt_fput: + return error; +} + +/* + * The following function implements the controller interface for + * the eventpoll file that enables the insertion/removal/change of + * file descriptors inside the interest set. + */ +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + struct epoll_event __user *, event) +{ + int error; + struct file *file, *tfile; + struct epoll_event epds; + + error = -EFAULT; + if (ep_op_has_event(op) && + copy_from_user(&epds, event, sizeof(struct epoll_event))) + goto error_return; + + /* Get the "struct file *" for the eventpoll file */ + error = -EBADF; + file = fget(epfd); + if (!file) + goto error_return; + + /* Get the "struct file *" for the target file */ + tfile = fget(fd); + if (!tfile) + goto error_fput; + + error = do_epoll_ctl(op, fd, file, tfile, &epds); fput(tfile); error_fput: fput(file); @@ -1413,6 +1439,258 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, #endif /* HAVE_SET_RESTORE_SIGMASK */ +#ifdef CONFIG_CHECKPOINT +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file) +{ + struct rb_node *rbp; + struct eventpoll *ep; + int ret = 0; + + ep = file->private_data; + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + struct epitem *epi; + + epi = rb_entry(rbp, struct epitem, rbn); + if (is_file_epoll(epi->ffd.file)) + continue; /* Don't recurse */ + ret = ckpt_collect_file(ctx, epi->ffd.file); + if (ret < 0) + break; + } + mutex_unlock(&ep->mtx); + return ret; +} + +struct epoll_deferq_entry { + struct ckpt_ctx *ctx; + struct file *epfile; +}; + +#define CKPT_EPOLL_CHUNK (8096 / (int) sizeof(struct ckpt_eventpoll_item)) + +static int ep_items_checkpoint(void *data) +{ + struct epoll_deferq_entry *dq_entry = data; + struct ckpt_ctx *ctx; + struct ckpt_hdr_eventpoll_items *h; + struct ckpt_eventpoll_item *items; + struct rb_node *rbp; + struct eventpoll *ep; + __s32 epfile_objref; + int num_items = 0, ret; + + ctx = dq_entry->ctx; + + epfile_objref = ckpt_obj_lookup(ctx, dq_entry->epfile, CKPT_OBJ_FILE); + BUG_ON(epfile_objref <= 0); + + ep = dq_entry->epfile->private_data; + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) + num_items++; + mutex_unlock(&ep->mtx); + + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_EPOLL_ITEMS); + if (!h) + return -ENOMEM; + h->num_items = num_items; + h->epfile_objref = epfile_objref; + ret = ckpt_write_obj(ctx, &h->h); + ckpt_hdr_put(ctx, h); + if (ret || !num_items) + return ret; + + ret = ckpt_write_obj_type(ctx, NULL, sizeof(*items)*num_items, + CKPT_HDR_BUFFER); + if (ret < 0) + return ret; + + items = kzalloc(sizeof(*items) * CKPT_EPOLL_CHUNK, GFP_KERNEL); + if (!items) + return -ENOMEM; + + /* + * Walk the rbtree copying items into the chunk of memory and then + * writing them to the checkpoint image + */ + ret = 0; + mutex_lock(&ep->mtx); + rbp = rb_first(&ep->rbr); + while ((num_items > 0) && rbp) { + int n = min(num_items, CKPT_EPOLL_CHUNK); + int j; + + for (j = 0; rbp && j < n; j++, rbp = rb_next(rbp)) { + struct epitem *epi; + int objref; + + epi = rb_entry(rbp, struct epitem, rbn); + items[j].fd = epi->ffd.fd; + items[j].events = epi->event.events; + items[j].data = epi->event.data; + objref = ckpt_obj_lookup(ctx, epi->ffd.file, + CKPT_OBJ_FILE); + if (objref <= 0) + goto unlock; + items[j].file_objref = objref; + } + ret = ckpt_kwrite(ctx, items, n*sizeof(*items)); + if (ret < 0) + break; + num_items -= n; + } +unlock: + mutex_unlock(&ep->mtx); + kfree(items); + if (num_items != 0 || (num_items == 0 && rbp)) + ret = -EBUSY; /* extra item(s) -- checkpoint obj leak */ + if (ret) + ckpt_err(ctx, ret, "Checkpointing epoll items.\n"); + return ret; +} + +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file) +{ + struct ckpt_hdr_file *h; + struct epoll_deferq_entry dq_entry; + int ret = -ENOMEM; + + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE); + if (!h) + return -ENOMEM; + h->f_type = CKPT_FILE_EPOLL; + ret = checkpoint_file_common(ctx, file, h); + if (ret < 0) + goto out; + ret = ckpt_write_obj(ctx, &h->h); + if (ret < 0) + goto out; + + /* + * Defer saving the epoll items until all of the ffd.file pointers + * have an objref; after the file table has been checkpointed. + */ + dq_entry.ctx = ctx; + dq_entry.epfile = file; + ret = deferqueue_add(ctx->files_deferq, &dq_entry, + sizeof(dq_entry), ep_items_checkpoint, NULL); +out: + ckpt_hdr_put(ctx, h); + return ret; +} + +static int ep_items_restore(void *data) +{ + struct ckpt_ctx *ctx = deferqueue_data_ptr(data); + struct ckpt_hdr_eventpoll_items *h; + struct ckpt_eventpoll_item *items = NULL; + struct eventpoll *ep; + struct file *epfile = NULL; + int ret, num_items; + + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_EPOLL_ITEMS); + if (IS_ERR(h)) + return PTR_ERR(h); + num_items = h->num_items; + epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE); + ckpt_hdr_put(ctx, h); + + /* Make sure userspace didn't give us a ref to a non-epoll file. */ + if (IS_ERR(epfile)) + return PTR_ERR(epfile); + if (!is_file_epoll(epfile)) + return -EINVAL; + if (!num_items) + return 0; + + ret = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER); + if (ret < 0) + return ret; + /* Make sure the items match the size we expect */ + if (num_items != (ret / sizeof(*items))) + return -EINVAL; + + items = kzalloc(sizeof(*items) * CKPT_EPOLL_CHUNK, GFP_KERNEL); + if (!items) + return -ENOMEM; + + ep = epfile->private_data; + + while (num_items > 0) { + int n = min(num_items, CKPT_EPOLL_CHUNK); + int j; + + ret = ckpt_kread(ctx, items, n*sizeof(*items)); + if (ret < 0) + break; + + /* Restore the epoll items/watches */ + for (j = 0; !ret && j < n; j++) { + struct epoll_event epev; + struct file *tfile; + + tfile = ckpt_obj_fetch(ctx, items[j].file_objref, + CKPT_OBJ_FILE); + if (IS_ERR(tfile)) { + ret = PTR_ERR(tfile); + goto out; + } + epev.events = items[j].events; + epev.data = items[j].data; + ret = do_epoll_ctl(EPOLL_CTL_ADD, items[j].fd, + epfile, tfile, &epev); + } + num_items -= n; + } +out: + kfree(items); + return ret; +} + +struct file *ep_file_restore(struct ckpt_ctx *ctx, + struct ckpt_hdr_file *h) +{ + struct file *epfile; + int epfd, ret; + + if (h->h.type != CKPT_HDR_FILE || + h->h.len != sizeof(*h) || + h->f_type != CKPT_FILE_EPOLL) + return ERR_PTR(-EINVAL); + + epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC); + if (epfd < 0) + return ERR_PTR(epfd); + epfile = fget(epfd); + sys_close(epfd); /* harmless even if an error occured */ + if (!epfile) /* can happen with a malicious user */ + return ERR_PTR(-EBUSY); + + /* + * Needed before we can properly restore the watches and enforce the + * limit on watch numbers. + */ + ret = restore_file_common(ctx, epfile, h); + if (ret < 0) + goto fput_out; + + /* + * Defer restoring the epoll items until the file table is + * fully restored. Ensures that valid file objrefs will resolve. + */ + ret = deferqueue_add_ptr(ctx->files_deferq, ctx, + ep_items_restore, NULL); + if (ret < 0) { +fput_out: + fput(epfile); + epfile = ERR_PTR(ret); + } + return epfile; +} + +#endif /* CONFIG_CHECKPOINT */ + static int __init eventpoll_init(void) { struct sysinfo si; diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h index 79e8e2d..21540d7 100644 --- a/include/linux/checkpoint_hdr.h +++ b/include/linux/checkpoint_hdr.h @@ -127,6 +127,8 @@ enum { #define CKPT_HDR_TTY CKPT_HDR_TTY CKPT_HDR_TTY_LDISC, #define CKPT_HDR_TTY_LDISC CKPT_HDR_TTY_LDISC + CKPT_HDR_EPOLL_ITEMS, /* must be after file-table */ +#define CKPT_HDR_EPOLL_ITEMS CKPT_HDR_EPOLL_ITEMS CKPT_HDR_MM = 401, #define CKPT_HDR_MM CKPT_HDR_MM @@ -485,6 +487,8 @@ enum file_type { #define CKPT_FILE_SOCKET CKPT_FILE_SOCKET CKPT_FILE_TTY, #define CKPT_FILE_TTY CKPT_FILE_TTY + CKPT_FILE_EPOLL, +#define CKPT_FILE_EPOLL CKPT_FILE_EPOLL CKPT_FILE_MAX #define CKPT_FILE_MAX CKPT_FILE_MAX }; @@ -701,6 +705,20 @@ struct ckpt_hdr_file_socket { __s32 sock_objref; } __attribute__((aligned(8))); +struct ckpt_hdr_eventpoll_items { + struct ckpt_hdr h; + __s32 epfile_objref; + __u32 num_items; +} __attribute__((aligned(8))); + +/* Contained in a CKPT_HDR_BUFFER following the ckpt_hdr_eventpoll_items */ +struct ckpt_eventpoll_item { + __u64 data; + __u32 fd; + __s32 file_objref; + __u32 events; +} __attribute__((aligned(8))); + /* memory layout */ struct ckpt_hdr_mm { struct ckpt_hdr h; diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index f6856a5..52282ae 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -56,6 +56,9 @@ struct file; #ifdef CONFIG_EPOLL +struct ckpt_ctx; +struct ckpt_hdr_file; + /* Used to initialize the epoll bits inside the "struct file" */ static inline void eventpoll_init_file(struct file *file) @@ -95,11 +98,23 @@ static inline void eventpoll_release(struct file *file) eventpoll_release_file(file); } -#else +#ifdef CONFIG_CHECKPOINT +extern struct file *ep_file_restore(struct ckpt_ctx *ctx, + struct ckpt_hdr_file *h); +#endif +#else +/* !defined(CONFIG_EPOLL) */ static inline void eventpoll_init_file(struct file *file) {} static inline void eventpoll_release(struct file *file) {} +#ifdef CONFIG_CHECKPOINT +static inline struct file *ep_file_restore(struct ckpt_ctx *ctx, + struct ckpt_hdr_file *ptr) +{ + return ERR_PTR(-ENOSYS); +} +#endif #endif #endif /* #ifdef __KERNEL__ */ -- 1.6.3.3 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html