Save/restore epoll items during checkpoint/restart respectively. kmalloc failures should be dealt with more kindly than just error-out because epoll is made to poll many thousands of file descriptors. Subsequent patches will change epoll c/r to "chunk" its output/input respectively. Signed-off-by: Matt Helsley <matthltc@xxxxxxxxxx> Changelog: v5: Fix potential recursion during collect. Replace call to ckpt_obj_collect() with ckpt_collect_file(). [Oren] Fix checkpoint leak detection when there are more items than expected. Cleanup/simplify error write paths. (will complicate in a later patch) [Oren] Remove files_deferq bits. [Oren] Remove extra newline. [Oren] Remove aggregate check on number of watches added. [Oren] This is OK since these will be done individually anyway. Remove check for negative objrefs during restart. [Oren] Fixup comment regarding race that indicates checkpoint leaks. [Oren] s/ckpt_read_obj/ckpt_read_buf_type/ [Oren] Patch for lots of epoll items follows. Moved sys_close(epfd) right under fget(). [Oren] Use CKPT_HDR_BUFFER rather than custome ckpt_read/write_* This makes it more similar to the pid array code. [Oren] It also simplifies the error recovery paths. Tested polling a pipe and 50,000 UNIX sockets. v4: ckpt-v18 Use files_deferq as submitted by Dan Smith Cleanup to only report >= 1 items when debugging. v3: [unposted] Removed most of the TODOs -- the remainder will be removed by subsequent patches. Fixed missing ep_file_collect() [Serge] Rather than include checkpoint_hdr.h declare (but do not define) the two structs needed in eventpoll.h [Oren] Complain with ckpt_write_err() when we detect checkpoint obj leaks. [Oren] Remove redundant is_epoll_file() check in collect. [Oren] Move epfile_objref lookup to simplify error handling. [Oren] Simplify error handling with early return in ep_eventpoll_checkpoint(). [Oren] Cleaned up a comment. [Oren] Shorten CKPT_HDR_FILE_EPOLL_ITEMS (-FILE) [Oren] Renumbered to indicate that it follows the file table. Renamed the epoll struct in checkpoint_hdr.h [Oren] Also renamed substruct. Fixup return of empty ep_file_restore(). [Oren] Changed some error returns. [Oren] Changed some tests to BUG_ON(). [Oren] Factored out watch insert with epoll_ctl() into do_epoll_ctl(). [Cedric, Oren] Signed-off-by: Matt Helsley <matthltc@xxxxxxxxxx> --- checkpoint/files.c | 8 + fs/eventpoll.c | 308 ++++++++++++++++++++++++++++++++++++---- include/linux/checkpoint_hdr.h | 18 +++ include/linux/eventpoll.h | 17 ++- 4 files changed, 322 insertions(+), 29 deletions(-) diff --git a/checkpoint/files.c b/checkpoint/files.c index f6de07e..6ea2389 100644 --- a/checkpoint/files.c +++ b/checkpoint/files.c @@ -22,6 +22,7 @@ #include <linux/deferqueue.h> #include <linux/checkpoint.h> #include <linux/checkpoint_hdr.h> +#include <linux/eventpoll.h> #include <net/sock.h> @@ -607,6 +608,13 @@ static struct restore_file_ops restore_file_ops[] = { .file_type = CKPT_FILE_TTY, .restore = tty_file_restore, }, +#ifdef CONFIG_EPOLL + { + .file_name = "EPOLL", + .file_type = CKPT_FILE_EPOLL, + .restore = ep_file_restore, + }, +#endif }; static struct file *do_restore_file(struct ckpt_ctx *ctx) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 085c5c0..4706ec5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -39,6 +39,12 @@ #include <asm/mman.h> #include <asm/atomic.h> +#ifdef CONFIG_CHECKPOINT +#include <linux/checkpoint.h> +#include <linux/checkpoint_hdr.h> +#include <linux/deferqueue.h> +#endif + /* * LOCKING: * There are three level of locking required by epoll : @@ -671,10 +677,20 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) return pollflags != -1 ? pollflags : 0; } +#ifdef CONFIG_CHECKPOINT +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file); +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file); +#else +#define ep_eventpoll_checkpoint NULL +#define ep_file_collect NULL +#endif + /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, - .poll = ep_eventpoll_poll + .poll = ep_eventpoll_poll, + .checkpoint = ep_eventpoll_checkpoint, + .collect = ep_file_collect, }; /* Fast test to see if the file is an evenpoll file */ @@ -1226,35 +1242,18 @@ SYSCALL_DEFINE1(epoll_create, int, size) * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ -SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, - struct epoll_event __user *, event) +int do_epoll_ctl(int op, int fd, + struct file *file, struct file *tfile, + struct epoll_event *epds) { int error; - struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; - struct epoll_event epds; - - error = -EFAULT; - if (ep_op_has_event(op) && - copy_from_user(&epds, event, sizeof(struct epoll_event))) - goto error_return; - - /* Get the "struct file *" for the eventpoll file */ - error = -EBADF; - file = fget(epfd); - if (!file) - goto error_return; - - /* Get the "struct file *" for the target file */ - tfile = fget(fd); - if (!tfile) - goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; if (!tfile->f_op || !tfile->f_op->poll) - goto error_tgt_fput; + return error; /* * We have to check that the file structure underneath the file descriptor @@ -1263,7 +1262,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, */ error = -EINVAL; if (file == tfile || !is_file_epoll(file)) - goto error_tgt_fput; + return error; /* * At this point it is safe to assume that the "private_data" contains @@ -1284,8 +1283,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= POLLERR | POLLHUP; - error = ep_insert(ep, &epds, tfile, fd); + epds->events |= POLLERR | POLLHUP; + error = ep_insert(ep, epds, tfile, fd); } else error = -EEXIST; break; @@ -1297,15 +1296,46 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, break; case EPOLL_CTL_MOD: if (epi) { - epds.events |= POLLERR | POLLHUP; - error = ep_modify(ep, epi, &epds); + epds->events |= POLLERR | POLLHUP; + error = ep_modify(ep, epi, epds); } else error = -ENOENT; break; } mutex_unlock(&ep->mtx); -error_tgt_fput: + return error; +} + +/* + * The following function implements the controller interface for + * the eventpoll file that enables the insertion/removal/change of + * file descriptors inside the interest set. + */ +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + struct epoll_event __user *, event) +{ + int error; + struct file *file, *tfile; + struct epoll_event epds; + + error = -EFAULT; + if (ep_op_has_event(op) && + copy_from_user(&epds, event, sizeof(struct epoll_event))) + goto error_return; + + /* Get the "struct file *" for the eventpoll file */ + error = -EBADF; + file = fget(epfd); + if (!file) + goto error_return; + + /* Get the "struct file *" for the target file */ + tfile = fget(fd); + if (!tfile) + goto error_fput; + + error = do_epoll_ctl(op, fd, file, tfile, &epds); fput(tfile); error_fput: fput(file); @@ -1413,6 +1443,228 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, #endif /* HAVE_SET_RESTORE_SIGMASK */ +#ifdef CONFIG_CHECKPOINT +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file) +{ + struct rb_node *rbp; + struct eventpoll *ep; + int ret = 0; + + ep = file->private_data; + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + struct epitem *epi; + + epi = rb_entry(rbp, struct epitem, rbn); + if (is_file_epoll(epi->ffd.file)) + continue; /* Don't recurse */ + ret = ckpt_collect_file(ctx, epi->ffd.file); + if (ret < 0) + break; + } + mutex_unlock(&ep->mtx); + return ret; +} + +struct epoll_deferq_entry { + struct ckpt_ctx *ctx; + struct file *epfile; +}; + +static int ep_items_checkpoint(void *data) +{ + struct epoll_deferq_entry *dq_entry = data; + struct ckpt_ctx *ctx; + struct ckpt_hdr_eventpoll_items *h; + struct ckpt_eventpoll_item *items; + struct rb_node *rbp; + struct eventpoll *ep; + __s32 epfile_objref; + int i, num_items, ret; + + ctx = dq_entry->ctx; + + epfile_objref = ckpt_obj_lookup(ctx, dq_entry->epfile, CKPT_OBJ_FILE); + BUG_ON(epfile_objref <= 0); + + ep = dq_entry->epfile->private_data; + mutex_lock(&ep->mtx); + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {} + mutex_unlock(&ep->mtx); + num_items = i; + + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_EPOLL_ITEMS); + if (!h) + return -ENOMEM; + h->num_items = num_items; + h->epfile_objref = epfile_objref; + ret = ckpt_write_obj(ctx, &h->h); + ckpt_hdr_put(ctx, h); + if (ret || !num_items) + return ret; + + items = kzalloc(sizeof(*items)*num_items, GFP_KERNEL); + if (!items) + return -ENOMEM; + ret = 0; + i = 0; + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); i < num_items && rbp; rbp = rb_next(rbp), + i++) { + struct epitem *epi; + int objref; + + epi = rb_entry(rbp, struct epitem, rbn); + items[i].fd = epi->ffd.fd; + items[i].events = epi->event.events; + items[i].data = epi->event.data; + objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE); + if (objref <= 0) { + ret = -EBUSY; /* missing item -- checkpoint obj leak */ + break; + } + items[i].file_objref = objref; + } + mutex_unlock(&ep->mtx); + if (i == num_items && rbp) + ret = -EBUSY; /* extra item(s) -- checkpoint obj leak */ + if (!ret) + ret = ckpt_write_buffer(ctx, items, sizeof(*items)*num_items); + else + ckpt_write_err(ctx, "E", "checkpoint leak detected.\n", ret); + kfree(items); + return ret; +} + +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file) +{ + struct ckpt_hdr_file *h; + struct epoll_deferq_entry dq_entry; + int ret = -ENOMEM; + + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE); + if (!h) + return -ENOMEM; + h->f_type = CKPT_FILE_EPOLL; + ret = checkpoint_file_common(ctx, file, h); + if (ret < 0) + goto out; + ret = ckpt_write_obj(ctx, &h->h); + if (ret < 0) + goto out; + + /* + * Defer saving the epoll items until all of the ffd.file pointers + * have an objref; after the file table has been checkpointed. + */ + dq_entry.ctx = ctx; + dq_entry.epfile = file; + ret = deferqueue_add(ctx->files_deferq, &dq_entry, + sizeof(dq_entry), ep_items_checkpoint, NULL); +out: + ckpt_hdr_put(ctx, h); + return ret; +} + +static int ep_items_restore(void *data) +{ + struct ckpt_ctx *ctx = deferqueue_data_ptr(data); + struct ckpt_hdr_eventpoll_items *h; + struct ckpt_eventpoll_item *items = NULL; + struct eventpoll *ep; + struct file *epfile = NULL; + int ret, num_items, i = 0; + + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_EPOLL_ITEMS); + if (IS_ERR(h)) + return PTR_ERR(h); + num_items = h->num_items; + epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE); + ckpt_hdr_put(ctx, h); + + /* Make sure userspace didn't give us a ref to a non-epoll file. */ + if (IS_ERR(epfile)) + return PTR_ERR(epfile); + if (!is_file_epoll(epfile)) + return -EINVAL; + if (!num_items) + return 0; + + ret = ckpt_read_payload(ctx, (void**)&items, num_items*sizeof(*items), + CKPT_HDR_BUFFER); + if (!items) + return -ENOMEM; + + /* Make sure the items match the size we expect */ + if (num_items != (ret / sizeof(*items))) { + ret = -EINVAL; + goto out; + } + + ep = epfile->private_data; + + /* Restore the epoll items/watches */ + for (ret = 0, i = 0; !ret && i < num_items; i++) { + struct epoll_event epev; + struct file *tfile; + + tfile = ckpt_obj_fetch(ctx, items[i].file_objref, + CKPT_OBJ_FILE); + if (IS_ERR(tfile)) { + ret = PTR_ERR(tfile); + break; + } + epev.events = items[i].events; + epev.data = items[i].data; + ret = do_epoll_ctl(EPOLL_CTL_ADD, items[i].fd, + epfile, tfile, &epev); + } +out: + kfree(items); + return ret; +} + +struct file* ep_file_restore(struct ckpt_ctx *ctx, + struct ckpt_hdr_file *h) +{ + struct file *epfile; + int epfd, ret; + + if (h->h.type != CKPT_HDR_FILE || + h->h.len != sizeof(*h) || + h->f_type != CKPT_FILE_EPOLL) + return ERR_PTR(-EINVAL); + + epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC); + if (epfd < 0) + return ERR_PTR(epfd); + epfile = fget(epfd); + sys_close(epfd); /* harmless even if an error occured */ + BUG_ON(!epfile); + + /* + * Needed before we can properly restore the watches and enforce the + * limit on watch numbers. + */ + ret = restore_file_common(ctx, epfile, h); + if (ret < 0) + goto fput_out; + + /* + * Defer restoring the epoll items until the file table is + * fully restored. Ensures that valid file objrefs will resolve. + */ + ret = deferqueue_add_ptr(ctx->files_deferq, ctx, ep_items_restore, NULL); + if (ret < 0) { +fput_out: + fput(epfile); + epfile = ERR_PTR(ret); + } + return epfile; +} + +#endif /* CONFIG_CHECKPOINT */ + static int __init eventpoll_init(void) { struct sysinfo si; diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h index ca2500d..1a3edab 100644 --- a/include/linux/checkpoint_hdr.h +++ b/include/linux/checkpoint_hdr.h @@ -119,6 +119,8 @@ enum { #define CKPT_HDR_TTY CKPT_HDR_TTY CKPT_HDR_TTY_LDISC, #define CKPT_HDR_TTY_LDISC CKPT_HDR_TTY_LDISC + CKPT_HDR_EPOLL_ITEMS = 391, /* Follows file-table */ +#define CKPT_HDR_EPOLL_ITEMS CKPT_HDR_EPOLL_ITEMS CKPT_HDR_MM = 401, #define CKPT_HDR_MM CKPT_HDR_MM @@ -469,6 +471,8 @@ enum file_type { #define CKPT_FILE_SOCKET CKPT_FILE_SOCKET CKPT_FILE_TTY, #define CKPT_FILE_TTY CKPT_FILE_TTY + CKPT_FILE_EPOLL, +#define CKPT_FILE_EPOLL CKPT_FILE_EPOLL CKPT_FILE_MAX #define CKPT_FILE_MAX CKPT_FILE_MAX }; @@ -573,6 +577,20 @@ struct ckpt_hdr_file_socket { __s32 sock_objref; } __attribute__((aligned(8))); +struct ckpt_hdr_eventpoll_items { + struct ckpt_hdr h; + __s32 epfile_objref; + __u32 num_items; +} __attribute__((aligned(8))); + +/* Contained in a CKPT_HDR_BUFFER following the ckpt_hdr_eventpoll_items */ +struct ckpt_eventpoll_item { + __u64 data; + __u32 fd; + __s32 file_objref; + __u32 events; +} __attribute__((aligned(8))); + /* memory layout */ struct ckpt_hdr_mm { struct ckpt_hdr h; diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index f6856a5..34538be 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -56,6 +56,9 @@ struct file; #ifdef CONFIG_EPOLL +struct ckpt_ctx; +struct ckpt_hdr_file; + /* Used to initialize the epoll bits inside the "struct file" */ static inline void eventpoll_init_file(struct file *file) @@ -95,11 +98,23 @@ static inline void eventpoll_release(struct file *file) eventpoll_release_file(file); } -#else +#ifdef CONFIG_CHECKPOINT +extern struct file* ep_file_restore(struct ckpt_ctx *ctx, + struct ckpt_hdr_file *h); +#endif +#else +/* !defined(CONFIG_EPOLL) */ static inline void eventpoll_init_file(struct file *file) {} static inline void eventpoll_release(struct file *file) {} +#ifdef CONFIG_CHECKPOINT +static inline struct file* ep_file_restore(struct ckpt_ctx *ctx, + struct ckpt_hdr_file *ptr) +{ + return ERR_PTR(-ENOSYS); +} +#endif #endif #endif /* #ifdef __KERNEL__ */ -- 1.5.6.3 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers