Quoting Matt Helsley (matthltc@xxxxxxxxxx): > Currently we allocate memory to output all of the epoll items in one > big chunk. At 20 bytes per item, and since epoll was designed to > support on the order of 10,000 items, we may find ourselves kmalloc'ing > 200,000 bytes. That's an order 7 allocation whereas the heuristic for > difficult allocations, PAGE_ALLOC_COST_ORDER, is 3. > > Instead, output the epoll header and items separately. Chunk the output > much like the pid array gets chunked. This ensures that even sub-order 0 > allocations will enable checkpoint of large epoll sets. A subsequent > patch will do something similar for the restore path. > > Signed-off-by: Matt Helsley <matthltc@xxxxxxxxxx> Feels a bit auto-tune-magic-happy :) but looks good Acked-by: Serge Hallyn <serue@xxxxxxxxxx> > --- > fs/eventpoll.c | 71 ++++++++++++++++++++++++++++++++++++------------------- > 1 files changed, 46 insertions(+), 25 deletions(-) > > diff --git a/fs/eventpoll.c b/fs/eventpoll.c > index 4706ec5..2506b40 100644 > --- a/fs/eventpoll.c > +++ b/fs/eventpoll.c > @@ -1480,7 +1480,7 @@ static int ep_items_checkpoint(void *data) > struct rb_node *rbp; > struct eventpoll *ep; > __s32 epfile_objref; > - int i, num_items, ret; > + int num_items = 0, nchunk, ret; > > ctx = dq_entry->ctx; > > @@ -1489,9 +1489,8 @@ static int ep_items_checkpoint(void *data) > > ep = dq_entry->epfile->private_data; > mutex_lock(&ep->mtx); > - for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {} > + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), num_items++) {} > mutex_unlock(&ep->mtx); > - num_items = i; > > h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_EPOLL_ITEMS); > if (!h) > @@ -1503,36 +1502,58 @@ static int ep_items_checkpoint(void *data) > if (ret || !num_items) > return ret; > > - items = kzalloc(sizeof(*items)*num_items, GFP_KERNEL); > + ret = ckpt_write_obj_type(ctx, NULL, sizeof(*items)*num_items, > + CKPT_HDR_BUFFER); > + if (ret < 0) > + return ret; > + > + nchunk = num_items; > + do { > + items = kzalloc(sizeof(*items)*nchunk, GFP_KERNEL); > + if (items) > + break; > + nchunk = nchunk >> 1; > + } while (nchunk > 0); > if (!items) > return -ENOMEM; > + > + /* > + * Walk the rbtree copying items into the chunk of memory and then > + * writing them to the checkpoint image > + */ > ret = 0; > - i = 0; > mutex_lock(&ep->mtx); > - for (rbp = rb_first(&ep->rbr); i < num_items && rbp; rbp = rb_next(rbp), > - i++) { > - struct epitem *epi; > - int objref; > - > - epi = rb_entry(rbp, struct epitem, rbn); > - items[i].fd = epi->ffd.fd; > - items[i].events = epi->event.events; > - items[i].data = epi->event.data; > - objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE); > - if (objref <= 0) { > - ret = -EBUSY; /* missing item -- checkpoint obj leak */ > - break; > + rbp = rb_first(&ep->rbr); > + while ((num_items > 0) && rbp) { > + int n = min(num_items, nchunk); > + int j; > + > + for (j = 0; rbp && j < n; j++, rbp = rb_next(rbp)) { > + struct epitem *epi; > + int objref; > + > + epi = rb_entry(rbp, struct epitem, rbn); > + items[j].fd = epi->ffd.fd; > + items[j].events = epi->event.events; > + items[j].data = epi->event.data; > + objref = ckpt_obj_lookup(ctx, epi->ffd.file, > + CKPT_OBJ_FILE); > + if (objref <= 0) > + goto unlock; > + items[j].file_objref = objref; > } > - items[i].file_objref = objref; > + ret = ckpt_kwrite(ctx, items, n*sizeof(*items)); > + if (ret < 0) > + break; > + num_items -= n; > } > +unlock: > mutex_unlock(&ep->mtx); > - if (i == num_items && rbp) > - ret = -EBUSY; /* extra item(s) -- checkpoint obj leak */ > - if (!ret) > - ret = ckpt_write_buffer(ctx, items, sizeof(*items)*num_items); > - else > - ckpt_write_err(ctx, "E", "checkpoint leak detected.\n", ret); > kfree(items); > + if (num_items != 0 || (num_items == 0 && rbp)) > + ret = -EBUSY; /* extra item(s) -- checkpoint obj leak */ > + if (ret) > + ckpt_write_err(ctx, "E", " checkpointing epoll items.\n", ret); > return ret; > } > > -- > 1.5.6.3 > > > _______________________________________________ > Containers mailing list > Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx > https://lists.linux-foundation.org/mailman/listinfo/containers _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers