>From a80c5215763f757840214465277e911e46e01219 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu <sukadev@xxxxxxxxxxxxxxxxxx> Date: Mon, 23 Jun 2008 20:13:57 -0700 Subject: [PATCH] Support Non-consecutive and dup pipe fds PATCH 1/1 provides basic infrastructure to save/restore state of pipes. This patch removes assumptions about order of the pipe-fds and also supports existence of 'dups' of pipe-fds. This logic has been separated from PATCH 1/1 for easier review and the two patches could be combined into a single one. Thanks to Matt Helsley for the optimized logic/code in match_pipe_ends(). TODO: There are few TODO's marked out in the patch. Hopefully these can be addressed without significant impact to the central-logic of saving/restoring pipes. - Temporarily using a regular-file's fd as 'trampoline-fd' when all fds are in use - Maybe read all fdinfo into memory during restart, so we can reduce the information we save into the checkpoint-file (see comments near 'struct fdinfo'). - Check logic of detecting 'dup's of pipe fds (any hidden gotchas ?) See pair_pipe_fds() - Alloc ppi_list[] dynamically (see getfdinfo()). - Use getrlimit() to compute max-open-fds (see near caller of pair_pipe_fds()). - [Oleg Nesterov]: SIGIO/inotify() issues associated with writing-back to pipes (fixing this would require some assitance from kernel ?) Ran several unit-test cases (see test-patches). Additional cases to be developed/executed. Signed-off-by: Sukadev Bhattiprolu <sukadev@xxxxxxxxxx> --- cr.c | 262 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 files changed, 240 insertions(+), 22 deletions(-) diff --git a/cr.c b/cr.c index 716cc86..f40a4fb 100644 --- a/cr.c +++ b/cr.c @@ -79,8 +79,25 @@ typedef struct isockinfo_t { char tcpstate[TPI_LEN]; } isockinfo_t; +/* + * TODO: restore_fd() processes each fd as it reads it of the checkpoint + * file. To avoid making a second-pass at the file, we store following + * fields during checkpoint (for now). + * + * peer_fdnum, dup_fdnum, create_pipe, tramp_fd' fields can be + * + * We could eliminate this fields by reading all fdinfo into memory + * and then 'computing' the above fields before processing the fds. + * But this would require a non-trivial rewrite of the restore_fd() + * logic. Hopefully that can be done without significant impact to + * rest of the logic associated with saving/restoring pipes. + */ typedef struct fdinfo_t { int fdnum; /* file descriptor number */ + int peer_fdnum; /* peer fd for pipes */ + int dup_fdnum; /* fd, if fd is dup of another pipe fd */ + int create_pipe; /* TRUE if this is the create-end of the pipe */ + int tramp_fd; /* trampoline-fd for use in restoring pipes */ mode_t mode; /* mode as per stat(2) */ off_t offset; /* read/write pointer position for regular files */ int flag; /* open(2) flag */ @@ -117,6 +134,7 @@ typedef struct pinfo_t { int nt; /* number of thread child (0 if no thread lib) */ pid_t *tpid; /* array of thread info */ struct pinfo_t *pmt; /* multithread: pointer to main thread info */ + int tramp_fd; /* trampoline-fd for use in restoring pipes */ } pinfo_t; /* @@ -263,6 +281,89 @@ int getsockinfo(pid_t pid, pinfo_t *pi, int num) return ret; } +typedef struct pipe_peer_info { + fdinfo_t *pipe_fdi; + //fdinfo_t *peer_fdi; + __ino_t pipe_ino; +} pipe_peer_info_t; + +__ino_t get_fd_ino(char *fname) +{ + struct stat sbuf; + + if (stat(fname, &sbuf) < 0) + ERROR("stat() on fd %s failed, errno %d\n", fname, errno); + + return sbuf.st_ino; +} + +static void pair_pipe_fds(pipe_peer_info_t *ppi_list, int npipe_fds) +{ + int i, j; + pipe_peer_info_t *xppi, *yppi; + fdinfo_t *xfdi, *yfdi; + + /* + * TODO: This currently assumes pipefds have not been dup'd. + * Of course, need to kill this assumption soon. + */ + for (i = 0; i < npipe_fds; i++) { + xppi = &ppi_list[i]; + xfdi = xppi->pipe_fdi; + + j = i + 1; + for (j = i+1; j < npipe_fds; j++) { + yppi = &ppi_list[j]; + yfdi = yppi->pipe_fdi; + + if (yppi->pipe_ino != xppi->pipe_ino) + continue; + + DEBUG("Checking flag i %d, j %d\n", i, j); + /* + * i and j refer to same pipe. Check if they are + * peers or aliases (dup'd fds). dup'd fds share + * file-status flags. Peer fds of unnamed pipes + * differ in O_WRONLY bit. + * + * TODO: + * CHECK ABOVE ASSUMPTION + */ + if (xfdi->flag == yfdi->flag) { + yfdi->dup_fdnum = xfdi->fdnum; + DEBUG("Pipe fds %d and %d are dups\n", + xfdi->fdnum, yfdi->fdnum); + } else { + DEBUG("Pipe fds %d and %d are peers\n", + xfdi->fdnum, yfdi->fdnum); + + /* + * If we have already paired it or determined + * it is a dup, ignore + */ + if (xfdi->peer_fdnum != -1 || + xfdi->dup_fdnum != -1) + continue; + + DEBUG("Pipe fd %d not paired yet\n", + xfdi->fdnum); + /* + * Create pipe on first end of the pipe we + * come across. + */ + if (xfdi->fdnum < yfdi->fdnum) + xfdi->create_pipe = 1; + + xfdi->peer_fdnum = yfdi->fdnum; + yfdi->peer_fdnum = xfdi->fdnum; + } + DEBUG("Clearing ino i %d, j %d\n", i, j); + } + } + + DEBUG("Done building pipe list i %d, j %d\n", i, j); +} + /* * getfds() parse the process open file descriptors as found in /proc. */ @@ -275,6 +376,10 @@ int getfdinfo(pinfo_t *pi) int len, n = 0; pid_t syscallpid = pi->syscallpid ? pi->syscallpid : pi->pid; + int npipe_fds = 0; + pipe_peer_info_t ppi_list[256];// TODO: alloc dynamically + pipe_peer_info_t *ppi; + snprintf(dname, sizeof(dname), "/proc/%u/fd", pi->pid); if (! (dir = opendir(dname))) return 0; while ((dent = readdir(dir))) { @@ -288,14 +393,54 @@ int getfdinfo(pinfo_t *pi) stat(dname, &st); pi->fi[n].mode = st.st_mode; pi->fi[n].flag = PT_FCNTL(syscallpid, pi->fi[n].fdnum, F_GETFL, 0); + pi->fi[n].create_pipe = 0; + pi->fi[n].tramp_fd = -1; + pi->fi[n].dup_fdnum = -1; + pi->fi[n].peer_fdnum = -1; if (S_ISREG(st.st_mode)) pi->fi[n].offset = (off_t)PT_LSEEK(syscallpid, pi->fi[n].fdnum, 0, SEEK_CUR); - else if (S_ISFIFO(st.st_mode)) + else if (S_ISFIFO(st.st_mode)) { t_s("fifo"); + ppi = &ppi_list[npipe_fds]; + ppi->pipe_fdi = &pi->fi[n]; + //ppi->peer_fdi = NULL; + ppi->pipe_ino = get_fd_ino(dname); + + DEBUG("Found a pipe: fd %d, flag 0x%x ino %d\n", + ppi->pipe_fdi->fdnum, + ppi->pipe_fdi->flag, ppi->pipe_ino); + npipe_fds++; + + } else if (S_ISSOCK(st.st_mode)) getsockinfo(syscallpid, pi, pi->fi[n].fdnum); n++; } + + if (n) { + pi->tramp_fd = pi->fi[n-1].fdnum + 1; + DEBUG("Using %d as trampoline-fd\n", pi->tramp_fd); + } + + /* + * TODO: replace 1024 with rlim.rlim_cur (but need to execute + * getrlimit() in checkpointed-process) + */ + if (npipe_fds && pi->tramp_fd >= 1024) { + /* + * TODO: + * This restriction can be relaxed a bit. We only + * need to give-up here if all fds are in use as + * pipe-fds. + */ + ERROR("Cannot allocate a 'trampoline_fd' for pipes"); + } + + /* + * Now that we found all fds, pair up any pipe_fds + */ + pair_pipe_fds(ppi_list, npipe_fds); + end: closedir(dir); return n; @@ -550,7 +695,11 @@ static int save_process_fifo_info(pinfo_t *pi, int fd) DEBUG("FIFO fd %d (%s), flag 0x%x\n", fi[i].fdnum, fi[i].name, fi[i].flag); - if (!(fi[i].flag & O_WRONLY)) + /* + * If its read-side fd or a dup of another write-side-fd, + * don't need the data. + */ + if (!(fi[i].flag & O_WRONLY) || fi[i].dup_fdnum != -1) continue; pbuf_size = estimate_fifo_unread_bytes(pi, fd); @@ -742,6 +891,12 @@ static int save_process_data(pid_t pid, int fd, lh_list_t *ptree) write_item(fd, "FD", NULL, 0); t_d(pi->nf); for (i = 0; i < pi->nf; i++) { + /* + * trampoline-fd is common to all fds, so we could write it + * once, as a separate item by itself. Stick it in each + * fdinfo for now. + */ + pi->fi[i].tramp_fd = pi->tramp_fd; write_item(fd, "fdinfo", &pi->fi[i], sizeof(fdinfo_t)); } write_item(fd, "END FD", NULL, 0); @@ -949,6 +1104,74 @@ pid_t restart_thread(pid_t ppid, int exitsig, int addr) return pid; } +static void match_pipe_ends(int pid, int tramp_fd, int expected_fds[], + int actual_fds[]) +{ + int ret; + int expected_read_fd = expected_fds[0]; + int expected_write_fd = expected_fds[1]; + + DEBUG("tramp_fd %d\n", tramp_fd); + /* + * pipe() may have returned one (or both) of the restarted fds + * at the wrong end of the pipe. This could cause dup2() to + * accidentaly close the pipe. Avoid that with an extra dup(). + */ + if (actual_fds[1] == expected_read_fd) { + t_d(ret = PT_DUP2(pid, actual_fds[1], tramp_fd + 1)); + actual_fds[1] = tramp_fd + 1; + } + + if (actual_fds[0] != expected_read_fd) { + t_d(ret = PT_DUP2(pid, actual_fds[0], expected_read_fd)); + t_d(PT_CLOSE(pid, actual_fds[0])); + } + + if (actual_fds[1] != expected_write_fd) { + t_d(ret = PT_DUP2(pid, actual_fds[1], expected_write_fd)); + t_d(PT_CLOSE(pid, actual_fds[1])); + } +} + +static void recreate_pipe(int pid, int tramp_fd, fdinfo_t *fdinfo) +{ + int actual_fds[2] = { 0, 0 }; + int expected_fds[2]; + + DEBUG("Creating pipe for fd %d\n", fdinfo->fdnum); + + t_d(PT_PIPE(pid, actual_fds)); + t_d(actual_fds[0]); + t_d(actual_fds[1]); + + /* + * Find read-end and write-end of the checkpointed pipe + * (i.e don't assume that read-side fd is smaller than + * write-side fd) + */ + if (fdinfo->flag & O_WRONLY) { + expected_fds[0] = fdinfo->peer_fdnum; + expected_fds[1] = fdinfo->fdnum; + } else { + expected_fds[0] = fdinfo->fdnum; + expected_fds[1] = fdinfo->peer_fdnum; + } + + /* + * Match the ends of newly created pipe with the ends of the + * checkpointed pipe. + */ + match_pipe_ends(pid, tramp_fd, expected_fds, actual_fds); + + /* + * for debug, use fcntl() on fdinfo->fdnum and fdinfo->peer_fdnum + * to ensure ends match + */ + + DEBUG("Done creating pipe '{%d, %d}'\n", fdinfo->fdnum, + fdinfo->peer_fdnum); +} + int restore_fd(int fd, pid_t pid) { char item[64]; @@ -1001,34 +1224,29 @@ int restore_fd(int fd, pid_t pid) if (pfd != fdinfo->fdnum) t_d(PT_CLOSE(pid, pfd)); } } else if (S_ISFIFO(fdinfo->mode)) { - int pipefds[2] = { 0, 0 }; - + if (fdinfo->dup_fdnum != -1) { + t_d(ret = PT_DUP2(pid, fdinfo->dup_fdnum, + fdinfo->fdnum)); + } /* - * We create the pipe when we see the pipe's read-fd. - * Just ignore the pipe's write-fd. + * When checkpointing, we arbitrarily mark one end + * of the pipe as the 'create-end'. Create a pipe + * if this fd is the 'create-end' and then restore + * the fcntl-flags. For the other end of the pipe, + * just restore its fcntl-flags. */ - if (fdinfo->flag == O_WRONLY) - continue; - - DEBUG("Creating pipe for fd %d\n", fdinfo->fdnum); - - t_d(PT_PIPE(pid, pipefds)); - t_d(pipefds[0]); - t_d(pipefds[1]); - - if (pipefds[0] != fdinfo->fdnum) { - DEBUG("Hmm, new pipe has fds %d, %d " - "Old pipe had fd %d\n", pipefds[0], - pipefds[1], fdinfo->fdnum); getchar(); - exit(1); - } - DEBUG("Done creating pipefds[0] %d\n", pipefds[0]); + else if (fdinfo->create_pipe) + recreate_pipe(pid, fdinfo->tramp_fd, fdinfo); } /* * Restore any special flags this fd had */ ret = PT_FCNTL(pid, fdinfo->fdnum, F_SETFL, fdinfo->flag); + if (ret < 0) { + ERROR("restore_fd() fd %d setfl flag 0x%x, ret %d\n", + fdinfo->fdnum, fdinfo->flag, ret); + } DEBUG("---- restore_fd() fd %d setfl flag 0x%x, ret %d\n", fdinfo->fdnum, fdinfo->flag, ret); free(fdinfo); -- 1.5.2.5 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers