On 04/02/2015 12:21 PM, Michal Privoznik wrote: > https://bugzilla.redhat.com/show_bug.cgi?id=1200149 > > Even though we have a mutex mechanism so that two clients don't spawn > two daemons, it's not strong enough. It can happen that while one > client is spawning the daemon, the other one fails to connect. > Basically two possible errors can happen: > > error: Failed to connect socket to '/home/mprivozn/.cache/libvirt/libvirt-sock': Connection refused > > or: > > error: Failed to connect socket to '/home/mprivozn/.cache/libvirt/libvirt-sock': No such file or directory > > The problem in both cases is, the daemon is only starting up, while we > are trying to connect (and fail). We should postpone the connecting > phase until the daemon is started (by the other thread that is > spawning it). In order to do that, create a file lock 'libvirt-lock' > in the directory where session daemon would create its socket. So even > when called from multiple processes, spawning a daemon will serialize > on the file lock. So only the first to come will spawn the daemon. > > Signed-off-by: Michal Privoznik <mprivozn@xxxxxxxxxx> > --- > > diff to v1: > -hopefully this one is race free > > src/rpc/virnetsocket.c | 148 ++++++++++++------------------------------------- > 1 file changed, 36 insertions(+), 112 deletions(-) > > diff --git a/src/rpc/virnetsocket.c b/src/rpc/virnetsocket.c > index 6b019cc..2b891ae 100644 > --- a/src/rpc/virnetsocket.c > +++ b/src/rpc/virnetsocket.c > @@ -123,7 +123,7 @@ VIR_ONCE_GLOBAL_INIT(virNetSocket) > > > #ifndef WIN32 > -static int virNetSocketForkDaemon(const char *binary, int passfd) > +static int virNetSocketForkDaemon(const char *binary) > { > int ret; > virCommandPtr cmd = virCommandNewArgList(binary, > @@ -136,10 +136,6 @@ static int virNetSocketForkDaemon(const char *binary, int passfd) > virCommandAddEnvPassBlockSUID(cmd, "XDG_RUNTIME_DIR", NULL); > virCommandClearCaps(cmd); > virCommandDaemonize(cmd); > - if (passfd) { > - virCommandPassFD(cmd, passfd, VIR_COMMAND_PASS_FD_CLOSE_PARENT); > - virCommandPassListenFDs(cmd); > - } > ret = virCommandRun(cmd, NULL); > virCommandFree(cmd); > return ret; > @@ -543,10 +539,10 @@ int virNetSocketNewConnectUNIX(const char *path, > const char *binary, > virNetSocketPtr *retsock) > { > - char *binname = NULL; > - char *pidpath = NULL; > - int fd, passfd = -1; > - int pidfd = -1; > + char *lockpath = NULL; Because we assign this to NULL... > + int lockfd = -1; > + int fd = -1; > + int retries = 100; > virSocketAddr localAddr; > virSocketAddr remoteAddr; > > @@ -561,6 +557,22 @@ int virNetSocketNewConnectUNIX(const char *path, > return -1; > } > > + if (spawnDaemon) { > + if (virPidFileConstructPath(false, NULL, "libvirt-lock", &lockpath) < 0) > + goto error; > + > + if ((lockfd = open(lockpath, O_RDWR | O_CREAT, 0600)) < 0 || > + virSetCloseExec(lockfd) < 0) { > + virReportSystemError(errno, _("Unable to create lock '%s'"), lockpath); And can jump to error right here > + goto error; > + } > + > + if (virFileLock(lockfd, false, 0, 1, true) < 0) { > + virReportSystemError(errno, _("Unable to lock '%s'"), lockpath); > + goto error; > + } > + } > + > if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) { > virReportSystemError(errno, "%s", _("Failed to create socket")); > goto error; > @@ -574,108 +586,25 @@ int virNetSocketNewConnectUNIX(const char *path, > if (remoteAddr.data.un.sun_path[0] == '@') > remoteAddr.data.un.sun_path[0] = '\0'; > > - retry: > - if (connect(fd, &remoteAddr.data.sa, remoteAddr.len) < 0) { > - int status = 0; > - pid_t pid = 0; > - > - if (!spawnDaemon) { > + while (retries && > + connect(fd, &remoteAddr.data.sa, remoteAddr.len) < 0) { > + if (!(spawnDaemon && errno == ENOENT)) { > virReportSystemError(errno, _("Failed to connect socket to '%s'"), > path); > goto error; > } > > - if (!(binname = last_component(binary)) || binname[0] == '\0') { > - virReportError(VIR_ERR_INTERNAL_ERROR, > - _("Cannot determine basename for binary '%s'"), > - binary); > - goto error; > - } > - > - if (virPidFileConstructPath(false, NULL, binname, &pidpath) < 0) > - goto error; > - > - pidfd = virPidFileAcquirePath(pidpath, false, getpid()); > - if (pidfd < 0) { > - /* > - * This can happen in a very rare case of two clients spawning two > - * daemons at the same time, and the error in the logs that gets > - * reset here can be a clue to some future debugging. > - */ > - virResetLastError(); > - spawnDaemon = false; > - goto retry; > - } > - > - if ((passfd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) { > - virReportSystemError(errno, "%s", _("Failed to create socket")); > + if (virNetSocketForkDaemon(binary) < 0) > goto error; > - } > > - /* > - * We already even acquired the pidfile, so no one else should be using > - * @path right now. So we're OK to unlink it and paying attention to > - * the return value makes no real sense here. Only if it's not an > - * abstract socket, of course. > - */ > - if (path[0] != '@') > - unlink(path); > - > - /* > - * We have to fork() here, because umask() is set per-process, chmod() > - * is racy and fchmod() has undefined behaviour on sockets according to > - * POSIX, so it doesn't work outside Linux. > - */ > - if ((pid = virFork()) < 0) > - goto error; > - > - if (pid == 0) { > - umask(0077); > - if (bind(passfd, &remoteAddr.data.sa, remoteAddr.len) < 0) > - _exit(EXIT_FAILURE); > - > - _exit(EXIT_SUCCESS); > - } > + retries--; > + usleep(5000); > + } > > - if (virProcessWait(pid, &status, false) < 0) > - goto error; > - > - if (status != EXIT_SUCCESS) { > - /* > - * OK, so the child failed to bind() the socket. This may mean that > - * another daemon was starting at the same time and succeeded with > - * its bind() (even though it should not happen because we using a > - * pidfile for the race). So we'll try connecting again, but this > - * time without spawning the daemon. > - */ > - spawnDaemon = false; > - virPidFileDeletePath(pidpath); > - VIR_FORCE_CLOSE(pidfd); > - VIR_FORCE_CLOSE(passfd); > - goto retry; > - } > - > - if (listen(passfd, 0) < 0) { > - virReportSystemError(errno, "%s", > - _("Failed to listen on socket that's about " > - "to be passed to the daemon")); > - goto error; > - } > - > - if (connect(fd, &remoteAddr.data.sa, remoteAddr.len) < 0) { > - virReportSystemError(errno, _("Failed to connect socket to '%s'"), > - path); > - goto error; > - } > - > - /* > - * Do we need to eliminate the super-rare race here any more? It would > - * need incorporating the following VIR_FORCE_CLOSE() into a > - * virCommandHook inside a virNetSocketForkDaemon(). > - */ > - VIR_FORCE_CLOSE(pidfd); > - if (virNetSocketForkDaemon(binary, passfd) < 0) > - goto error; > + if (lockfd) { > + unlink(lockpath); > + VIR_FORCE_CLOSE(lockfd); > + VIR_FREE(lockpath); > } > > localAddr.len = sizeof(localAddr.data); > @@ -687,19 +616,14 @@ int virNetSocketNewConnectUNIX(const char *path, > if (!(*retsock = virNetSocketNew(&localAddr, &remoteAddr, true, fd, -1, 0))) > goto error; > > - VIR_FREE(pidpath); > - > return 0; > > error: > - if (pidfd >= 0) > - virPidFileDeletePath(pidpath); > - VIR_FREE(pidpath); > + if (lockfd) > + unlink(lockpath); Coverity complains about calling unlink with NULL: (8) Event var_deref_model: Passing null pointer "lockpath" to "unlink", which dereferences it. Also see events: > + VIR_FREE(lockpath); > VIR_FORCE_CLOSE(fd); > - VIR_FORCE_CLOSE(passfd); > - VIR_FORCE_CLOSE(pidfd); > - if (spawnDaemon) > - unlink(path); > + VIR_FORCE_CLOSE(lockfd); > return -1; > } > #else > -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list