On 21.03.12 05.35, Tanu Kaskinen wrote: > Do you have a patch yet? Well, something similar (see below). Unfortunately I build on OS/2 the native OS/2 select implementation has a different interface. So the code is not 1:1 portable. Furthermore it seems that my suggestion will still not always fix the problem. There is still an ABA problem, because the descriptor that just died could already be reused for a different purpose. So I changed the code to retry several times until it fails. This fixed the deadlocks so far. I am no longer able to reproduce the problem. But strictly speaking the ABA problem is still there. If a descriptor id gets reused (at least OS/2 does this immediately after a shutdown call) pa_poll might return with an event for this new descriptor while the mainloop tried to wait for the old, died socket. I think there is no work around possible in pa_poll. I think to avoid the race condition completely one needs to cancel the registration for the read or write events in synchronized context before it invokes shutdown on a socket (likely already implemented) and the retry loop around poll needs to be moved into the mainloop code so that the mainloop enters synchronized context and recreates the descriptor list before poll is invoked the next time. ----- #if defined(OS_IS_OS2) int pa_poll (struct pollfd *fds, unsigned long int nfds, int timeout) { int* selectfds; int nselectfds; struct pollfd* f; struct pollfd* fdsend = fds + nfds; int nreads, nwrites, nexcepts; int* pselectfd; int ready; int nretry = nfds; /* limit the number of retries */ /* First check how many descriptors we need */ nselectfds = 0; for (f = fds; f != fdsend; ++f) { int events = f->events; nselectfds += !!(events & POLLIN) + !!(events & POLLPRI) + !!(events & POLLOUT); f->revents = 0; } selectfds = alloca(nselectfds * sizeof *selectfds); retry: /* retry here, if select fails because of a bad descriptor. */ /* populate selectfds */ nreads = 0; nwrites = 0; nexcepts = 0; pselectfd = selectfds; for (f = fds; f != fdsend; ++f) { if (f->events & POLLIN) { *pselectfd++ = f->fd; ++nreads; } } for (f = fds; f != fdsend; ++f) { if (f->events & POLLOUT) { *pselectfd++ = f->fd; ++nwrites; } } for (f = fds; f != fdsend; ++f) { if (f->events & POLLPRI) { *pselectfd++ = f->fd; ++nexcepts; } } /* execute select */ ready = os2_select(selectfds, nreads, nwrites, nexcepts, timeout); if (ready == -1) { errno = sock_errno(); /* One of the descriptors is bad. Find out which one. */ /* The error code is not checked here, because it may vary. * E.g. ENOTSOCK in case the id is now a file. */ for (f = fds; f != fdsend; ++f) if (f->events != 0) { /* check the descriptor */ if (os2_select(&f->fd, 1, 0, 0, 0) == -1) /* do not use this descriptor any more */ f->events = 0; } if (nretry) { --nretry; goto retry; } } else if (ready) { /* Find out the descriptors that got ready. */ ready = 0; pselectfd = selectfds; for (f = fds; f != fdsend; ++f) { if (f->events & POLLIN) { if (*pselectfd++ != -1) { ++ready; f->revents = POLLIN; } } } for (f = fds; f != fdsend; ++f) { if (f->events & POLLOUT) { if (*pselectfd++ != -1) { if (!f->revents) ++ready; f->revents |= POLLOUT; } } } for (f = fds; f != fdsend; ++f) { if (f->events & POLLPRI) { if (*pselectfd++ != -1) { if (!f->revents) ++ready; f->revents |= POLLPRI; } } } } return ready; } #elif !defined(HAVE_POLL_H) || defined(OS_IS_DARWIN) ... Marcel