On Wed, Jan 22, 2020 at 04:56:59PM +0800, Honggang LI wrote: > > > > > > rdma_destroy_event_channel(ch); > > > > This call should fail, since there's still a valid id open on it. > > Yes, you are right. After switch rdma_destroy_event_channel and > rdma_destroy_id, the memory leak issue is gone. > > But we still observe file descriptor leak. I managed to reproduce the file descriptor leak without dapl. That means it is a librdmacm issue. Please see attachment 'rdmacm-fd-leak.txt' for details.
I managed to reproduce this without dapl. That means the resource leak is a librdamcm issue. [root@rdma-dev-00 cm2]$ sh build.sh + rm -f libofa.so libofa.o + gcc -fPIC -g -c -o libofa.o libofa.c + gcc -shared -fPIC -g -Wl,-init,test_init -Wl,-fini,test_fini -lrdmacm -o libofa.so libofa.o + gcc -ldl -g -o test.exe test.c + ip addr show mlx4_ib0 + grep -w inet inet 172.31.0.230/24 brd 172.31.0.255 scope global dynamic noprefixroute mlx4_ib0 + ./test.exe 172.31.0.230 dlopen librdamcm.so done dlopen librdamcm.so done dlopen librdamcm.so done dlopen librdamcm.so done === ls -l /proc/20221/fd total 0 lrwx------. 1 root root 64 Jan 22 10:10 0 -> /dev/pts/1 lrwx------. 1 root root 64 Jan 22 10:10 1 -> /dev/pts/1 lrwx------. 1 root root 64 Jan 22 10:10 10 -> /dev/infiniband/uverbs0 <--- leak lr-x------. 1 root root 64 Jan 22 10:10 11 -> 'anon_inode:[infinibandevent]' lrwx------. 1 root root 64 Jan 22 10:10 2 -> /dev/pts/1 lrwx------. 1 root root 64 Jan 22 10:10 4 -> /dev/infiniband/uverbs0 <-- leak lr-x------. 1 root root 64 Jan 22 10:10 5 -> 'anon_inode:[infinibandevent]' lrwx------. 1 root root 64 Jan 22 10:10 6 -> /dev/infiniband/uverbs0 <-- leak lr-x------. 1 root root 64 Jan 22 10:10 7 -> 'anon_inode:[infinibandevent]' lrwx------. 1 root root 64 Jan 22 10:10 8 -> /dev/infiniband/uverbs0 <-- leak lr-x------. 1 root root 64 Jan 22 10:10 9 -> 'anon_inode:[infinibandevent]' [root@rdma-dev-00 cm2]$ [root@rdma-dev-00 cm2]$ cat build.sh #/bin/bash set -x rm -f libofa.so libofa.o gcc -fPIC -g -c -o libofa.o libofa.c gcc -shared -fPIC -g -Wl,-init,test_init -Wl,-fini,test_fini -lrdmacm -o libofa.so libofa.o gcc -ldl -g -o test.exe test.c ip addr show mlx4_ib0 | grep -w inet #LD_DEBUG=libs ./test.exe 172.31.0.230 ./test.exe 172.31.0.230 [root@rdma-dev-00 cm2]$ cat libofa.c #include <stdio.h> #include <stdlib.h> #include <dlfcn.h> #include <arpa/inet.h> #include <rdma/rdma_cma.h> #include <limits.h> #include <sys/types.h> #include <unistd.h> static void *handle; void test_init(void) { handle = dlopen("/usr/lib64/librdmacm.so", RTLD_NOW | RTLD_GLOBAL); if (!handle) printf("dlopen /usr/lib64/librdmacm.so failed\n"); else printf("dlopen librdamcm.so done\n"); } void test_fini(void) { if (handle) dlclose(handle); handle = NULL; } void test(char *ipoib_ip) { #if 1 int ret; struct rdma_cm_id *id; struct sockaddr_in ipoib_addr; struct rdma_event_channel *ch; void *handle; memset(&ipoib_addr, 0, sizeof(ipoib_addr)); ipoib_addr.sin_family = AF_INET; ipoib_addr.sin_port = 5555; #if 1 ret = inet_pton(AF_INET, ipoib_ip, (void *)&(ipoib_addr.sin_addr)); if (ret != 1) printf("inet_pton failed\n"); #else ipoib_addr.sin_addr.s_addr=htonl(INADDR_ANY); #endif ch = rdma_create_event_channel(); if (ch == NULL) printf("rdma_create_event_channel failed\n"); ret = rdma_create_id(ch, &id, NULL, RDMA_PS_TCP); if (ret != 0) printf("rdma_create_id failed\n"); ret = rdma_bind_addr(id, (struct sockaddr *) &ipoib_addr); if (ret != 0) printf("rdma_bind_addr failed\n"); #if DEBUG printf("befora call rdma_destroy_id\n"); getchar(); #endif ret = rdma_destroy_id(id); if (ret != 0) printf("rdma_destroy_id failed\n"); #if DEBUG printf("before call rdma_destroy_event_channel\n"); getchar(); #endif rdma_destroy_event_channel(ch); #if DEBUG printf("after call rdma_destroy_event_channle\n"); getchar(); #endif #else printf("xxx %s:%s\n", __FILE__, __func__); #endif } [root@rdma-dev-00 cm2]$ cat test.c #include <stdio.h> #include <stdlib.h> #include <dlfcn.h> #include <sys/types.h> #include <sys/wait.h> #include <unistd.h> #include <string.h> typedef void (* DUMMY_TEST_FUNC) (char *); int main(int argc, char **argv) { DUMMY_TEST_FUNC sym; void *handle; int i; pid_t cpid, ppid; int wstatus; char path[128]; if (argc != 2) { printf("usage: %s IPoIB_IP_ADDR\n", argv[0]); return 1; } for (i = 0; i < 4; i++) { handle = dlopen("./libofa.so", RTLD_NOW | RTLD_GLOBAL); sym = dlsym(handle, "test"); sym(argv[1]); dlclose(handle); } cpid = fork(); if (cpid == 0) { /* child */ ppid = getppid(); memset(path, 0, 128); sprintf(path, "/proc/%d/fd", ppid); printf("=== ls -l %s\n", path); execl("/usr/bin/ls", "/usr/bin/ls", "-l", path, (char *)NULL); } else { waitpid(cpid, &wstatus, 0); } return 0; }