I tested the patch with code stressing swap->destroy->create->add 10000 times, the performance regression still happens, and now it is ip_set_destroy. (I pasted the test code at the end of this mail) time show that most delay is 'off cpu': $ time sudo ./stressipset real 2m45.115s user 0m0.019s sys 0m0.744s Most time, callstack stuck in rcu_barrier: $ sudo cat /proc/2158/stack [<0>] rcu_barrier+0x1f6/0x2d0 [<0>] ip_set_destroy+0x84/0x1d0 [ip_set] [<0>] nfnetlink_rcv_msg+0x2ac/0x2f0 [nfnetlink] [<0>] netlink_rcv_skb+0x57/0x100 [<0>] netlink_unicast+0x19a/0x280 [<0>] netlink_sendmsg+0x250/0x4d0 [<0>] __sys_sendto+0x1be/0x1d0 [<0>] __x64_sys_sendto+0x20/0x30 [<0>] do_syscall_64+0x42/0xf0 [<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76 perf_event_open profiling show similiar call signature for rcu_call and synchronize_rcu ip_set_destroy(49.651% 2133/4296) rcu_barrier(80.684% 1721/2133) wait_for_completion(79.198% 1363/1721) schedule_timeout(94.864% 1293/1363) schedule(96.520% 1248/1293) __schedule(97.436% 1216/1248) preempt_count_add(0.240% 3/1248) srso_return_thunk(0.160% 2/1248) preempt_count_sub(0.160% 2/1248) srso_return_thunk(0.077% 1/1293) _raw_spin_unlock_irq(1.027% 14/1363) _raw_spin_lock_irq(0.514% 7/1363) __cond_resched(0.220% 3/1363) srso_return_thunk(0.147% 2/1363) ip_set_swap(79.842% 709/888) (this profiling was captured when synchronize_rcu is used in ip_set_swap) synchronize_rcu(74.330% 527/709) __wait_rcu_gp(89.184% 470/527) wait_for_completion(86.383% 406/470) schedule_timeout(91.133% 370/406) schedule(95.135% 352/370) _raw_spin_unlock_irq(3.202% 13/406) _raw_spin_lock_irq(0.739% 3/406) srso_return_thunk(0.246% 1/406) _raw_spin_unlock_irq(7.021% 33/470) __call_rcu_common.constprop.0(3.830% 18/470) rcu_gp_is_expedited(3.036% 16/527) __cond_resched(0.569% 3/527) srso_return_thunk(0.190% 1/527) They all call wait_for_completion, which may sleep on something on purpose, I guess...(Maybe rcu is not a good choice here?) I made another test with 'synchronize_rcu' removed from ip_set_swap and no 'call_rcu' in ip_set_destroy, the performance is much better: $ time sudo ./stressipset real 0m2.203s user 0m0.037s sys 0m0.188s Here is my test code, it is very ugly...(I have no knowledge of netfilter messaging protocol, and just hack it out with the help of strace.) And I was lazy, before run the test, foo/bar netset is needed: ipset create foo hash:net ipset create bar hash:net Here come the ugly code, should be able to compile with gcc. #include <stdio.h> #include <stdlib.h> #include <sys/types.h> #include <sys/socket.h> #include <unistd.h> #include <arpa/inet.h> #include <linux/netlink.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/ipset/ip_set.h> #include <string.h> unsigned char mbuffer[4096]; unsigned char buf[128]; // cmd buffer for swap unsigned char dbuf[128]; // cmd buffer for destroy unsigned char cbuf[128]; // cmd buffer for create unsigned char abuf[128]; // cmd buffer for add int main() { int err; int sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER); if (sock<0) { perror("Fail to create socket"); return 1; } struct sockaddr_nl addr = { .nl_family = AF_NETLINK, .nl_pad = 0, .nl_pid = 0, .nl_groups = 0 }; struct sockaddr raddr = {0}; socklen_t rsize; int seq = 0x12345678; err = bind(sock, (struct sockaddr*)&addr, sizeof(addr)); if (err) { perror("Fail to bind"); return 1; } err = getsockname(sock, &raddr, &rsize); if (err) { perror("Fail to getsockname"); return 1; } struct nlmsghdr *phdr; struct nfgenmsg *pnfg; struct nlattr *pnla; unsigned int total; ssize_t rz; struct iovec iovs; iovs.iov_base = mbuffer; iovs.iov_len = sizeof(mbuffer); struct msghdr msg = {0}; msg.msg_name = &addr; msg.msg_namelen = sizeof(addr); msg.msg_iov = &iovs; msg.msg_iovlen = 1; memset(buf, 0, sizeof(buf)); total = 0; phdr = (struct nlmsghdr*)(buf+total); total += sizeof(struct nlmsghdr); phdr->nlmsg_type=NFNL_SUBSYS_IPSET<<8|IPSET_CMD_PROTOCOL; phdr->nlmsg_seq = seq; phdr->nlmsg_flags = NLM_F_REQUEST; pnfg = (struct nfgenmsg*)(buf+total); total += sizeof(struct nfgenmsg); pnfg->nfgen_family=AF_INET; pnfg->version= NFNETLINK_V0; pnfg->res_id=htons(0); pnla = (struct nlattr *)(buf+total); pnla->nla_len = 5; pnla->nla_type = 1; buf[total+sizeof(struct nlattr)]=0x06; total+=8; phdr->nlmsg_len = total; rz = sendto(sock, buf, total, 0, (struct sockaddr*)&addr, sizeof(addr)); rz = recvmsg(sock, &msg, 0); int sz = total; // build swap pnla = (struct nlattr *)(buf+total); pnla->nla_len = 8; pnla->nla_type = 2; char *p = buf+(total+sizeof(struct nlattr)); p[0]='f'; p[1]='o'; p[2]='o'; p[3]=0; total+=8; pnla = (struct nlattr *)(buf+total); pnla->nla_len = 8; pnla->nla_type = 3; p = buf+(total+sizeof(struct nlattr)); p[0]='b'; p[1]='a'; p[2]='r'; p[3]=0; total+=8; phdr->nlmsg_type = NFNL_SUBSYS_IPSET<<8|IPSET_CMD_SWAP; phdr->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK; phdr->nlmsg_len = total; // build destroy char bar[] = "\x62\x61\x72\x00"; memcpy(dbuf, buf, sz); total = sz; struct nlmsghdr *phdr_d = (struct nlmsghdr*)dbuf; pnla = (struct nlattr *)(dbuf+total); pnla->nla_len = 8; pnla->nla_type = 2; memcpy(dbuf+(total+sizeof(struct nlattr)), bar, sizeof(bar)); total+=pnla->nla_len; phdr_d->nlmsg_type = NFNL_SUBSYS_IPSET<<8|IPSET_CMD_DESTROY; phdr_d->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK; phdr_d->nlmsg_len = total; // build create memcpy(cbuf, buf, sz); total = sz; struct nlmsghdr *phdr_c = (struct nlmsghdr*)cbuf; pnla = (struct nlattr *)(cbuf+total); pnla->nla_len = 8; pnla->nla_type = 2; memcpy(cbuf+(total+sizeof(struct nlattr)), bar, sizeof(bar)); total+=(pnla->nla_len+3)/4*4; char hashnet[] = "\x68\x61\x73\x68\x3a\x6e\x65\x74\x00"; pnla = (struct nlattr *)(cbuf+total); pnla->nla_len = 13; pnla->nla_type = 3; memcpy(cbuf+(total+sizeof(struct nlattr)), hashnet, sizeof(hashnet)); total+=(pnla->nla_len+3)/4*4; pnla = (struct nlattr *)(cbuf+total); pnla->nla_len = 5; pnla->nla_type = 4; cbuf[total+sizeof(struct nlattr)]=6; total+=(pnla->nla_len+3)/4*4; pnla = (struct nlattr *)(cbuf+total); pnla->nla_len = 5; pnla->nla_type = 5; cbuf[total+sizeof(struct nlattr)]=2; total+=(pnla->nla_len+3)/4*4; pnla = (struct nlattr *)(cbuf+total); pnla->nla_len = 4; pnla->nla_type = NLA_F_NESTED|0x7; total+=(pnla->nla_len+3)/4*4; phdr_c->nlmsg_type = NFNL_SUBSYS_IPSET<<8|IPSET_CMD_CREATE; phdr_c->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK; phdr_c->nlmsg_len = total; // build add memcpy(abuf, buf, sz); total = sz; struct nlmsghdr *phdr_a = (struct nlmsghdr*)abuf; pnla = (struct nlattr *)(abuf+total); pnla->nla_len = 8; pnla->nla_type = 2; memcpy(abuf+(total+sizeof(struct nlattr)), bar, sizeof(bar)); total+=(pnla->nla_len+3)/4*4; char ip[] = "\x0c\x00\x01\x80\x08\x00\x01\x40\x0a\x01\x00\x00\x05\x00\x03\x00\x10\x00\x00\x00\x08\x00\x09\x40\x00\x00\x00\x00"; pnla = (struct nlattr *)(abuf+total); pnla->nla_len = 32; pnla->nla_type = NLA_F_NESTED|0x7; memcpy(abuf+(total+sizeof(struct nlattr)), ip, sizeof(ip)); total+=(pnla->nla_len+3)/4*4; phdr_a->nlmsg_type = NFNL_SUBSYS_IPSET<<8|IPSET_CMD_ADD; phdr_a->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK; phdr_a->nlmsg_len = total; for (int i=0; i<10000; i++) { // swap foo bar seq++; phdr->nlmsg_seq = seq; sendto(sock, buf, phdr->nlmsg_len, 0, (struct sockaddr*)&addr, sizeof(addr)); recvmsg(sock, &msg, 0); err=*(short*)(mbuffer+(sizeof(struct nlmsghdr)-2)); if (err) { printf("fail to swap %d\n", err); break; } // destroy bar seq++; phdr_d->nlmsg_seq = seq; sendto(sock, dbuf, phdr_d->nlmsg_len, 0, (struct sockaddr*)&addr, sizeof(addr)); recvmsg(sock, &msg, 0); err=*(short*)(mbuffer+(sizeof(struct nlmsghdr)-2)); if (err) { printf("fail to destroy\n"); break; } // create bar seq++; phdr_c->nlmsg_seq = seq; sendto(sock, cbuf, phdr_c->nlmsg_len, 0, (struct sockaddr*)&addr, sizeof(addr)); recvmsg(sock, &msg, 0); err=*(short*)(mbuffer+(sizeof(struct nlmsghdr)-2)); if (err) { printf("fail to create\n"); break; } // add bar seq++; phdr_a->nlmsg_seq = seq; sendto(sock, abuf, phdr_a->nlmsg_len, 0, (struct sockaddr*)&addr, sizeof(addr)); recvmsg(sock, &msg, 0); err=*(short*)(mbuffer+(sizeof(struct nlmsghdr)-2)); if (err) { printf("fail to add\n"); break; } } close(sock); return 0; } Thanks David