Hi,
When debugging some performance issue, i found that thousands of threads
exit around same time could cause a severe spin lock contention on proc
dentry "/proc/$parent_process_pid/task/", that's because threads needs
to clean up their pid file from that dir when exit. Check the following
standalone test case that simulated the case and perf top result on v5.7
kernel. Any idea on how to fix this?
PerfTop: 48891 irqs/sec kernel:95.6% exact: 100.0% lost: 0/0
drop: 0/0 [4000Hz cycles], (all, 72 CPUs)
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
66.10% [kernel] [k]
native_queued_spin_lock_slowpath
1.13% [kernel] [k] _raw_spin_lock
0.84% [kernel] [k] clear_page_erms
0.82% [kernel] [k]
queued_write_lock_slowpath
0.64% [kernel] [k] proc_task_readdir
0.61% [kernel] [k]
find_idlest_group.isra.95
0.61% [kernel] [k]
syscall_return_via_sysret
0.55% [kernel] [k] entry_SYSCALL_64
0.49% [kernel] [k] memcpy_erms
0.46% [kernel] [k] update_cfs_group
0.41% [kernel] [k] get_pid_task
0.39% [kernel] [k]
_raw_spin_lock_irqsave
0.37% [kernel] [k]
__list_del_entry_valid
0.34% [kernel] [k]
get_page_from_freelist
0.34% [kernel] [k] __d_lookup
0.32% [kernel] [k] update_load_avg
0.31% libc-2.17.so [.] get_next_seq
0.27% [kernel] [k] avc_has_perm_noaudit
0.26% [kernel] [k] __sched_text_start
0.25% [kernel] [k]
selinux_inode_permission
0.25% [kernel] [k] __slab_free
0.24% [kernel] [k] detach_entity_cfs_rq
0.23% [kernel] [k] zap_pte_range
0.22% [kernel] [k]
_find_next_bit.constprop.1
0.22% libc-2.17.so [.] vfprintf
0.20% libc-2.17.so [.] _int_malloc
0.19% [kernel] [k] _raw_spin_lock_irq
0.18% [kernel] [k] rb_erase
0.18% [kernel] [k] pid_revalidate
0.18% [kernel] [k] lockref_get_not_dead
0.18% [kernel] [k]
__alloc_pages_nodemask
0.17% [kernel] [k] set_task_cpu
0.17% libc-2.17.so [.] __strcoll_l
0.17% [kernel] [k] do_syscall_64
0.17% [kernel] [k] __vmalloc_node_range
0.17% libc-2.17.so [.] _IO_vfscanf
0.17% [kernel] [k] refcount_dec_not_one
0.15% [kernel] [k] __task_pid_nr_ns
0.15% [kernel] [k]
native_irq_return_iret
0.15% [kernel] [k] free_pcppages_bulk
0.14% [kernel] [k] kmem_cache_alloc
0.14% [kernel] [k] link_path_walk
0.14% libc-2.17.so [.] _int_free
0.14% [kernel] [k]
__update_load_avg_cfs_rq
0.14% perf.5.7.0-master.20200601.ol7.x86_64 [.] 0x00000000000eac29
0.13% [kernel] [k] kmem_cache_free
0.13% [kernel] [k] number
0.13% [kernel] [k] memset_erms
0.12% [kernel] [k] proc_pid_status
0.12% [kernel] [k] __d_lookup_rcu
=========== runme.sh ==========
#!/bin/bash
threads=${1:-10000}
prog=proc_race
while [ 1 ]; do ./$prog $threads; done &
while [ 1 ]; do
pid=`ps aux | grep $prog | grep -v grep| awk '{print $2}'`
if [ -z $pid ]; then continue; fi
threadnum=`ls -l /proc/$pid/task | wc -l`
if [ $threadnum -gt $threads ]; then
echo kill $pid
kill -9 $pid
fi
done
===========proc_race.c=========
#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>
#define handle_error_en(en, msg) \
do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)
#define handle_error(msg) \
do { perror(msg); exit(EXIT_FAILURE); } while (0)
struct thread_info {
pthread_t thread_id;
int thread_num;
};
static void *child_thread()
{
int i;
while (1) { if (!(i++ % 1000000)) sleep(1);}
return NULL;
}
int main(int argc, char *argv[])
{
int s, tnum, opt, num_threads;
struct thread_info *tinfo;
void *res;
if (argc == 2)
num_threads = atoi(argv[1]);
else
num_threads = 10000;
tinfo = calloc(num_threads, sizeof(struct thread_info));
if (tinfo == NULL)
handle_error("calloc");
for (tnum = 0; tnum < num_threads; tnum++) {
tinfo[tnum].thread_num = tnum + 1;
s = pthread_create(&tinfo[tnum].thread_id, NULL,
&child_thread, NULL);
if (s != 0)
handle_error_en(s, "pthread_create");
}
for (tnum = 0; tnum < num_threads; tnum++) {
s = pthread_join(tinfo[tnum].thread_id, &res);
if (s != 0)
handle_error_en(s, "pthread_join");
free(res);
}
free(tinfo);
exit(EXIT_SUCCESS);
}
==========
Thanks,
Junxiao.