Possible deadlock in __pcpu_freelist_push

Priya Bala Govindasamy <pgovind2@xxxxxxx> · Thu, 5 Sep 2024 11:32:27 -0700

Hello,

We are developing a tool called Spinner to detect locking violations
in the bpf subsystem. Spinner reported a nested bpf locking issue with
the helper function bpf_get_stackid. bpf_get_stackid calls
pcpu_freelist_push which takes a spin_lock. A deadlock will occur if a
bpf program calls bpf_get_stackid, takes the spin lock in
pcpu_freelist_push, and triggers nested execution of another bpf
program that also calls bpf_get_stackid and tries to take the same
lock.

This issue was reported for kernel v6.9. However, we believe this
should still exist in the latest kernel. We tried to validate the
report on v6.10 kernel by running a PoC. Below is the lockdep splat.
The PoC is attached at the end.

Thanks,
Priya

============================================
 WARNING: possible recursive locking detected
 6.10.0-rc7+ #69 Not tainted
 --------------------------------------------
 sshd/1125 is trying to acquire lock:
 ffffe8fffbe1fd80 (&head->lock){....}-{2:2}, at: __pcpu_freelist_pop+0x1c5/0x820

 but task is already holding lock:
 ffffe8fffbe1fd80 (&head->lock){....}-{2:2}, at:
__pcpu_freelist_push+0x2ee/0x4f0

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(&head->lock);
   lock(&head->lock);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

 3 locks held by sshd/1125:
  #0: ffffffff85d5cbc0 (rcu_read_lock){....}-{1:3}, at:
__bpf_prog_enter+0x24/0x190
  #1: ffffe8fffbe1fd80 (&head->lock){....}-{2:2}, at:
__pcpu_freelist_push+0x2ee/0x4f0
  #2: ffffffff85d5cbc0 (rcu_read_lock){....}-{1:3}, at:
trace_call_bpf+0xc3/0x810

 stack backtrace:
 CPU: 0 PID: 1125 Comm: sshd Not tainted 6.10.0-rc7+ #69
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.13.0-1ubuntu1.1 04/01/2014
 Call Trace:
  <TASK>
  dump_stack_lvl+0x9f/0xf0
  dump_stack+0x14/0x20
  print_deadlock_bug+0x3ca/0x680
  __lock_acquire+0x2ff5/0x6a60
  ? __pfx___lock_acquire+0x10/0x10
  ? kernel_text_address+0x15b/0x180
  ? unwind_next_frame+0x18f/0xa60
  ? __kernel_text_address+0x16/0x50
  lock_acquire+0x1be/0x560
  ? __pcpu_freelist_pop+0x1c5/0x820
  ? __pfx_perf_callchain_kernel+0x10/0x10
  ? __pfx_lock_acquire+0x10/0x10
  _raw_spin_lock+0x3b/0x80
  ? __pcpu_freelist_pop+0x1c5/0x820
  __pcpu_freelist_pop+0x1c5/0x820
  pcpu_freelist_pop+0x31/0x80
  __bpf_get_stackid+0x515/0x960
  ? __pfx_mark_lock+0x10/0x10
  bpf_get_stackid+0x10b/0x180
  bpf_prog_b4f8da3e125c426b_test_prog2+0x42/0x47
  trace_call_bpf+0x24d/0x810
  ? __pfx_trace_call_bpf+0x10/0x10
  ? __pcpu_freelist_push+0x2ef/0x4f0
  kprobe_perf_func+0x108/0x8c0
  ? __pfx_kprobe_perf_func+0x10/0x10
  ? __pfx___lock_acquire+0x10/0x10
  ? __pfx___lock_acquire+0x10/0x10
  ? kernel_text_address+0x15b/0x180
  ? unwind_next_frame+0x18f/0xa60
  kprobe_dispatcher+0xbc/0x160
  opt_pre_handler+0xd7/0x1b0
  ? __pcpu_freelist_push+0x2ef/0x4f0
  optimized_callback+0x200/0x290
  0xffffffffa0958039
 RIP: 0010:__pcpu_freelist_push+0x2ef/0x4f0
 Code: 00 fc ff df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 fe 01 00 00
4a 03 1c e5 40 ed 20 85 4c 8d 63 08 4c 89 e7 e8 12 9a 95 02 e9 <0d> 10
0a 1f 00 00 00 00 00 fc ff df 48 c1 ea 03 80 3c 02 00 0f 85
 RSP: 0018:ffff88811acf77f8 EFLAGS: 00000096
 RAX: 0000000000000000 RBX: ffffe8fffbe1fd60 RCX: 0000000000000000
 RDX: 1ffffd1fff7c3fb5 RSI: ffffffff846be8a0 RDI: ffffffff848fd000
 RBP: ffff88811acf7848 R08: ffff8881f2e319a0 R09: 1ffffffff0a41da8
 R10: ffffffff88096967 R11: 00000000cda74fb5 R12: ffffe8fffbe1fd68
 R13: 00000000000036d7 R14: ffffc90001b90900 R15: ffffc90001b9090c
  ? trace_irq_disable+0xe1/0x130
  pcpu_freelist_push+0x68/0x80
  __bpf_get_stackid+0x3a6/0x960
  ? bpf_trace_printk+0x109/0x160
  bpf_get_stackid+0x10b/0x180
  bpf_get_stackid_raw_tp+0x1a4/0x260
  bpf_prog_bd94480187a43af0_test_prog1+0x42/0x5b
  bpf_trampoline_6442487319+0x5c/0xfd
  bpf_lsm_task_alloc+0x9/0x20
  ? security_task_alloc+0xbf/0x230
  copy_process+0x2027/0x8390
  ? __this_cpu_preempt_check+0x17/0x20
  ? kvm_sched_clock_read+0x15/0x30
  ? __pfx_copy_process+0x10/0x10
  ? debug_smp_processor_id+0x1b/0x30
  ? do_syscall_64+0x97/0x140
  ? __this_cpu_preempt_check+0x17/0x20
  ? lockdep_hardirqs_on+0xcf/0x150
  ? do_syscall_64+0x97/0x140
  ? do_fcntl+0x93e/0x12a0
  kernel_clone+0xd7/0x710
  ? __pfx_kernel_clone+0x10/0x10
  ? lockdep_hardirqs_on+0xcf/0x150
  ? syscall_exit_to_user_mode+0xd5/0x220
  __do_sys_clone+0xbe/0xf0
  ? __pfx___do_sys_clone+0x10/0x10
  ? __pfx___sys_socketpair+0x10/0x10
  __x64_sys_clone+0xc2/0x150
  ? __this_cpu_preempt_check+0x17/0x20
  ? lockdep_hardirqs_on+0xcf/0x150
  x64_sys_call+0x1951/0x1f20
  do_syscall_64+0x8b/0x140
  ? syscall_exit_to_user_mode+0xd5/0x220
  ? do_syscall_64+0x97/0x140
  ? irqentry_exit+0x6f/0xa0
  ? exc_page_fault+0x8d/0x110
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
 RIP: 0033:0x7f19ba8eab57
 Code: ba 04 00 f3 0f 1e fa 64 48 8b 04 25 10 00 00 00 45 31 c0 31 d2
31 f6 bf 11 00 20 01 4c 8d 90 d0 02 00 00 b8 38 00 00 00 0f 05 <48> 3d
00 f0 ff ff 77 41 41 89 c0 85 c0 75 2c 64 48 8b 04 25 10 00
 RSP: 002b:00007ffce70b84e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000038
 RAX: ffffffffffffffda RBX: 00007f19bb37f040 RCX: 00007f19ba8eab57
 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011
 RBP: 0000000000000000 R08: 0000000000000000 R09: 000056121bfe5730
 R10: 00007f19baa39450 R11: 0000000000000246 R12: 0000000000000001
 R13: 000056121bfe5950 R14: 00000000ffffffff R15: 0000561214b5d004
  </TASK>

The deadlock can be triggered using the following bpf and user programs.
============================================================

#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_endian.h>

char LICENSE[] SEC("license") = "GPL";

struct {
        __uint(type, BPF_MAP_TYPE_STACK_TRACE);
        __uint(key_size, sizeof(u32));
        __uint(value_size, 127 * sizeof(struct bpf_stack_build_id));
        __uint(max_entries, 10000);
        __uint(map_flags, BPF_F_STACK_BUILD_ID);
} stackmap SEC(".maps");

#define KERN_STACKID_FLAGS (0 |  BPF_F_REUSE_STACKID)

SEC("lsm/task_alloc")
int test_prog1(void *ctx){
        bpf_printk("lsm0");
        __u32 stack =  bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS);
        bpf_printk("lsm1");
        return 0;

}

SEC("kprobe/__pcpu_freelist_push+0x2ee")
int test_prog2(void *ctx){
        bpf_printk("kprobe");
        __u32 stack =  bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS);
        return 0;

}
============================================================

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <errno.h>
#include <stdbool.h>
#include <sys/resource.h>
#include <linux/bpf.h>
#include <bpf/libbpf.h>
#include <bpf/bpf.h>
#include <execinfo.h>
#include <signal.h>

static int libbpf_print_fn(enum libbpf_print_level level, const char
*format, va_list args)
{
        return vfprintf(stderr, format, args);
}

static volatile bool exiting = false;

static void sig_handler(int sig)
{
        exiting = true;
}

int main(int argc, char **argv)
{
        int err;

        libbpf_set_print(libbpf_print_fn);

        //handling ctrl+c
        signal(SIGINT, sig_handler);
        signal(SIGTERM, sig_handler);

const char *obj_file1 = "cp_user.bpf.o";
        struct bpf_object *obj1 = bpf_object__open_file(obj_file1, NULL);
        if (!obj1)
                return 1;

        err = bpf_object__load(obj1);
        if (err) {
                fprintf(stderr, "Error loading BPF target object\n");
                return 1;
        }

        struct bpf_program *prog1 =
bpf_object__find_program_by_name(obj1, "test_prog1");
        if (!prog1) {
                fprintf(stderr, "Error finding BPF program by title\n");
                goto cleanup;
        }

        struct bpf_program *prog2 =
bpf_object__find_program_by_name(obj1, "test_prog2");
        if (!prog2) {
                fprintf(stderr, "Error finding BPF program by title\n");
                goto cleanup;
        }

        struct bpf_link *link1 = bpf_program__attach_lsm(prog1);
        if (!link1) {
                fprintf(stderr, "Error attaching lsm\n");
                goto cleanup;
        }

        struct bpf_link *link2 = bpf_program__attach(prog2);
        if (!link2) {
                fprintf(stderr, "Error attaching kprobe\n");
                goto cleanup;
        }

        printf("Started successfully");

        while(!exiting) {
        }
        bpf_link__destroy(link1);
        bpf_link__destroy(link2);

cleanup:

        bpf_object__close(obj1);
        return 0;

}