Re: [RFC] Proposal: Static SECCOMP Policies

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Here's that main.c from my prior reply sent via neomutt (outlook wraps
plaintext messages at 80 chars), if you want to check it out and don't
want to fix the random newlines:

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2023 Motorola Mobility, Inc.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * Kernel module that hooks the vmalloc infrastructure to ensure that code
 * pages are not interleaved with data pages unless at a PMD level granularity.
 * Must be loaded prior to other kernel mechanisms leveraging code page
 * allocation, e.g. BPF, EROFS fixmap.
 */


#include <linux/kernel.h>
#include <linux/bpf.h>
#include <linux/mutex.h>
#include <linux/atomic.h>
#include <linux/highmem.h>
#include <linux/kprobes.h>
#include <linux/list.h>
#include <linux/mm_types.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/of_platform.h>
#include <linux/pagewalk.h>
#include <linux/types.h>
#include <linux/moduleloader.h>
#include <linux/vmalloc.h>
#include <linux/gfp_types.h>
#include <linux/seccomp.h>
#include <asm/pgalloc.h>
#include <asm/ptrace.h>
#include <asm/patching.h>
#include <asm/module.h>
#include <asm/page.h>
#include <asm/seccomp.h>

#ifdef SECCOMP_ARCH_NATIVE                                          
/**                                                                 
 * struct action_cache - per-filter cache of seccomp actions per    
 * arch/syscall pair                                                
 *                                                                  
 * @allow_native: A bitmap where each bit represents whether the    
 *                filter will always allow the syscall, for the     
 *                native architecture.                              
 * @allow_compat: A bitmap where each bit represents whether the    
 *                filter will always allow the syscall, for the     
 *                compat architecture.                              
 */                                                                 
struct action_cache {                                               
        DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);       
#ifdef SECCOMP_ARCH_COMPAT                                          
        DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);       
#endif                                                              
};                                                                  
#else                                                               
struct action_cache { };                                            
#endif

struct seccomp_filter {             
        refcount_t refs;            
        refcount_t users;           
        bool log;                   
        bool wait_killable_recv;    
        struct action_cache cache;  
        struct seccomp_filter *prev;
        struct bpf_prog *prog;      
        struct notification *notif; 
        struct mutex notify_lock;   
        wait_queue_head_t wqh;      
};                                  



void print_bpf_prog_aux(struct bpf_prog_aux *aux) {
        printk("BPF Program Aux Details:\n");
        printk("Ref Count: %lld\n", atomic64_read(&aux->refcnt));
        printk("Used Map Count: %u\n", aux->used_map_cnt);
        printk("Used BTF Count: %u\n", aux->used_btf_cnt);
        printk("Max Context Offset: %u\n", aux->max_ctx_offset);
        printk("Max Packet Offset: %u\n", aux->max_pkt_offset);
        printk("Max TP Access: %u\n", aux->max_tp_access);
        printk("Stack Depth: %u\n", aux->stack_depth);
        printk("ID: %u\n", aux->id);
        printk("Function Count: %u\n", aux->func_cnt);
        printk("Function Index: %u\n", aux->func_idx);
        printk("Attach BTF ID: %u\n", aux->attach_btf_id);
        printk("Context Arg Info Size: %u\n", aux->ctx_arg_info_size);
        printk("Max Read-Only Access: %u\n", aux->max_rdonly_access);
        printk("Max Read-Write Access: %u\n", aux->max_rdwr_access);
        printk("Attach BTF: %p\n", aux->attach_btf);
        printk("Context Arg Info: %p\n", aux->ctx_arg_info);
        printk("DST Mutex: %p\n", &aux->dst_mutex);
        printk("DST Program: %p\n", aux->dst_prog);
        printk("DST Trampoline: %p\n", aux->dst_trampoline);
        printk("Saved DST Program Type: %d\n", aux->saved_dst_prog_type);
        printk("Saved DST Attach Type: %d\n", aux->saved_dst_attach_type);
        printk("Verifier Zero Extension: %u\n", aux->verifier_zext);
        printk("Attach BTF Trace: %u\n", aux->attach_btf_trace);
        printk("Function Proto Unreliable: %u\n", aux->func_proto_unreliable);
        printk("Sleepable: %u\n", aux->sleepable);
        printk("Tail Call Reachable: %u\n", aux->tail_call_reachable);
        printk("XDP Has Frags: %u\n", aux->xdp_has_frags);
        printk("Attach Func Proto: %p\n", aux->attach_func_proto);
        printk("Attach Func Name: %s\n", aux->attach_func_name);
        printk("Functions: %p\n", aux->func);
        printk("JIT Data: %p\n", aux->jit_data);
        printk("Poke Table: %p\n", aux->poke_tab);
        printk("Kfunc Table: %p\n", aux->kfunc_tab);
        printk("Kfunc BTF Table: %p\n", aux->kfunc_btf_tab);
        printk("Size Poke Table: %u\n", aux->size_poke_tab);
        printk("Ksym: %p\n", &aux->ksym);
        printk("Operations: %p\n", aux->ops);
        printk("Used Maps: %p\n", aux->used_maps);
        printk("Used Maps Mutex: %p\n", &aux->used_maps_mutex);
        printk("Used BTFs: %p\n", aux->used_btfs);
        printk("Program: %p\n", aux->prog);
        printk("User: %p\n", aux->user);
        printk("Load Time: %llu\n", aux->load_time);
        printk("Verified Instructions: %u\n", aux->verified_insns);
        printk("Cgroup Attach Type: %d\n", aux->cgroup_atype);
        printk("Cgroup Storage: %p\n", aux->cgroup_storage);
        printk("Name: %s\n", aux->name);
}

void print_bpf_prog_insnsi(struct bpf_insn * insns, uint64_t len) {
        int i;
        for (i = 0; i < len; i++) {                     
                const struct bpf_insn *insn = &insns[i];
                printk("BPF INSN %016llx\n", *((uint64_t *)insn));
        }
}

void print_bpf_prog(struct bpf_prog *prog) {
        printk("BPF Program Details:\n");
        printk("Pages: %u\n", prog->pages);
        printk("JITed: %u\n", prog->jited);
        printk("JIT Requested: %u\n", prog->jit_requested);
        printk("GPL Compatible: %u\n", prog->gpl_compatible);
        printk("Control Block Access: %u\n", prog->cb_access);
        printk("DST Needed: %u\n", prog->dst_needed);
        printk("Blinding Requested: %u\n", prog->blinding_requested);
        printk("Blinded: %u\n", prog->blinded);
        printk("Is Function: %u\n", prog->is_func);
        printk("Kprobe Override: %u\n", prog->kprobe_override);
        printk("Has Callchain Buffer: %u\n", prog->has_callchain_buf);
        printk("Enforce Expected Attach Type: %u\n", prog->enforce_expected_attach_type);
        printk("Call Get Stack: %u\n", prog->call_get_stack);
        printk("Call Get Func IP: %u\n", prog->call_get_func_ip);
        printk("Timestamp Type Access: %u\n", prog->tstamp_type_access);
        printk("Type: %d\n", prog->type);
        printk("Expected Attach Type: %d\n", prog->expected_attach_type);
        printk("Length: %u\n", prog->len);
        printk("JITed Length: %u\n", prog->jited_len);
        printk("Tag: ");
        for (int i = 0; i < BPF_TAG_SIZE; i++) {
                printk("%02x", prog->tag[i]);
        }
        printk("\n");
        printk("Stats: %p\n", prog->stats);
        printk("Active: %p\n", prog->active);
        printk("AUX FIELDS:\n");
        print_bpf_prog_aux(prog->aux);
        print_bpf_prog_insnsi(prog->insnsi, prog->len);
}


/* Functions we need for patching dynamic code allocations */
typedef void *(*module_alloc_t)(unsigned long size);
module_alloc_t module_alloc_ind;
typedef void (*module_memfree_t)(void *module_region);
module_memfree_t module_memfree_ind;

/* TODO: actually we could probably just include "net/bpf_jit.h" */
typedef int (*aarch64_insn_patch_text_nosync_t)(void *addr, u32 insn);
aarch64_insn_patch_text_nosync_t aarch64_insn_patch_text_nosync_ind;
typedef u32 (*aarch64_insn_gen_branch_imm_t)(unsigned long pc,
                                             unsigned long addr,
enum aarch64_insn_branch_type type);
aarch64_insn_gen_branch_imm_t aarch64_insn_gen_branch_imm_ind;
typedef u32 (*aarch64_insn_gen_hint_t)(enum aarch64_insn_hint_cr_op op);
aarch64_insn_gen_hint_t aarch64_insn_gen_hint_ind;
typedef u32 (*aarch64_insn_gen_branch_reg_t)(
        enum aarch64_insn_register reg, enum aarch64_insn_branch_type type);
aarch64_insn_gen_branch_reg_t aarch64_insn_gen_branch_reg_ind;
typedef void *(*__vmalloc_node_range_t)(unsigned long size, unsigned long align,
                                        unsigned long start, unsigned long end,
                                        gfp_t gfp_mask, pgprot_t prot,
                                        unsigned long vm_flags, int node,
const void *caller);
__vmalloc_node_range_t __vmalloc_node_range_ind;

/* Used for reworking the kprobe allocator */
typedef int (*collect_garbage_slots_t)(struct kprobe_insn_cache *c);
collect_garbage_slots_t collect_garbage_slots_ind;

static struct kprobe kallsyms_lookup_name_kp = { .symbol_name =
        "kallsyms_lookup_name",
.addr = 0 };
typedef unsigned long (*kallsyms_lookup_name_t)(const char *name);
kallsyms_lookup_name_t kallsyms_lookup_name_ind;

/* Functions we are patching */
static struct kprobe alloc_vmap_area_kp = { .symbol_name = "alloc_vmap_area",
.addr = 0 };

/* DEBUG: bpf allocation printing */
// static struct kprobe bpf_int_jit_compile_kp = { .symbol_name = "bpf_int_jit_compile",
// .addr = 0 };
static struct kprobe ptrace_request_kp = { .symbol_name = "ptrace_request",
.addr = 0 };
/* END DEBUG */

/* Static variables that must be manually accessed for definition */
u64 module_alloc_base;
struct kprobe_insn_cache *kprobe_insn_slots_ptr;

/**
 * get_kp_addr - TODO comment rest of file
 */
static __always_inline void *get_kp_addr(struct kprobe *kp)
{
        void *res = 0;
        if (register_kprobe(kp)) {
                pr_err("Error: moto_org_mem failed to get kp addr for %s\n",
                       kp->symbol_name);
                return 0;
        }
        res = kp->addr;
        unregister_kprobe(kp);
        return res;
}

static void *bpf_jit_alloc_exec_handler(unsigned long size)
{
        return module_alloc_ind(size);
}

static void bpf_jit_free_exec_handler(void *addr)
{
        module_memfree_ind(addr);
}

static u64 bpf_jit_alloc_exec_limit_handler(void)
{
        return MODULES_END - MODULES_VADDR;
}

static void *alloc_insn_page_handler(void)
{
        return __vmalloc_node_range_ind(PAGE_SIZE, 1, module_alloc_base,
                                        module_alloc_base + SZ_2G, GFP_KERNEL,
                                        PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS,
                                        NUMA_NO_NODE,
        __builtin_return_address(0));
}

static bool allocation_balance = false;

/**
 * alloc_vmap_area_pre_handler - adjusts vstart, vend to not interleave code/data
 *
 * Right now, vmalloc infrastructure does the following:
 * |<-----data----->||<-----code and data pages----->||<-----data----->|
 * Maintainers likely do not want to touch vmalloc internals for fear of
 * breaking everything, so we provide an open-source work-around with hopes
 * that these fixes will make their way into the mainline kernel.
 *
 * We adjust the parameters to the call to avoid the code memory range by
 * selecting the lower half, then in a separate post handler, we check whether
 * the allocation failed, and if so, run the allocation with the upper half.
 *
 * TODO: we need to remove the flip/flopping and properly segment the memory
 * here, but it is not clear how to do this without modifying core vmalloc
 * infrastructure. See upstream patch here:
 * https://lore.kernel.org/all/20240423095843.446565600-1-mbland@xxxxxxxxxxxx/#t
 *
 * Parameters are passed in the arm64 linux kernel following the AAPCS64 ABI 
 * convention, and thus it is safe to interpolate based upon the signature
 * the location of the specific values for vstart and vend.
 * https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst
 */
static int alloc_vmap_area_handler(struct kprobe *kp, struct pt_regs *regs)
{
        unsigned long size;
        unsigned long vstart;
        size = regs->regs[0];
        vstart = regs->regs[2];
        if (vstart == VMALLOC_START) { /* We are attempting to vmalloc data */
                /* Everything is fine, do nothing */
                if (module_alloc_base + SZ_2G <= VMALLOC_START ||
                        module_alloc_base > VMALLOC_END)
                return 0;

                allocation_balance = !allocation_balance;

                /* Not enough room below, else if not enough room above */
                if (module_alloc_base - VMALLOC_START < size)
                        allocation_balance = true;
                        else if (VMALLOC_END - module_alloc_base + SZ_2G < size)
                        allocation_balance = false;

                /* Allocate from higher valued addresses or lower valued
                 * address evenly. since these are virtual it does not 
                 * really matter */
                if (allocation_balance) {
                        regs->regs[2] = module_alloc_base + SZ_2G;
                } else {
                        regs->regs[3] = module_alloc_base;
                }
        }

        return 0;
}

/* DEBUG: Analyze allocated BPF programs */
// static int bpf_int_jit_compile_handler(struct kprobe *kp, struct pt_regs *regs)
// {
//         // struct bpf_prog *prog = (struct bpf_prog *)regs->regs[0];
//         // print_bpf_prog(prog);
//         return 0;
// }
// 
static int ptrace_request_handler(struct kprobe *kp, struct pt_regs *regs)
{
        struct task_struct *task = (struct task_struct *)regs->regs[0];
        long request = regs->regs[1];
        unsigned long addr = regs->regs[2];
        struct seccomp_filter *filter;                                      
        if (request != 0x420c) {
                return 0;
        }
        if (addr != 13371337) {
                printk("waiting for regs ... %llx\n", regs->regs[1]);
                return 0;
        }
                                                                            
        if (!task)
        {
                printk("ptrace_request_handler no task\n");
                return 0;
        }
                                                                            
        filter = READ_ONCE(task->seccomp.filter);
        printk("TASK PID %d or %d\n", task->pid, pid_vnr(task_pgrp(task)));
        if (!filter) {
                printk("ptrace_request_handler no filter\n");
                return 0;
        }
        if (filter->prog)
                print_bpf_prog(filter->prog);

        return 0;
}
/* END DEBUG */


void __always_inline patch_jump_to_handler(void *faddr, void *helper)
{
        u32 insn;
        insn = aarch64_insn_gen_branch_imm_ind((unsigned long)faddr,
                                               (unsigned long)helper,
        AARCH64_INSN_BRANCH_NOLINK);
        aarch64_insn_patch_text_nosync_ind(faddr, insn);
}

struct kprobe_insn_page {
        struct list_head list;
        kprobe_opcode_t *insns; /* Page of instruction slots */
        struct kprobe_insn_cache *cache;
        int nused;
        int ngarbage;
        char slot_used[];
};

void free_insn_pages(struct kprobe_insn_cache *kic)
{
        struct kprobe_insn_page *kip;
        unsigned int i = 0;

        /* TODO: Since the slot array is not protected by rcu, we need a mutex,
         * but we are also should be the only thing running that is touching
         * the kprobes */
        list_for_each_entry_rcu (kip, &kic->pages, list) {
                for (i = 0; i < kip->nused; i++) {
                        kip->slot_used[i] = 0;
                        kip->nused--;
                }
                list_del_rcu(&kip->list);
                synchronize_rcu();
                kip->cache->free(kip->insns);
                kfree(kip);
        }
}

/**
 * mod_init - TODO
 *
 * TODO FAIL IF ANY OF THE BELOW FAILS
 */
static int __init mod_init(void)
{
        void *bpf_jit_alloc_exec_addr = 0;
        void *bpf_jit_free_exec_addr = 0;
        void *bpf_jit_alloc_exec_limit_addr = 0;
        void *alloc_insn_page_addr = 0;
        kallsyms_lookup_name_ind =
                (kallsyms_lookup_name_t)get_kp_addr(&kallsyms_lookup_name_kp);

        module_alloc_ind =
                (module_alloc_t)kallsyms_lookup_name_ind("module_alloc");
        module_memfree_ind =
                (module_memfree_t)kallsyms_lookup_name_ind("module_memfree");
        __vmalloc_node_range_ind =
                (__vmalloc_node_range_t)kallsyms_lookup_name_ind(
                        "__vmalloc_node_range");
        aarch64_insn_patch_text_nosync_ind =
                (aarch64_insn_patch_text_nosync_t)kallsyms_lookup_name_ind(
                        "aarch64_insn_patch_text_nosync");
        aarch64_insn_gen_branch_imm_ind =
                (aarch64_insn_gen_branch_imm_t)kallsyms_lookup_name_ind(
                        "aarch64_insn_gen_branch_imm");
        aarch64_insn_gen_hint_ind =
                (aarch64_insn_gen_hint_t)kallsyms_lookup_name_ind(
                        "aarch64_insn_gen_hint");
        aarch64_insn_gen_branch_reg_ind =
                (aarch64_insn_gen_branch_reg_t)kallsyms_lookup_name_ind(
                        "aarch64_insn_gen_branch_reg");

        collect_garbage_slots_ind =
                (collect_garbage_slots_t)kallsyms_lookup_name_ind(
                        "collect_garbage_slots");

        bpf_jit_alloc_exec_addr =
                (void *)kallsyms_lookup_name_ind("bpf_jit_alloc_exec");
        bpf_jit_free_exec_addr =
                (void *)kallsyms_lookup_name_ind("bpf_jit_free_exec");
        bpf_jit_alloc_exec_limit_addr =
                (void *)kallsyms_lookup_name_ind("bpf_jit_alloc_exec_limit");
        alloc_insn_page_addr =
                (void *)kallsyms_lookup_name_ind("alloc_insn_page");

        module_alloc_base =
                *((u64 *)kallsyms_lookup_name_ind("module_alloc_base"));

        patch_jump_to_handler(bpf_jit_alloc_exec_addr,
                              bpf_jit_alloc_exec_handler);
        patch_jump_to_handler(bpf_jit_free_exec_addr,
                              bpf_jit_free_exec_handler);
        patch_jump_to_handler(bpf_jit_alloc_exec_limit_addr,
                              bpf_jit_alloc_exec_limit_handler);
        patch_jump_to_handler(alloc_insn_page_addr, alloc_insn_page_handler);

        /*
         * Under the hood, arm64 calls __get_insn_slot to generate memory pages for
         * kprobes, and these memory pages *supposedly* access an indirect pointer to
         * their allocation function through kprobe_insn_slots. Because we allocated
         * a kprobe in order to access kallsyms_lookup_name, one page is already allocated.
         * However, even kprobe garbage collection cowardly refuses to kill the last page,
         * so we have our own free routine that nixes that last survivor.
         */
        kprobe_insn_slots_ptr =
                (struct kprobe_insn_cache *)kallsyms_lookup_name_ind(
        "kprobe_insn_slots");
        free_insn_pages(kprobe_insn_slots_ptr);

        alloc_vmap_area_kp.pre_handler = alloc_vmap_area_handler;
        if (register_kprobe(&alloc_vmap_area_kp)) {
                pr_err("moto_org_mem.ko failed to hook alloc_vmap_area!\n");
                return -EACCES;
        }

        /* DEBUG */
        // bpf_int_jit_compile_kp.pre_handler = bpf_int_jit_compile_handler;
        // if (register_kprobe(&bpf_int_jit_compile_kp)) {
        //         pr_err("moto_org_mem.ko failed to hook bpf_int_jit_compile!\n");
        //         return -EACCES;
        // }

        ptrace_request_kp.pre_handler = ptrace_request_handler;
        if (register_kprobe(&ptrace_request_kp)) {
                pr_err("moto_org_mem.ko failed to hook ptrace_request_kp!\n");
                return -EACCES;
        }

        /* END DEBUG */
        pr_info("moto_org_mem loaded!\n");

        return 0;
}

static void __exit mod_exit(void)
{
}

module_init(mod_init);
module_exit(mod_exit);

MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Maxwell Bland <mbland@xxxxxxxxxxxx>");
MODULE_DESCRIPTION("Organizes the vmalloc memory code pages are not interleaved "
                   "with data pages.");





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [Linux for Sparc]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux