On 9/7/19 2:40 PM, Alan Maguire wrote: > packet capture is especially valuable in tracing contexts, so > extend bpf_pcap helper to take a tracing-derived skb pointer > as an argument. > > In the case of tracing programs, the starting protocol > (corresponding to libpcap DLT_* values; 1 for Ethernet, 12 for > IP, etc) needs to be specified and should reflect the protocol > type which is pointed to by the skb->data pointer; i.e. the > start of the packet. This can derived in a limited set of cases, > but should be specified where possible. For skb and xdp programs > this protocol will nearly always be 1 (BPF_PCAP_TYPE_ETH). > > Example usage for a tracing program, where we use a > struct bpf_pcap_hdr array map to pass in preferences for > protocol and max len: > > struct bpf_map_def SEC("maps") pcap_conf_map = { > .type = BPF_MAP_TYPE_ARRAY, > .key_size = sizeof(int), > .value_size = sizeof(struct bpf_pcap_hdr), > .max_entries = 1, > }; > > struct bpf_map_def SEC("maps") pcap_map = { > .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, > .key_size = sizeof(int), > .value_size = sizeof(int), > .max_entries = 1024, > }; > > SEC("kprobe/kfree_skb") > int probe_kfree_skb(struct pt_regs *ctx) > { > struct bpf_pcap_hdr *conf; > int key = 0; > > conf = bpf_map_lookup_elem(&pcap_conf_map, &key); > if (!conf) > return 0; > > bpf_pcap((void *)PT_REGS_PARM1(ctx), conf->cap_len, &pcap_map, > conf->protocol, BPF_F_CURRENT_CPU); > return 0; > } > > Signed-off-by: Alan Maguire <alan.maguire@xxxxxxxxxx> > --- [...] > @@ -2977,6 +2992,8 @@ enum bpf_func_id { > > /* BPF_FUNC_pcap flags */ > #define BPF_F_PCAP_ID_MASK 0xffffffffffff > +#define BPF_F_PCAP_ID_IIFINDEX (1ULL << 48) > +#define BPF_F_PCAP_STRICT_TYPE (1ULL << 56) > > /* Mode for BPF_FUNC_skb_adjust_room helper. */ > enum bpf_adj_room_mode { > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index ca1255d..311883b 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -13,6 +13,8 @@ > #include <linux/kprobes.h> > #include <linux/syscalls.h> > #include <linux/error-injection.h> > +#include <linux/skbuff.h> > +#include <linux/ip.h> > > #include <asm/tlb.h> > > @@ -530,6 +532,216 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, > return __bpf_perf_event_output(regs, map, flags, sd); > } > > +/* Essentially just skb_copy_bits() using probe_kernel_read() where needed. */ > +static unsigned long bpf_trace_skb_copy(void *tobuf, const void *from, > + unsigned long offset, > + unsigned long len) > +{ > + const struct sk_buff *frag_iterp, *skb = from; > + struct skb_shared_info *shinfop, shinfo; > + struct sk_buff frag_iter; > + unsigned long copy, start; > + void *to = tobuf; > + int i, ret; > + > + start = skb_headlen(skb); > + > + copy = start - offset; > + if (copy > 0) { > + if (copy > len) > + copy = len; > + ret = probe_kernel_read(to, skb->data, copy); > + if (unlikely(ret < 0)) > + goto out; > + len -= copy; > + if (len == 0) > + return 0; > + offset += copy; > + to += copy; > + } > + > + if (skb->data_len == 0) > + goto out; > + > + shinfop = skb_shinfo(skb); > + > + ret = probe_kernel_read(&shinfo, shinfop, sizeof(shinfo)); > + if (unlikely(ret < 0)) > + goto out; > + > + if (shinfo.nr_frags > MAX_SKB_FRAGS) { > + ret = -EINVAL; > + goto out; > + } > + for (i = 0; i < shinfo.nr_frags; i++) { > + skb_frag_t *f = &shinfo.frags[i]; > + int end; > + > + if (start > offset + len) { > + ret = -E2BIG; > + goto out; > + } > + > + end = start + skb_frag_size(f); > + copy = end - offset; > + if (copy > 0) { > + u32 poff, p_len, copied; > + struct page *p; > + u8 *vaddr; > + > + if (copy > len) > + copy = len; > + > + skb_frag_foreach_page(f, > + skb_frag_off(f) + offset - start, > + copy, p, poff, p_len, copied) { > + > + vaddr = kmap_atomic(p); > + ret = probe_kernel_read(to + copied, > + vaddr + poff, p_len); > + kunmap_atomic(vaddr); > + > + if (unlikely(ret < 0)) > + goto out; > + } > + len -= copy; > + if (len == 0) > + return 0; > + offset += copy; > + to += copy; > + } > + start = end; > + } > + > + for (frag_iterp = shinfo.frag_list; frag_iterp; > + frag_iterp = frag_iter.next) { > + int end; > + > + if (start > offset + len) { > + ret = -E2BIG; > + goto out; > + } > + ret = probe_kernel_read(&frag_iter, frag_iterp, > + sizeof(frag_iter)); > + if (ret) > + goto out; > + > + end = start + frag_iter.len; > + copy = end - offset; > + if (copy > 0) { > + if (copy > len) > + copy = len; > + ret = bpf_trace_skb_copy(to, &frag_iter, > + offset - start, > + copy); > + if (ret) > + goto out; > + > + len -= copy; > + if (len == 0) > + return 0; > + offset += copy; > + to += copy; > + } > + start = end; > + } > +out: > + if (ret) > + memset(tobuf, 0, len); > + > + return ret; > +} For net side bpf_perf_event_output, we have static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { void *ptr = skb_header_pointer(skb, off, len, dst_buff); if (unlikely(!ptr)) return len; if (ptr != dst_buff) memcpy(dst_buff, ptr, len); return 0; } BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, bpf_skb_copy); } It does not really consider output all the frags. I understand that to get truly all packet data, frags should be considered, but seems we did not do it before? I am wondering whether we need to do here. If we indeed do not need to handle frags here, I think maybe bpf_probe_read() in existing bpf kprobe function should be enough, we do not need this helper? > + > +/* Derive protocol for some of the easier cases. For tracing, a probe point > + * may be dealing with packets in various states. Common cases are IP > + * packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet > + * (_PCAP_TYPE_ETH). For other cases the caller must specify the > + * protocol they expect. Other heuristics for packet identification > + * should be added here as needed, since determining the packet type > + * ensures we do not capture packets that fail to match the desired > + * pcap type in BPF_F_PCAP_STRICT_TYPE mode. > + */ > +static inline int bpf_skb_protocol_get(struct sk_buff *skb) > +{ > + switch (htons(skb->protocol)) { > + case ETH_P_IP: > + case ETH_P_IPV6: > + if (skb_network_header(skb) == skb->data) > + return BPF_PCAP_TYPE_IP; > + else > + return BPF_PCAP_TYPE_ETH; > + default: > + return BPF_PCAP_TYPE_UNSET; > + } > +} > + > +BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map, > + int, protocol_wanted, u64, flags) Up to now, for helpers, verifier has a way to verifier it is used properly regarding to the context. For example, for xdp version perf_event_output, the help prototype, BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) the verifier is able to guarantee that the first parameter has correct type xdp_buff, not something from type cast. .arg1_type = ARG_PTR_TO_CTX, This helper, in the below we have .arg1_type = ARG_ANYTHING, So it is not really enforced. Bringing BTF can help, but type name matching typically bad. > +{ > + struct bpf_pcap_hdr pcap; > + struct sk_buff skb; > + int protocol; > + int ret; > + > + if (unlikely(flags & ~(BPF_F_PCAP_ID_IIFINDEX | BPF_F_PCAP_ID_MASK | > + BPF_F_PCAP_STRICT_TYPE))) > + return -EINVAL; > + > + ret = probe_kernel_read(&skb, data, sizeof(skb)); > + if (unlikely(ret < 0)) > + return ret; > + > + /* Sanity check skb len in case we get bogus data. */ > + if (unlikely(!skb.len)) > + return -ENOENT; > + if (unlikely(skb.len > GSO_MAX_SIZE || skb.data_len > skb.len)) > + return -E2BIG; > + > + protocol = bpf_skb_protocol_get(&skb); > + > + if (protocol_wanted == BPF_PCAP_TYPE_UNSET) { > + /* If we cannot determine protocol type, bail. */ > + if (protocol == BPF_PCAP_TYPE_UNSET) > + return -EPROTO; > + } else { > + /* if we determine protocol type, and it's not what we asked > + * for _and_ we are in strict mode, bail. Otherwise we assume > + * the packet is the requested protocol type and drive on. > + */ > + if (flags & BPF_F_PCAP_STRICT_TYPE && > + protocol != BPF_PCAP_TYPE_UNSET && > + protocol != protocol_wanted) > + return -EPROTO; > + protocol = protocol_wanted; > + } > + > + /* If we specified a matching incoming ifindex, bail if not a match. */ > + if (flags & BPF_F_PCAP_ID_IIFINDEX) { > + int iif = flags & BPF_F_PCAP_ID_MASK; > + > + if (iif && skb.skb_iif != iif) > + return -ENOENT; > + } > + > + ret = bpf_pcap_prepare(protocol, size, skb.len, flags, &pcap); > + if (ret) > + return ret; > + > + return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap), > + &skb, pcap.cap_len, bpf_trace_skb_copy); > +} > + > +static const struct bpf_func_proto bpf_trace_pcap_proto = { > + .func = bpf_trace_pcap, > + .gpl_only = true, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_ANYTHING, > + .arg2_type = ARG_ANYTHING, > + .arg3_type = ARG_CONST_MAP_PTR, > + .arg4_type = ARG_ANYTHING, > + .arg5_type = ARG_ANYTHING, > +}; > + > BPF_CALL_0(bpf_get_current_task) > { > return (long) current; > @@ -709,6 +921,8 @@ static void do_bpf_send_signal(struct irq_work *entry) > #endif > case BPF_FUNC_send_signal: > return &bpf_send_signal_proto; > + case BPF_FUNC_pcap: > + return &bpf_trace_pcap_proto; > default: > return NULL; > } >