[PATCH v8 1/8] sk_run_filter: add support for custom load_pointer

Will Drewry <wad@xxxxxxxxxxxx> · Thu, 16 Feb 2012 14:02:22 -0600

This change allows CONFIG_SECCOMP to make use of BPF programs for
user-controlled system call filtering (as shown in this patch series).

To minimize the impact on existing BPF evaluation, function pointer
use must be declared at sk_chk_filter-time.  This allows ancillary
load instructions to be generated that use the function pointer rather
than adding _any_ code to the existing LD_* instruction paths.

Crude performance numbers using udpflood -l 10000000 against dummy0.
3 trials for baseline, 3 for with tcpdump. Averaged then differenced.
Hard to believe trials were repeated at least a couple more times.

* x86 32-bit (Atom N570 @ 1.66 GHz 2 core HT) [stackprot]:
- Without:  94.05s - 76.36s = 17.68s
- With:     86.22s - 73.30s = 12.92s
- Slowdown per call: -476 nanoseconds

* x86 32-bit (Atom N570 @ 1.66 GHz 2 core HT) [no stackprot]:
- Without:  92.06s - 77.81s = 14.25s
- With:     91.77s - 76.91s = 14.86s
- Slowdown per call: +61 nanoseconds

* x86 64-bit (Atom N570 @ 1.66 GHz 2 core HT) [stackprot]:
- Without: 122.58s - 99.54s = 23.04s
- With:    115.52s - 98.99s = 16.53s
- Slowdown per call:  -651 nanoseconds

* x86 64-bit (Atom N570 @ 1.66 GHz 2 core HT) [no stackprot]:
- Without: 114.95s - 91.92s = 23.03s
- With:    110.47s - 90.79s = 19.68s
- Slowdown per call: -335 nanoseconds

This makes the x86-32-nossp make sense.  Added register pressure always
makes x86-32 sad.  If this is a concern, I could change the call
approach to bpf_run_filter to see if I can alleviate it a bit.

That said, the x86-*-ssp numbers show a marked increase in performance.
I've tested and retested and I keep getting these results. I'm also
suprised by the nossp speed up on 64-bit, but I dunno. I haven't looked
at the full disassembly of the call path. If that is required for the
performance differences I'm seeing, please let me know. Or if I there is
a preferred cpu to run this against - atoms can be a little weird.

v8: - fixed variable positioning and bad cast (eric.dumazet@xxxxxxxxx)
    - no longer passes A as a pointer (inspection of x86 asm shows A is
      %ebx again; thanks eric.dumazet@xxxxxxxxx)
    - cleaned up switch macros and expanded use
      (joe@xxxxxxxxxxx, indan@xxxxxx)
    - added length fn pointer and handled LD_W_LEN/LDX_W_LEN
    - moved from a wrapping struct to a typedef for the function
      pointer. (matches existing function pointer style)
    - added comprehensive comment above the typedef.
    - benchmarks
v7: - first cut

Signed-off-by: Will Drewry <wad@xxxxxxxxxxxx>
---
 include/linux/filter.h |   69 +++++++++++++++++++++-
 net/core/filter.c      |  152 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 185 insertions(+), 36 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 8eeb205..d22ad46 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -110,6 +110,9 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
  */
 #define BPF_MEMWORDS 16
 
+/* BPF program (checking) flags */
+#define BPF_CHK_FLAGS_NO_SKB	1
+
 /* RATIONALE. Negative offsets are invalid in BPF.
    We use them to reference ancillary data.
    Unlike introduction new instructions, it does not break
@@ -145,17 +148,67 @@ struct sk_filter
 	struct sock_filter     	insns[0];
 };
 
+/**
+ * struct bpf_load_fns - callbacks for bpf_run_filter
+ * These functions are called by bpf_run_filter if bpf_chk_filter
+ * was invoked with BPF_CHK_FLAGS_NO_SKB.
+ *
+ * pointer:
+ * @data: const pointer to the data passed into bpf_run_filter
+ * @k: offset into @skb's data
+ * @size: the size of the requested data in bytes: 1, 2, or 4.
+ * @buffer: If non-NULL, a 32-bit buffer for staging data.
+ *
+ * Returns a pointer to the requested data.
+ *
+ * This function operates similarly to load_pointer in net/core/filter.c
+ * except that the pointer to the returned data must already be
+ * byteswapped as appropriate to the source data and endianness.
+ * @buffer may be used if the data needs to be staged.
+ *
+ * length:
+ * @data: const pointer to the data passed into bpf_fun_filter
+ *
+ * Returns the length of the data.
+ */
+struct bpf_load_fns {
+	void *(*pointer)(const void *data, int k, unsigned int size,
+			 void *buffer);
+	u32 (*length)(const void *data);
+};
+
 static inline unsigned int sk_filter_len(const struct sk_filter *fp)
 {
 	return fp->len * sizeof(struct sock_filter) + sizeof(*fp);
 }
 
+extern unsigned int bpf_run_filter(const void *data,
+				   const struct sock_filter *filter,
+				   const struct bpf_load_fns *load_fn);
+
+/**
+ *	sk_run_filter - run a filter on a socket
+ *	@skb: buffer to run the filter on
+ *	@fentry: filter to apply
+ *
+ * Runs bpf_run_filter with the struct sk_buff-specific data
+ * accessor behavior.
+ */
+static inline unsigned int sk_run_filter(const struct sk_buff *skb,
+					 const struct sock_filter *filter)
+{
+	return bpf_run_filter(skb, filter, NULL);
+}
+
 extern int sk_filter(struct sock *sk, struct sk_buff *skb);
-extern unsigned int sk_run_filter(const struct sk_buff *skb,
-				  const struct sock_filter *filter);
 extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 extern int sk_detach_filter(struct sock *sk);
-extern int sk_chk_filter(struct sock_filter *filter, unsigned int flen);
+extern int bpf_chk_filter(struct sock_filter *filter, unsigned int flen, u32 flags);
+
+static inline int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
+{
+	return bpf_chk_filter(filter, flen, 0);
+}
 
 #ifdef CONFIG_BPF_JIT
 extern void bpf_jit_compile(struct sk_filter *fp);
@@ -228,6 +281,16 @@ enum {
 	BPF_S_ANC_HATYPE,
 	BPF_S_ANC_RXHASH,
 	BPF_S_ANC_CPU,
+	/* Used to differentiate SKB data and generic data */
+	BPF_S_ANC_LD_W_ABS,
+	BPF_S_ANC_LD_H_ABS,
+	BPF_S_ANC_LD_B_ABS,
+	BPF_S_ANC_LD_W_LEN,
+	BPF_S_ANC_LD_W_IND,
+	BPF_S_ANC_LD_H_IND,
+	BPF_S_ANC_LD_B_IND,
+	BPF_S_ANC_LDX_W_LEN,
+	BPF_S_ANC_LDX_B_MSH,
 };
 
 #endif /* __KERNEL__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 5dea452..a5c98a9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -98,9 +98,10 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
 EXPORT_SYMBOL(sk_filter);
 
 /**
- *	sk_run_filter - run a filter on a socket
- *	@skb: buffer to run the filter on
+ *	bpf_run_filter - run a filter on a BPF program
+ *	@data: buffer to run the filter on
  *	@fentry: filter to apply
+ *	@load_fns: custom data accessor functions
  *
  * Decode and apply filter instructions to the skb->data.
  * Return length to keep, 0 for none. @skb is the data we are
@@ -108,9 +109,13 @@ EXPORT_SYMBOL(sk_filter);
  * Because all jumps are guaranteed to be before last instruction,
  * and last instruction guaranteed to be a RET, we dont need to check
  * flen. (We used to pass to this function the length of filter)
+ *
+ * load_fn is only used if SKF_FLAGS_USE_LOAD_FNS was specified
+ * to sk_chk_generic_filter.
  */
-unsigned int sk_run_filter(const struct sk_buff *skb,
-			   const struct sock_filter *fentry)
+unsigned int bpf_run_filter(const void *data,
+			    const struct sock_filter *fentry,
+			    const struct bpf_load_fns *load_fns)
 {
 	void *ptr;
 	u32 A = 0;			/* Accumulator */
@@ -128,6 +133,7 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
 #else
 		const u32 K = fentry->k;
 #endif
+#define SKB(_data) ((const struct sk_buff *)(_data))
 
 		switch (fentry->code) {
 		case BPF_S_ALU_ADD_X:
@@ -213,7 +219,7 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
 		case BPF_S_LD_W_ABS:
 			k = K;
 load_w:
-			ptr = load_pointer(skb, k, 4, &tmp);
+			ptr = load_pointer(data, k, 4, &tmp);
 			if (ptr != NULL) {
 				A = get_unaligned_be32(ptr);
 				continue;
@@ -222,7 +228,7 @@ load_w:
 		case BPF_S_LD_H_ABS:
 			k = K;
 load_h:
-			ptr = load_pointer(skb, k, 2, &tmp);
+			ptr = load_pointer(data, k, 2, &tmp);
 			if (ptr != NULL) {
 				A = get_unaligned_be16(ptr);
 				continue;
@@ -231,17 +237,17 @@ load_h:
 		case BPF_S_LD_B_ABS:
 			k = K;
 load_b:
-			ptr = load_pointer(skb, k, 1, &tmp);
+			ptr = load_pointer(data, k, 1, &tmp);
 			if (ptr != NULL) {
 				A = *(u8 *)ptr;
 				continue;
 			}
 			return 0;
 		case BPF_S_LD_W_LEN:
-			A = skb->len;
+			A = SKB(data)->len;
 			continue;
 		case BPF_S_LDX_W_LEN:
-			X = skb->len;
+			X = SKB(data)->len;
 			continue;
 		case BPF_S_LD_W_IND:
 			k = X + K;
@@ -253,7 +259,7 @@ load_b:
 			k = X + K;
 			goto load_b;
 		case BPF_S_LDX_B_MSH:
-			ptr = load_pointer(skb, K, 1, &tmp);
+			ptr = load_pointer(data, K, 1, &tmp);
 			if (ptr != NULL) {
 				X = (*(u8 *)ptr & 0xf) << 2;
 				continue;
@@ -288,29 +294,29 @@ load_b:
 			mem[K] = X;
 			continue;
 		case BPF_S_ANC_PROTOCOL:
-			A = ntohs(skb->protocol);
+			A = ntohs(SKB(data)->protocol);
 			continue;
 		case BPF_S_ANC_PKTTYPE:
-			A = skb->pkt_type;
+			A = SKB(data)->pkt_type;
 			continue;
 		case BPF_S_ANC_IFINDEX:
-			if (!skb->dev)
+			if (!SKB(data)->dev)
 				return 0;
-			A = skb->dev->ifindex;
+			A = SKB(data)->dev->ifindex;
 			continue;
 		case BPF_S_ANC_MARK:
-			A = skb->mark;
+			A = SKB(data)->mark;
 			continue;
 		case BPF_S_ANC_QUEUE:
-			A = skb->queue_mapping;
+			A = SKB(data)->queue_mapping;
 			continue;
 		case BPF_S_ANC_HATYPE:
-			if (!skb->dev)
+			if (!SKB(data)->dev)
 				return 0;
-			A = skb->dev->type;
+			A = SKB(data)->dev->type;
 			continue;
 		case BPF_S_ANC_RXHASH:
-			A = skb->rxhash;
+			A = SKB(data)->rxhash;
 			continue;
 		case BPF_S_ANC_CPU:
 			A = raw_smp_processor_id();
@@ -318,15 +324,15 @@ load_b:
 		case BPF_S_ANC_NLATTR: {
 			struct nlattr *nla;
 
-			if (skb_is_nonlinear(skb))
+			if (skb_is_nonlinear(SKB(data)))
 				return 0;
-			if (A > skb->len - sizeof(struct nlattr))
+			if (A > SKB(data)->len - sizeof(struct nlattr))
 				return 0;
 
-			nla = nla_find((struct nlattr *)&skb->data[A],
-				       skb->len - A, X);
+			nla = nla_find((struct nlattr *)&SKB(data)->data[A],
+				       SKB(data)->len - A, X);
 			if (nla)
-				A = (void *)nla - (void *)skb->data;
+				A = (void *)nla - (void *)SKB(data)->data;
 			else
 				A = 0;
 			continue;
@@ -334,22 +340,71 @@ load_b:
 		case BPF_S_ANC_NLATTR_NEST: {
 			struct nlattr *nla;
 
-			if (skb_is_nonlinear(skb))
+			if (skb_is_nonlinear(SKB(data)))
 				return 0;
-			if (A > skb->len - sizeof(struct nlattr))
+			if (A > SKB(data)->len - sizeof(struct nlattr))
 				return 0;
 
-			nla = (struct nlattr *)&skb->data[A];
-			if (nla->nla_len > A - skb->len)
+			nla = (struct nlattr *)&SKB(data)->data[A];
+			if (nla->nla_len > A - SKB(data)->len)
 				return 0;
 
 			nla = nla_find_nested(nla, X);
 			if (nla)
-				A = (void *)nla - (void *)skb->data;
+				A = (void *)nla - (void *)SKB(data)->data;
 			else
 				A = 0;
 			continue;
 		}
+		case BPF_S_ANC_LD_W_ABS:
+			k = K;
+load_fn_w:
+			ptr = load_fns->pointer(data, k, 4, &tmp);
+			if (ptr) {
+				A = *(u32 *)ptr;
+				continue;
+			}
+			return 0;
+		case BPF_S_ANC_LD_H_ABS:
+			k = K;
+load_fn_h:
+			ptr = load_fns->pointer(data, k, 2, &tmp);
+			if (ptr) {
+				A = *(u16 *)ptr;
+				continue;
+			}
+			return 0;
+		case BPF_S_ANC_LD_B_ABS:
+			k = K;
+load_fn_b:
+			ptr = load_fns->pointer(data, k, 1, &tmp);
+			if (ptr) {
+				A = *(u8 *)ptr;
+				continue;
+			}
+			return 0;
+		case BPF_S_ANC_LDX_B_MSH:
+			ptr = load_fns->pointer(data, K, 1, &tmp);
+			if (ptr) {
+				X = (*(u8 *)ptr & 0xf) << 2;
+				continue;
+			}
+			return 0;
+		case BPF_S_ANC_LD_W_IND:
+			k = X + K;
+			goto load_fn_w;
+		case BPF_S_ANC_LD_H_IND:
+			k = X + K;
+			goto load_fn_h;
+		case BPF_S_ANC_LD_B_IND:
+			k = X + K;
+			goto load_fn_b;
+		case BPF_S_ANC_LD_W_LEN:
+			A = load_fns->length(data);
+			continue;
+		case BPF_S_ANC_LDX_W_LEN:
+			X = load_fns->length(data);
+			continue;
 		default:
 			WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
 				       fentry->code, fentry->jt,
@@ -360,7 +415,7 @@ load_b:
 
 	return 0;
 }
-EXPORT_SYMBOL(sk_run_filter);
+EXPORT_SYMBOL(bpf_run_filter);
 
 /*
  * Security :
@@ -423,9 +478,10 @@ error:
 }
 
 /**
- *	sk_chk_filter - verify socket filter code
+ *	bpf_chk_filter - verify socket filter BPF code
  *	@filter: filter to verify
  *	@flen: length of filter
+ *	@flags: May be BPF_CHK_FLAGS_NO_SKB or 0
  *
  * Check the user's filter code. If we let some ugly
  * filter code slip through kaboom! The filter must contain
@@ -434,9 +490,13 @@ error:
  *
  * All jumps are forward as they are not signed.
  *
+ * If BPF_CHK_FLAGS_NO_SKB is set in flags, any SKB-specific
+ * rules become illegal and a custom set of bpf_load_fns will
+ * be expected by bpf_run_filter.
+ *
  * Returns 0 if the rule set is legal or -EINVAL if not.
  */
-int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
+int bpf_chk_filter(struct sock_filter *filter, unsigned int flen, u32 flags)
 {
 	/*
 	 * Valid instructions are initialized to non-0.
@@ -542,9 +602,35 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			    pc + ftest->jf + 1 >= flen)
 				return -EINVAL;
 			break;
+#define MAYBE_USE_LOAD_FN(CODE) \
+			if (flags & BPF_CHK_FLAGS_NO_SKB) { \
+				code = BPF_S_ANC_##CODE; \
+				break; \
+			}
+		case BPF_S_LD_W_LEN:
+			MAYBE_USE_LOAD_FN(LD_W_LEN);
+			break;
+		case BPF_S_LDX_W_LEN:
+			MAYBE_USE_LOAD_FN(LDX_W_LEN);
+			break;
+		case BPF_S_LD_W_IND:
+			MAYBE_USE_LOAD_FN(LD_W_IND);
+			break;
+		case BPF_S_LD_H_IND:
+			MAYBE_USE_LOAD_FN(LD_H_IND);
+			break;
+		case BPF_S_LD_B_IND:
+			MAYBE_USE_LOAD_FN(LD_B_IND);
+			break;
+		case BPF_S_LDX_B_MSH:
+			MAYBE_USE_LOAD_FN(LDX_B_MSH);
+			break;
 		case BPF_S_LD_W_ABS:
+			MAYBE_USE_LOAD_FN(LD_W_ABS);
 		case BPF_S_LD_H_ABS:
+			MAYBE_USE_LOAD_FN(LD_H_ABS);
 		case BPF_S_LD_B_ABS:
+			MAYBE_USE_LOAD_FN(LD_B_ABS);
 #define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE:	\
 				code = BPF_S_ANC_##CODE;	\
 				break
@@ -572,7 +658,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 	}
 	return -EINVAL;
 }
-EXPORT_SYMBOL(sk_chk_filter);
+EXPORT_SYMBOL(bpf_chk_filter);
 
 /**
  * 	sk_filter_release_rcu - Release a socket filter by rcu_head
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html