[PATCH] skb destructor enhancement idea

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



A couple days ago I posted about the idea of a generic, "chainable" destructor
mechanism for skb's. The attached patch is a first attempt at implementing such a
mechanism. So far, it doesn't seem to break anything :)

Any driver people want to try it out and see if they can make their driver use it?
Our internal use for this isn't for drivers, but a netfilter-like packet interceptor
mechanism that needs to manage its own memory allocation. 

I believe the changes are minimal enough to not increase overhead too much (in the
case of drivers that can't reduce their memory copying even with this additional
functionality.) It looks like moving a few hundred thousand skbs through the stack
every second isn't an unusual situation.

The patch is against 2.4.2 (because it's what we're frozen on internally at the
moment). Let me know if a patch against a current kernel would help somehow.

-- 
-Will  :: AD6XL :: http://tyranny.egregious.net/~will/
 Orton :: finger will@tyranny.egregious.net for GPG public key
diff -urP linux-242-clean/include/linux/skbuff.h linux/include/linux/skbuff.h
--- linux-242-clean/include/linux/skbuff.h	Wed Feb 21 16:10:12 2001
+++ linux/include/linux/skbuff.h	Thu Jun 21 15:19:27 2001
@@ -22,6 +22,9 @@
 #include <asm/atomic.h>
 #include <asm/types.h>
 #include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
 
 #define HAVE_ALLOC_SKB		/* For the drivers to know */
 #define HAVE_ALIGNABLE_SKB	/* Ditto 8)		   */
@@ -57,6 +60,20 @@
 	spinlock_t	lock;
 };
 
+
+typedef struct __sk_buff_dest {
+	struct __sk_buff_dest *next;
+	void (*destructor)( void * );
+	void *context;
+}sk_buff_dest;
+
+typedef struct __sk_buff_dest_pool {
+	spinlock_t	lock;
+	kmem_cache_t    *cache;
+	sk_buff_dest	*head;	
+}sk_buff_dest_pool;
+
+
 struct sk_buff {
 	/* These two members must be first. */
 	struct sk_buff	* next;			/* Next buffer in list 				*/
@@ -122,7 +139,8 @@
 	unsigned char	*data;			/* Data head pointer				*/
 	unsigned char	*tail;			/* Tail pointer					*/
 	unsigned char 	*end;			/* End pointer					*/
-	void 		(*destructor)(struct sk_buff *);	/* Destruct function		*/
+	sk_buff_dest_pool	destructor;
+
 #ifdef CONFIG_NETFILTER
 	/* Can be used for communication between hooks. */
         unsigned long	nfmark;
@@ -150,6 +168,80 @@
 #define SK_RMEM_MAX	65535
 
 #ifdef __KERNEL__
+
+extern unsigned int skbd_stat_add_head;
+extern unsigned int skbd_stat_add_dest;
+extern unsigned int skbd_stat_alloc;
+extern unsigned int skbd_stat_call_dest;
+extern unsigned int skbd_stat_maybe_call_dest;
+
+extern sk_buff_dest_pool sk_buff_dest_free_pool; 
+
+static inline void skbd_add_head(sk_buff_dest_pool *pool,
+								 sk_buff_dest *newnode)
+{
+	skbd_stat_add_head++;
+	newnode->next = pool->head;
+	pool->head = newnode;
+}
+
+static inline void skbd_remove_head(sk_buff_dest_pool *pool,
+									sk_buff_dest **rmnode)
+{
+	if ((*rmnode = pool->head) != NULL)
+		pool->head = pool->head->next;
+}
+
+
+static inline int skb_add_dest(struct sk_buff *skb, void (*destructor)(void *),
+							   void *context, unsigned long gfp_mask)
+{
+	unsigned long flags;
+	sk_buff_dest *skbd;
+
+	skbd_stat_add_dest++;
+    
+	spin_lock_irqsave(&sk_buff_dest_free_pool.lock, flags);
+	skbd_remove_head(&sk_buff_dest_free_pool, &skbd); 
+	spin_unlock_irqrestore(&sk_buff_dest_free_pool.lock, flags);
+
+	if (!skbd) {
+        skbd_stat_alloc++;
+//		printk(" NULL skbd, allocating one\n" );
+		skbd = kmem_cache_alloc( sk_buff_dest_free_pool.cache, gfp_mask);
+		if (!skbd) {
+			printk(" out of memory allocating skbd\n");
+			return 1;
+		}
+	}
+
+	skbd->destructor = destructor;
+	skbd->context = context;
+	skbd_add_head(&skb->destructor, skbd);
+	return 0;
+}                                                                                                  
+
+static inline void skb_call_dest(struct sk_buff *skb)
+{
+	unsigned long flags;
+	sk_buff_dest *skbd;
+
+	skbd_stat_maybe_call_dest++;
+	skbd_remove_head(&skb->destructor, &skbd);
+    
+	while (skbd) {
+        skbd_stat_call_dest++;
+        skbd->destructor(skbd->context);
+
+        spin_lock_irqsave(&sk_buff_dest_free_pool.lock, flags);
+        skbd_add_head(&sk_buff_dest_free_pool, skbd);
+        spin_unlock_irqrestore(&sk_buff_dest_free_pool.lock, flags);
+
+        skbd_remove_head(&skb->destructor, &skbd);
+	}
+}
+
+
 /*
  *	Handling routines are only of interest to the kernel
  */
@@ -837,9 +929,7 @@
 
 static inline void skb_orphan(struct sk_buff *skb)
 {
-	if (skb->destructor)
-		skb->destructor(skb);
-	skb->destructor = NULL;
+	skb_call_dest( skb );
 	skb->sk = NULL;
 }
 
diff -urP linux-242-clean/include/net/sock.h linux/include/net/sock.h
--- linux-242-clean/include/net/sock.h	Wed Feb 21 16:10:24 2001
+++ linux/include/net/sock.h	Thu Jun 21 15:19:27 2001
@@ -1131,19 +1131,25 @@
  *	packet ever received.
  */
 
-static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
+static inline int skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
 {
 	sock_hold(sk);
 	skb->sk = sk;
-	skb->destructor = sock_wfree;
+
+	if (skb_add_dest(skb, (void (*)(void *))sock_wfree, skb, GFP_ATOMIC))
+		return 1;
 	atomic_add(skb->truesize, &sk->wmem_alloc);
+	return 0;
 }
 
-static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
+static inline int skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
-	skb->destructor = sock_rfree;
+
+	if (skb_add_dest(skb, (void (*)(void *))sock_rfree, skb, GFP_ATOMIC))
+		return 1;
 	atomic_add(skb->truesize, &sk->rmem_alloc);
+	return 0;
 }
 
 static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -1173,7 +1179,7 @@
 #endif /* CONFIG_FILTER */
 
 	skb->dev = NULL;
-	skb_set_owner_r(skb, sk);
+	if ( skb_set_owner_r(skb, sk) == 1 ) return -ENOMEM;
 	skb_queue_tail(&sk->receive_queue, skb);
 	if (!sk->dead)
 		sk->data_ready(sk,skb->len);
@@ -1187,7 +1193,8 @@
 	 */
 	if (atomic_read(&sk->rmem_alloc) + skb->truesize >= (unsigned)sk->rcvbuf)
 		return -ENOMEM;
-	skb_set_owner_r(skb, sk);
+	if ( skb_set_owner_r(skb, sk) == 1 ) return -ENOMEM;
+
 	skb_queue_tail(&sk->error_queue,skb);
 	if (!sk->dead)
 		sk->data_ready(sk,skb->len);
diff -urP linux-242-clean/include/net/tcp.h linux/include/net/tcp.h
--- linux-242-clean/include/net/tcp.h	Wed Feb 21 16:10:40 2001
+++ linux/include/net/tcp.h	Thu Jun 21 15:19:27 2001
@@ -1704,12 +1704,15 @@
 
 extern void tcp_rfree(struct sk_buff *skb);
 
-static inline void tcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
+static inline int tcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
-	skb->destructor = tcp_rfree;
+
+	if (skb_add_dest( skb, (void (*)(void *))tcp_rfree, skb, GFP_ATOMIC))
+		return 1;
 	atomic_add(skb->truesize, &sk->rmem_alloc);
 	sk->forward_alloc -= skb->truesize;
+	return 0;
 }
 
 extern void tcp_listen_wlock(void);
diff -urP linux-242-clean/net/core/skbuff.c linux/net/core/skbuff.c
--- linux-242-clean/net/core/skbuff.c	Fri Feb 16 16:06:17 2001
+++ linux/net/core/skbuff.c	Thu Jun 21 15:19:27 2001
@@ -46,6 +46,7 @@
 #include <linux/inet.h>
 #include <linux/slab.h>
 #include <linux/netdevice.h>
+#include <linux/proc_fs.h>
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/cache.h>
@@ -70,6 +71,46 @@
 	char			pad[SMP_CACHE_BYTES];
 } skb_head_pool[NR_CPUS];
 
+sk_buff_dest_pool sk_buff_dest_free_pool;
+unsigned int skbd_stat_add_dest;
+unsigned int skbd_stat_alloc;
+unsigned int skbd_stat_add_head;
+unsigned int skbd_stat_maybe_call_dest;
+unsigned int skbd_stat_call_dest;
+
+static int skbd_get_info(char *buffer, char **start, off_t offset, int length)
+{
+    int size = 0;
+
+    size += sprintf(buffer + size, "add_head: %d\n", skbd_stat_add_head);
+    size += sprintf(buffer + size, "add_dest: %d\n", skbd_stat_add_dest);
+    size += sprintf(buffer + size, "alloc: %d\n", skbd_stat_alloc);
+    size += sprintf(buffer + size, "maybe_call_dest: %d\n", skbd_stat_maybe_call_dest);
+    size += sprintf(buffer + size, "call_dest: %d\n", skbd_stat_call_dest);
+    return size;
+}
+
+void __init skb_dest_init(void)
+{
+	sk_buff_dest_free_pool.cache = kmem_cache_create("dest_pool_cache",
+					      sizeof(sk_buff_dest),
+					      0,
+					      SLAB_HWCACHE_ALIGN,
+					      NULL, NULL);
+	if (!sk_buff_dest_free_pool.cache)
+		panic("cannot create sk_buff_dest_pool.cache");
+
+	spin_lock_init(&sk_buff_dest_free_pool.lock);
+	sk_buff_dest_free_pool.head  = NULL;
+
+    proc_net_create("skbd", 0, skbd_get_info);
+    skbd_stat_add_head = 0;
+    skbd_stat_add_dest = 0;
+    skbd_stat_alloc = 0;
+    skbd_stat_maybe_call_dest = 0;
+    skbd_stat_call_dest = 0;
+}
+
 /*
  *	Keep out-of-line to prevent kernel bloat.
  *	__builtin_return_address is not used because it is not always
@@ -234,7 +275,7 @@
 	skb->ip_summed = 0;
 	skb->priority = 0;
 	skb->security = 0;	/* By default packets are insecure */
-	skb->destructor = NULL;
+	skb->destructor.head =  NULL;
 
 #ifdef CONFIG_NETFILTER
 	skb->nfmark = skb->nfcache = 0;
@@ -277,13 +318,8 @@
 	}
 
 	dst_release(skb->dst);
-	if(skb->destructor) {
-		if (in_irq()) {
-			printk(KERN_WARNING "Warning: kfree_skb on hard IRQ %p\n",
-				NET_CALLER(skb));
-		}
-		skb->destructor(skb);
-	}
+	skb_call_dest(skb);
+
 #ifdef CONFIG_NETFILTER
 	nf_conntrack_put(skb->nfct);
 #endif
@@ -326,7 +362,7 @@
 	n->list = NULL;
 	n->sk = NULL;
 	atomic_set(&n->users, 1);
-	n->destructor = NULL;
+	n->destructor.head = NULL;
 #ifdef CONFIG_NETFILTER
 	nf_conntrack_get(skb->nfct);
 #endif
@@ -354,7 +390,7 @@
 	atomic_set(&new->users, 1);
 	new->pkt_type=old->pkt_type;
 	new->stamp=old->stamp;
-	new->destructor = NULL;
+	new->destructor.head = NULL;
 	new->security=old->security;
 #ifdef CONFIG_NETFILTER
 	new->nfmark=old->nfmark;
@@ -483,4 +519,6 @@
 
 	for (i=0; i<NR_CPUS; i++)
 		skb_queue_head_init(&skb_head_pool[i].list);
+
+	skb_dest_init();	
 }
diff -urP linux-242-clean/net/ipv4/ip_gre.c linux/net/ipv4/ip_gre.c
--- linux-242-clean/net/ipv4/ip_gre.c	Tue Nov 28 21:53:45 2000
+++ linux/net/ipv4/ip_gre.c	Thu Jun 21 15:19:27 2001
@@ -802,8 +802,13 @@
 			tunnel->recursion--;
 			return 0;
 		}
-		if (skb->sk)
-			skb_set_owner_w(new_skb, skb->sk);
+		if (skb->sk && skb_set_owner_w(new_skb, skb->sk)) {
+			ip_rt_put(rt);
+			stats->tx_dropped++;
+			dev_kfree_skb(new_skb);
+			tunnel->recursion--;
+			return 0;
+		}
 		dev_kfree_skb(skb);
 		skb = new_skb;
 	}
diff -urP linux-242-clean/net/ipv4/ip_output.c linux/net/ipv4/ip_output.c
--- linux-242-clean/net/ipv4/ip_output.c	Fri Oct 27 11:03:14 2000
+++ linux/net/ipv4/ip_output.c	Thu Jun 21 15:19:27 2001
@@ -295,8 +295,12 @@
 		kfree_skb(skb);
 		if (skb2 == NULL)
 			return -ENOMEM;
-		if (sk)
-			skb_set_owner_w(skb2, sk);
+		if (sk) {
+			if (skb_set_owner_w(skb2, sk)) {
+				kfree_skb(skb2);	
+				return -ENOMEM;
+			}	
+		}
 		skb = skb2;
 		iph = skb->nh.iph;
 	}
@@ -797,8 +801,12 @@
 		 *	it might possess
 		 */
 
-		if (skb->sk)
-			skb_set_owner_w(skb2, skb->sk);
+		if (skb->sk) {
+			if (skb_set_owner_w(skb2, skb->sk)) {
+				kfree_skb(skb2);
+				goto fail;
+			}		
+		}
 		skb2->dst = dst_clone(skb->dst);
 		skb2->dev = skb->dev;
 
diff -urP linux-242-clean/net/ipv4/ipip.c linux/net/ipv4/ipip.c
--- linux-242-clean/net/ipv4/ipip.c	Tue Nov 28 21:53:45 2000
+++ linux/net/ipv4/ipip.c	Thu Jun 21 15:19:27 2001
@@ -612,8 +612,16 @@
 			tunnel->recursion--;
 			return 0;
 		}
-		if (skb->sk)
-			skb_set_owner_w(new_skb, skb->sk);
+		if (skb->sk) {
+			if (skb_set_owner_w(new_skb, skb->sk)) {
+				dev_kfree_skb(new_skb);
+				dev_kfree_skb(skb);
+				ip_rt_put(rt);
+				stats->tx_dropped++;
+				tunnel->recursion--;
+				return 0;
+			}
+		}
 		dev_kfree_skb(skb);
 		skb = new_skb;
 	}
diff -urP linux-242-clean/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c
--- linux-242-clean/net/ipv4/tcp_input.c	Thu Jun 21 11:44:27 2001
+++ linux/net/ipv4/tcp_input.c	Thu Jun 21 15:19:27 2001
@@ -2547,7 +2547,11 @@
 
 		if (!eaten) {
 queue_and_out:
-			tcp_set_owner_r(skb, sk);
+			if (tcp_set_owner_r(skb, sk)) {
+				printk(" tcp_set_owner_r failed\n");
+				__kfree_skb( skb );
+				return;
+			}	
 			__skb_queue_tail(&sk->receive_queue, skb);
 		}
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -2633,7 +2637,11 @@
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
 		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 
-	tcp_set_owner_r(skb, sk);
+	if (tcp_set_owner_r(skb, sk)) {
+		printk(" tcp_set_owner_r (2) failed\n");
+		__kfree_skb(skb);
+		return;
+	}
 
 	if (skb_peek(&tp->out_of_order_queue) == NULL) {
 		/* Initial out of order segment, build 1 SACK. */
@@ -2735,13 +2743,17 @@
 
 				nskb = skb_copy_expand(skb, skb_headroom(skb), 0, GFP_ATOMIC);
 				if (nskb) {
-					tcp_set_owner_r(nskb, sk);
-					memcpy(nskb->data-skb_headroom(skb),
-					       skb->data-skb_headroom(skb),
-					       skb_headroom(skb));
-					__skb_append(skb, nskb);
-					__skb_unlink(skb, skb->list);
-					__kfree_skb(skb);
+					if (tcp_set_owner_r(nskb, sk)) {
+						printk(" tcp_set_owner_r(3) failed\n" );
+						__kfree_skb(nskb);
+					} else {
+						memcpy(nskb->data-skb_headroom(skb),
+					       		skb->data-skb_headroom(skb),
+						       skb_headroom(skb));
+						__skb_append(skb, nskb);
+						__skb_unlink(skb, skb->list);
+						__kfree_skb(skb);
+					}
 				}
 			}
 			skb = skb_next;
@@ -3271,7 +3283,8 @@
 				/* Bulk data transfer: receiver */
 				__skb_pull(skb,tcp_header_len);
 				__skb_queue_tail(&sk->receive_queue, skb);
-				tcp_set_owner_r(skb, sk);
+				if (tcp_set_owner_r(skb, sk))
+					goto csum_error;
 				tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 			}
 
diff -urP linux-242-clean/net/ipv4/tcp_output.c linux/net/ipv4/tcp_output.c
--- linux-242-clean/net/ipv4/tcp_output.c	Tue Nov 28 21:53:45 2000
+++ linux/net/ipv4/tcp_output.c	Thu Jun 21 15:19:27 2001
@@ -235,7 +235,8 @@
 		}
 		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 		skb->h.th = th;
-		skb_set_owner_w(skb, sk);
+		if (skb_set_owner_w(skb, sk))
+			return -ENOMEM;
 
 		/* Build TCP header and checksum it. */
 		th->source		= sk->sport;
diff -urP linux-242-clean/net/netsyms.c linux/net/netsyms.c
--- linux-242-clean/net/netsyms.c	Fri Feb  9 11:29:44 2001
+++ linux/net/netsyms.c	Thu Jun 21 15:19:27 2001
@@ -92,6 +92,14 @@
 EXPORT_SYMBOL(skb_over_panic);
 EXPORT_SYMBOL(skb_under_panic);
 
+/* Skbuff destructor list and stats */
+EXPORT_SYMBOL(sk_buff_dest_free_pool);
+EXPORT_SYMBOL(skbd_stat_add_dest);
+EXPORT_SYMBOL(skbd_stat_alloc);
+EXPORT_SYMBOL(skbd_stat_add_head);
+EXPORT_SYMBOL(skbd_stat_maybe_call_dest);
+EXPORT_SYMBOL(skbd_stat_call_dest);
+
 /* Socket layer registration */
 EXPORT_SYMBOL(sock_register);
 EXPORT_SYMBOL(sock_unregister);
diff -urP linux-242-clean/net/unix/af_unix.c linux/net/unix/af_unix.c
--- linux-242-clean/net/unix/af_unix.c	Fri Feb  9 11:34:13 2001
+++ linux/net/unix/af_unix.c	Thu Jun 21 15:19:27 2001
@@ -1107,16 +1107,19 @@
 	return err;
 }
 
-static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+static int unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	int i;
 
 	scm->fp = UNIXCB(skb).fp;
-	skb->destructor = sock_wfree;
+
+	if (skb_add_dest(skb, (void (*)(void *))sock_wfree, skb, GFP_ATOMIC))
+        return 1;
 	UNIXCB(skb).fp = NULL;
 
 	for (i=scm->fp->count-1; i>=0; i--)
 		unix_notinflight(scm->fp->fp[i]);
+	return 0;
 }
 
 static void unix_destruct_fds(struct sk_buff *skb)
@@ -1131,14 +1134,16 @@
 	sock_wfree(skb);
 }
 
-static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	int i;
 	for (i=scm->fp->count-1; i>=0; i--)
 		unix_inflight(scm->fp->fp[i]);
 	UNIXCB(skb).fp = scm->fp;
-	skb->destructor = unix_destruct_fds;
+	if (skb_add_dest(skb, (void (*)(void *))unix_destruct_fds, skb, GFP_ATOMIC))
+        return 1;
 	scm->fp = NULL;
+	return 0;
 }
 
 /*
@@ -1187,8 +1192,8 @@
 		goto out;
 
 	memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred));
-	if (scm->fp)
-		unix_attach_fds(scm, skb);
+	if (scm->fp && unix_attach_fds(scm, skb))
+			goto out;
 
 	skb->h.raw = skb->data;
 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
@@ -1352,8 +1357,8 @@
 		size = min(size, skb_tailroom(skb));
 
 		memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred));
-		if (scm->fp)
-			unix_attach_fds(scm, skb);
+		if (scm->fp && unix_attach_fds(scm, skb))
+			goto out_err;
 
 		if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
 			kfree_skb(skb);
@@ -1433,8 +1438,8 @@
 
 	if (!(flags & MSG_PEEK))
 	{
-		if (UNIXCB(skb).fp)
-			unix_detach_fds(scm, skb);
+		if (UNIXCB(skb).fp && unix_detach_fds(scm, skb))
+			goto out_free;
 	}
 	else 
 	{
@@ -1596,8 +1601,11 @@
 		{
 			skb_pull(skb, chunk);
 
-			if (UNIXCB(skb).fp)
-				unix_detach_fds(scm, skb);
+			if (UNIXCB(skb).fp && unix_detach_fds(scm, skb)) {
+				skb_queue_head( &sk->receive_queue, skb );
+				copied = -EFAULT;
+				break;
+			}
 
 			/* put the skb back if we didn't use it up.. */
 			if (skb->len)

[Index of Archives]     [Netdev]     [Ethernet Bridging]     [Linux 802.1Q VLAN]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Git]     [Bugtraq]     [Yosemite News and Information]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux PCI]     [Linux Admin]     [Samba]

  Powered by Linux