[LARTC] [PATCH] Prioritized Accept Queues with Preemption Capability

Linux Advanced Routing and Traffic Control

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The following patch is an enhancement to a mechanism called Prioritized Accept
Queues(PAQ) that can be used to prioritize incoming connection requests on a
socket based on the source/dest ip addreses and ports. We have posted the
original patch in July. This enhancement introduces a way to preempt low
priority connections from the accept queue in order to avoid starvation of
higher priority connections when the accept queue is filled with lower priority
connections. 

For example, this feature can be used to guarantee low delay and high throughput
to preferred clients on a web server by assigning higher priority to connection
requests whose source ip address matches the ip address of the preferred clients.
It can also be used on a server hosting multiple websites each identified by its
own ip address. In this case the prioritization can be done based on the 
destination ip address of the connection requests.

The documentation on HOWTO use this patch and the test results which show an
improvement in connection rate for higher priority classes can be found at our
project website.
        http://oss.software.ibm.com/qos

We would appreciate any comments or suggestions.

Thanks
Sridhar

---------------------------
Sridhar Samudrala
IBM Linux Technology Centre
samudrala@xxxxxxxxxx

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
diff -urN -X dontdiff linux-2.4.9/Documentation/Configure.help linux-2.4.9-ppaq/Documentation/Configure.help
--- linux-2.4.9/Documentation/Configure.help	Sun Aug 12 10:51:41 2001
+++ linux-2.4.9-ppaq/Documentation/Configure.help	Thu Sep 13 18:15:37 2001
@@ -1955,6 +1955,14 @@
   If you want to compile it as a module, say M here and read
   Documentation/modules.txt.  If unsure, say `N'.
 
+Prioritized Accept Queue (EXPERIMENTAL)
+CONFIG_PRIO_ACCEPTQ
+  When enabled, this option allows you to set priorities to incoming
+  connection requests using the rules created by the iptables MARK target
+  option. The nfmark field set by the rules is used as a priority value
+  when the connection is added to accept queue. The priority value can 
+  range between 0-7 with 0 being the highest priority and 7 the lowest. 
+  
 Packet filtering
 CONFIG_IP_NF_FILTER
   Packet filtering defines a table `filter', which has a series of
diff -urN -X dontdiff linux-2.4.9/include/net/sock.h linux-2.4.9-ppaq/include/net/sock.h
--- linux-2.4.9/include/net/sock.h	Wed Aug 15 14:21:32 2001
+++ linux-2.4.9-ppaq/include/net/sock.h	Thu Sep 13 18:32:38 2001
@@ -239,6 +239,11 @@
 #define pppoe_relay	proto.pppoe.relay
 #endif
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+/* Priorities range from 0-7 */
+#define MAX_ACCEPTQ_PRIO        7
+#endif
+
 /* This defines a selective acknowledgement block. */
 struct tcp_sack_block {
 	__u32	start_seq;
@@ -409,7 +414,11 @@
 
 	/* FIFO of established children */
 	struct open_request	*accept_queue;
+#ifdef CONFIG_PRIO_ACCEPTQ
+	struct open_request     *accept_queue_tail[MAX_ACCEPTQ_PRIO];
+#else
 	struct open_request	*accept_queue_tail;
+#endif
 
 	int			write_pending;	/* A write to socket waits to start. */
 
diff -urN -X dontdiff linux-2.4.9/include/net/tcp.h linux-2.4.9-ppaq/include/net/tcp.h
--- linux-2.4.9/include/net/tcp.h	Wed Aug 15 14:26:33 2001
+++ linux-2.4.9-ppaq/include/net/tcp.h	Thu Sep 13 18:42:25 2001
@@ -519,6 +519,9 @@
 		struct tcp_v6_open_req v6_req;
 #endif
 	} af;
+#ifdef CONFIG_PRIO_ACCEPTQ
+	int acceptq_prio;
+#endif
 };
 
 /* SLAB cache for open requests. */
@@ -1572,10 +1575,33 @@
 					 struct sock *child)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+#ifdef CONFIG_PRIO_ACCEPTQ
+	int prio = req->acceptq_prio;
+	int prev_prio;
+#endif
 
 	req->sk = child;
 	tcp_acceptq_added(sk);
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+	if (!tp->accept_queue_tail[prio]) {
+		for (prev_prio = prio - 1; prev_prio >= 0; prev_prio--)
+			if (tp->accept_queue_tail[prev_prio])
+				break;
+		tp->accept_queue_tail[prio] = req;
+		if (prev_prio >= 0) {
+			req->dl_next = tp->accept_queue_tail[prev_prio]->dl_next;
+			tp->accept_queue_tail[prev_prio]->dl_next = req; 
+		} else {
+			req->dl_next = tp->accept_queue;
+			tp->accept_queue = req;
+		}
+	} else {
+		req->dl_next = tp->accept_queue_tail[prio]->dl_next;
+		tp->accept_queue_tail[prio]->dl_next = req;
+		tp->accept_queue_tail[prio] = req;
+	}
+#else
 	if (!tp->accept_queue_tail) {
 		tp->accept_queue = req;
 	} else {
@@ -1583,6 +1609,7 @@
 	}
 	tp->accept_queue_tail = req;
 	req->dl_next = NULL;
+#endif
 }
 
 struct tcp_listen_opt
@@ -1649,6 +1676,10 @@
 					struct tcp_opt *tp,
 					struct sk_buff *skb)
 {
+#ifdef CONFIG_PRIO_ACCEPTQ
+	int nfmark = (int)skb->nfmark;
+#endif
+
 	req->rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
 	req->rcv_isn = TCP_SKB_CB(skb)->seq;
 	req->mss = tp->mss_clamp;
@@ -1660,6 +1691,9 @@
 	req->acked = 0;
 	req->ecn_ok = 0;
 	req->rmt_port = skb->h.th->source;
+#ifdef CONFIG_PRIO_ACCEPTQ
+	req->acceptq_prio = (nfmark < 0) ? 0 : ((nfmark > MAX_ACCEPTQ_PRIO) ? MAX_ACCEPTQ_PRIO : nfmark);
+#endif
 }
 
 #define TCP_MEM_QUANTUM	((int)PAGE_SIZE)
diff -urN -X dontdiff linux-2.4.9/net/ipv4/netfilter/Config.in linux-2.4.9-ppaq/net/ipv4/netfilter/Config.in
--- linux-2.4.9/net/ipv4/netfilter/Config.in	Tue Mar  6 22:44:16 2001
+++ linux-2.4.9-ppaq/net/ipv4/netfilter/Config.in	Thu Sep 13 18:15:38 2001
@@ -27,6 +27,7 @@
   if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
     dep_tristate '  Unclean match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_UNCLEAN $CONFIG_IP_NF_IPTABLES
     dep_tristate '  Owner match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_OWNER $CONFIG_IP_NF_IPTABLES
+    bool '  Prioritized Accept Queues (EXPERIMENTAL)' CONFIG_PRIO_ACCEPTQ
   fi
 # The targets
   dep_tristate '  Packet filtering' CONFIG_IP_NF_FILTER $CONFIG_IP_NF_IPTABLES 
diff -urN -X dontdiff linux-2.4.9/net/ipv4/tcp.c linux-2.4.9-ppaq/net/ipv4/tcp.c
--- linux-2.4.9/net/ipv4/tcp.c	Wed Aug 15 01:22:17 2001
+++ linux-2.4.9-ppaq/net/ipv4/tcp.c	Thu Sep 13 18:15:38 2001
@@ -529,7 +529,12 @@
 
 	sk->max_ack_backlog = 0;
 	sk->ack_backlog = 0;
+#ifdef CONFIG_PRIO_ACCEPTQ
+	tp->accept_queue = NULL;
+	memset(tp->accept_queue_tail, 0, (sizeof(struct open_request *) * (MAX_ACCEPTQ_PRIO + 1)));
+#else
 	tp->accept_queue = tp->accept_queue_tail = NULL;
+#endif
 	tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 	tcp_delack_init(tp);
 
@@ -588,7 +593,12 @@
 	write_lock_bh(&tp->syn_wait_lock);
 	tp->listen_opt =NULL;
 	write_unlock_bh(&tp->syn_wait_lock);
+#ifdef CONFIG_PRIO_ACCEPTQ
+	tp->accept_queue = NULL;
+	memset(tp->accept_queue_tail, 0, (sizeof(struct open_request *) * (MAX_ACCEPTQ_PRIO + 1)));
+#else
 	tp->accept_queue = tp->accept_queue_tail = NULL;
+#endif
 
 	if (lopt->qlen) {
 		for (i=0; i<TCP_SYNQ_HSIZE; i++) {
@@ -2109,6 +2119,9 @@
 	struct open_request *req;
 	struct sock *newsk;
 	int error;
+#ifdef CONFIG_PRIO_ACCEPTQ
+	int prio;
+#endif
 
 	lock_sock(sk); 
 
@@ -2134,8 +2147,17 @@
 	}
 
 	req = tp->accept_queue;
+#ifdef CONFIG_PRIO_ACCEPTQ
+	tp->accept_queue = req->dl_next;
+	for (prio = 0; prio <= MAX_ACCEPTQ_PRIO; prio++)
+		if (req == tp->accept_queue_tail[prio]) {
+			tp->accept_queue_tail[prio] = NULL;
+			break;
+		}
+#else
 	if ((tp->accept_queue = req->dl_next) == NULL)
 		tp->accept_queue_tail = NULL;
+#endif
 
  	newsk = req->sk;
 	tcp_acceptq_removed(sk);
diff -urN -X dontdiff linux-2.4.9/net/ipv4/tcp_ipv4.c linux-2.4.9-ppaq/net/ipv4/tcp_ipv4.c
--- linux-2.4.9/net/ipv4/tcp_ipv4.c	Wed Apr 25 14:57:39 2001
+++ linux-2.4.9-ppaq/net/ipv4/tcp_ipv4.c	Mon Sep 17 18:49:01 2001
@@ -1262,6 +1262,21 @@
 	tcp_v4_send_reset
 };
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+static struct open_request *low_prio_req_in_acceptq(struct sock *sk, int prio)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	struct open_request *low_prio_req = NULL;
+	int tmp_prio;
+
+	for (tmp_prio = MAX_ACCEPTQ_PRIO; tmp_prio > prio; tmp_prio--) 
+		if ((low_prio_req = tp->accept_queue_tail[tmp_prio])) 
+			break;	
+
+	return (low_prio_req);
+}
+#endif
+
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_opt tp;
@@ -1299,7 +1314,19 @@
 	 * clogging syn queue with openreqs with exponentially increasing
 	 * timeout.
 	 */
+#ifdef CONFIG_PRIO_ACCEPTQ
+	/* With Prioritized Accept Queue, a new condition is added so that an 
+	 * incoming SYN is dropped only if there are no lower priority 
+	 * connection requests in the acceptq. This is to avoid starvation of 
+	 * higher priority connection requests in the presence of persistent low	 
+	 * priority connections filling up the acceptq. 
+	 */
+	if (tcp_acceptq_is_full(sk) && 
+		!(low_prio_req_in_acceptq(sk, (int)skb->nfmark)) && 
+			tcp_synq_young(sk) > 1)
+#else
 	if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+#endif
 		goto drop;
 
 	req = tcp_openreq_alloc();
@@ -1407,6 +1434,62 @@
 	return 0;
 }
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+/* removes the req(last one) from sk's accept queue. */
+static void remove_openreq_from_acceptq(struct sock *sk, struct open_request *req)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	struct open_request *tmp_req = NULL; 
+	struct open_request *prev_req;
+	int req_prio = req->acceptq_prio;
+	int prio;
+
+	/* find the last req in the next higher priority class */
+	for (prio = req_prio-1; prio >= 0; prio--)
+		if ((tmp_req = tp->accept_queue_tail[prio]))
+			break;
+
+	/* no higher priority class, start scanning from the start of acceptq */
+	if (!tmp_req)
+		tmp_req = tp->accept_queue;
+
+	prev_req = tmp_req;
+
+	/* find the prev req */
+	for (; tmp_req ; tmp_req = tmp_req->dl_next) {
+		if (tmp_req == req)
+			break;
+		prev_req = tmp_req;
+	}
+
+	if (prev_req) {
+		prev_req->dl_next = NULL;
+		if (prev_req->acceptq_prio == req_prio) 
+			tp->accept_queue_tail[req_prio] = prev_req;
+		else
+			tp->accept_queue_tail[req_prio] = NULL;
+	} else {
+		BUG_TRAP(prev_req != NULL);
+	}
+}
+
+/* remove lreq from accept queue and add it to syn table */ 
+static void preempt_low_prio_req(struct sock *sk, struct open_request *lreq)
+{
+	struct sock *lsk;
+
+	lsk = lreq->sk;
+	lreq->sk = NULL;
+	remove_openreq_from_acceptq(sk, lreq);
+	tcp_acceptq_removed(sk);
+	tcp_v4_synq_add(sk, lreq);
+	tcp_unhash(lsk);
+	tcp_set_state(lsk, TCP_CLOSE);
+	sock_orphan(lsk);
+	atomic_inc(&tcp_orphan_count);
+	tcp_destroy_sock(lsk);
+}
+#endif /* CONFIG_PRIO_ACCEPTQ */
 
 /* 
  * The three way handshake has completed - we got a valid synack - 
@@ -1419,8 +1502,35 @@
 	struct tcp_opt *newtp;
 	struct sock *newsk;
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+	if (tcp_acceptq_is_full(sk)) {
+		struct open_request *low_prio_req;
+
+		/* if there is a lower priority req in acceptq and we haven't 
+		 * acked any received data on the associated socket, move it to 
+		 * syn table so that the incoming higher priority req can be 
+		 * accepted. 
+		 */
+		if ((low_prio_req = low_prio_req_in_acceptq(sk, req->acceptq_prio))) {
+			struct sock *lsk = low_prio_req->sk;
+			struct tcp_opt *tp = &lsk->tp_pinfo.af_tcp;
+
+			bh_lock_sock(lsk);
+			/* we haven't acked any data received */
+			if (tp->rcv_wup == (low_prio_req->rcv_isn + 1)) { 
+				preempt_low_prio_req(sk, low_prio_req);	
+				bh_unlock_sock(lsk);
+			} else {
+				bh_unlock_sock(lsk);
+				goto exit_overflow;
+			}
+		} else
+			goto exit_overflow;
+	}
+#else
 	if (tcp_acceptq_is_full(sk))
 		goto exit_overflow;
+#endif
 
 	if (dst == NULL &&
 	    (dst = tcp_v4_route_req(sk, req)) == NULL)
diff -urN -X dontdiff linux-2.4.9/net/ipv4/tcp_minisocks.c linux-2.4.9-ppaq/net/ipv4/tcp_minisocks.c
--- linux-2.4.9/net/ipv4/tcp_minisocks.c	Wed Aug 15 01:22:17 2001
+++ linux-2.4.9-ppaq/net/ipv4/tcp_minisocks.c	Mon Sep 17 18:49:18 2001
@@ -734,7 +734,12 @@
 		newtp->num_sacks = 0;
 		newtp->urg_data = 0;
 		newtp->listen_opt = NULL;
+#ifdef CONFIG_PRIO_ACCEPTQ
+		newtp->accept_queue = NULL;
+		memset(newtp->accept_queue_tail, 0, (sizeof(struct open_request *) * (MAX_ACCEPTQ_PRIO + 1)));
+#else
 		newtp->accept_queue = newtp->accept_queue_tail = NULL;
+#endif
 		/* Deinitialize syn_wait_lock to trap illegal accesses. */
 		memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
 
@@ -802,6 +807,9 @@
 	int paws_reject = 0;
 	struct tcp_opt ttp;
 	struct sock *child;
+#ifdef CONFIG_PRIO_ACCEPTQ
+	struct open_request *r1, *r2;
+#endif
 
 	ttp.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
@@ -913,10 +921,24 @@
 	 * ESTABLISHED STATE. If it will be dropped after
 	 * socket is created, wait for troubles.
 	 */
+#ifdef CONFIG_PRIO_ACCEPTQ
+	r1 = *prev;
+#endif
 	child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
 	if (child == NULL)
 		goto listen_overflow;
+#ifdef CONFIG_PRIO_ACCEPTQ
+	r2 = *prev;
 
+	/* With Prioritized Accept Queues, it is possible that prev pointer can 
+	 * change in the above call to syn_recv_sock(). This can happen if an 
+	 * openreq is preempted and moved from acceptq to syn table and it 
+	 * hashes to the same bucket as 'req' and 'req' is the first entry in 
+	 * the hash bucket. If so, prev needs to be updated.
+	 */ 
+	if ((req == r1) && (r1 != r2))
+		prev = &r2->dl_next;
+#endif
 	tcp_synq_unlink(tp, req, prev);
 	tcp_synq_removed(sk, req);
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++




[Index of Archives]     [LARTC Home Page]     [Netfilter]     [Netfilter Development]     [Network Development]     [Bugtraq]     [GCC Help]     [Yosemite News]     [Linux Kernel]     [Fedora Users]
  Powered by Linux