The following patch is an enhancement to a mechanism called Prioritized Accept Queues(PAQ) that can be used to prioritize incoming connection requests on a socket based on the source/dest ip addreses and ports. We have posted the original patch in July. This enhancement introduces a way to preempt low priority connections from the accept queue in order to avoid starvation of higher priority connections when the accept queue is filled with lower priority connections. For example, this feature can be used to guarantee low delay and high throughput to preferred clients on a web server by assigning higher priority to connection requests whose source ip address matches the ip address of the preferred clients. It can also be used on a server hosting multiple websites each identified by its own ip address. In this case the prioritization can be done based on the destination ip address of the connection requests. The documentation on HOWTO use this patch and the test results which show an improvement in connection rate for higher priority classes can be found at our project website. http://oss.software.ibm.com/qos We would appreciate any comments or suggestions. Thanks Sridhar --------------------------- Sridhar Samudrala IBM Linux Technology Centre samudrala@xxxxxxxxxx ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ diff -urN -X dontdiff linux-2.4.9/Documentation/Configure.help linux-2.4.9-ppaq/Documentation/Configure.help --- linux-2.4.9/Documentation/Configure.help Sun Aug 12 10:51:41 2001 +++ linux-2.4.9-ppaq/Documentation/Configure.help Thu Sep 13 18:15:37 2001 @@ -1955,6 +1955,14 @@ If you want to compile it as a module, say M here and read Documentation/modules.txt. If unsure, say `N'. +Prioritized Accept Queue (EXPERIMENTAL) +CONFIG_PRIO_ACCEPTQ + When enabled, this option allows you to set priorities to incoming + connection requests using the rules created by the iptables MARK target + option. The nfmark field set by the rules is used as a priority value + when the connection is added to accept queue. The priority value can + range between 0-7 with 0 being the highest priority and 7 the lowest. + Packet filtering CONFIG_IP_NF_FILTER Packet filtering defines a table `filter', which has a series of diff -urN -X dontdiff linux-2.4.9/include/net/sock.h linux-2.4.9-ppaq/include/net/sock.h --- linux-2.4.9/include/net/sock.h Wed Aug 15 14:21:32 2001 +++ linux-2.4.9-ppaq/include/net/sock.h Thu Sep 13 18:32:38 2001 @@ -239,6 +239,11 @@ #define pppoe_relay proto.pppoe.relay #endif +#ifdef CONFIG_PRIO_ACCEPTQ +/* Priorities range from 0-7 */ +#define MAX_ACCEPTQ_PRIO 7 +#endif + /* This defines a selective acknowledgement block. */ struct tcp_sack_block { __u32 start_seq; @@ -409,7 +414,11 @@ /* FIFO of established children */ struct open_request *accept_queue; +#ifdef CONFIG_PRIO_ACCEPTQ + struct open_request *accept_queue_tail[MAX_ACCEPTQ_PRIO]; +#else struct open_request *accept_queue_tail; +#endif int write_pending; /* A write to socket waits to start. */ diff -urN -X dontdiff linux-2.4.9/include/net/tcp.h linux-2.4.9-ppaq/include/net/tcp.h --- linux-2.4.9/include/net/tcp.h Wed Aug 15 14:26:33 2001 +++ linux-2.4.9-ppaq/include/net/tcp.h Thu Sep 13 18:42:25 2001 @@ -519,6 +519,9 @@ struct tcp_v6_open_req v6_req; #endif } af; +#ifdef CONFIG_PRIO_ACCEPTQ + int acceptq_prio; +#endif }; /* SLAB cache for open requests. */ @@ -1572,10 +1575,33 @@ struct sock *child) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; +#ifdef CONFIG_PRIO_ACCEPTQ + int prio = req->acceptq_prio; + int prev_prio; +#endif req->sk = child; tcp_acceptq_added(sk); +#ifdef CONFIG_PRIO_ACCEPTQ + if (!tp->accept_queue_tail[prio]) { + for (prev_prio = prio - 1; prev_prio >= 0; prev_prio--) + if (tp->accept_queue_tail[prev_prio]) + break; + tp->accept_queue_tail[prio] = req; + if (prev_prio >= 0) { + req->dl_next = tp->accept_queue_tail[prev_prio]->dl_next; + tp->accept_queue_tail[prev_prio]->dl_next = req; + } else { + req->dl_next = tp->accept_queue; + tp->accept_queue = req; + } + } else { + req->dl_next = tp->accept_queue_tail[prio]->dl_next; + tp->accept_queue_tail[prio]->dl_next = req; + tp->accept_queue_tail[prio] = req; + } +#else if (!tp->accept_queue_tail) { tp->accept_queue = req; } else { @@ -1583,6 +1609,7 @@ } tp->accept_queue_tail = req; req->dl_next = NULL; +#endif } struct tcp_listen_opt @@ -1649,6 +1676,10 @@ struct tcp_opt *tp, struct sk_buff *skb) { +#ifdef CONFIG_PRIO_ACCEPTQ + int nfmark = (int)skb->nfmark; +#endif + req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->rcv_isn = TCP_SKB_CB(skb)->seq; req->mss = tp->mss_clamp; @@ -1660,6 +1691,9 @@ req->acked = 0; req->ecn_ok = 0; req->rmt_port = skb->h.th->source; +#ifdef CONFIG_PRIO_ACCEPTQ + req->acceptq_prio = (nfmark < 0) ? 0 : ((nfmark > MAX_ACCEPTQ_PRIO) ? MAX_ACCEPTQ_PRIO : nfmark); +#endif } #define TCP_MEM_QUANTUM ((int)PAGE_SIZE) diff -urN -X dontdiff linux-2.4.9/net/ipv4/netfilter/Config.in linux-2.4.9-ppaq/net/ipv4/netfilter/Config.in --- linux-2.4.9/net/ipv4/netfilter/Config.in Tue Mar 6 22:44:16 2001 +++ linux-2.4.9-ppaq/net/ipv4/netfilter/Config.in Thu Sep 13 18:15:38 2001 @@ -27,6 +27,7 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_UNCLEAN $CONFIG_IP_NF_IPTABLES dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_OWNER $CONFIG_IP_NF_IPTABLES + bool ' Prioritized Accept Queues (EXPERIMENTAL)' CONFIG_PRIO_ACCEPTQ fi # The targets dep_tristate ' Packet filtering' CONFIG_IP_NF_FILTER $CONFIG_IP_NF_IPTABLES diff -urN -X dontdiff linux-2.4.9/net/ipv4/tcp.c linux-2.4.9-ppaq/net/ipv4/tcp.c --- linux-2.4.9/net/ipv4/tcp.c Wed Aug 15 01:22:17 2001 +++ linux-2.4.9-ppaq/net/ipv4/tcp.c Thu Sep 13 18:15:38 2001 @@ -529,7 +529,12 @@ sk->max_ack_backlog = 0; sk->ack_backlog = 0; +#ifdef CONFIG_PRIO_ACCEPTQ + tp->accept_queue = NULL; + memset(tp->accept_queue_tail, 0, (sizeof(struct open_request *) * (MAX_ACCEPTQ_PRIO + 1))); +#else tp->accept_queue = tp->accept_queue_tail = NULL; +#endif tp->syn_wait_lock = RW_LOCK_UNLOCKED; tcp_delack_init(tp); @@ -588,7 +593,12 @@ write_lock_bh(&tp->syn_wait_lock); tp->listen_opt =NULL; write_unlock_bh(&tp->syn_wait_lock); +#ifdef CONFIG_PRIO_ACCEPTQ + tp->accept_queue = NULL; + memset(tp->accept_queue_tail, 0, (sizeof(struct open_request *) * (MAX_ACCEPTQ_PRIO + 1))); +#else tp->accept_queue = tp->accept_queue_tail = NULL; +#endif if (lopt->qlen) { for (i=0; i<TCP_SYNQ_HSIZE; i++) { @@ -2109,6 +2119,9 @@ struct open_request *req; struct sock *newsk; int error; +#ifdef CONFIG_PRIO_ACCEPTQ + int prio; +#endif lock_sock(sk); @@ -2134,8 +2147,17 @@ } req = tp->accept_queue; +#ifdef CONFIG_PRIO_ACCEPTQ + tp->accept_queue = req->dl_next; + for (prio = 0; prio <= MAX_ACCEPTQ_PRIO; prio++) + if (req == tp->accept_queue_tail[prio]) { + tp->accept_queue_tail[prio] = NULL; + break; + } +#else if ((tp->accept_queue = req->dl_next) == NULL) tp->accept_queue_tail = NULL; +#endif newsk = req->sk; tcp_acceptq_removed(sk); diff -urN -X dontdiff linux-2.4.9/net/ipv4/tcp_ipv4.c linux-2.4.9-ppaq/net/ipv4/tcp_ipv4.c --- linux-2.4.9/net/ipv4/tcp_ipv4.c Wed Apr 25 14:57:39 2001 +++ linux-2.4.9-ppaq/net/ipv4/tcp_ipv4.c Mon Sep 17 18:49:01 2001 @@ -1262,6 +1262,21 @@ tcp_v4_send_reset }; +#ifdef CONFIG_PRIO_ACCEPTQ +static struct open_request *low_prio_req_in_acceptq(struct sock *sk, int prio) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct open_request *low_prio_req = NULL; + int tmp_prio; + + for (tmp_prio = MAX_ACCEPTQ_PRIO; tmp_prio > prio; tmp_prio--) + if ((low_prio_req = tp->accept_queue_tail[tmp_prio])) + break; + + return (low_prio_req); +} +#endif + int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_opt tp; @@ -1299,7 +1314,19 @@ * clogging syn queue with openreqs with exponentially increasing * timeout. */ +#ifdef CONFIG_PRIO_ACCEPTQ + /* With Prioritized Accept Queue, a new condition is added so that an + * incoming SYN is dropped only if there are no lower priority + * connection requests in the acceptq. This is to avoid starvation of + * higher priority connection requests in the presence of persistent low + * priority connections filling up the acceptq. + */ + if (tcp_acceptq_is_full(sk) && + !(low_prio_req_in_acceptq(sk, (int)skb->nfmark)) && + tcp_synq_young(sk) > 1) +#else if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) +#endif goto drop; req = tcp_openreq_alloc(); @@ -1407,6 +1434,62 @@ return 0; } +#ifdef CONFIG_PRIO_ACCEPTQ +/* removes the req(last one) from sk's accept queue. */ +static void remove_openreq_from_acceptq(struct sock *sk, struct open_request *req) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct open_request *tmp_req = NULL; + struct open_request *prev_req; + int req_prio = req->acceptq_prio; + int prio; + + /* find the last req in the next higher priority class */ + for (prio = req_prio-1; prio >= 0; prio--) + if ((tmp_req = tp->accept_queue_tail[prio])) + break; + + /* no higher priority class, start scanning from the start of acceptq */ + if (!tmp_req) + tmp_req = tp->accept_queue; + + prev_req = tmp_req; + + /* find the prev req */ + for (; tmp_req ; tmp_req = tmp_req->dl_next) { + if (tmp_req == req) + break; + prev_req = tmp_req; + } + + if (prev_req) { + prev_req->dl_next = NULL; + if (prev_req->acceptq_prio == req_prio) + tp->accept_queue_tail[req_prio] = prev_req; + else + tp->accept_queue_tail[req_prio] = NULL; + } else { + BUG_TRAP(prev_req != NULL); + } +} + +/* remove lreq from accept queue and add it to syn table */ +static void preempt_low_prio_req(struct sock *sk, struct open_request *lreq) +{ + struct sock *lsk; + + lsk = lreq->sk; + lreq->sk = NULL; + remove_openreq_from_acceptq(sk, lreq); + tcp_acceptq_removed(sk); + tcp_v4_synq_add(sk, lreq); + tcp_unhash(lsk); + tcp_set_state(lsk, TCP_CLOSE); + sock_orphan(lsk); + atomic_inc(&tcp_orphan_count); + tcp_destroy_sock(lsk); +} +#endif /* CONFIG_PRIO_ACCEPTQ */ /* * The three way handshake has completed - we got a valid synack - @@ -1419,8 +1502,35 @@ struct tcp_opt *newtp; struct sock *newsk; +#ifdef CONFIG_PRIO_ACCEPTQ + if (tcp_acceptq_is_full(sk)) { + struct open_request *low_prio_req; + + /* if there is a lower priority req in acceptq and we haven't + * acked any received data on the associated socket, move it to + * syn table so that the incoming higher priority req can be + * accepted. + */ + if ((low_prio_req = low_prio_req_in_acceptq(sk, req->acceptq_prio))) { + struct sock *lsk = low_prio_req->sk; + struct tcp_opt *tp = &lsk->tp_pinfo.af_tcp; + + bh_lock_sock(lsk); + /* we haven't acked any data received */ + if (tp->rcv_wup == (low_prio_req->rcv_isn + 1)) { + preempt_low_prio_req(sk, low_prio_req); + bh_unlock_sock(lsk); + } else { + bh_unlock_sock(lsk); + goto exit_overflow; + } + } else + goto exit_overflow; + } +#else if (tcp_acceptq_is_full(sk)) goto exit_overflow; +#endif if (dst == NULL && (dst = tcp_v4_route_req(sk, req)) == NULL) diff -urN -X dontdiff linux-2.4.9/net/ipv4/tcp_minisocks.c linux-2.4.9-ppaq/net/ipv4/tcp_minisocks.c --- linux-2.4.9/net/ipv4/tcp_minisocks.c Wed Aug 15 01:22:17 2001 +++ linux-2.4.9-ppaq/net/ipv4/tcp_minisocks.c Mon Sep 17 18:49:18 2001 @@ -734,7 +734,12 @@ newtp->num_sacks = 0; newtp->urg_data = 0; newtp->listen_opt = NULL; +#ifdef CONFIG_PRIO_ACCEPTQ + newtp->accept_queue = NULL; + memset(newtp->accept_queue_tail, 0, (sizeof(struct open_request *) * (MAX_ACCEPTQ_PRIO + 1))); +#else newtp->accept_queue = newtp->accept_queue_tail = NULL; +#endif /* Deinitialize syn_wait_lock to trap illegal accesses. */ memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock)); @@ -802,6 +807,9 @@ int paws_reject = 0; struct tcp_opt ttp; struct sock *child; +#ifdef CONFIG_PRIO_ACCEPTQ + struct open_request *r1, *r2; +#endif ttp.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { @@ -913,10 +921,24 @@ * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ +#ifdef CONFIG_PRIO_ACCEPTQ + r1 = *prev; +#endif child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; +#ifdef CONFIG_PRIO_ACCEPTQ + r2 = *prev; + /* With Prioritized Accept Queues, it is possible that prev pointer can + * change in the above call to syn_recv_sock(). This can happen if an + * openreq is preempted and moved from acceptq to syn table and it + * hashes to the same bucket as 'req' and 'req' is the first entry in + * the hash bucket. If so, prev needs to be updated. + */ + if ((req == r1) && (r1 != r2)) + prev = &r2->dl_next; +#endif tcp_synq_unlink(tp, req, prev); tcp_synq_removed(sk, req); ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++