moving ipvs() to POST/PREROUTING

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Greetings,

Ok, things are mostly working now. The patch is a little messy as in there's 
old comments remaining and function names are left as is, but hopefully 
reviewable. If it's not, I'll split it up and add appropriate comments...

Functions I've tested with the patch:
* LVS-NAT
* LVS-NAT + SNAT
* LVS-DR
* local node

With local node, 127.0.0.1 doesn't work but an IP address on a local interface 
does. When the address is 127.0.0.1, the SYN makes it all the way through 
INPUT, but the SYN/ACK doesn't come into OUTPUT. Something to investigate 
further... Also, null_xmit doesn't work as ipvs_in is being done in 
POSTROUTING, so I've simple aliased LOCAL to MASQ for the time being.

What I haven't tested:
* LVS-TUN
* ICMP for LVS-NAT
* IP_VS_CONN_F_BYPASS - what is this?
* likely others ;)

LVS-TUN should work as LVS-DR didn't require any direct modification, but it's 
a little bit of a pain to set up for testing at this stage. I'm not sure how 
I should go about testing the others.

I realized I haven't explained at all why I chose POST/PRE as the hook points. 
Firstly the cropped output from a LOG target in every mangle table for the 
SYN SYN/ACK of a LVS-NAT connection:

PREROUTING IN=eth0 OUT= SRC=192.168.0.104 DST=192.168.0.SYN
FORWARD IN=eth0 OUT=eth0 SRC=192.168.0.104 DST=192.168.0.7 SYN
POSTROUTING IN= OUT=eth0 SRC=192.168.0.104 DST=192.168.0.7 SYN
POSTROUTING IN= OUT=eth1 SRC=192.168.0.104 DST=192.168.1.3 SYN

PREROUTING IN=eth1 OUT= SRC=192.168.0.7 DST=192.168.0.104 ACK SYN
FORWARD IN=eth1 OUT=eth0 SRC=192.168.0.7 DST=192.168.0.104 ACK SYN
POSTROUTING IN= OUT=eth0 SRC=192.168.0.7 DST=192.168.0.104 ACK SYN

192.168.0.104 is the client, 192.168.0.7 is the VIP and 192.168.1.3 is the 
real server. Other than the second POSTROUTING entry on the SYN side, 
netfilter isn't dealing with the real server's IP at all. This will 
theoretically make writing firewall rules much easier and also limits what 
netfilter's conntracking has to deal with.

Actually, I don't know why the second POSTROUTING entry is there at all. It 
seems that after the packet is injected into the end of POSTROUTING, a 
routing decision is being made again and POSTROUTING is rerun. Preferable the 
packet would go straight out the appropriate interface after ipvs_in is run.

Similar behaviour happens with a local node:

PREROUTING IN=eth0 OUT= SRC=192.168.0.104 DST=192.168.0.7 SYN
FORWARD IN=eth0 OUT=eth0 SRC=192.168.0.104 DST=192.168.0.7 SYN
POSTROUTING IN= OUT=eth0 SRC=192.168.0.104 DST=192.168.0.7 SYN
POSTROUTING IN= OUT=lo SRC=192.168.0.104 DST=192.168.0.5 SYN
PREROUTING IN=lo OUT= SRC=192.168.0.104 DST=192.168.0.5 SYN
INPUT IN=lo OUT= SRC=192.168.0.104 DST=192.168.0.5 SYN

OUTPUT IN= OUT=eth0 SRC=192.168.0.7 DST=192.168.0.104 ACK SYN
POSTROUTING IN= OUT=eth0 SRC=192.168.0.7 DST=192.168.0.104 ACK SYN

192.168.0.5 is an IP local to the director. I had to add the ipvs_out hooks to 
the beginning of OUTPUT as the local reply never hits PREROUTING. Again with 
the above, I'd prefer the POST/PRE/INPUT disappear.

Anyway, that's pretty much my intention. Is there any problem with essentially 
hiding the real servers from netfilter? Is there a way to get the packet out 
of the netfilter loop earlier?

-- 
Jason Stubbs <j.stubbs@xxxxxxxxxxxxxxx>
LINKTHINK INC.
東京都渋谷区桜ヶ丘町22-14 N.E.S S棟 3F
TEL 03-5728-4772  FAX 03-5728-4773
--- linux-2.6.24-gentoo-r4/net/ipv4/ipvs/ip_vs_core.c	2008-01-25 07:58:37.000000000 +0900
+++ linux-2.6.24-gentoo-r4-jason/net/ipv4/ipvs/ip_vs_core.c	2008-04-11 12:56:38.000000000 +0900
@@ -480,25 +480,6 @@
 }
 
 
-/*
- *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
- *      chain, and is used for VS/NAT.
- *      It detects packets for VS/NAT connections and sends the packets
- *      immediately. This can avoid that iptable_nat mangles the packets
- *      for VS/NAT.
- */
-static unsigned int ip_vs_post_routing(unsigned int hooknum,
-				       struct sk_buff *skb,
-				       const struct net_device *in,
-				       const struct net_device *out,
-				       int (*okfn)(struct sk_buff *))
-{
-	if (!skb->ipvs_property)
-		return NF_ACCEPT;
-	/* The packet was sent from IPVS, exit this chain */
-	return NF_STOP;
-}
-
 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 {
 	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
@@ -767,15 +748,6 @@
 	ip_hdr(skb)->saddr = cp->vaddr;
 	ip_send_check(ip_hdr(skb));
 
-	/* For policy routing, packets originating from this
-	 * machine itself may be routed differently to packets
-	 * passing through.  We want this packet to be routed as
-	 * if it came from this machine itself.  So re-compute
-	 * the routing information.
-	 */
-	if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
-		goto drop;
-
 	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 
 	ip_vs_out_stats(cp, skb);
@@ -905,18 +877,8 @@
 	int ret, restart;
 	int ihl;
 
-	/*
-	 *	Big tappo: only PACKET_HOST (neither loopback nor mcasts)
-	 *	... don't know why 1st test DOES NOT include 2nd (?)
-	 */
-	if (unlikely(skb->pkt_type != PACKET_HOST
-		     || skb->dev->flags & IFF_LOOPBACK || skb->sk)) {
-		IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
-			  skb->pkt_type,
-			  ip_hdr(skb)->protocol,
-			  NIPQUAD(ip_hdr(skb)->daddr));
+	if (skb->ipvs_property)
 		return NF_ACCEPT;
-	}
 
 	iph = ip_hdr(skb);
 	if (unlikely(iph->protocol == IPPROTO_ICMP)) {
@@ -1032,8 +994,8 @@
 	.hook		= ip_vs_in,
 	.owner		= THIS_MODULE,
 	.pf		= PF_INET,
-	.hooknum        = NF_IP_LOCAL_IN,
-	.priority       = 100,
+	.hooknum        = NF_IP_POST_ROUTING,
+	.priority       = NF_IP_PRI_LAST,
 };
 
 /* After packet filtering, change source only for VS/NAT */
@@ -1041,8 +1003,17 @@
 	.hook		= ip_vs_out,
 	.owner		= THIS_MODULE,
 	.pf		= PF_INET,
-	.hooknum        = NF_IP_FORWARD,
-	.priority       = 100,
+	.hooknum        = NF_IP_PRE_ROUTING,
+	.priority       = NF_IP_PRI_FIRST+1,
+};
+
+/* After packet filtering, change source only for VS/NAT */
+static struct nf_hook_ops ip_vs_local_out_ops = {
+	.hook		= ip_vs_out,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum        = NF_IP_LOCAL_OUT,
+	.priority       = NF_IP_PRI_FIRST+1,
 };
 
 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
@@ -1051,17 +1022,18 @@
 	.hook		= ip_vs_forward_icmp,
 	.owner		= THIS_MODULE,
 	.pf		= PF_INET,
-	.hooknum        = NF_IP_FORWARD,
-	.priority       = 99,
+	.hooknum        = NF_IP_PRE_ROUTING,
+	.priority       = NF_IP_PRI_FIRST,
 };
 
-/* Before the netfilter connection tracking, exit from POST_ROUTING */
-static struct nf_hook_ops ip_vs_post_routing_ops = {
-	.hook		= ip_vs_post_routing,
+/* After packet filtering (but before ip_vs_out_icmp), catch icmp
+   destined for 0.0.0.0/0, which is for incoming IPVS connections */
+static struct nf_hook_ops ip_vs_local_icmp_ops = {
+	.hook		= ip_vs_forward_icmp,
 	.owner		= THIS_MODULE,
 	.pf		= PF_INET,
-	.hooknum        = NF_IP_POST_ROUTING,
-	.priority       = NF_IP_PRI_NAT_SRC-1,
+	.hooknum        = NF_IP_LOCAL_OUT,
+	.priority       = NF_IP_PRI_FIRST,
 };
 
 
@@ -1103,22 +1075,32 @@
 		IP_VS_ERR("can't register out hook.\n");
 		goto cleanup_inops;
 	}
-	ret = nf_register_hook(&ip_vs_post_routing_ops);
+
+	ret = nf_register_hook(&ip_vs_forward_icmp_ops);
 	if (ret < 0) {
-		IP_VS_ERR("can't register post_routing hook.\n");
+		IP_VS_ERR("can't register forward_icmp hook.\n");
 		goto cleanup_outops;
 	}
-	ret = nf_register_hook(&ip_vs_forward_icmp_ops);
+
+	ret = nf_register_hook(&ip_vs_local_out_ops);
 	if (ret < 0) {
-		IP_VS_ERR("can't register forward_icmp hook.\n");
-		goto cleanup_postroutingops;
+		IP_VS_ERR("can't register local out hook.\n");
+		goto cleanup_icmpops;
+	}
+
+	ret = nf_register_hook(&ip_vs_local_icmp_ops);
+	if (ret < 0) {
+		IP_VS_ERR("can't register local icmp hook.\n");
+		goto cleanup_localout;
 	}
 
 	IP_VS_INFO("ipvs loaded.\n");
 	return ret;
 
-  cleanup_postroutingops:
-	nf_unregister_hook(&ip_vs_post_routing_ops);
+  cleanup_localout:
+    nf_unregister_hook(&ip_vs_local_out_ops);
+  cleanup_icmpops:
+    nf_unregister_hook(&ip_vs_forward_icmp_ops);
   cleanup_outops:
 	nf_unregister_hook(&ip_vs_out_ops);
   cleanup_inops:
@@ -1137,7 +1119,6 @@
 static void __exit ip_vs_cleanup(void)
 {
 	nf_unregister_hook(&ip_vs_forward_icmp_ops);
-	nf_unregister_hook(&ip_vs_post_routing_ops);
 	nf_unregister_hook(&ip_vs_out_ops);
 	nf_unregister_hook(&ip_vs_in_ops);
 	ip_vs_conn_cleanup();
--- linux-2.6.24-gentoo-r4/net/ipv4/ipvs/ip_vs_ctl.c	2008-01-25 07:58:37.000000000 +0900
+++ linux-2.6.24-gentoo-r4-jason/net/ipv4/ipvs/ip_vs_ctl.c	2008-04-11 12:23:59.000000000 +0900
@@ -706,7 +706,7 @@
 	/* check if local node and update the flags */
 	if (inet_addr_type(udest->addr) == RTN_LOCAL) {
 		conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-			| IP_VS_CONN_F_LOCALNODE;
+			| IP_VS_CONN_F_MASQ;
 	}
 
 	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
--- linux-2.6.24-gentoo-r4/net/ipv4/ipvs/ip_vs_xmit.c	2008-04-10 18:20:03.000000000 +0900
+++ linux-2.6.24-gentoo-r4-jason/net/ipv4/ipvs/ip_vs_xmit.c	2008-04-11 12:15:14.000000000 +0900
@@ -129,8 +129,8 @@
 do {							\
 	(skb)->ipvs_property = 1;			\
 	skb_forward_csum(skb);				\
-	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,	\
-		(rt)->u.dst.dev, dst_output);		\
+	NF_HOOK_THRESH(PF_INET, NF_IP_POST_ROUTING, (skb), NULL,	\
+		(rt)->u.dst.dev, dst_output, NF_IP_PRI_LAST);		\
 } while (0)
 
 

[Index of Archives]     [Linux Filesystem Devel]     [Linux NFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]     [X.Org]

  Powered by Linux