Re: More nf_conntrack_sip questions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Patrick McHardy wrote:
Philip Prindeville wrote:
Patrick McHardy wrote:
Philip Prindeville wrote:
That version is known not to work very well, 2.6.26 includes
a largely rewritten version.

This is a backport to 2.6.25:

git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-2.6.25-sip.git

Well, it might be easier to simply jump to 2.6.26.8 (unless there's something horribly wrong with that version).

Just means going through patch hell again as we munge patches for ocf, imq, squashfs, unionfs, multi-routing, etc.

Say, I've been wondering about that: what is the status of dead default gateway detection? Not sure why it requires a patch and isn't just something selectable via CONFIG_xxx...

I'm guessing the patch hasn't been submitted yet.

This might be more of a question for linux-net.

I tried to take the 2.6.25.15 patches (which worked with .19 just fine) and rewrite them for 2.6.26.8, but the changes related to CONFIG_NET_NS seem to be fouling us up... little confused about that, because the code:

+               if (ip_route_output_key(out->nd_net, &rt, &fl) != 0) {
+                       /* Funky routing can do this. */
+                       if (net_ratelimit())
+                               printk("MASQUERADE:"
+                                      " No route: Rusty's brain broke!\n");
+                       return NF_DROP;
+               }

compiled just fine in 2.6.25.19 as I said...

I haven't groveled through all the changes from 2.6.25.19 to 2.6.26.8...  In 2.6.26 I'm seeing:

#ifdef CONFIG_NET_NS
       /* Network namespace this network device is inside */
       struct net              *nd_net;
#endif

in include/linux/netdevice.h.  In 2.6.25.19 I see:

       /* Network namespace this network device is inside */
       struct net              *nd_net;

(no #ifdef's).

Can someone please look over my patch and confirm it's correctness?  Mostly I took:

+                               if (fib_lookup(dev->nd_net, &fl, &res) != 0)
+                                       continue;
+                               if (res.type != RTN_UNICAST &&
+                                   res.type != RTN_LOCAL) {
+                                       fib_res_put(&res);
+                                       continue;
+                               }

and rewrote it as:

+ #ifdef CONFIG_NET_NS
+                               if (fib_lookup(dev->nd_net, &fl, &res) != 0)
+ #else
+                               if (fib_lookup(&init_net, &fl, &res) != 0)
+ #endif
+                                       continue;
+                               if (res.type != RTN_UNICAST &&
+                                   res.type != RTN_LOCAL) {
+                                       fib_res_put(&res);
+                                       continue;
+                               }



Ditto for:

+ #ifdef CONFIG_NET_NS
+               if (ip_route_output_key(out->nd_net, &rt, &fl) != 0) {
+ #else
+               if (ip_route_output_key(&init_net, &rt, &fl) != 0) {
+ #endif
+                       /* Funky routing can do this. */
+                       if (net_ratelimit())
+                               printk("MASQUERADE:"
+                                      " No route: Rusty's brain broke!\n");
+                       return NF_DROP;
+               }



Might be handy to have a convenience wrapper that takes a struct net_device *, and returns its ->nd_net if CONFIG_NET_NS is enabled, and otherwise returns &init_net...

Maybe like nf_forward_net() works, but simpler. Or should I have written the patches above using nf_local_out_net() instead???

It would be nice if the patch (the original, not my tweaked version) became part of the upstream source. One of the things about the patch is that it's not a compile-time option. It has sequences such as:

-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-       if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
-#else
       if (FIB_RES_DEV(res) == dev)
-#endif

and:

-#ifdef CONFIG_IP_ROUTE_MULTIPATH
               fib_sync_up(ifa->ifa_dev->dev);
-#endif


which I don't understand. You can't simple turn it off or on via CONFIG_IP_ROUTE_MULTIPATH=y ...

Thanks,

-Philip




diff -urp v2.6.25/linux/include/linux/rtnetlink.h linux/include/linux/rtnetlink.h
--- v2.6.25/linux/include/linux/rtnetlink.h	2008-04-17 09:58:08.000000000 +0300
+++ linux/include/linux/rtnetlink.h	2008-04-19 18:30:04.000000000 +0300
@@ -303,6 +303,8 @@ struct rtnexthop
 #define RTNH_F_DEAD		1	/* Nexthop is dead (used by multipath)	*/
 #define RTNH_F_PERVASIVE	2	/* Do recursive gateway lookup	*/
 #define RTNH_F_ONLINK		4	/* Gateway is forced on link	*/
+#define RTNH_F_SUSPECT		8	/* We don't know the real state	*/
+#define RTNH_F_BADSTATE		(RTNH_F_DEAD | RTNH_F_SUSPECT)
 
 /* Macros to handle hexthops */
 
diff -urp v2.6.25/linux/include/net/flow.h linux/include/net/flow.h
--- v2.6.25/linux/include/net/flow.h	2008-04-17 09:58:08.000000000 +0300
+++ linux/include/net/flow.h	2008-04-19 18:30:17.000000000 +0300
@@ -19,6 +19,8 @@ struct flowi {
 		struct {
 			__be32			daddr;
 			__be32			saddr;
+			__be32			lsrc;
+			__be32			gw;
 			__u8			tos;
 			__u8			scope;
 		} ip4_u;
@@ -43,6 +45,8 @@ struct flowi {
 #define fl6_flowlabel	nl_u.ip6_u.flowlabel
 #define fl4_dst		nl_u.ip4_u.daddr
 #define fl4_src		nl_u.ip4_u.saddr
+#define fl4_lsrc	nl_u.ip4_u.lsrc
+#define fl4_gw		nl_u.ip4_u.gw
 #define fl4_tos		nl_u.ip4_u.tos
 #define fl4_scope	nl_u.ip4_u.scope
 
diff -urp v2.6.25/linux/include/net/ip_fib.h linux/include/net/ip_fib.h
--- v2.6.25/linux/include/net/ip_fib.h	2008-04-17 09:58:08.000000000 +0300
+++ linux/include/net/ip_fib.h	2008-04-19 18:30:04.000000000 +0300
@@ -207,6 +207,8 @@ extern int fib_lookup(struct net *n, str
 extern struct fib_table *fib_new_table(struct net *net, u32 id);
 extern struct fib_table *fib_get_table(struct net *net, u32 id);
 
+extern int fib_result_table(struct fib_result *res);
+
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
 
 /* Exported by fib_frontend.c */
@@ -276,4 +278,6 @@ static inline void fib_proc_exit(struct 
 }
 #endif
 
+extern rwlock_t fib_nhflags_lock;
+
 #endif  /* _NET_FIB_H */
diff -urp v2.6.25/linux/include/net/netfilter/nf_nat.h linux/include/net/netfilter/nf_nat.h
--- v2.6.25/linux/include/net/netfilter/nf_nat.h	2008-04-17 09:58:08.000000000 +0300
+++ linux/include/net/netfilter/nf_nat.h	2008-04-19 18:30:17.000000000 +0300
@@ -77,6 +77,13 @@ struct nf_conn_nat
 #endif
 };
 
+/* Call input routing for SNAT-ed traffic */
+extern unsigned int ip_nat_route_input(unsigned int hooknum,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *));
+
 /* Set up the info structure to map into this range. */
 extern unsigned int nf_nat_setup_info(struct nf_conn *ct,
 				      const struct nf_nat_range *range,
diff -urp v2.6.25/linux/include/net/route.h linux/include/net/route.h
--- v2.6.25/linux/include/net/route.h	2008-04-17 09:58:08.000000000 +0300
+++ linux/include/net/route.h	2008-04-19 18:30:17.000000000 +0300
@@ -116,6 +116,7 @@
 extern int		ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
 extern int		ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
 extern int		ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin);
+extern int		ip_route_input_lookup(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin, __be32 lsrc);
 extern unsigned short	ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev);
 extern void		ip_rt_send_redirect(struct sk_buff *skb);
 
diff -urp v2.6.25/linux/net/bridge/br_netfilter.c linux/net/bridge/br_netfilter.c
--- v2.6.25/linux/net/bridge/br_netfilter.c	2008-04-17 09:58:08.000000000 +0300
+++ linux/net/bridge/br_netfilter.c	2008-04-19 18:30:17.000000000 +0300
@@ -325,6 +325,10 @@ static int br_nf_pre_routing_finish(stru
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 	int err;
 
+	/* Old skb->dst is not expected, it is lost in all cases */
+	dst_release(skb->dst);
+	skb->dst = NULL;
+
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
 		nf_bridge->mask ^= BRNF_PKT_TYPE;
diff -urp v2.6.25/linux/net/ipv4/fib_frontend.c linux/net/ipv4/fib_frontend.c
--- v2.6.25/linux/net/ipv4/fib_frontend.c	2008-04-17 09:58:09.000000000 +0300
+++ linux/net/ipv4/fib_frontend.c	2008-04-19 18:30:04.000000000 +0300
@@ -49,6 +49,8 @@
 
 #ifndef CONFIG_IP_MULTIPLE_TABLES
 
+#define FIB_RES_TABLE(r) (RT_TABLE_MAIN)
+
 static int __net_init fib4_rules_init(struct net *net)
 {
 	struct fib_table *local_table, *main_table;
@@ -73,6 +75,8 @@ fail:
 }
 #else
 
+#define FIB_RES_TABLE(r) (fib_result_table(r))
+
 struct fib_table *fib_new_table(struct net *net, u32 id)
 {
 	struct fib_table *tb;
@@ -127,7 +131,8 @@ void fib_select_default(struct net *net,
 	table = res->r->table;
 #endif
 	tb = fib_get_table(net, table);
-	if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+	if ((FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) ||
+	    FIB_RES_NH(*res).nh_scope == RT_SCOPE_HOST)
 		tb->tb_select_default(tb, flp, res);
 }
 
@@ -241,6 +246,9 @@ int fib_validate_source(__be32 src, __be
 					.tos = tos } },
 			    .iif = oif };
 	struct fib_result res;
+	int table;
+	unsigned char prefixlen;
+	unsigned char scope;
 	int no_addr, rpf;
 	int ret;
 	struct net *net;
@@ -264,31 +272,35 @@ int fib_validate_source(__be32 src, __be
 		goto e_inval_res;
 	*spec_dst = FIB_RES_PREFSRC(res);
 	fib_combine_itag(itag, &res);
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
-#else
 	if (FIB_RES_DEV(res) == dev)
-#endif
 	{
 		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
 		fib_res_put(&res);
 		return ret;
 	}
+	table = FIB_RES_TABLE(&res);
+	prefixlen = res.prefixlen;
+	scope = res.scope;
 	fib_res_put(&res);
 	if (no_addr)
 		goto last_resort;
-	if (rpf)
-		goto e_inval;
 	fl.oif = dev->ifindex;
 
 	ret = 0;
 	if (fib_lookup(net, &fl, &res) == 0) {
-		if (res.type == RTN_UNICAST) {
+		if (res.type == RTN_UNICAST &&
+		    ((table == FIB_RES_TABLE(&res) &&
+		      res.prefixlen >= prefixlen && res.scope >= scope) ||
+		     !rpf)) {
 			*spec_dst = FIB_RES_PREFSRC(res);
 			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+			fib_res_put(&res);
+			return ret;
 		}
 		fib_res_put(&res);
 	}
+	if (rpf)
+		goto e_inval;
 	return ret;
 
 last_resort:
@@ -911,9 +923,7 @@ static int fib_inetaddr_event(struct not
 	switch (event) {
 	case NETDEV_UP:
 		fib_add_ifaddr(ifa);
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
 		fib_sync_up(ifa->ifa_dev->dev);
-#endif
 		rt_cache_flush(-1);
 		break;
 	case NETDEV_DOWN:
@@ -949,9 +959,7 @@ static int fib_netdev_event(struct notif
 		for_ifa(in_dev) {
 			fib_add_ifaddr(ifa);
 		} endfor_ifa(in_dev);
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
 		fib_sync_up(dev);
-#endif
 		rt_cache_flush(-1);
 		break;
 	case NETDEV_DOWN:
diff -urp v2.6.25/linux/net/ipv4/fib_hash.c linux/net/ipv4/fib_hash.c
--- v2.6.25/linux/net/ipv4/fib_hash.c	2008-04-17 09:58:09.000000000 +0300
+++ linux/net/ipv4/fib_hash.c	2008-04-19 18:30:04.000000000 +0300
@@ -280,25 +280,35 @@ out:
 static void
 fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
 {
-	int order, last_idx;
+	int order, last_idx, last_dflt, last_nhsel;
+	struct fib_alias *first_fa = NULL;
+	struct hlist_head *head;
 	struct hlist_node *node;
 	struct fib_node *f;
 	struct fib_info *fi = NULL;
 	struct fib_info *last_resort;
 	struct fn_hash *t = (struct fn_hash*)tb->tb_data;
-	struct fn_zone *fz = t->fn_zones[0];
+	struct fn_zone *fz = t->fn_zones[res->prefixlen];
+	__be32 k;
 
 	if (fz == NULL)
 		return;
 
+	k = fz_key(flp->fl4_dst, fz);
+	last_dflt = -2;
+	last_nhsel = 0;
 	last_idx = -1;
 	last_resort = NULL;
 	order = -1;
 
 	read_lock(&fib_hash_lock);
-	hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+	head = &fz->fz_hash[fn_hash(k, fz)];
+	hlist_for_each_entry(f, node, head, fn_hash) {
 		struct fib_alias *fa;
 
+		if (f->fn_key != k)
+			continue;
+
 		list_for_each_entry(fa, &f->fn_alias, fa_list) {
 			struct fib_info *next_fi = fa->fa_info;
 
@@ -306,42 +316,56 @@ fn_hash_select_default(struct fib_table 
 			    fa->fa_type != RTN_UNICAST)
 				continue;
 
+			if (fa->fa_tos &&
+			    fa->fa_tos != flp->fl4_tos)
+				continue;
 			if (next_fi->fib_priority > res->fi->fib_priority)
 				break;
-			if (!next_fi->fib_nh[0].nh_gw ||
-			    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
-				continue;
 			fa->fa_state |= FA_S_ACCESSED;
 
-			if (fi == NULL) {
-				if (next_fi != res->fi)
-					break;
-			} else if (!fib_detect_death(fi, order, &last_resort,
-						&last_idx, tb->tb_default)) {
+			if (!first_fa) {
+				last_dflt = fa->fa_last_dflt;
+				first_fa = fa;
+			}
+			if (fi && !fib_detect_death(fi, order, &last_resort,
+				&last_idx, &last_dflt, &last_nhsel, flp)) {
 				fib_result_assign(res, fi);
-				tb->tb_default = order;
+				first_fa->fa_last_dflt = order;
 				goto out;
 			}
 			fi = next_fi;
 			order++;
 		}
+		break;
 	}
 
 	if (order <= 0 || fi == NULL) {
-		tb->tb_default = -1;
+		if (fi && fi->fib_nhs > 1 &&
+		    fib_detect_death(fi, order, &last_resort, &last_idx,
+			&last_dflt, &last_nhsel, flp) &&
+		    last_resort == fi) {
+			read_lock_bh(&fib_nhflags_lock);
+			fi->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT;
+			read_unlock_bh(&fib_nhflags_lock);
+		}
+		if (first_fa) first_fa->fa_last_dflt = -1;
 		goto out;
 	}
 
 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
-				tb->tb_default)) {
+			      &last_dflt, &last_nhsel, flp)) {
 		fib_result_assign(res, fi);
-		tb->tb_default = order;
+		first_fa->fa_last_dflt = order;
 		goto out;
 	}
 
-	if (last_idx >= 0)
+	if (last_idx >= 0) {
 		fib_result_assign(res, last_resort);
-	tb->tb_default = last_idx;
+		read_lock_bh(&fib_nhflags_lock);
+		last_resort->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT;
+		read_unlock_bh(&fib_nhflags_lock);
+		first_fa->fa_last_dflt = last_idx;
+	}
 out:
 	read_unlock(&fib_hash_lock);
 }
@@ -465,6 +489,7 @@ static int fn_hash_insert(struct fib_tab
 			write_lock_bh(&fib_hash_lock);
 			fi_drop = fa->fa_info;
 			fa->fa_info = fi;
+			fa->fa_last_dflt = -1;
 			fa->fa_type = cfg->fc_type;
 			fa->fa_scope = cfg->fc_scope;
 			state = fa->fa_state;
@@ -519,6 +544,7 @@ static int fn_hash_insert(struct fib_tab
 	new_fa->fa_type = cfg->fc_type;
 	new_fa->fa_scope = cfg->fc_scope;
 	new_fa->fa_state = 0;
+	new_fa->fa_last_dflt = -1;
 
 	/*
 	 * Insert new entry to the list.
diff -urp v2.6.25/linux/net/ipv4/fib_lookup.h linux/net/ipv4/fib_lookup.h
--- v2.6.25/linux/net/ipv4/fib_lookup.h	2008-04-17 09:58:09.000000000 +0300
+++ linux/net/ipv4/fib_lookup.h	2008-04-19 18:30:04.000000000 +0300
@@ -8,6 +8,7 @@
 struct fib_alias {
 	struct list_head	fa_list;
 	struct fib_info		*fa_info;
+	int			fa_last_dflt;
 	u8			fa_tos;
 	u8			fa_type;
 	u8			fa_scope;
@@ -38,7 +39,8 @@ extern struct fib_alias *fib_find_alias(
 					u8 tos, u32 prio);
 extern int fib_detect_death(struct fib_info *fi, int order,
 			    struct fib_info **last_resort,
-			    int *last_idx, int dflt);
+			    int *last_idx, int *dflt, int *last_nhsel,
+			    const struct flowi *flp);
 
 static inline void fib_result_assign(struct fib_result *res,
 				     struct fib_info *fi)
diff -urp v2.6.25/linux/net/ipv4/fib_rules.c linux/net/ipv4/fib_rules.c
--- v2.6.25/linux/net/ipv4/fib_rules.c	2008-04-17 09:58:09.000000000 +0300
+++ linux/net/ipv4/fib_rules.c	2008-04-19 18:30:04.000000000 +0300
@@ -54,6 +54,11 @@ u32 fib_rules_tclass(struct fib_result *
 }
 #endif
 
+int fib_result_table(struct fib_result *res)
+{
+	return res->r->table;
+}
+
 int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
 {
 	struct fib_lookup_arg arg = {
diff -urp v2.6.25/linux/net/ipv4/fib_semantics.c linux/net/ipv4/fib_semantics.c
--- v2.6.25/linux/net/ipv4/fib_semantics.c	2008-04-17 09:58:09.000000000 +0300
+++ linux/net/ipv4/fib_semantics.c	2008-04-19 18:30:17.000000000 +0300
@@ -52,6 +52,7 @@ static struct hlist_head *fib_info_hash;
 static struct hlist_head *fib_info_laddrhash;
 static unsigned int fib_hash_size;
 static unsigned int fib_info_cnt;
+rwlock_t fib_nhflags_lock = RW_LOCK_UNLOCKED;
 
 #define DEVINDEX_HASHBITS 8
 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
@@ -187,7 +188,7 @@ static __inline__ int nh_comp(const stru
 #ifdef CONFIG_NET_CLS_ROUTE
 		    nh->nh_tclassid != onh->nh_tclassid ||
 #endif
-		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_BADSTATE))
 			return -1;
 		onh++;
 	} endfor_nexthops(fi);
@@ -238,7 +239,7 @@ static struct fib_info *fib_find_info(co
 		    nfi->fib_priority == fi->fib_priority &&
 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
 			   sizeof(fi->fib_metrics)) == 0 &&
-		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_BADSTATE) == 0 &&
 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
 			return fi;
 	}
@@ -349,26 +350,70 @@ struct fib_alias *fib_find_alias(struct 
 }
 
 int fib_detect_death(struct fib_info *fi, int order,
-		     struct fib_info **last_resort, int *last_idx, int dflt)
+		     struct fib_info **last_resort, int *last_idx, int *dflt,
+		     int *last_nhsel, const struct flowi *flp)
 {
 	struct neighbour *n;
-	int state = NUD_NONE;
+	int nhsel;
+	int state;
+	struct fib_nh * nh;
+	__be32 dst;
+	int flag, dead = 1;
+
+	/* change_nexthops(fi) { */
+	for (nhsel = 0, nh = fi->fib_nh; nhsel < fi->fib_nhs; nh++, nhsel++) {
+		if (flp->oif && flp->oif != nh->nh_oif)
+			continue;
+		if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && nh->nh_gw &&
+		    nh->nh_scope == RT_SCOPE_LINK)
+			continue;
+		if (nh->nh_flags & RTNH_F_DEAD)
+			continue;
 
-	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
-	if (n) {
-		state = n->nud_state;
-		neigh_release(n);
-	}
-	if (state==NUD_REACHABLE)
-		return 0;
-	if ((state&NUD_VALID) && order != dflt)
-		return 0;
-	if ((state&NUD_VALID) ||
-	    (*last_idx<0 && order > dflt)) {
-		*last_resort = fi;
-		*last_idx = order;
+		flag = 0;
+		if (nh->nh_dev->flags & IFF_NOARP) {
+			dead = 0;
+			goto setfl;
+		}
+
+		dst = nh->nh_gw;
+		if (!nh->nh_gw || nh->nh_scope != RT_SCOPE_LINK)
+			dst = flp->fl4_dst;
+
+		state = NUD_NONE;
+		n = neigh_lookup(&arp_tbl, &dst, nh->nh_dev);
+		if (n) {
+			state = n->nud_state;
+			neigh_release(n);
+		}
+		if (state==NUD_REACHABLE ||
+			((state&NUD_VALID) && order != *dflt)) {
+			dead = 0;
+			goto setfl;
+		}
+		if (!(state&NUD_VALID))
+			flag = 1;
+		if (!dead)
+			goto setfl;
+		if ((state&NUD_VALID) ||
+		    (*last_idx<0 && order >= *dflt)) {
+			*last_resort = fi;
+			*last_idx = order;
+			*last_nhsel = nhsel;
+		}
+
+		setfl:
+
+		read_lock_bh(&fib_nhflags_lock);
+		if (flag)
+			nh->nh_flags |= RTNH_F_SUSPECT;
+		else
+			nh->nh_flags &= ~RTNH_F_SUSPECT;
+		read_unlock_bh(&fib_nhflags_lock);
 	}
-	return 1;
+	/* } endfor_nexthops(fi) */
+
+	return dead;
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -540,8 +585,11 @@ static int fib_check_nh(struct fib_confi
 				return -EINVAL;
 			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
 				return -ENODEV;
-			if (!(dev->flags&IFF_UP))
-				return -ENETDOWN;
+			if (!(dev->flags&IFF_UP)) {
+				if (fi->fib_protocol != RTPROT_STATIC)
+					return -ENETDOWN;
+				nh->nh_flags |= RTNH_F_DEAD;
+			}
 			nh->nh_dev = dev;
 			dev_hold(dev);
 			nh->nh_scope = RT_SCOPE_LINK;
@@ -561,24 +609,48 @@ static int fib_check_nh(struct fib_confi
 			/* It is not necessary, but requires a bit of thinking */
 			if (fl.fl4_scope < RT_SCOPE_LINK)
 				fl.fl4_scope = RT_SCOPE_LINK;
-			if ((err = fib_lookup(net, &fl, &res)) != 0)
-				return err;
+			err = fib_lookup(net, &fl, &res);
 		}
-		err = -EINVAL;
-		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
-			goto out;
-		nh->nh_scope = res.scope;
-		nh->nh_oif = FIB_RES_OIF(res);
-		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
-			goto out;
-		dev_hold(nh->nh_dev);
-		err = -ENETDOWN;
-		if (!(nh->nh_dev->flags & IFF_UP))
-			goto out;
-		err = 0;
+		if (err) {
+			struct in_device *in_dev;
+
+			if (err != -ENETUNREACH ||
+			    fi->fib_protocol != RTPROT_STATIC)
+				return err;
+
+			in_dev = inetdev_by_index(net, nh->nh_oif);
+			if (in_dev == NULL ||
+			    in_dev->dev->flags & IFF_UP) {
+				if (in_dev)
+					in_dev_put(in_dev);
+				return err;
+			}
+			nh->nh_flags |= RTNH_F_DEAD;
+			nh->nh_scope = RT_SCOPE_LINK;
+			nh->nh_dev = in_dev->dev;
+			dev_hold(nh->nh_dev);
+			in_dev_put(in_dev);
+		} else {
+			err = -EINVAL;
+			if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
+				goto out;
+			nh->nh_scope = res.scope;
+			nh->nh_oif = FIB_RES_OIF(res);
+			if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+				goto out;
+			dev_hold(nh->nh_dev);
+			if (!(nh->nh_dev->flags & IFF_UP)) {
+				if (fi->fib_protocol != RTPROT_STATIC) {
+					err = -ENETDOWN;
+					goto out;
+				}
+				nh->nh_flags |= RTNH_F_DEAD;
+			}
+			err = 0;
 out:
-		fib_res_put(&res);
-		return err;
+			fib_res_put(&res);
+			return err;
+		}
 	} else {
 		struct in_device *in_dev;
 
@@ -589,8 +661,11 @@ out:
 		if (in_dev == NULL)
 			return -ENODEV;
 		if (!(in_dev->dev->flags&IFF_UP)) {
-			in_dev_put(in_dev);
-			return -ENETDOWN;
+			if (fi->fib_protocol != RTPROT_STATIC) {
+				in_dev_put(in_dev);
+				return -ENETDOWN;
+			}
+			nh->nh_flags |= RTNH_F_DEAD;
 		}
 		nh->nh_dev = in_dev->dev;
 		dev_hold(nh->nh_dev);
@@ -900,8 +975,12 @@ int fib_semantic_match(struct list_head 
 				for_nexthops(fi) {
 					if (nh->nh_flags&RTNH_F_DEAD)
 						continue;
-					if (!flp->oif || flp->oif == nh->nh_oif)
-						break;
+					if (flp->oif && flp->oif != nh->nh_oif)
+						continue;
+					if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw &&
+					    nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
+						continue;
+					break;
 				}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 				if (nhsel < fi->fib_nhs) {
@@ -1078,18 +1157,29 @@ int fib_sync_down_dev(struct net_device 
 		prev_fi = fi;
 		dead = 0;
 		change_nexthops(fi) {
-			if (nh->nh_flags&RTNH_F_DEAD)
-				dead++;
-			else if (nh->nh_dev == dev &&
-					nh->nh_scope != scope) {
-				nh->nh_flags |= RTNH_F_DEAD;
+			if (nh->nh_flags&RTNH_F_DEAD) {
+				if (fi->fib_protocol!=RTPROT_STATIC ||
+				    nh->nh_dev == NULL ||
+				    __in_dev_get_rtnl(nh->nh_dev) == NULL ||
+				    nh->nh_dev->flags&IFF_UP)
+					dead++;
+			} else if (nh->nh_dev == dev &&
+				   nh->nh_scope != scope) {
+				write_lock_bh(&fib_nhflags_lock);
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-				spin_lock_bh(&fib_multipath_lock);
+				spin_lock(&fib_multipath_lock);
+				nh->nh_flags |= RTNH_F_DEAD;
 				fi->fib_power -= nh->nh_power;
 				nh->nh_power = 0;
-				spin_unlock_bh(&fib_multipath_lock);
+				spin_unlock(&fib_multipath_lock);
+#else
+				nh->nh_flags |= RTNH_F_DEAD;
 #endif
-				dead++;
+				write_unlock_bh(&fib_nhflags_lock);
+				if (fi->fib_protocol!=RTPROT_STATIC ||
+				    force ||
+				    __in_dev_get_rtnl(dev) == NULL)
+					dead++;
 			}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 			if (force > 1 && nh->nh_dev == dev) {
@@ -1107,11 +1197,8 @@ int fib_sync_down_dev(struct net_device 
 	return ret;
 }
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
 /*
-   Dead device goes up. We wake up dead nexthops.
-   It takes sense only on multipath routes.
+   Dead device goes up or new address is added. We wake up dead nexthops.
  */
 
 int fib_sync_up(struct net_device *dev)
@@ -1121,8 +1208,10 @@ int fib_sync_up(struct net_device *dev)
 	struct hlist_head *head;
 	struct hlist_node *node;
 	struct fib_nh *nh;
-	int ret;
+	struct fib_result res;
+	int ret, rep;
 
+repeat:
 	if (!(dev->flags&IFF_UP))
 		return 0;
 
@@ -1130,6 +1219,7 @@ int fib_sync_up(struct net_device *dev)
 	hash = fib_devindex_hashfn(dev->ifindex);
 	head = &fib_info_devhash[hash];
 	ret = 0;
+	rep = 0;
 
 	hlist_for_each_entry(nh, node, head, nh_hash) {
 		struct fib_info *fi = nh->nh_parent;
@@ -1142,19 +1232,43 @@ int fib_sync_up(struct net_device *dev)
 		prev_fi = fi;
 		alive = 0;
 		change_nexthops(fi) {
-			if (!(nh->nh_flags&RTNH_F_DEAD)) {
-				alive++;
+			if (!(nh->nh_flags&RTNH_F_DEAD))
 				continue;
-			}
 			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
 				continue;
 			if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
 				continue;
+			if (nh->nh_gw && fi->fib_protocol == RTPROT_STATIC) {
+				struct flowi fl = {
+					.nl_u = { .ip4_u =
+						  { .daddr = nh->nh_gw,
+						    .scope = nh->nh_scope } },
+					.oif =  nh->nh_oif,
+				};
+ #ifdef CONFIG_NET_NS
+				if (fib_lookup(dev->nd_net, &fl, &res) != 0)
+ #else
+				if (fib_lookup(&init_net, &fl, &res) != 0)
+ #endif
+					continue;
+				if (res.type != RTN_UNICAST &&
+				    res.type != RTN_LOCAL) {
+					fib_res_put(&res);
+					continue;
+				}
+				nh->nh_scope = res.scope;
+				fib_res_put(&res);
+				rep = 1;
+			}
 			alive++;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
 			spin_lock_bh(&fib_multipath_lock);
 			nh->nh_power = 0;
+#endif
 			nh->nh_flags &= ~RTNH_F_DEAD;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
 			spin_unlock_bh(&fib_multipath_lock);
+#endif
 		} endfor_nexthops(fi)
 
 		if (alive > 0) {
@@ -1162,10 +1272,14 @@ int fib_sync_up(struct net_device *dev)
 			ret++;
 		}
 	}
+	if (rep)
+		goto repeat;
 
 	return ret;
 }
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
 /*
    The algorithm is suboptimal, but it provides really
    fair weighted route distribution.
@@ -1174,24 +1288,45 @@ int fib_sync_up(struct net_device *dev)
 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
 {
 	struct fib_info *fi = res->fi;
-	int w;
+	int w, alive;
 
 	spin_lock_bh(&fib_multipath_lock);
+	if (flp->oif) {
+		int sel = -1;
+		w = -1;
+		change_nexthops(fi) {
+			if (flp->oif != nh->nh_oif)
+				continue;
+			if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw &&
+			    nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
+				continue;
+			if (!(nh->nh_flags&RTNH_F_BADSTATE)) {
+				if (nh->nh_power > w) {
+					w = nh->nh_power;
+					sel = nhsel;
+				}
+			}
+		} endfor_nexthops(fi);
+		if (sel >= 0) {
+			spin_unlock_bh(&fib_multipath_lock);
+			res->nh_sel = sel;
+			return;
+		}
+		goto last_resort;
+	}
+
+repeat:
 	if (fi->fib_power <= 0) {
 		int power = 0;
 		change_nexthops(fi) {
-			if (!(nh->nh_flags&RTNH_F_DEAD)) {
+			if (!(nh->nh_flags&RTNH_F_BADSTATE)) {
 				power += nh->nh_weight;
 				nh->nh_power = nh->nh_weight;
 			}
 		} endfor_nexthops(fi);
 		fi->fib_power = power;
-		if (power <= 0) {
-			spin_unlock_bh(&fib_multipath_lock);
-			/* Race condition: route has just become dead. */
-			res->nh_sel = 0;
-			return;
-		}
+		if (power <= 0)
+			goto last_resort;
 	}
 
 
@@ -1201,20 +1336,40 @@ void fib_select_multipath(const struct f
 
 	w = jiffies % fi->fib_power;
 
+	alive = 0;
 	change_nexthops(fi) {
-		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
+		if (!(nh->nh_flags&RTNH_F_BADSTATE) && nh->nh_power) {
 			if ((w -= nh->nh_power) <= 0) {
 				nh->nh_power--;
 				fi->fib_power--;
-				res->nh_sel = nhsel;
 				spin_unlock_bh(&fib_multipath_lock);
+				res->nh_sel = nhsel;
 				return;
 			}
+			alive = 1;
+		}
+	} endfor_nexthops(fi);
+	if (alive) {
+		fi->fib_power = 0;
+		goto repeat;
+	}
+
+last_resort:
+
+	for_nexthops(fi) {
+		if (!(nh->nh_flags&RTNH_F_DEAD)) {
+			if (flp->oif && flp->oif != nh->nh_oif)
+				continue;
+			if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw &&
+			    nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
+				continue;
+			spin_unlock_bh(&fib_multipath_lock);
+			res->nh_sel = nhsel;
+			return;
 		}
 	} endfor_nexthops(fi);
 
 	/* Race condition: route has just become dead. */
-	res->nh_sel = 0;
 	spin_unlock_bh(&fib_multipath_lock);
 }
 #endif
diff -urp v2.6.25/linux/net/ipv4/netfilter/ipt_MASQUERADE.c linux/net/ipv4/netfilter/ipt_MASQUERADE.c
--- v2.6.25/linux/net/ipv4/netfilter/ipt_MASQUERADE.c	2008-04-17 09:58:09.000000000 +0300
+++ linux/net/ipv4/netfilter/ipt_MASQUERADE.c	2008-04-19 18:30:17.000000000 +0300
@@ -59,7 +59,7 @@ masquerade_tg(struct sk_buff *skb, const
 	enum ip_conntrack_info ctinfo;
 	struct nf_nat_range newrange;
 	const struct nf_nat_multi_range_compat *mr;
-	const struct rtable *rt;
+	struct rtable *rt;
 	__be32 newsrc;
 
 	NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
@@ -77,13 +77,32 @@ masquerade_tg(struct sk_buff *skb, const
 		return NF_ACCEPT;
 
 	mr = targinfo;
-	rt = skb->rtable;
-	newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
-	if (!newsrc) {
-		printk("MASQUERADE: %s ate my IP address\n", out->name);
-		return NF_DROP;
+
+	{
+		struct flowi fl = { .nl_u = { .ip4_u =
+					      { .daddr = ip_hdr(skb)->daddr,
+						.tos = (RT_TOS(ip_hdr(skb)->tos) |
+							RTO_CONN),
+						.gw = ((struct rtable *) skb->dst)->rt_gateway,
+					      } },
+				    .mark = skb->mark,
+				    .oif = out->ifindex };
+ #ifdef CONFIG_NET_NS
+		if (ip_route_output_key(out->nd_net, &rt, &fl) != 0) {
+ #else
+		if (ip_route_output_key(&init_net, &rt, &fl) != 0) {
+ #endif
+			/* Funky routing can do this. */
+			if (net_ratelimit())
+				printk("MASQUERADE:"
+				       " No route: Rusty's brain broke!\n");
+			return NF_DROP;
+		}
 	}
 
+	newsrc = rt->rt_src;
+	ip_rt_put(rt);
+
 	write_lock_bh(&masq_lock);
 	nat->masq_index = out->ifindex;
 	write_unlock_bh(&masq_lock);
diff -urp v2.6.25/linux/net/ipv4/netfilter/nf_nat_core.c linux/net/ipv4/netfilter/nf_nat_core.c
--- v2.6.25/linux/net/ipv4/netfilter/nf_nat_core.c	2008-04-17 09:58:09.000000000 +0300
+++ linux/net/ipv4/netfilter/nf_nat_core.c	2008-04-19 18:30:17.000000000 +0300
@@ -624,6 +624,52 @@ static struct nf_ct_ext_type nat_extend 
 	.flags		= NF_CT_EXT_F_PREALLOC,
 };
 
+unsigned int
+ip_nat_route_input(unsigned int hooknum,
+		struct sk_buff *skb,
+		const struct net_device *in,
+		const struct net_device *out,
+		int (*okfn)(struct sk_buff *))
+{
+	struct iphdr *iph;
+	struct nf_conn *conn;
+	enum ip_conntrack_info ctinfo;
+	enum ip_conntrack_dir dir;
+	unsigned long statusbit;
+	__be32 saddr;
+
+	if (!(conn = nf_ct_get(skb, &ctinfo)))
+		return NF_ACCEPT;
+
+	if (!(conn->status & IPS_NAT_DONE_MASK))
+		return NF_ACCEPT;
+	dir = CTINFO2DIR(ctinfo);
+	statusbit = IPS_SRC_NAT;
+	if (dir == IP_CT_DIR_REPLY)
+		statusbit ^= IPS_NAT_MASK;
+	if (!(conn->status & statusbit))
+		return NF_ACCEPT;
+
+	if (skb->dst)
+		return NF_ACCEPT;
+
+	if (skb->len < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	/* use daddr in other direction as masquerade address (lsrc) */
+	iph = ip_hdr(skb);
+	saddr = conn->tuplehash[!dir].tuple.dst.u3.ip;
+	if (saddr == iph->saddr)
+		return NF_ACCEPT;
+
+	if (ip_route_input_lookup(skb, iph->daddr, iph->saddr, iph->tos,
+	    skb->dev, saddr))
+		return NF_DROP;
+
+	return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(ip_nat_route_input);
+
 static int __init nf_nat_init(void)
 {
 	size_t i;
diff -urp v2.6.25/linux/net/ipv4/netfilter/nf_nat_standalone.c linux/net/ipv4/netfilter/nf_nat_standalone.c
--- v2.6.25/linux/net/ipv4/netfilter/nf_nat_standalone.c	2008-04-17 09:58:09.000000000 +0300
+++ linux/net/ipv4/netfilter/nf_nat_standalone.c	2008-04-19 18:30:17.000000000 +0300
@@ -282,6 +282,14 @@ static struct nf_hook_ops nf_nat_ops[] _
 		.hooknum	= NF_INET_PRE_ROUTING,
 		.priority	= NF_IP_PRI_NAT_DST,
 	},
+	/* Before routing, route before mangling */
+	{
+		.hook		= ip_nat_route_input,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_LAST-1,
+	},
 	/* After packet filtering, change source */
 	{
 		.hook		= nf_nat_out,
diff -urp v2.6.25/linux/net/ipv4/route.c linux/net/ipv4/route.c
--- v2.6.25/linux/net/ipv4/route.c	2008-04-17 09:58:09.000000000 +0300
+++ linux/net/ipv4/route.c	2008-04-19 18:30:17.000000000 +0300
@@ -1207,6 +1207,7 @@ void ip_rt_redirect(__be32 old_gw, __be3
 
 				/* Gateway is different ... */
 				rt->rt_gateway		= new_gw;
+				if (rt->fl.fl4_gw) rt->fl.fl4_gw = new_gw;
 
 				/* Redirect received -> path was valid */
 				dst_confirm(&rth->u.dst);
@@ -1647,6 +1648,7 @@ static int ip_route_input_mc(struct sk_b
 	rth->fl.fl4_tos	= tos;
 	rth->fl.mark    = skb->mark;
 	rth->fl.fl4_src	= saddr;
+	rth->fl.fl4_lsrc = 0;
 	rth->rt_src	= saddr;
 #ifdef CONFIG_NET_CLS_ROUTE
 	rth->u.dst.tclassid = itag;
@@ -1657,6 +1659,7 @@ static int ip_route_input_mc(struct sk_b
 	dev_hold(rth->u.dst.dev);
 	rth->idev	= in_dev_get(rth->u.dst.dev);
 	rth->fl.oif	= 0;
+	rth->fl.fl4_gw	= 0;
 	rth->rt_gateway	= daddr;
 	rth->rt_spec_dst= spec_dst;
 	rth->rt_genid	= atomic_read(&rt_genid);
@@ -1762,7 +1762,7 @@ static int __mkroute_input(struct sk_buf
 			   struct fib_result *res,
 			   struct in_device *in_dev,
 			   __be32 daddr, __be32 saddr, u32 tos,
-			   struct rtable **result)
+			   __be32 lsrc, struct rtable **result)
 {
 
 	struct rtable *rth;
@@ -1796,6 +1796,7 @@ static int __mkroute_input(struct sk_buf
 		flags |= RTCF_DIRECTSRC;
 
 	if (out_dev == in_dev && err &&
+	    !lsrc &&
 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
 		flags |= RTCF_DOREDIRECT;
@@ -1789,6 +1793,7 @@ static inline int __mkroute_input(struct
 	rth->fl.mark    = skb->mark;
 	rth->fl.fl4_src	= saddr;
 	rth->rt_src	= saddr;
+	rth->fl.fl4_lsrc	= lsrc;
 	rth->rt_gateway	= daddr;
 	rth->rt_iif 	=
 		rth->fl.iif	= in_dev->dev->ifindex;
@@ -1796,6 +1801,7 @@ static inline int __mkroute_input(struct
 	dev_hold(rth->u.dst.dev);
 	rth->idev	= in_dev_get(rth->u.dst.dev);
 	rth->fl.oif 	= 0;
+	rth->fl.fl4_gw	= 0;
 	rth->rt_spec_dst= spec_dst;
 
 	rth->u.dst.input = ip_forward;
@@ -1858,21 +1859,23 @@ static int __mkroute_input(struct sk_buf
 
 static int ip_mkroute_input(struct sk_buff *skb,
 			    struct fib_result *res,
+			    struct net *net,
 			    const struct flowi *fl,
 			    struct in_device *in_dev,
-			    __be32 daddr, __be32 saddr, u32 tos)
+			    __be32 daddr, __be32 saddr, u32 tos, __be32 lsrc)
 {
 	struct rtable* rth = NULL;
 	int err;
 	unsigned hash;
 
+	fib_select_default(net, fl, res);
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
+	if (res->fi && res->fi->fib_nhs > 1)
 		fib_select_multipath(fl, res);
 #endif
 
 	/* create a routing cache entry */
-	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
+	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, lsrc, &rth);
 	if (err)
 		return err;
 
@@ -1850,18 +1858,19 @@ static inline int ip_mkroute_input(struc
  */
 
 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			       u8 tos, struct net_device *dev)
+			       u8 tos, struct net_device *dev, __be32 lsrc)
 {
 	struct fib_result res;
 	struct in_device *in_dev = in_dev_get(dev);
 	struct flowi fl = { .nl_u = { .ip4_u =
 				      { .daddr = daddr,
-					.saddr = saddr,
+					.saddr = lsrc? : saddr,
 					.tos = tos,
 					.scope = RT_SCOPE_UNIVERSE,
 				      } },
 			    .mark = skb->mark,
-			    .iif = dev->ifindex };
+			    .iif = lsrc?
+				init_net.loopback_dev->ifindex : dev->ifindex };
 	unsigned	flags = 0;
 	u32		itag = 0;
 	struct rtable * rth;
@@ -1897,6 +1906,12 @@ static int ip_route_input_slow(struct sk
 	    ipv4_is_loopback(daddr))
 		goto martian_destination;
 
+	if (lsrc) {
+		if (ipv4_is_multicast(lsrc) || ipv4_is_lbcast(lsrc) ||
+		    ipv4_is_zeronet(lsrc) || ipv4_is_loopback(lsrc))
+			goto e_inval;
+	}
+
 	/*
 	 *	Now we are ready to route packet.
 	 */
@@ -1906,6 +1921,8 @@ static int ip_route_input_slow(struct sk
 		goto no_route;
 	}
 	free_res = 1;
+	fl.iif = dev->ifindex;
+	fl.fl4_src = saddr;
 
 	RT_CACHE_STAT_INC(in_slow_tot);
 
@@ -1930,7 +1947,7 @@ static int ip_route_input_slow(struct sk
 	if (res.type != RTN_UNICAST)
 		goto martian_destination;
 
-	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
+	err = ip_mkroute_input(skb, &res, net, &fl, in_dev, daddr, saddr, tos, lsrc);
 done:
 	in_dev_put(in_dev);
 	if (free_res)
@@ -1940,6 +1957,8 @@ out:	return err;
 brd_input:
 	if (skb->protocol != htons(ETH_P_IP))
 		goto e_inval;
+	if (lsrc)
+		goto e_inval;
 
 	if (ipv4_is_zeronet(saddr))
 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -1981,6 +2000,7 @@ local_input:
 	rth->u.dst.dev	= net->loopback_dev;
 	dev_hold(rth->u.dst.dev);
 	rth->idev	= in_dev_get(rth->u.dst.dev);
+	rth->fl.fl4_gw	= 0;
 	rth->rt_gateway	= daddr;
 	rth->rt_spec_dst= spec_dst;
 	rth->u.dst.input= ip_local_deliver;
@@ -2032,8 +2052,9 @@ martian_source:
 	goto e_inval;
 }
 
-int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-		   u8 tos, struct net_device *dev)
+static inline int
+ip_route_input_cached(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+		   u8 tos, struct net_device *dev, __be32 lsrc)
 {
 	struct rtable * rth;
 	unsigned	hash;
@@ -2105,6 +2108,7 @@ ip_route_input_cached(struct sk_buff *sk
 		if (((rth->fl.fl4_dst ^ daddr) |
 		     (rth->fl.fl4_src ^ saddr) |
 		     (rth->fl.iif ^ iif) |
+		     (rth->fl.fl4_lsrc ^ lsrc) |
 		     rth->fl.oif |
 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
 		    rth->fl.mark == skb->mark &&
@@ -2097,7 +2119,19 @@ int ip_route_input(struct sk_buff *skb, 
 		rcu_read_unlock();
 		return -EINVAL;
 	}
-	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
+	return ip_route_input_slow(skb, daddr, saddr, tos, dev, lsrc);
+}
+
+int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+		   u8 tos, struct net_device *dev)
+{
+	return ip_route_input_cached(skb, daddr, saddr, tos, dev, 0);
+}
+
+int ip_route_input_lookup(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			  u8 tos, struct net_device *dev, __be32 lsrc)
+{
+	return ip_route_input_cached(skb, daddr, saddr, tos, dev, lsrc);
 }
 
 static inline int __mkroute_output(struct rtable **result,
@@ -2169,6 +2203,7 @@ static inline int __mkroute_output(struc
 	rth->fl.fl4_tos	= tos;
 	rth->fl.fl4_src	= oldflp->fl4_src;
 	rth->fl.oif	= oldflp->oif;
+	rth->fl.fl4_gw	= oldflp->fl4_gw;
 	rth->fl.mark    = oldflp->mark;
 	rth->rt_dst	= fl->fl4_dst;
 	rth->rt_src	= fl->fl4_src;
@@ -2249,6 +2284,7 @@ static int ip_route_output_slow(struct n
 	struct flowi fl = { .nl_u = { .ip4_u =
 				      { .daddr = oldflp->fl4_dst,
 					.saddr = oldflp->fl4_src,
+					.gw = oldflp->fl4_gw,
 					.tos = tos & IPTOS_RT_MASK,
 					.scope = ((tos & RTO_ONLINK) ?
 						  RT_SCOPE_LINK :
@@ -2354,6 +2390,7 @@ static int ip_route_output_slow(struct n
 		dev_out = net->loopback_dev;
 		dev_hold(dev_out);
 		fl.oif = net->loopback_dev->ifindex;
+		fl.fl4_gw = 0;
 		res.type = RTN_LOCAL;
 		flags |= RTCF_LOCAL;
 		goto make_route;
@@ -2361,7 +2398,7 @@ static int ip_route_output_slow(struct n
 
 	if (fib_lookup(net, &fl, &res)) {
 		res.fi = NULL;
-		if (oldflp->oif) {
+		if (oldflp->oif && dev_out->flags & IFF_UP) {
 			/* Apparently, routing tables are wrong. Assume,
 			   that the destination is on link.
 
@@ -2401,6 +2438,7 @@ static int ip_route_output_slow(struct n
 		dev_out = net->loopback_dev;
 		dev_hold(dev_out);
 		fl.oif = dev_out->ifindex;
+		fl.fl4_gw = 0;
 		if (res.fi)
 			fib_info_put(res.fi);
 		res.fi = NULL;
@@ -2408,13 +2446,12 @@ static int ip_route_output_slow(struct n
 		goto make_route;
 	}
 
+	if (res.type == RTN_UNICAST)
+		fib_select_default(net, &fl, &res);
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res.fi->fib_nhs > 1 && fl.oif == 0)
+	if (res.fi->fib_nhs > 1)
 		fib_select_multipath(&fl, &res);
-	else
 #endif
-	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
-		fib_select_default(net, &fl, &res);
 
 	if (!fl.fl4_src)
 		fl.fl4_src = FIB_RES_PREFSRC(res);
@@ -2452,6 +2489,7 @@ int __ip_route_output_key(struct net *ne
 		    rth->fl.fl4_src == flp->fl4_src &&
 		    rth->fl.iif == 0 &&
 		    rth->fl.oif == flp->oif &&
+		    rth->fl.fl4_gw == flp->fl4_gw &&
 		    rth->fl.mark == flp->mark &&
 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
@@ -3054,3 +3092,4 @@ int __init ip_rt_init(void)
 EXPORT_SYMBOL(__ip_select_ident);
 EXPORT_SYMBOL(ip_route_input);
 EXPORT_SYMBOL(ip_route_output_key);
+EXPORT_SYMBOL(ip_route_input_lookup);

[Index of Archives]     [Netfitler Users]     [LARTC]     [Bugtraq]     [Yosemite Forum]

  Powered by Linux