This is a note to let you know that I've just added the patch titled inetpeer: get rid of ip_id_count to the 3.10-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: inetpeer-get-rid-of-ip_id_count.patch and it can be found in the queue-3.10 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable@xxxxxxxxxxxxxxx> know about it. >From foo@baz Fri Aug 8 08:48:58 PDT 2014 From: Eric Dumazet <edumazet@xxxxxxxxxx> Date: Mon, 2 Jun 2014 05:26:03 -0700 Subject: inetpeer: get rid of ip_id_count From: Eric Dumazet <edumazet@xxxxxxxxxx> [ Upstream commit 73f156a6e8c1074ac6327e0abd1169e95eb66463 ] Ideally, we would need to generate IP ID using a per destination IP generator. linux kernels used inet_peer cache for this purpose, but this had a huge cost on servers disabling MTU discovery. 1) each inet_peer struct consumes 192 bytes 2) inetpeer cache uses a binary tree of inet_peer structs, with a nominal size of ~66000 elements under load. 3) lookups in this tree are hitting a lot of cache lines, as tree depth is about 20. 4) If server deals with many tcp flows, we have a high probability of not finding the inet_peer, allocating a fresh one, inserting it in the tree with same initial ip_id_count, (cf secure_ip_id()) 5) We garbage collect inet_peer aggressively. IP ID generation do not have to be 'perfect' Goal is trying to avoid duplicates in a short period of time, so that reassembly units have a chance to complete reassembly of fragments belonging to one message before receiving other fragments with a recycled ID. We simply use an array of generators, and a Jenkin hash using the dst IP as a key. ipv6_select_ident() is put back into net/ipv6/ip6_output.c where it belongs (it is only used from this file) secure_ip_id() and secure_ipv6_id() no longer are needed. Rename ip_select_ident_more() to ip_select_ident_segs() to avoid unnecessary decrement/increment of the number of segments. Signed-off-by: Eric Dumazet <edumazet@xxxxxxxxxx> Signed-off-by: David S. Miller <davem@xxxxxxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- drivers/net/ppp/pptp.c | 2 - drivers/net/vxlan.c | 2 - include/net/inetpeer.h | 16 ++----------- include/net/ip.h | 40 +++++++++++++++++++--------------- include/net/ipv6.h | 11 +++++---- include/net/secure_seq.h | 2 - net/core/secure_seq.c | 25 --------------------- net/ipv4/igmp.c | 4 +-- net/ipv4/inetpeer.c | 18 --------------- net/ipv4/ip_output.c | 7 ++--- net/ipv4/ip_tunnel.c | 2 - net/ipv4/ipmr.c | 2 - net/ipv4/raw.c | 2 - net/ipv4/route.c | 47 ++++++++++++++++------------------------ net/ipv4/xfrm4_mode_tunnel.c | 2 - net/ipv6/ip6_output.c | 15 ++++++++++++ net/ipv6/output_core.c | 23 ------------------- net/ipv6/sit.c | 2 - net/netfilter/ipvs/ip_vs_xmit.c | 2 - 19 files changed, 80 insertions(+), 144 deletions(-) --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -281,7 +281,7 @@ static int pptp_xmit(struct ppp_channel nf_reset(skb); skb->ip_summed = CHECKSUM_NONE; - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); ip_send_check(iph); ip_local_out(skb); --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1093,7 +1093,7 @@ static netdev_tx_t vxlan_xmit_one(struct iph->daddr = dst; iph->saddr = fl4.saddr; iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); + __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); nf_reset(skb); --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -41,14 +41,13 @@ struct inet_peer { struct rcu_head gc_rcu; }; /* - * Once inet_peer is queued for deletion (refcnt == -1), following fields - * are not available: rid, ip_id_count + * Once inet_peer is queued for deletion (refcnt == -1), following field + * is not available: rid * We can share memory with rcu_head to help keep inet_peer small. */ union { struct { atomic_t rid; /* Frag reception counter */ - atomic_t ip_id_count; /* IP ID for the next packet */ }; struct rcu_head rcu; struct inet_peer *gc_next; @@ -166,7 +165,7 @@ extern void inetpeer_invalidate_tree(str extern void inetpeer_invalidate_family(int family); /* - * temporary check to make sure we dont access rid, ip_id_count, tcp_ts, + * temporary check to make sure we dont access rid, tcp_ts, * tcp_ts_stamp if no refcount is taken on inet_peer */ static inline void inet_peer_refcheck(const struct inet_peer *p) @@ -174,13 +173,4 @@ static inline void inet_peer_refcheck(co WARN_ON_ONCE(atomic_read(&p->refcnt) <= 0); } - -/* can be called with or without local BH being disabled */ -static inline int inet_getid(struct inet_peer *p, int more) -{ - more++; - inet_peer_refcheck(p); - return atomic_add_return(more, &p->ip_id_count) - more; -} - #endif /* _NET_INETPEER_H */ --- a/include/net/ip.h +++ b/include/net/ip.h @@ -252,9 +252,19 @@ int ip_dont_fragment(struct sock *sk, st !(dst_metric_locked(dst, RTAX_MTU))); } -extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more); +#define IP_IDENTS_SZ 2048u +extern atomic_t *ip_idents; -static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk) +static inline u32 ip_idents_reserve(u32 hash, int segs) +{ + atomic_t *id_ptr = ip_idents + hash % IP_IDENTS_SZ; + + return atomic_add_return(segs, id_ptr) - segs; +} + +void __ip_select_ident(struct iphdr *iph, int segs); + +static inline void ip_select_ident_segs(struct sk_buff *skb, struct sock *sk, int segs) { struct iphdr *iph = ip_hdr(skb); @@ -264,24 +274,20 @@ static inline void ip_select_ident(struc * does not change, they drop every other packet in * a TCP stream using header compression. */ - iph->id = (sk && inet_sk(sk)->inet_daddr) ? - htons(inet_sk(sk)->inet_id++) : 0; - } else - __ip_select_ident(iph, dst, 0); -} - -static inline void ip_select_ident_more(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk, int more) -{ - struct iphdr *iph = ip_hdr(skb); - - if ((iph->frag_off & htons(IP_DF)) && !skb->local_df) { if (sk && inet_sk(sk)->inet_daddr) { iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += 1 + more; - } else + inet_sk(sk)->inet_id += segs; + } else { iph->id = 0; - } else - __ip_select_ident(iph, dst, more); + } + } else { + __ip_select_ident(iph, segs); + } +} + +static inline void ip_select_ident(struct sk_buff *skb, struct sock *sk) +{ + ip_select_ident_segs(skb, sk, 1); } /* --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -530,14 +530,19 @@ static inline u32 ipv6_addr_hash(const s } /* more secured version of ipv6_addr_hash() */ -static inline u32 ipv6_addr_jhash(const struct in6_addr *a) +static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval) { u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1]; return jhash_3words(v, (__force u32)a->s6_addr32[2], (__force u32)a->s6_addr32[3], - ipv6_hash_secret); + initval); +} + +static inline u32 ipv6_addr_jhash(const struct in6_addr *a) +{ + return __ipv6_addr_jhash(a, ipv6_hash_secret); } static inline bool ipv6_addr_loopback(const struct in6_addr *a) @@ -649,8 +654,6 @@ static inline int ipv6_addr_diff(const s return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); } -extern void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt); - /* * Header manipulation */ --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -3,8 +3,6 @@ #include <linux/types.h> -extern __u32 secure_ip_id(__be32 daddr); -extern __u32 secure_ipv6_id(const __be32 daddr[4]); extern u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); extern u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport); --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -95,31 +95,6 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral #endif #ifdef CONFIG_INET -__u32 secure_ip_id(__be32 daddr) -{ - u32 hash[MD5_DIGEST_WORDS]; - - net_secret_init(); - hash[0] = (__force __u32) daddr; - hash[1] = net_secret[13]; - hash[2] = net_secret[14]; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - return hash[0]; -} - -__u32 secure_ipv6_id(const __be32 daddr[4]) -{ - __u32 hash[4]; - - net_secret_init(); - memcpy(hash, daddr, 16); - md5_transform(hash, net_secret); - - return hash[0]; -} __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -343,7 +343,7 @@ static struct sk_buff *igmpv3_newpack(st pip->saddr = fl4.saddr; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); ((u8 *)&pip[1])[0] = IPOPT_RA; ((u8 *)&pip[1])[1] = 4; ((u8 *)&pip[1])[2] = 0; @@ -687,7 +687,7 @@ static int igmp_send_report(struct in_de iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -26,20 +26,7 @@ * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. - * At this moment this information consists only of ID field for the next - * outgoing IP packet. This field is incremented with each packet as encoded - * in inet_getid() function (include/net/inetpeer.h). - * At the moment of writing this notes identifier of IP packets is generated - * to be unpredictable using this code only for packets subjected - * (actually or potentially) to defragmentation. I.e. DF packets less than - * PMTU in size when local fragmentation is disabled use a constant ID and do - * not use this code (see ip_select_ident() in include/net/ip.h). * - * Route cache entries hold references to our nodes. - * New cache entries get references via lookup by destination IP address in - * the avl tree. The reference is grabbed only when it's needed i.e. only - * when we try to output IP packet which needs an unpredictable ID (see - * __ip_select_ident() in net/ipv4/route.c). * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can @@ -62,7 +49,6 @@ * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable - * ip_id_count: atomic value (no lock needed) */ static struct kmem_cache *peer_cachep __read_mostly; @@ -504,10 +490,6 @@ relookup: p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, - (daddr->family == AF_INET) ? - secure_ip_id(daddr->addr.a4) : - secure_ipv6_id(daddr->addr.a6)); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(skb, &rt->dst, sk); + ip_select_ident(skb, sk); if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -394,8 +394,7 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(skb, &rt->dst, sk, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); + ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -1332,7 +1331,7 @@ struct sk_buff *__ip_make_skb(struct soc iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); - ip_select_ident(skb, &rt->dst, sk); + ip_select_ident(skb, sk); if (opt) { iph->ihl += opt->optlen>>2; --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -691,7 +691,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, iph->daddr = fl4.daddr; iph->saddr = fl4.saddr; iph->ttl = ttl; - __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); + __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); iptunnel_xmit(skb, dev); return; --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1661,7 +1661,7 @@ static void ip_encap(struct sk_buff *skb iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); - ip_select_ident(skb, skb_dst(skb), NULL); + ip_select_ident(skb, NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -387,7 +387,7 @@ static int raw_send_hdrinc(struct sock * iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -89,6 +89,7 @@ #include <linux/rcupdate.h> #include <linux/times.h> #include <linux/slab.h> +#include <linux/jhash.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> @@ -464,39 +465,23 @@ static struct neighbour *ipv4_neigh_look return neigh_create(&arp_tbl, pkey, dev); } -/* - * Peer allocation may fail only in serious out-of-memory conditions. However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. - */ -static void ip_select_fb_ident(struct iphdr *iph) -{ - static DEFINE_SPINLOCK(ip_fb_id_lock); - static u32 ip_fallback_id; - u32 salt; +atomic_t *ip_idents __read_mostly; +EXPORT_SYMBOL(ip_idents); - spin_lock_bh(&ip_fb_id_lock); - salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); - iph->id = htons(salt & 0xFFFF); - ip_fallback_id = salt; - spin_unlock_bh(&ip_fb_id_lock); -} - -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +void __ip_select_ident(struct iphdr *iph, int segs) { - struct net *net = dev_net(dst->dev); - struct inet_peer *peer; + static u32 ip_idents_hashrnd __read_mostly; + static bool hashrnd_initialized = false; + u32 hash, id; - peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); - if (peer) { - iph->id = htons(inet_getid(peer, more)); - inet_putpeer(peer); - return; + if (unlikely(!hashrnd_initialized)) { + hashrnd_initialized = true; + get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); } - ip_select_fb_ident(iph); + hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd); + id = ip_idents_reserve(hash, segs); + iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); @@ -2656,6 +2641,12 @@ int __init ip_rt_init(void) { int rc = 0; + ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + if (!ip_idents) + panic("IP: failed to allocate ip_idents\n"); + + prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -117,12 +117,12 @@ static int xfrm4_mode_tunnel_output(stru top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - ip_select_ident(skb, dst->child, NULL); top_iph->ttl = ip4_dst_hoplimit(dst->child); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; + ip_select_ident(skb, NULL); return 0; } --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -540,6 +540,21 @@ static void ip6_copy_metadata(struct sk_ skb_copy_secmark(to, from); } +static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) +{ + static u32 ip6_idents_hashrnd __read_mostly; + static bool hashrnd_initialized = false; + u32 hash, id; + + if (unlikely(!hashrnd_initialized)) { + hashrnd_initialized = true; + get_random_bytes(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); + } + hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd); + id = ip_idents_reserve(hash, 1); + fhdr->identification = htonl(id); +} + int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -6,29 +6,6 @@ #include <net/ipv6.h> #include <net/ip6_fib.h> -void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) -{ - static atomic_t ipv6_fragmentation_id; - int ident; - -#if IS_ENABLED(CONFIG_IPV6) - if (rt && !(rt->dst.flags & DST_NOPEER)) { - struct inet_peer *peer; - struct net *net; - - net = dev_net(rt->dst.dev); - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); - if (peer) { - fhdr->identification = htonl(inet_getid(peer, 0)); - inet_putpeer(peer); - return; - } - } -#endif - ident = atomic_inc_return(&ipv6_fragmentation_id); - fhdr->identification = htonl(ident); -} -EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -919,7 +919,7 @@ static netdev_tx_t ipip6_tunnel_xmit(str iph->ttl = iph6->hop_limit; skb->ip_summed = CHECKSUM_NONE; - ip_select_ident(skb, skb_dst(skb), NULL); + ip_select_ident(skb, NULL); iptunnel_xmit(skb, dev); return NETDEV_TX_OK; --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -883,7 +883,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s iph->daddr = cp->daddr.ip; iph->saddr = saddr; iph->ttl = old_iph->ttl; - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; Patches currently in stable-queue which might be from edumazet@xxxxxxxxxx are queue-3.10/sctp-fix-possible-seqlock-seadlock-in-sctp_packet_transmit.patch queue-3.10/net-sendmsg-fix-null-pointer-dereference.patch queue-3.10/net-correctly-set-segment-mac_len-in-skb_segment.patch queue-3.10/ip-make-ip-identifiers-less-predictable.patch queue-3.10/inetpeer-get-rid-of-ip_id_count.patch -- To unsubscribe from this list: send the line "unsubscribe stable" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html