On Wed, 7 Sep 2011 01:23:16 -0300 Glauber Costa <glommer@xxxxxxxxxxxxx> wrote: > With all the infrastructure in place, this patch implements > per-cgroup control for tcp memory pressure handling. > > Signed-off-by: Glauber Costa <glommer@xxxxxxxxxxxxx> > CC: David S. Miller <davem@xxxxxxxxxxxxx> > CC: Hiroyouki Kamezawa <kamezawa.hiroyu@xxxxxxxxxxxxxx> > CC: Eric W. Biederman <ebiederm@xxxxxxxxxxxx> Hmm, then, kmem_cgroup.c is just a caller of plugins implemented by other components ? > --- > include/linux/kmem_cgroup.h | 7 ++++ > include/net/sock.h | 10 ++++++- > mm/kmem_cgroup.c | 10 ++++++- > net/core/sock.c | 18 +++++++++++ > net/ipv4/tcp.c | 67 +++++++++++++++++++++++++++++++++++++----- > 5 files changed, 102 insertions(+), 10 deletions(-) > > diff --git a/include/linux/kmem_cgroup.h b/include/linux/kmem_cgroup.h > index d983ba8..89ad0a1 100644 > --- a/include/linux/kmem_cgroup.h > +++ b/include/linux/kmem_cgroup.h > @@ -23,6 +23,13 @@ > struct kmem_cgroup { > struct cgroup_subsys_state css; > struct kmem_cgroup *parent; > + > +#ifdef CONFIG_INET > + int tcp_memory_pressure; > + atomic_long_t tcp_memory_allocated; > + struct percpu_counter tcp_sockets_allocated; > + long tcp_prot_mem[3]; > +#endif > }; I think you should place 'read-mostly' values carefully. > > diff --git a/include/net/sock.h b/include/net/sock.h > index ab65640..91424e3 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -64,6 +64,7 @@ > #include <net/dst.h> > #include <net/checksum.h> > > +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); > /* > * This structure really needs to be cleaned up. > * Most of it is for TCP, and not used by any of > @@ -814,7 +815,14 @@ struct proto { > int *(*memory_pressure)(struct kmem_cgroup *sg); > /* Pointer to the per-cgroup version of the the sysctl_mem field */ > long *(*prot_mem)(struct kmem_cgroup *sg); > - > + /* > + * cgroup specific initialization function. Called once for all > + * protocols that implement it, from cgroups populate function. > + * This function has to setup any files the protocol want to > + * appear in the kmem cgroup filesystem. > + */ > + int (*init_cgroup)(struct cgroup *cgrp, > + struct cgroup_subsys *ss); > int *sysctl_wmem; > int *sysctl_rmem; > int max_header; > diff --git a/mm/kmem_cgroup.c b/mm/kmem_cgroup.c > index 7950e69..5e53d66 100644 > --- a/mm/kmem_cgroup.c > +++ b/mm/kmem_cgroup.c > @@ -17,16 +17,24 @@ > #include <linux/cgroup.h> > #include <linux/slab.h> > #include <linux/kmem_cgroup.h> > +#include <net/sock.h> > > static int kmem_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) > { > - return 0; > + int ret = 0; > +#ifdef CONFIG_NET > + ret = sockets_populate(ss, cgrp); > +#endif CONFIG_INET ? > + return ret; > } > > static void > kmem_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) > { > struct kmem_cgroup *cg = kcg_from_cgroup(cgrp); > +#ifdef CONFIG_INET > + percpu_counter_destroy(&cg->tcp_sockets_allocated); > +#endif > kfree(cg); > } > > diff --git a/net/core/sock.c b/net/core/sock.c > index ead9c02..9d833cf 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -134,6 +134,24 @@ > #include <net/tcp.h> > #endif > > +static DEFINE_RWLOCK(proto_list_lock); > +static LIST_HEAD(proto_list); > + > +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) > +{ > + struct proto *proto; > + int ret = 0; > + > + read_lock(&proto_list_lock); > + list_for_each_entry(proto, &proto_list, node) { > + if (proto->init_cgroup) > + ret |= proto->init_cgroup(cgrp, ss); > + } > + read_unlock(&proto_list_lock); > + > + return ret; > +} Hmm, I don't understand this part but... ret |= ... and no 'undo' ? If no 'undo', ->init_cgroup() should success always and no return value is required. > + > /* > * Each address family might have different locking rules, so we have > * one slock key per address family: > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index 76f03ed..0725dc4 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -289,13 +289,6 @@ int sysctl_tcp_rmem[3] __read_mostly; > EXPORT_SYMBOL(sysctl_tcp_rmem); > EXPORT_SYMBOL(sysctl_tcp_wmem); > > -atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ > - > -/* > - * Current number of TCP sockets. > - */ > -struct percpu_counter tcp_sockets_allocated; > - > /* > * TCP splice context > */ > @@ -305,13 +298,68 @@ struct tcp_splice_state { > unsigned int flags; > }; > > +#ifdef CONFIG_CGROUP_KMEM > /* > * Pressure flag: try to collapse. > * Technical note: it is used by multiple contexts non atomically. > * All the __sk_mem_schedule() is of this nature: accounting > * is strict, actions are advisory and have some latency. > */ > -int tcp_memory_pressure __read_mostly; > +void tcp_enter_memory_pressure(struct sock *sk) > +{ > + struct kmem_cgroup *sg = sk->sk_cgrp; > + if (!sg->tcp_memory_pressure) { > + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); > + sg->tcp_memory_pressure = 1; > + } > +} > + > +long *tcp_sysctl_mem(struct kmem_cgroup *sg) > +{ > + return sg->tcp_prot_mem; > +} > + > +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg) > +{ > + return &(sg->tcp_memory_allocated); > +} > + > +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) > +{ > + struct kmem_cgroup *sg = kcg_from_cgroup(cgrp); > + unsigned long limit; > + struct net *net = current->nsproxy->net_ns; > + > + sg->tcp_memory_pressure = 0; > + atomic_long_set(&sg->tcp_memory_allocated, 0); > + percpu_counter_init(&sg->tcp_sockets_allocated, 0); > + > + limit = nr_free_buffer_pages() / 8; > + limit = max(limit, 128UL); > + > + sg->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0]; > + sg->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1]; > + sg->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2]; > + > + return 0; > +} > +EXPORT_SYMBOL(tcp_init_cgroup); > + > +int *memory_pressure_tcp(struct kmem_cgroup *sg) > +{ > + return &sg->tcp_memory_pressure; > +} > + > +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg) > +{ > + return &sg->tcp_sockets_allocated; > +} > +#else > + > +/* Current number of TCP sockets. */ > +struct percpu_counter tcp_sockets_allocated; > +atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ > +int tcp_memory_pressure; > you dropped __read_mostly. Thanks, -kame -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>