From: Marco Stornelli <marco.stornelli@xxxxxxxxx> This patch adds the possibility for an application to receive statistics information only for processes belonging to a cgroup. The mechanism is the same of the cpu's exit data statistics. With this patch, instead of waiting on a specific cpumask, an application can wait for exit data on a specific container. Through this patch it's possible to have a simple death notifier mechanism. We can select the processes to watch and wait for their death. A death notify mechanism is especially useful for embedded systems. Signed-off-by: Marco Stornelli <marco.stornelli@xxxxxxxxx> --- diff -uprN linux-2.6.29-orig/Documentation/accounting/getdelays.c linux-2.6.29/Documentation/accounting/getdelays.c --- linux-2.6.29-orig/Documentation/accounting/getdelays.c 2009-03-24 00:12:14.000000000 +0100 +++ linux-2.6.29/Documentation/accounting/getdelays.c 2009-06-02 15:47:01.000000000 +0200 @@ -77,9 +77,11 @@ static void usage(void) "[-m cpumask] [-t tgid] [-p pid]\n"); fprintf(stderr, " -d: print delayacct stats\n"); fprintf(stderr, " -i: print IO accounting (works only with -p)\n"); + fprintf(stderr, " -q: print context switch accounting\n"); fprintf(stderr, " -l: listen forever\n"); fprintf(stderr, " -v: debug on\n"); - fprintf(stderr, " -C: container path\n"); + fprintf(stderr, " -C: container path (container statistics)\n"); + fprintf(stderr, " -N: container path (death notify)\n"); } /* @@ -263,13 +265,14 @@ int main(int argc, char *argv[]) char *logfile = NULL; int loop = 0; int containerset = 0; + int containernotify = 0; char containerpath[1024]; int cfd = 0; struct msgtemplate msg; while (1) { - c = getopt(argc, argv, "qdiw:r:m:t:p:vlC:"); + c = getopt(argc, argv, "qdiw:r:m:t:p:vlC:N:"); if (c < 0) break; @@ -290,6 +293,10 @@ int main(int argc, char *argv[]) containerset = 1; strncpy(containerpath, optarg, strlen(optarg) + 1); break; + case 'N': + containernotify = 1; + strncpy(containerpath, optarg, strlen(optarg) + 1); + break; case 'w': logfile = strdup(optarg); printf("write to file %s\n", logfile); @@ -364,8 +371,13 @@ int main(int argc, char *argv[]) } } - if (tid && containerset) { - fprintf(stderr, "Select either -t or -C, not both\n"); + if (tid && (containerset || containernotify)) { + fprintf(stderr, "Select either -t or -C or -N\n"); + goto err; + } + + if (containerset && containernotify) { + fprintf(stderr, "Select either -C or -N, not both\n"); goto err; } @@ -392,7 +404,23 @@ int main(int argc, char *argv[]) goto err; } } - if (!maskset && !tid && !containerset) { + + if (containernotify) { + cfd = open(containerpath, O_RDONLY); + if (cfd < 0) { + perror("error opening container file"); + goto err; + } + rc = send_cmd(nl_sd, id, mypid, CGROUPSTATS_CMD_GET, + CGROUPSTATS_CMD_ATTR_REGISTER_FD, + &cfd, sizeof(__u32)); + if (rc < 0) { + perror("error sending cgroupstats command"); + goto err; + } + } + + if (!maskset && !tid && !containerset && !containernotify) { usage(); goto err; } @@ -400,6 +428,7 @@ int main(int argc, char *argv[]) do { int i; + PRINTF("Recv...\n"); rep_len = recv(nl_sd, &msg, sizeof(msg), 0); PRINTF("received %d bytes\n", rep_len); @@ -495,6 +524,14 @@ done: if (rc < 0) err(rc, "error sending deregister cpumask\n"); } + if (containernotify) { + rc = send_cmd(nl_sd, id, mypid, CGROUPSTATS_CMD_GET, + CGROUPSTATS_CMD_ATTR_DEREGISTER_FD, + &cfd, sizeof(__u32)); + printf("Sent deregister container, retval %d\n", rc); + if (rc < 0) + err(rc, "error sending deregister container\n"); + } err: close(nl_sd); if (fd) --- linux-2.6.29-orig/kernel/taskstats.c 2009-03-24 00:12:14.000000000 +0100 +++ linux-2.6.29/kernel/taskstats.c 2009-06-02 15:54:37.000000000 +0200 @@ -56,6 +56,8 @@ __read_mostly = { static struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, + [CGROUPSTATS_CMD_ATTR_REGISTER_FD] = { .type = NLA_U32 }, + [CGROUPSTATS_CMD_ATTR_DEREGISTER_FD] = { .type = NLA_U32 }, }; struct listener { @@ -70,6 +72,16 @@ struct listener_list { }; static DEFINE_PER_CPU(struct listener_list, listener_array); +struct cgroup_listener { + struct list_head list; + pid_t pid; + char valid; + struct dentry *d_cgroup; + int ready_to_send; +}; + +static struct listener_list cgroup_listener_array; + enum actions { REGISTER, DEREGISTER, @@ -124,6 +136,63 @@ static int send_reply(struct sk_buff *sk } /* + * Send taskstats data in @skb to listeners registered for cgroup members exit + * data + */ +static void send_cgroup_listeners(struct sk_buff *skb, + struct listener_list *listeners) +{ + struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); + struct cgroup_listener *s, *tmp; + struct sk_buff *skb_next, *skb_cur = skb; + void *reply = genlmsg_data(genlhdr); + int rc, delcount = 0; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return; + } + + rc = 0; + down_read(&listeners->sem); + list_for_each_entry(s, &listeners->list, list) { + if (!s->ready_to_send) + continue; + skb_next = NULL; + if (!list_is_last(&s->list, &listeners->list)) { + skb_next = skb_clone(skb_cur, GFP_KERNEL); + if (!skb_next) + break; + } + rc = genlmsg_unicast(skb_cur, s->pid); + if (rc == -ECONNREFUSED) { + s->valid = 0; + delcount++; + } + s->ready_to_send = 0; + skb_cur = skb_next; + } + up_read(&listeners->sem); + + if (skb_cur) + nlmsg_free(skb_cur); + + if (!delcount) + return; + + /* Delete invalidated entries */ + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (!s->valid) { + list_del(&s->list); + kfree(s); + } + } + up_write(&listeners->sem); +} + +/* * Send taskstats data in @skb to listeners registered for @cpu's exit data */ static void send_cpu_listeners(struct sk_buff *skb, @@ -290,6 +359,43 @@ ret: return; } + +static int add_cgroup_del_listener(pid_t pid, struct dentry *d_cgroup, + int isadd) +{ + struct listener_list *listeners = &cgroup_listener_array; + struct cgroup_listener *s, *tmp; + + if (isadd == REGISTER) { + s = kmalloc(sizeof(struct cgroup_listener), GFP_KERNEL); + if (!s) + goto cleanup; + s->pid = pid; + INIT_LIST_HEAD(&s->list); + s->valid = 1; + s->d_cgroup = d_cgroup; + s->ready_to_send = 0; + + down_write(&listeners->sem); + list_add(&s->list, &listeners->list); + up_write(&listeners->sem); + return 0; + } + + /* Deregister or cleanup */ +cleanup: + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (s->pid == pid) { + list_del(&s->list); + kfree(s); + break; + } + } + up_write(&listeners->sem); + return 0; +} + static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) { struct listener_list *listeners; @@ -391,6 +497,32 @@ static int cgroupstats_user_cmd(struct s struct file *file; int fput_needed; + na = info->attrs[CGROUPSTATS_CMD_ATTR_REGISTER_FD]; + if (na) { + fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_REGISTER_FD]); + file = fget_light(fd, &fput_needed); + if (!file) + return 0; + + rc = add_cgroup_del_listener(info->snd_pid, file->f_dentry, + REGISTER); + fput_light(file, fput_needed); + return rc; + } + + na = info->attrs[CGROUPSTATS_CMD_ATTR_DEREGISTER_FD]; + if (na) { + fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_DEREGISTER_FD]); + file = fget_light(fd, &fput_needed); + if (!file) + return 0; + + rc = add_cgroup_del_listener(info->snd_pid, file->f_dentry, + DEREGISTER); + fput_light(file, fput_needed); + return rc; + } + na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; if (!na) return -EINVAL; @@ -517,15 +649,32 @@ ret: return sig->stats; } +int check_ready_to_send(pid_t pid, struct listener_list *cgroup_list) +{ + struct listener_list *listeners = cgroup_list; + struct cgroup_listener *s, *tmp; + int ready = 0; + + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (cgroup_verify_pid(pid, s->d_cgroup) > 0) { + s->ready_to_send = 1; + ready = 1; + } + } + + return ready; +} + /* Send pid data out on exit */ void taskstats_exit(struct task_struct *tsk, int group_dead) { int rc; struct listener_list *listeners; + struct listener_list *cgroup_listeners = &cgroup_listener_array; struct taskstats *stats; struct sk_buff *rep_skb; size_t size; - int is_thread_group; + int is_thread_group, target = 0; if (!family_registered) return; @@ -545,7 +694,16 @@ void taskstats_exit(struct task_struct * } listeners = &__raw_get_cpu_var(listener_array); - if (list_empty(&listeners->list)) + if (!list_empty(&listeners->list)) + target |= CPU_TARGET; + + down_write(&cgroup_listeners->sem); + if (!list_empty(&cgroup_listeners->list)) + if (check_ready_to_send(tsk->pid, cgroup_listeners)) + target |= CGROUP_TARGET; + up_write(&cgroup_listeners->sem); + + if (!target) return; rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); @@ -573,7 +731,10 @@ void taskstats_exit(struct task_struct * memcpy(stats, tsk->signal->stats, sizeof(*stats)); send: - send_cpu_listeners(rep_skb, listeners); + if (target & CPU_TARGET) + send_cpu_listeners(rep_skb, listeners); + if (target & CGROUP_TARGET) + send_cgroup_listeners(rep_skb, cgroup_listeners); return; err: nlmsg_free(rep_skb); @@ -595,12 +756,15 @@ static struct genl_ops cgroupstats_ops = void __init taskstats_init_early(void) { unsigned int i; + struct listener_list *listeners = &cgroup_listener_array; taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); for_each_possible_cpu(i) { INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); init_rwsem(&(per_cpu(listener_array, i).sem)); } + INIT_LIST_HEAD(&listeners->list); + init_rwsem(&listeners->sem); } static int __init taskstats_init(void) --- linux-2.6.29-orig/kernel/cgroup.c 2009-03-24 00:12:14.000000000 +0100 +++ linux-2.6.29/kernel/cgroup.c 2009-06-02 15:50:57.000000000 +0200 @@ -2040,6 +2040,44 @@ static int pid_array_load(pid_t *pidarra } /** + * cgroup_verify_pid - it verifies if a pid is in a cgroup + * @dentry: A dentry entry belonging to the cgroup for which stats have + * been requested. + * + * Return value can be < 0 for error, 0 not pid not found, > 0 pid found + */ +int cgroup_verify_pid(pid_t pid, struct dentry *dentry) +{ + int ret = -EINVAL; + struct cgroup *cgrp; + struct cgroup_iter it; + struct task_struct *tsk; + + /* + * Validate dentry by checking the superblock operations, + * and make sure it's a directory. + */ + if (dentry->d_sb->s_op != &cgroup_ops || + !S_ISDIR(dentry->d_inode->i_mode)) + goto err; + + ret = 0; + cgrp = dentry->d_fsdata; + + cgroup_iter_start(cgrp, &it); + while ((tsk = cgroup_iter_next(cgrp, &it))) { + if (tsk->pid == pid) { + cgroup_iter_end(cgrp, &it); + return 1; + } + } + cgroup_iter_end(cgrp, &it); + +err: + return ret; +} + +/** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have --- linux-2.6.29-orig/include/linux/cgroup.h 2009-03-24 00:12:14.000000000 +0100 +++ linux-2.6.29/include/linux/cgroup.h 2009-06-02 15:55:11.000000000 +0200 @@ -32,6 +32,7 @@ extern void cgroup_fork(struct task_stru extern void cgroup_fork_callbacks(struct task_struct *p); extern void cgroup_post_fork(struct task_struct *p); extern void cgroup_exit(struct task_struct *p, int run_callbacks); +extern int cgroup_verify_pid(pid_t pid, struct dentry *dentry); extern int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); @@ -450,6 +451,10 @@ static inline void cgroup_exit(struct ta static inline void cgroup_lock(void) {} static inline void cgroup_unlock(void) {} +static inline int cgroup_verify_pid(pid_t pid, struct dentry *dentry) +{ + return -EINVAL; +} static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { --- linux-2.6.29-orig/include/linux/cgroupstats.h 2009-03-24 00:12:14.000000000 +0100 +++ linux-2.6.29/include/linux/cgroupstats.h 2009-06-01 11:37:46.000000000 +0200 @@ -63,6 +63,8 @@ enum { enum { CGROUPSTATS_CMD_ATTR_UNSPEC = 0, CGROUPSTATS_CMD_ATTR_FD, + CGROUPSTATS_CMD_ATTR_REGISTER_FD, + CGROUPSTATS_CMD_ATTR_DEREGISTER_FD, __CGROUPSTATS_CMD_ATTR_MAX, }; --- linux-2.6.29-orig/include/linux/taskstats.h 2009-03-24 00:12:14.000000000 +0100 +++ linux-2.6.29/include/linux/taskstats.h 2009-06-02 15:35:24.000000000 +0200 @@ -37,6 +37,9 @@ #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ +#define CPU_TARGET 0x1 +#define CGROUP_TARGET 0x2 + struct taskstats { /* The version number of this struct. This field is always set to -- To unsubscribe from this list: send the line "unsubscribe linux-embedded" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html