The patch titled per-task delay accounting taskstats interface: control exit data through cpumasks has been added to the -mm tree. Its filename is per-task-delay-accounting-taskstats-interface-control-exit-data-through-cpumasks.patch See http://www.zip.com.au/~akpm/linux/patches/stuff/added-to-mm.txt to find out what to do about this ------------------------------------------------------ Subject: per-task delay accounting taskstats interface: control exit data through cpumasks From: Shailabh Nagar <nagar@xxxxxxxxxxxxxx> On systems with a large number of cpus, with even a modest rate of tasks exiting per cpu, the volume of taskstats data sent on thread exit can overflow a userspace listener's buffers. One approach to avoiding overflow is to allow listeners to get data for a limited and specific set of cpus. By scaling the number of listeners and/or the cpus they monitor, userspace can handle the statistical data overload more gracefully. In this patch, each listener registers to listen to a specific set of cpus by specifying a cpumask. The interest is recorded per-cpu. When a task exits on a cpu, its taskstats data is unicast to each listener interested in that cpu. Thanks to Andrew Morton for pointing out the various scalability and general concerns of previous attempts and for suggesting this design. Signed-off-by: Shailabh Nagar <nagar@xxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxx> --- include/linux/taskstats.h | 4 include/linux/taskstats_kern.h | 22 ---- kernel/exit.c | 5 kernel/taskstats.c | 168 +++++++++++++++++++++++++++++-- 4 files changed, 168 insertions(+), 31 deletions(-) diff -puN include/linux/taskstats.h~per-task-delay-accounting-taskstats-interface-control-exit-data-through-cpumasks include/linux/taskstats.h --- a/include/linux/taskstats.h~per-task-delay-accounting-taskstats-interface-control-exit-data-through-cpumasks +++ a/include/linux/taskstats.h @@ -87,8 +87,6 @@ struct taskstats { }; -#define TASKSTATS_LISTEN_GROUP 0x1 - /* * Commands sent from userspace * Not versioned. New commands should only be inserted at the enum's end @@ -120,6 +118,8 @@ enum { TASKSTATS_CMD_ATTR_UNSPEC = 0, TASKSTATS_CMD_ATTR_PID, TASKSTATS_CMD_ATTR_TGID, + TASKSTATS_CMD_ATTR_REGISTER_CPUMASK, + TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK, __TASKSTATS_CMD_ATTR_MAX, }; diff -puN include/linux/taskstats_kern.h~per-task-delay-accounting-taskstats-interface-control-exit-data-through-cpumasks include/linux/taskstats_kern.h --- a/include/linux/taskstats_kern.h~per-task-delay-accounting-taskstats-interface-control-exit-data-through-cpumasks +++ a/include/linux/taskstats_kern.h @@ -20,21 +20,6 @@ enum { extern kmem_cache_t *taskstats_cache; extern struct mutex taskstats_exit_mutex; -static inline int taskstats_has_listeners(void) -{ - if (!genl_sock) - return 0; - return netlink_has_listeners(genl_sock, TASKSTATS_LISTEN_GROUP); -} - - -static inline void taskstats_exit_alloc(struct taskstats **ptidstats) -{ - *ptidstats = NULL; - if (taskstats_has_listeners()) - *ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); -} - static inline void taskstats_exit_free(struct taskstats *tidstats) { if (tidstats) @@ -82,17 +67,18 @@ static inline void taskstats_tgid_free(s kmem_cache_free(taskstats_cache, stats); } -extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int); +extern void taskstats_exit_alloc(struct taskstats **, unsigned int *); +extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int, unsigned int); extern void taskstats_init_early(void); extern void taskstats_tgid_alloc(struct signal_struct *); #else -static inline void taskstats_exit_alloc(struct taskstats **ptidstats) +static inline void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) {} static inline void taskstats_exit_free(struct taskstats *ptidstats) {} static inline void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, - int group_dead) + int group_dead, unsigned int cpu) {} static inline void taskstats_tgid_init(struct signal_struct *sig) {} diff -puN kernel/exit.c~per-task-delay-accounting-taskstats-interface-control-exit-data-through-cpumasks kernel/exit.c --- a/kernel/exit.c~per-task-delay-accounting-taskstats-interface-control-exit-data-through-cpumasks +++ a/kernel/exit.c @@ -847,6 +847,7 @@ fastcall NORET_TYPE void do_exit(long co struct task_struct *tsk = current; struct taskstats *tidstats; int group_dead; + unsigned int mycpu; profile_task_exit(tsk); @@ -884,7 +885,7 @@ fastcall NORET_TYPE void do_exit(long co current->comm, current->pid, preempt_count()); - taskstats_exit_alloc(&tidstats); + taskstats_exit_alloc(&tidstats, &mycpu); acct_update_integrals(tsk); if (tsk->mm) { @@ -905,7 +906,7 @@ fastcall NORET_TYPE void do_exit(long co #endif if (unlikely(tsk->audit_context)) audit_free(tsk); - taskstats_exit_send(tsk, tidstats, group_dead); + taskstats_exit_send(tsk, tidstats, group_dead, mycpu); taskstats_exit_free(tidstats); delayacct_tsk_exit(tsk); diff -puN kernel/taskstats.c~per-task-delay-accounting-taskstats-interface-control-exit-data-through-cpumasks kernel/taskstats.c --- a/kernel/taskstats.c~per-task-delay-accounting-taskstats-interface-control-exit-data-through-cpumasks +++ a/kernel/taskstats.c @@ -19,9 +19,17 @@ #include <linux/kernel.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> #include <net/genetlink.h> #include <asm/atomic.h> +/* + * Maximum length of a cpumask that can be specified in + * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute + */ +#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) + static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; static int family_registered; kmem_cache_t *taskstats_cache; @@ -37,8 +45,25 @@ static struct nla_policy taskstats_cmd_g __read_mostly = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, + [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; + +struct listener { + struct list_head list; + pid_t pid; +}; + +struct listener_list { + struct rw_semaphore sem; + struct list_head list; }; +static DEFINE_PER_CPU(struct listener_list, listener_array); +enum actions { + REGISTER, + DEREGISTER, + CPU_DONT_CARE +}; static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, void **replyp, size_t size) @@ -74,9 +99,11 @@ static int prepare_reply(struct genl_inf return 0; } -static int send_reply(struct sk_buff *skb, pid_t pid, int event) +static int send_reply(struct sk_buff *skb, pid_t pid, int event, unsigned int cpu) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + struct listener_list *listeners; + struct list_head *p, *tmp; void *reply; int rc; @@ -88,9 +115,29 @@ static int send_reply(struct sk_buff *sk return rc; } - if (event == TASKSTATS_MSG_MULTICAST) - return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP); - return genlmsg_unicast(skb, pid); + if (event == TASKSTATS_MSG_UNICAST) + return genlmsg_unicast(skb, pid); + + /* + * Taskstats multicast is unicasts to listeners who have registered + * interest in cpu + */ + + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_safe(p, tmp, &listeners->list) { + int ret; + struct listener *s = list_entry(p, struct listener, list); + ret = genlmsg_unicast(skb, s->pid); + if (ret) { + list_del(&s->list); + kfree(s); + rc = ret; + } + } + up_write(&listeners->sem); + + return rc; } static int fill_pid(pid_t pid, struct task_struct *pidtsk, @@ -204,8 +251,55 @@ ret: return; } +static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) +{ + struct listener *s; + struct listener_list *listeners; + unsigned int cpu; + cpumask_t mask = *maskp; + struct list_head *p; + + if (!cpus_subset(mask, cpu_possible_map)) + return -EINVAL; + + if (isadd == REGISTER) { + for_each_cpu_mask(cpu, mask) { + s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, + cpu_to_node(cpu)); + if (!s) + goto cleanup; + s->pid = pid; + INIT_LIST_HEAD(&s->list); + + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_add(&s->list, &listeners->list); + up_write(&listeners->sem); + } + return 0; + } -static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) + /* Deregister or cleanup */ +cleanup: + for_each_cpu_mask(cpu, mask) { + struct list_head *tmp; + + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_safe(p, tmp, &listeners->list) { + s = list_entry(p, struct listener, list); + if (s->pid == pid) { + list_del(&s->list); + kfree(s); + break; + } + } + up_write(&listeners->sem); + } + return 0; +} + +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { int rc = 0; struct sk_buff *rep_skb; @@ -213,6 +307,29 @@ static int taskstats_send_stats(struct s void *reply; size_t size; struct nlattr *na; + cpumask_t mask; + + if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) { + na = info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]; + if (nla_len(na) > TASKSTATS_CPUMASK_MAXLEN) + return -E2BIG; + rc = cpulist_parse((char *)nla_data(na), mask); + if (rc) + return rc; + rc = add_del_listener(info->snd_pid, &mask, REGISTER); + return rc; + } + + if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) { + na = info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]; + if (nla_len(na) > TASKSTATS_CPUMASK_MAXLEN) + return -E2BIG; + rc = cpulist_parse((char *)nla_data(na), mask); + if (rc) + return rc; + rc = add_del_listener(info->snd_pid, &mask, DEREGISTER); + return rc; + } /* * Size includes space for nested attributes @@ -252,7 +369,8 @@ static int taskstats_send_stats(struct s nla_nest_end(rep_skb, na); - return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST); + return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST, + CPU_DONT_CARE); nla_put_failure: return genlmsg_cancel(rep_skb, reply); @@ -261,9 +379,35 @@ err: return rc; } +void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) +{ + struct listener_list *listeners; + struct taskstats *tmp; + /* + * This is the cpu on which the task is exiting currently and will + * be the one for which the exit event is sent, even if the cpu + * on which this function is running changes later. + */ + *mycpu = raw_smp_processor_id(); + + *ptidstats = NULL; + tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); + if (!tmp) + return; + + listeners = &per_cpu(listener_array, *mycpu); + down_read(&listeners->sem); + if (!list_empty(&listeners->list)) { + *ptidstats = tmp; + tmp = NULL; + } + up_read(&listeners->sem); + kfree(tmp); +} + /* Send pid data out on exit */ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, - int group_dead) + int group_dead, unsigned int mycpu) { int rc; struct sk_buff *rep_skb; @@ -324,7 +468,7 @@ void taskstats_exit_send(struct task_str nla_nest_end(rep_skb, na); send: - send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST, mycpu); return; nla_put_failure: @@ -338,7 +482,7 @@ ret: static struct genl_ops taskstats_ops = { .cmd = TASKSTATS_CMD_GET, - .doit = taskstats_send_stats, + .doit = taskstats_user_cmd, .policy = taskstats_cmd_get_policy, }; @@ -353,6 +497,7 @@ void __init taskstats_init_early(void) static int __init taskstats_init(void) { int rc; + unsigned int i; rc = genl_register_family(&family); if (rc) @@ -362,6 +507,11 @@ static int __init taskstats_init(void) if (rc < 0) goto err; + for_each_possible_cpu(i) { + INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); + init_rwsem(&(per_cpu(listener_array, i).sem)); + } + family_registered = 1; return 0; err: _ Patches currently in -mm which might be from nagar@xxxxxxxxxxxxxx are per-task-delay-accounting-setup.patch per-task-delay-accounting-sync-block-i-o-and-swapin-delay-collection.patch per-task-delay-accounting-cpu-delay-collection-via-schedstats.patch per-task-delay-accounting-utilities-for-genetlink-usage.patch per-task-delay-accounting-taskstats-interface.patch per-task-delay-accounting-delay-accounting-usage-of-taskstats-interface.patch per-task-delay-accounting-documentation.patch per-task-delay-accounting-proc-export-of-aggregated-block-i-o-delays.patch delay-accounting-taskstats-interface-send-tgid-once.patch per-task-delay-accounting-avoid-send-without-listeners.patch per-task-delay-accounting-taskstats-interface-control-exit-data-through-cpumasks.patch task-watchers-register-per-task-delay-accounting.patch - To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html