When I run "cat /proc/stat" in a container, container will access host's file directly which is a security risk. LXCFS is a good way to strengthen the isolation among containers. However, I can not get a container's correct status because LXCFS just transfer host's status to container. So I track status of a task group and record it on cgroup.procs_stat so that LXCFS can read correct status of a cont- ainer. cgroup.procs_stat record context switches, boot time, the number of running processes, blocked processes, fork and softirq of a task group. Actually, container is just a process for linux kernel and container's children processes belong to a same task group so I can get container's status as long as I find task group to which the container belongs. Add two data structures in CPU accounting group to save the status of a task grooup. For each task, find CPU accounting group to which this task belongs, then update the corresponding data. So, I can get the co- rrect status data of a container in the cgroup to which the container belongs. Signed-off-by: zhangq95 <qiangzh.hust@xxxxxxxxx> --- include/linux/cgroup.h | 31 ++++++++++++ include/linux/cpuset.h | 1 + include/linux/pid_namespace.h | 6 +++ kernel/cgroup/cgroup-v1.c | 108 ++++++++++++++++++++++++++++++++++++++++++ kernel/cgroup/cgroup.c | 8 ++++ kernel/cgroup/cpuset.c | 26 ++++++++++ kernel/fork.c | 3 ++ kernel/sched/core.c | 14 ++++++ kernel/sched/cpuacct.c | 103 ++++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 23 +++++++++ kernel/sched/rt.c | 2 + kernel/softirq.c | 5 ++ 12 files changed, 330 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 473e0c0..63aa652 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -113,6 +113,8 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); +extern struct cgroup_subsys_state *global_cgroup_css(struct cgroup *cgrp, + int ssid); void cgroup_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p); @@ -696,12 +698,41 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, #ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUP_CPUACCT +enum { + CPUACCT_PROCS_RUNNING = 0, + CPUACCT_PROCS_IOWAIT, + CPUACCT_PROCS_FORKS, + CPUACCT_PROCS_SWITCHES, + + CPUACCT_PROCS_STAT_NSTATS, +}; void cpuacct_charge(struct task_struct *tsk, u64 cputime); void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); +unsigned long task_ca_procs_stat(struct task_struct *tsk, int cpu, + int index, int m_index); +void update_cpuacct_procs_stat(struct task_struct *tsk, int cpu, + int index, int inc, int m_index); +bool task_in_nonroot_cpuacct(struct task_struct *tsk); +void update_cpuacct_running_from_tg(struct task_group *tg, + int cpu, int inc); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} +static inline unsigned long +task_ca_procs_stat(struct task_struct *tsk, int cpu, + int index, int m_index) { return 0; } + +static inline void +update_cpuacct_procs_stat(struct task_struct *tsk, int cpu, + int index, int inc, int m_index) {} + +static inline bool +task_in_nonroot_cpuacct(struct task_struct *tsk) { return false; } + +static inline void +update_cpuacct_running_from_tg(struct task_group *tg, + int cpu, int inc) {} #endif void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 934633a..4ce5372 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -56,6 +56,7 @@ extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); +extern void get_tsk_cpu_allowed(struct task_struct *tsk, struct cpumask *pmask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 49538b1..2d84f7b 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -100,4 +100,10 @@ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); +/* Determine if task is in root_namespace */ +static inline bool in_noninit_pid_ns(struct task_struct *tsk) +{ + return task_active_pid_ns(tsk) != &init_pid_ns; +} + #endif /* _LINUX_PID_NS_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index a2c05d2..95cafc2 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -7,12 +7,15 @@ #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> +#include <linux/sched/stat.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> +#include <linux/kernel_stat.h> #include <linux/cgroupstats.h> +#include <linux/cpuset.h> #include <trace/events/cgroup.h> @@ -604,6 +607,106 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_procs_stat_show(struct seq_file *s, void *v) +{ + struct kernfs_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct cgroup_pidlist *l; + enum cgroup_filetype type = seq_cft(s)->private; + struct task_struct *tsk; + int ret, i = 0, j = 0, tmp = 0; + unsigned long forks = 0, iowait = 0, nr_runnable = 0; + pid_t *start; + struct timespec64 boottime; + unsigned long long start_time, switches = 0; + unsigned long per_softirq_nums[NR_SOFTIRQS] = {0}; + unsigned long sum_softirq = 0; + struct cpumask cpus_allowed; + + mutex_lock(&cgrp->pidlist_mutex); + if (of->priv) + of->priv = cgroup_pidlist_find(cgrp, type); + + if (!of->priv) { + ret = pidlist_array_load(cgrp, type, + (struct cgroup_pidlist **)&of->priv); + if (ret) + return ERR_PTR(ret); + } + l = of->priv; + + start = l->list; + + tsk = find_task_by_pid_ns(*start, &init_pid_ns); + getboottime64(&boottime); + + if (in_noninit_pid_ns(tsk) && + task_in_nonroot_cpuacct(tsk)) { + if (task_css(tsk, cpuset_cgrp_id)) { + memset(&cpus_allowed, 0, sizeof(cpus_allowed)); + get_tsk_cpu_allowed(tsk, &cpus_allowed); + } + + start_time = tsk->real_start_time / NSEC_PER_SEC; + start_time += (unsigned long long)boottime.tv_sec; + + for_each_cpu_and(i, cpu_possible_mask, &cpus_allowed) { + switches += task_ca_procs_stat(tsk, i, + CPUACCT_PROCS_SWITCHES, 0); + forks += task_ca_procs_stat(tsk, i, + CPUACCT_PROCS_FORKS, 0); + nr_runnable += task_ca_procs_stat(tsk, i, + CPUACCT_PROCS_RUNNING, 0); + iowait += task_ca_procs_stat(tsk, i, + CPUACCT_PROCS_IOWAIT, 0); + + for (j = 0; j < NR_SOFTIRQS; j++) { + tmp = task_ca_procs_stat(tsk, i, j, 1); + per_softirq_nums[j] += tmp; + sum_softirq += tmp; + } + } + + } else { + cpumask_copy(&cpus_allowed, cpu_possible_mask); + nr_runnable = nr_running(); + forks = total_forks; + iowait = nr_iowait(); + switches = nr_context_switches(); + start_time = (unsigned long long)boottime.tv_sec; + + for (j = 0; j < NR_SOFTIRQS; j++) { + unsigned long softirq_stat = kstat_softirqs_cpu(j, i); + + per_softirq_nums[j] += softirq_stat; + sum_softirq += softirq_stat; + } + } + + seq_printf(s, "softirq %lu ", sum_softirq); + for (j = 0; j < NR_SOFTIRQS; j++) + seq_printf(s, "%lu ", per_softirq_nums[j]); + + seq_puts(s, "\n"); + seq_printf(s, + "ctxt %llu\n" + "btime %llu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + switches, + start_time, + forks, + nr_runnable, + iowait); + + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, + CGROUP_PIDLIST_DESTROY_DELAY); + mutex_unlock(&cgrp->pidlist_mutex); + + return 0; +} + static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -678,6 +781,11 @@ struct cftype cgroup1_base_files[] = { .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, + { + .name = "cgroup.procs_stat", + .seq_show = cgroup_procs_stat_show, + .write = cgroup1_procs_write, + }, { } /* terminate */ }; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a662bfc..ec0f181 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -54,6 +54,8 @@ #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> +#include <linux/cgroup.h> +#include <linux/pid_namespace.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -485,6 +487,12 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, return css; } +struct cgroup_subsys_state *global_cgroup_css(struct cgroup *cgrp, + int ssid) +{ + return cgroup_tryget_css(cgrp, cgroup_subsys[(ssid)]); +} + /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b42037e..52c4c71 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2431,6 +2431,32 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) spin_unlock_irqrestore(&callback_lock, flags); } +/** + * get_tsk_cpu_allowed - get cpus_allowed mask of a tsk. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Description: Returns the cpumask_var_t cpus_allowed of the cpuset + * according to the specified @tsk. + **/ +void get_tsk_cpu_allowed(struct task_struct *tsk, struct cpumask *pmask) +{ + unsigned long flags; + struct cpuset *cs = NULL; + + spin_lock_irqsave(&callback_lock, flags); + rcu_read_lock(); + + cs = task_cs(tsk); + if (cs) + cpumask_and(pmask, cs->cpus_allowed, cpu_possible_mask); + else + cpumask_copy(pmask, cpu_possible_mask); + + rcu_read_unlock(); + spin_unlock_irqrestore(&callback_lock, flags); +} + void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { rcu_read_lock(); diff --git a/kernel/fork.c b/kernel/fork.c index a5d21c4..72449b0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1958,6 +1958,9 @@ static __latent_entropy struct task_struct *copy_process( } total_forks++; + update_cpuacct_procs_stat(task_active_pid_ns(p)->child_reaper, + task_active_pid_ns(p)->child_reaper->cpu, + CPUACCT_PROCS_FORKS, 1, 0); spin_unlock(¤t->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e10aae..ba969af 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3404,11 +3404,19 @@ static void __sched notrace __schedule(bool preempt) struct rq_flags rf; struct rq *rq; int cpu; + struct task_struct *prev_root = NULL, *next_root = NULL; cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; + if (task_active_pid_ns(prev)) { + prev_root = task_active_pid_ns(prev)->child_reaper; + if (prev_root != init_pid_ns.child_reaper) + update_cpuacct_procs_stat(prev, prev->cpu, + CPUACCT_PROCS_SWITCHES, 1, 0); + } + schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3462,6 +3470,12 @@ static void __sched notrace __schedule(bool preempt) } next = pick_next_task(rq, prev, &rf); + if (task_active_pid_ns(next)) { + next_root = task_active_pid_ns(next)->child_reaper; + if (prev_root && prev_root != next_root) + update_cpuacct_procs_stat(next, next->cpu, + CPUACCT_PROCS_SWITCHES, 1, 0); + } clear_tsk_need_resched(prev); clear_preempt_need_resched(); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9fbb103..a822eb9 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -24,12 +24,20 @@ struct cpuacct_usage { u64 usages[CPUACCT_STAT_NSTATS]; }; +/* Processes status of a group of task and its child cgroups */ +struct cpuacct_procs_stat { + unsigned long procs_stat[CPUACCT_PROCS_STAT_NSTATS]; + unsigned long irq[NR_SOFTIRQS]; +}; + /* track CPU usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every CPU */ struct cpuacct_usage __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; + struct cpuacct_procs_stat *procs_stat; + struct cpuacct_softirq *softirq; }; static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) @@ -37,6 +45,12 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) return css ? container_of(css, struct cpuacct, css) : NULL; } +/*return cpu accounting group corresponding to this container*/ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(global_cgroup_css(cgrp, cpuacct_cgrp_id), + struct cpuacct, css); +} /* Return CPU accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { @@ -49,11 +63,94 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) } static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(struct cpuacct_procs_stat, root_cpuacct_procs_stat); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, + .procs_stat = &root_cpuacct_procs_stat, }; +/* Determine the task is in the root_cpuacct */ +bool task_in_nonroot_cpuacct(struct task_struct *tsk) +{ + struct cpuacct *ca = task_ca(tsk); + + if (ca && (ca != &root_cpuacct)) + return true; + else + return false; +} + +/* return processes stat of a group to which this task belongs */ +unsigned long task_ca_procs_stat(struct task_struct *tsk, int cpu, + int index, int m_index) +{ + struct cpuacct *ca; + unsigned long res = 0; + + if (!tsk) + return 0; + + ca = task_ca(tsk); + if (ca) { + if (m_index == 0) + res = per_cpu_ptr(ca->procs_stat, + cpu)->procs_stat[index]; + else + res = per_cpu_ptr(ca->procs_stat, + cpu)->irq[index]; + } + + return res; +} + +/* update processes stat of a group to which this task belongs */ +void update_cpuacct_procs_stat(struct task_struct *tsk, int cpu, int index, + int inc, int m_index) +{ + struct cpuacct *ca; + unsigned long *res; + + if (!tsk) + return; + + ca = task_ca(tsk); + if (ca) { + if (m_index == 0) { + res = &(per_cpu_ptr(ca->procs_stat, + cpu)->procs_stat[index]); + *res += inc; + } else { + res = &(per_cpu_ptr(ca->procs_stat, + cpu)->irq[index]); + *res += inc; + } + } +} + +/* update cpuacct of a group to which this task belongs from a task_group */ +void update_cpuacct_running_from_tg(struct task_group *tg, int cpu, int inc) +{ + struct cgroup *cgrp; + struct cpuacct *ca; + unsigned long *nr_running; + struct cpuacct_procs_stat *procs_stat; + + if (!tg) + return; + + cgrp = tg->css.cgroup; + if (!cgrp) + return; + + ca = cgroup_ca(cgrp); + if (ca && (ca != &root_cpuacct)) { + procs_stat = per_cpu_ptr(ca->procs_stat, cpu); + nr_running = &(procs_stat->procs_stat[CPUACCT_PROCS_RUNNING]); + *nr_running += inc; + } +} + /* Create a new CPU accounting group */ static struct cgroup_subsys_state * cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) @@ -74,9 +171,14 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) ca->cpustat = alloc_percpu(struct kernel_cpustat); if (!ca->cpustat) goto out_free_cpuusage; + ca->procs_stat = alloc_percpu(struct cpuacct_procs_stat); + if (!ca->procs_stat) + goto out_free_stat; return &ca->css; +out_free_stat: + free_percpu(ca->procs_stat); out_free_cpuusage: free_percpu(ca->cpuusage); out_free_ca: @@ -92,6 +194,7 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) free_percpu(ca->cpustat); free_percpu(ca->cpuusage); + free_percpu(ca->procs_stat); kfree(ca); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 54dc31e..46adf63 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -23,6 +23,7 @@ #include "sched.h" #include <trace/events/sched.h> +#include <linux/cgroup.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -4732,6 +4733,22 @@ static int tg_throttle_down(struct task_group *tg, void *data) return 0; } +void update_cpuacct_running_from_cfs(struct cfs_rq *cfs_rq, int inc) +{ + struct rq *rq; + int cpu = 0; + + if (!cfs_rq) + return; + + rq = rq_of(cfs_rq); + if (!rq) + return; + + cpu = cpu_of(rq); + update_cpuacct_running_from_tg(cfs_rq->tg, cpu, inc); +} + static void throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -4757,6 +4774,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; + update_cpuacct_running_from_cfs(qcfs_rq, -task_delta); if (qcfs_rq->load.weight) dequeue = 0; @@ -4820,6 +4838,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (enqueue) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; + update_cpuacct_running_from_cfs(cfs_rq, task_delta); if (cfs_rq_throttled(cfs_rq)) break; @@ -5379,6 +5398,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + update_cpuacct_running_from_cfs(cfs_rq, 1); flags = ENQUEUE_WAKEUP; } @@ -5386,6 +5406,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + update_cpuacct_running_from_cfs(cfs_rq, 1); if (cfs_rq_throttled(cfs_rq)) break; @@ -5427,6 +5448,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + update_cpuacct_running_from_cfs(cfs_rq, -1); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -5446,6 +5468,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + update_cpuacct_running_from_cfs(cfs_rq, -1); if (cfs_rq_throttled(cfs_rq)) break; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7aef6b4..766ec16 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1327,6 +1327,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags); + update_cpuacct_procs_stat(p, cpu_of(rq), CPUACCT_PROCS_RUNNING, 1, 0); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1338,6 +1339,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se, flags); + update_cpuacct_procs_stat(p, cpu_of(rq), CPUACCT_PROCS_RUNNING, -1, 0); dequeue_pushable_task(rq, p); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 177de36..9fa1995 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -26,6 +26,7 @@ #include <linux/smpboot.h> #include <linux/tick.h> #include <linux/irq.h> +#include <linux/cgroup.h> #define CREATE_TRACE_POINTS #include <trace/events/irq.h> @@ -248,6 +249,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) bool in_hardirq; __u32 pending; int softirq_bit; + struct task_struct *p = current; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -280,6 +282,9 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) prev_count = preempt_count(); kstat_incr_softirqs_this_cpu(vec_nr); + if (task_active_pid_ns(p)) + update_cpuacct_procs_stat(p, p->cpu, + vec_nr, 1, 1); trace_softirq_entry(vec_nr); h->action(h); -- 2.7.4 -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html