Happy new year guys.
I need to have /proc cgroups aware, as I want to have LXC containers that see only the resources that are given to them.
In order to do that I had to patch the kernel. I decided to start with cpuinfo, stat and interrupts and then continue
with meminfo and loadavg.
I managed to patch the Kernel (linux 3.12.0) and make /proc/cpuinfo, /proc/stat and /proc/interrupts be cgroups aware.
Attached are the patches that make the necessary changes.
The change for /proc/cpuinfo and /proc/interrupts is currently done only for x86 arch, but I will patch the rest of the
architectures if the style of the patches is acceptable.
Tomorrow I will check if the patches apply and build with the latest kernel.
Best regards,
Marian
>From 94891538f4a6a6b57aab0a2b917589ba73adfad9 Mon Sep 17 00:00:00 2001
From: Marian Marinov <mm@xxxxxxxx>
Date: Sat, 4 Jan 2014 05:45:42 +0200
Subject: [PATCH 1/2] arch/x86/kernel/cpu/proc.c: Make /proc/cpuinfo display
cpu information relative only to the current cgroup - added linux/cgroup.h
include because it is needed for the cpumask_test_cpu() - addded a
task_struct to c_start() - and added a loop that will skip all CPUs that are
not part of the current cgroup by using the cpus_allowed mask from the
task_struct
Signed-off-by: Marian Marinov <mm@xxxxxxxx>
---
arch/x86/kernel/cpu/proc.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index aee6317..d9e9fb6 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
+#include <linux/cgroup.h>
/*
* Get CPU information for use by the procfs.
@@ -133,9 +134,19 @@ static int show_cpuinfo(struct seq_file *m, void *v)
static void *c_start(struct seq_file *m, loff_t *pos)
{
+ struct task_struct *tsk;
*pos = cpumask_next(*pos - 1, cpu_online_mask);
- if ((*pos) < nr_cpu_ids)
+ tsk = current_thread_info()->task;
+ if ((*pos) < nr_cpu_ids) {
+ if (tsk != NULL) {
+ while (cpumask_test_cpu((*pos), &tsk->cpus_allowed) == 0) {
+ (*pos)++;
+ if ((*pos) >= nr_cpu_ids)
+ return NULL;
+ }
+ }
return &cpu_data(*pos);
+ }
return NULL;
}
--
1.8.4
>From ff68f073cb90316baa78936ff219a155788e29c2 Mon Sep 17 00:00:00 2001
From: Marian Marinov <mm@xxxxxxxx>
Date: Sat, 4 Jan 2014 06:10:24 +0200
Subject: [PATCH 1/1] arch/x86/kernel/irq.c: Made /proc/interrupts to be
cgroups aware - print only the CPUs that are part of the current cgroup -
Added code to handle Kconfig options - Added code to skip all CPUs that are
not part of the current cgroup using the task_struct's allowed_cpus mask
Signed-off-by: Marian Marinov <mm@xxxxxxxx>
---
arch/x86/kernel/irq.c | 73 +++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 59 insertions(+), 14 deletions(-)
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 22d0687..b0a17c0 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -54,75 +54,120 @@ void ack_bad_irq(unsigned int irq)
int arch_show_interrupts(struct seq_file *p, int prec)
{
int j;
+#ifdef CONFIG_CPUSETS
+ struct task_struct *tsk;
+ tsk = current_thread_info()->task;
+#endif
seq_printf(p, "%*s: ", prec, "NMI");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
seq_printf(p, " Non-maskable interrupts\n");
#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "%*s: ", prec, "LOC");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
seq_printf(p, " Local timer interrupts\n");
seq_printf(p, "%*s: ", prec, "SPU");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
seq_printf(p, "%*s: ", prec, "PMI");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
seq_printf(p, " Performance monitoring interrupts\n");
seq_printf(p, "%*s: ", prec, "IWI");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
seq_printf(p, " IRQ work interrupts\n");
seq_printf(p, "%*s: ", prec, "RTR");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
seq_printf(p, " APIC ICR read retries\n");
#endif
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
seq_printf(p, " Platform interrupts\n");
}
#ifdef CONFIG_SMP
seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
seq_printf(p, " Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
irq_stats(j)->irq_tlb_count);
seq_printf(p, " Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
seq_printf(p, "%*s: ", prec, "TRM");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
#endif
#ifdef CONFIG_X86_MCE
seq_printf(p, "%*s: ", prec, "MCE");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
seq_printf(p, " Machine check exceptions\n");
seq_printf(p, "%*s: ", prec, "MCP");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(j, &tsk->cpus_allowed))
+#endif
+ seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
seq_printf(p, " Machine check polls\n");
#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
--
1.8.4
>From 00af9f7b5eeef770d0da240a6bf2064a2ba11e47 Mon Sep 17 00:00:00 2001
From: Marian Marinov <mm@xxxxxxxx>
Date: Sat, 4 Jan 2014 06:03:11 +0200
Subject: [PATCH 1/1] fs/proc/stat.c & kernel/sched/stats.c: List only the CPUs
that are in the current cpuset
- Added a check to allow the display of cpu information only if the cpu is part of the current cpu set using the task_struct allowed_cpus
Signed-off-by: Marian Marinov <mm@xxxxxxxx>
---
fs/proc/stat.c | 14 ++++++++++++++
kernel/sched/stats.c | 9 +++++++++
2 files changed, 23 insertions(+)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 1cf86c0..e5ca3ef 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -87,6 +87,11 @@ static int show_stat(struct seq_file *p, void *v)
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec boottime;
+#ifdef CONFIG_CPUSETS
+ struct task_struct *tsk;
+
+ tsk = current_thread_info()->task;
+#endif
user = nice = system = idle = iowait =
irq = softirq = steal = 0;
@@ -94,7 +99,12 @@ static int show_stat(struct seq_file *p, void *v)
getboottime(&boottime);
jif = boottime.tv_sec;
+
for_each_possible_cpu(i) {
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(i, &tsk->cpus_allowed) == 0)
+ continue;
+#endif
user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
@@ -142,6 +152,10 @@ static int show_stat(struct seq_file *p, void *v)
steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(i, &tsk->cpus_allowed) == 0)
+ continue;
+#endif
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index da98af3..5897358 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -17,6 +17,10 @@ static int show_schedstat(struct seq_file *seq, void *v)
int cpu;
int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
char *mask_str = kmalloc(mask_len, GFP_KERNEL);
+#ifdef CONFIG_CPUSETS
+ struct task_struct *tsk;
+ tsk = current_thread_info()->task;
+#endif
if (mask_str == NULL)
return -ENOMEM;
@@ -33,6 +37,11 @@ static int show_schedstat(struct seq_file *seq, void *v)
cpu = (unsigned long)(v - 2);
rq = cpu_rq(cpu);
+#ifdef CONFIG_CPUSETS
+ if (tsk != NULL && cpumask_test_cpu(cpu, &tsk->cpus_allowed) == 0)
+ return 0;
+#endif
+
/* runqueue-specific stats */
seq_printf(seq,
"cpu%d %u 0 %u %u %u %u %llu %llu %lu",
--
1.8.4
>From dec97e6141f92109c0cd02883cff20e3f1429564 Mon Sep 17 00:00:00 2001
From: Marian Marinov <mm@xxxxxxxx>
Date: Sat, 4 Jan 2014 05:50:03 +0200
Subject: [PATCH 2/2] arch/x86/kernel/cpu/proc.c: Added Kconfig option handling
Signed-off-by: Marian Marinov <mm@xxxxxxxx>
---
arch/x86/kernel/cpu/proc.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index d9e9fb6..114fd95 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -3,7 +3,10 @@
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
+
+#ifdef CONFIG_CPUSETS
#include <linux/cgroup.h>
+#endif
/*
* Get CPU information for use by the procfs.
@@ -134,10 +137,13 @@ static int show_cpuinfo(struct seq_file *m, void *v)
static void *c_start(struct seq_file *m, loff_t *pos)
{
+#ifdef CONFIG_CPUSETS
struct task_struct *tsk;
+#endif
*pos = cpumask_next(*pos - 1, cpu_online_mask);
- tsk = current_thread_info()->task;
if ((*pos) < nr_cpu_ids) {
+#ifdef CONFIG_CPUSETS
+ tsk = current_thread_info()->task;
if (tsk != NULL) {
while (cpumask_test_cpu((*pos), &tsk->cpus_allowed) == 0) {
(*pos)++;
@@ -145,6 +151,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
return NULL;
}
}
+#endif
return &cpu_data(*pos);
}
return NULL;
--
1.8.4