From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Now, /proc/stat uses single_open() for showing information. This means the all data will be gathered and buffered once to a (big) buf. Now, /proc/stat shows stats per cpu and stats per IRQs. To get information in once-shot, it allocates a big buffer (until KMALLOC_MAX_SIZE). Eric Dumazet reported that the bufsize calculation doesn't take the numner of IRQ into account and the information cannot be got in one-shot. (By this, seq_read() will allocate buffer again and read the whole data gain...) This patch changes /proc/stat to use seq_open() rather than single_open() and provides ->start(), ->next(), ->stop(), ->show(). By this, /proc/stat will not need to take care of size of buffer. [heiko.carstens@xxxxxxxxxx]: This is the forward port of a patch from KAMEZAWA Hiroyuki (https://lkml.org/lkml/2012/1/23/41). I added a couple of simple changes like e.g. that the cpu iterator handles 32 cpus in a batch to avoid lots of iterations. With this patch it should not happen anymore that reading /proc/stat fails because of a failing high order memory allocation. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Signed-off-by: Heiko Carstens <heiko.carstens@xxxxxxxxxx> --- fs/proc/stat.c | 278 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 203 insertions(+), 75 deletions(-) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 3898ca5f1e92..652e255fee90 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -77,22 +77,109 @@ static u64 get_iowait_time(int cpu) #endif -static int show_stat(struct seq_file *p, void *v) +enum proc_stat_stage /* The numbers are used as *pos and iter->stage */ +{ + SHOW_TOTAL_CPU_STAT, + SHOW_PERCPU_STAT, + SHOW_TOTAL_IRQS, + SHOW_IRQ_DETAILS, + SHOW_TIMES, + SHOW_TOTAL_SOFTIRQ, + SHOW_SOFTIRQ_DETAILS, + SHOW_EOL, + END_STATS, +}; + +/* + * To reduce the number of ->next(), ->show() calls IRQ numbers are + * handled in batch. + */ +struct seq_stat_iter { + int stage; + unsigned long jiffies; + int cpu_iter; + int irq_iter; + int softirq_iter; + /* cached information */ + u64 irq_sum; + u64 softirq_sum; + u32 per_softirq_sums[NR_SOFTIRQS]; +}; + +static void *proc_stat_start(struct seq_file *p, loff_t *pos) +{ + struct seq_stat_iter *iter = p->private; + + /* At lseek(), *pos==0 is passed.(see travers() in seq_file.c */ + if (!*pos) { + struct timespec boottime; + + memset(iter, 0, sizeof(*iter)); + iter->stage = SHOW_TOTAL_CPU_STAT; + getboottime(&boottime); + iter->jiffies = boottime.tv_sec; + } + if (iter->stage == END_STATS) + return NULL; + return iter; +} + +static void proc_stat_stop(struct seq_file *p, void *v) +{ +} + +static void *proc_stat_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct seq_stat_iter *iter = p->private; + int index; + + switch (iter->stage) { + case SHOW_TOTAL_CPU_STAT: + iter->stage = SHOW_PERCPU_STAT; + iter->cpu_iter = cpumask_first(cpu_online_mask); + break; + case SHOW_PERCPU_STAT: + index = cpumask_next(iter->cpu_iter, cpu_online_mask); + if (index >= nr_cpu_ids) + iter->stage = SHOW_TOTAL_IRQS; + else + iter->cpu_iter = index; + break; + case SHOW_TOTAL_IRQS: + iter->stage = SHOW_IRQ_DETAILS; + iter->irq_iter = 0; + break; + case SHOW_IRQ_DETAILS: + if (iter->irq_iter >= nr_irqs) + iter->stage = SHOW_TIMES; + break; + case SHOW_TIMES: + iter->stage = SHOW_TOTAL_SOFTIRQ; + break; + case SHOW_TOTAL_SOFTIRQ: + iter->stage = SHOW_SOFTIRQ_DETAILS; + break; + case SHOW_SOFTIRQ_DETAILS: + iter->stage = SHOW_EOL; + break; + case SHOW_EOL: + iter->stage = END_STATS; + return NULL; + default: + break; + } + return iter; +} + +static int show_total_cpu_stat(struct seq_file *p, struct seq_stat_iter *iter) { - int i, j; - unsigned long jif; u64 user, nice, system, idle, iowait, irq, softirq, steal; u64 guest, guest_nice; - u64 sum = 0; - u64 sum_softirq = 0; - unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; - struct timespec boottime; + int i, j; - user = nice = system = idle = iowait = - irq = softirq = steal = 0; + user = nice = system = idle = iowait = 0; + irq = softirq = steal = 0; guest = guest_nice = 0; - getboottime(&boottime); - jif = boottime.tv_sec; for_each_possible_cpu(i) { user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; @@ -105,17 +192,17 @@ static int show_stat(struct seq_file *p, void *v) steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; - sum += kstat_cpu_irqs_sum(i); - sum += arch_irq_stat_cpu(i); + iter->irq_sum += kstat_cpu_irqs_sum(i); + iter->irq_sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); - per_softirq_sums[j] += softirq_stat; - sum_softirq += softirq_stat; + iter->per_softirq_sums[j] += softirq_stat; + iter->softirq_sum += softirq_stat; } } - sum += arch_irq_stat(); + iter->irq_sum += arch_irq_stat(); seq_puts(p, "cpu "); seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); @@ -129,20 +216,31 @@ static int show_stat(struct seq_file *p, void *v) seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); seq_putc(p, '\n'); + return 0; +} + +static int show_online_cpu_stat(struct seq_file *p, struct seq_stat_iter *iter) +{ + u64 user, nice, system, idle, iowait, irq, softirq, steal; + u64 guest, guest_nice; + int i, cpu, index; - for_each_online_cpu(i) { + /* Handle 32 cpus at a time, to avoid lots of seqfile iterations. */ + cpu = index = iter->cpu_iter; + for (i = 0; i < 32 && index < nr_cpu_ids; i++) { + cpu = index; /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle = get_idle_time(i); - iowait = get_iowait_time(i); - irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; - seq_printf(p, "cpu%d", i); + user = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; + nice = kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; + system = kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(cpu); + iowait = get_iowait_time(cpu); + irq = kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ]; + softirq = kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ]; + steal = kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; + guest = kcpustat_cpu(cpu).cpustat[CPUTIME_GUEST]; + guest_nice = kcpustat_cpu(cpu).cpustat[CPUTIME_GUEST_NICE]; + seq_printf(p, "cpu%d", cpu); seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); @@ -154,66 +252,96 @@ static int show_stat(struct seq_file *p, void *v) seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); seq_putc(p, '\n'); + index = cpumask_next(cpu, cpu_online_mask); } - seq_printf(p, "intr %llu", (unsigned long long)sum); - - /* sum again ? it could be updated? */ - for_each_irq_nr(j) - seq_put_decimal_ull(p, ' ', kstat_irqs(j)); - - seq_printf(p, - "\nctxt %llu\n" - "btime %lu\n" - "processes %lu\n" - "procs_running %lu\n" - "procs_blocked %lu\n", - nr_context_switches(), - (unsigned long)jif, - total_forks, - nr_running(), - nr_iowait()); - - seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); - - for (i = 0; i < NR_SOFTIRQS; i++) - seq_put_decimal_ull(p, ' ', per_softirq_sums[i]); - seq_putc(p, '\n'); + iter->cpu_iter = cpu; + return 0; +} + +static int show_irq_details(struct seq_file *p, struct seq_stat_iter *iter) +{ + int ret; + + /* + * we update iterater in ->show()...seems ugly but for avoiding + * tons of function calls, print out here as much as possible + */ + do { + ret = seq_put_decimal_ull(p, ' ', kstat_irqs(iter->irq_iter)); + if (!ret) + iter->irq_iter += 1; + } while (!ret && iter->irq_iter < nr_irqs); return 0; } +static int show_softirq_details(struct seq_file *p, struct seq_stat_iter *iter) +{ + int ret; + + do { + ret = seq_put_decimal_ull(p, ' ', + iter->per_softirq_sums[iter->softirq_iter]); + if (!ret) + iter->softirq_iter += 1; + } while (!ret && iter->softirq_iter < NR_SOFTIRQS); + return 0; +} + +static int proc_stat_show(struct seq_file *p, void *v) +{ + struct seq_stat_iter *iter = v; + + switch (iter->stage) { + case SHOW_TOTAL_CPU_STAT: + return show_total_cpu_stat(p, iter); + case SHOW_PERCPU_STAT: + return show_online_cpu_stat(p, iter); + case SHOW_TOTAL_IRQS: + return seq_printf(p, "intr %llu", + (unsigned long long)iter->irq_sum); + case SHOW_IRQ_DETAILS: + return show_irq_details(p, iter); + case SHOW_TIMES: + return seq_printf(p, + "\nctxt %llu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + nr_context_switches(), + (unsigned long)iter->jiffies, + total_forks, + nr_running(), + nr_iowait()); + case SHOW_TOTAL_SOFTIRQ: + return seq_printf(p, "softirq %llu", + (unsigned long long)iter->softirq_sum); + case SHOW_SOFTIRQ_DETAILS: + return show_softirq_details(p, iter); + case SHOW_EOL: + return seq_putc(p, '\n'); + } + return 0; +} + +static const struct seq_operations show_stat_op = { + .start = proc_stat_start, + .next = proc_stat_next, + .stop = proc_stat_stop, + .show = proc_stat_show, +}; + static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_online_cpus(); - char *buf; - struct seq_file *m; - int res; - - /* minimum size to display an interrupt count : 2 bytes */ - size += 2 * nr_irqs; - - /* don't ask for more than the kmalloc() max size */ - if (size > KMALLOC_MAX_SIZE) - size = KMALLOC_MAX_SIZE; - buf = kmalloc(size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = single_open(file, show_stat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = ksize(buf); - } else - kfree(buf); - return res; + return seq_open_private(file, &show_stat_op, sizeof(struct seq_stat_iter)); } static const struct file_operations proc_stat_operations = { .open = stat_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = seq_release_private, }; static int __init proc_stat_init(void) -- 1.8.5.5 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html