On Tue, May 24, 2016 at 10:09 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote: > On Tue, May 24, 2016 at 08:43:57AM -0700, Andy Lutomirski wrote: >> > So this has implications for code like >> > kernel/events/internal.h:get_recursion_context() and >> > kernel/trace/trace.c:get_trace_buf(). >> > >> > Which use a sequence of: in_nmi(), in_irq(), in_softirq() to pick 1 out >> > of 4 possible contexts. >> > >> > I would really like the Changelog to reflect on this. The current code >> > will have ISTs land in in_irq(), with this chance, not so much. >> >> I can change the changelog. > > This is basically all I'm asking. > >> > Now ISTs as a whole invalidate the whole 'we have only 4 contexts' and >> > the mapping back to those 4 is going to be somewhat arbitrary I suspect, >> > but changes like this should be very much aware of these things. And >> > make an explicit choice. >> >> I'm not so comfortable with trying to make any particular guarantees >> about what all the in_xyz() things will return for different entry >> types and how they nest. For example, debug can nest inside itself >> quite easily (at one point I even had a user program to force it to >> happen) -- this can trigger when a watchpoint nests inside a >> single-step trap, and it can also happen when a watchpoint handler is >> interrupted by an NMI than then recursively triggers the watchpoint. >> The latter could easily result in nested NMIs that are directly >> visible to the trace or event code. >> >> On x86, there's also MCE, which is NMI-ish, and NMI and MCE can freely >> nest inside each other. (Blech.) > > Right; all the nesting possibilities are endless and insane. And I don't > think it makes sense to try and enumerate them all and worse; expose > this to generic code. > > I think having the 4: task, softirq, hardirq, nmi is fine. We just need > to be a little careful with how we map to them, that we don't wreck some > common situation and suddenly start loosing trace data. > >> Would it make more sense to adjust the trace code to have a percpu >> nesting count and to match up get_trace_buf with put_trace_buf to >> decrement the count? The event code looks like the same thing could >> happen. > > We have, but its coupled with static storage, such as to avoid having to > do it on stack or *horror* dynamically allocate. > > So in order to protect these resource we have the per context nesting > count. I gave it a shot and it looks straightforward and is a net deletion of code. It's attached and compile-tested only. Is there a reason you don't like this approach? --Andy
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a4b49c99f22..acd366e4ab1a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1986,83 +1986,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) /* created for use with alloc_percpu */ struct trace_buffer_struct { - char buffer[TRACE_BUF_SIZE]; + int nesting; + char buffer[4][TRACE_BUF_SIZE]; }; static struct trace_buffer_struct *trace_percpu_buffer; -static struct trace_buffer_struct *trace_percpu_sirq_buffer; -static struct trace_buffer_struct *trace_percpu_irq_buffer; -static struct trace_buffer_struct *trace_percpu_nmi_buffer; /* - * The buffer used is dependent on the context. There is a per cpu - * buffer for normal context, softirq contex, hard irq context and - * for NMI context. Thise allows for lockless recording. - * - * Note, if the buffers failed to be allocated, then this returns NULL + * Thise allows for lockless recording. If we're nested too deeply, then + * this returns NULL. */ static char *get_trace_buf(void) { - struct trace_buffer_struct *percpu_buffer; - - /* - * If we have allocated per cpu buffers, then we do not - * need to do any locking. - */ - if (in_nmi()) - percpu_buffer = trace_percpu_nmi_buffer; - else if (in_irq()) - percpu_buffer = trace_percpu_irq_buffer; - else if (in_softirq()) - percpu_buffer = trace_percpu_sirq_buffer; - else - percpu_buffer = trace_percpu_buffer; + struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - if (!percpu_buffer) + if (!buffer || buffer->nesting >= 4) return NULL; - return this_cpu_ptr(&percpu_buffer->buffer[0]); + return &buffer->buffer[buffer->nesting++][0]; +} + +static void put_trace_buf(void) +{ + this_cpu_dec(trace_percpu_buffer->nesting); } static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct *buffers; - struct trace_buffer_struct *sirq_buffers; - struct trace_buffer_struct *irq_buffers; - struct trace_buffer_struct *nmi_buffers; buffers = alloc_percpu(struct trace_buffer_struct); - if (!buffers) - goto err_warn; - - sirq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!sirq_buffers) - goto err_sirq; - - irq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!irq_buffers) - goto err_irq; - - nmi_buffers = alloc_percpu(struct trace_buffer_struct); - if (!nmi_buffers) - goto err_nmi; + if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) + return -ENOMEM; trace_percpu_buffer = buffers; - trace_percpu_sirq_buffer = sirq_buffers; - trace_percpu_irq_buffer = irq_buffers; - trace_percpu_nmi_buffer = nmi_buffers; - return 0; - - err_nmi: - free_percpu(irq_buffers); - err_irq: - free_percpu(sirq_buffers); - err_sirq: - free_percpu(buffers); - err_warn: - WARN(1, "Could not allocate percpu trace_printk buffer"); - return -ENOMEM; } static int buffers_allocated; @@ -2153,7 +2111,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); @@ -2179,6 +2137,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) } out: + put_trace_buf(); + +out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); @@ -2210,7 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); @@ -2229,7 +2190,11 @@ __trace_array_vprintk(struct ring_buffer *buffer, __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } - out: + +out_nobuffer: + put_trace_buf(); + +out: preempt_enable_notrace(); unpause_graph_tracing();